├── .gitignore ├── LICENSE ├── SeqRecordLite.py ├── ab1.py ├── analyze_oligos.py ├── bin ├── compute_divergence.py ├── explode_fasta.py ├── fasta2idt.py ├── fasta2lenhist.py ├── fasta2tiles.py ├── fasta2uniq.py ├── fasta_rm_newlines.py ├── fasta_sort_by_abundance.py ├── generate_otu_table.py ├── generic_script.py ├── idt2fasta.py ├── make_timeseries_figures.py ├── qiime_cluster_jobs_LSF.py ├── quality_hist.py ├── sff2fastq_trimmed.py ├── streamgraph_html.py ├── timeseries2json.py └── timeseries2streamgraph.py ├── blast.py ├── blat.py ├── countdata.py ├── daemonize.py ├── degex.py ├── exonerate.py ├── graphtools.py ├── lsf.py ├── mplextensions.py ├── oligoTm.py ├── primers.py ├── pyutils.py ├── qPCR2melting.py ├── qPCR2quantitation.py ├── sanger.py ├── scale.py ├── seqtools.py ├── statstools.py ├── stitch.py ├── streamgraph.py ├── timeseries.py └── unafold.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.so 3 | *.o 4 | build/ 5 | .DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /SeqRecordLite.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | from Bio.Seq import Seq, UnknownSeq 4 | from Bio.SeqRecord import SeqRecord 5 | from Bio.SeqFeature import SeqFeature, FeatureLocation 6 | from Bio.Alphabet import NucleotideAlphabet 7 | 8 | 9 | class SeqRecordLite(object): 10 | """SeqRecord wrapper that allows simpler attribute access. 11 | 12 | The underlying data structure is actually a biopython `SeqRecord` object. 13 | This class wraps it in a way that maintains the simple-to-use interface to 14 | get at some common annotations. It also knows how to print out it's data 15 | as IMGT-flavored INSDC (e.g., GenBank/EMBL). 16 | """ 17 | 18 | def __init__(self, biopython_object=None): 19 | 20 | # first we define our underlying SeqRecord object 21 | if biopython_object == None: 22 | self._record = SeqRecord(seq=UnknownSeq(0,alphabet=NucleotideAlphabet()),id='',name='',description='') 23 | elif isinstance(biopython_object,Seq): 24 | self._record = SeqRecord(seq=copy.deepcopy(biopython_object),id='',name='',description='') 25 | elif isinstance(biopython_object,SeqRecord): 26 | self._record = copy.deepcopy(biopython_object) 27 | 28 | # define dictionary of features for faster lookup 29 | self._features = {} 30 | for (i,feature) in enumerate(self._record.features): 31 | self._features.setdefault(feature.type,[]).append(i) 32 | 33 | 34 | def __getattr__(self,name): 35 | # This function should only get called if I am looking for an attribute that 36 | # didn't already have a getter defined or a default method. In this case, I 37 | # search the annotations dictionary or the features table of the underlying 38 | # SeqRecord to try to find the information. 39 | if name in self._record.annotations: 40 | return self._record.annotations[name] 41 | elif name in self._features: 42 | return [self._record.features[i] for i in self._features[name]] 43 | raise AttributeError 44 | 45 | 46 | # define properties to access some common SeqRecord interface 47 | 48 | @property 49 | def seq(self): 50 | return self._record.seq 51 | 52 | @seq.setter 53 | def seq(self,s): 54 | self._record.seq = s 55 | 56 | @property 57 | def annotations(self): 58 | return self._record.annotations 59 | 60 | @property 61 | def id(self): 62 | return self._record.id 63 | 64 | @id.setter 65 | def id(self,i): 66 | self._record.id = i 67 | 68 | @property 69 | def description(self): 70 | return self._record.description 71 | 72 | @description.setter 73 | def description(self,d): 74 | self._record.description = d 75 | 76 | @property 77 | def name(self): 78 | return self._record.name 79 | 80 | @name.setter 81 | def name(self,n): 82 | self._record.name = n 83 | 84 | @property 85 | def features(self): 86 | return self._record.features 87 | 88 | def format(self,*args,**kw): 89 | return self._record.format(*args,**kw) 90 | 91 | 92 | # manipulation of SeqRecord parts 93 | 94 | def add_feature(self,start=None,end=None,type='',strand=None,qualifiers=None): 95 | if start == None or end == None: 96 | raise ValueError, "if there is no spanning location...use an annotation?" 97 | location = FeatureLocation(start,end) 98 | feature = SeqFeature(location=location,type=type,strand=strand,qualifiers=qualifiers) 99 | self._record.features.append(feature) 100 | self._features.setdefault(feature.type,[]).append(len(self._record.features) - 1) 101 | return self 102 | 103 | def has_feature(self,type): 104 | return type in self._features 105 | 106 | def del_feature(self,type): 107 | idxs = self._features.pop(type) 108 | idxs.sort(reverse=True) 109 | for i in idxs: 110 | self._record.features.pop(i) 111 | return self 112 | 113 | 114 | # some standard interface 115 | 116 | def __len__(self): 117 | return len(self.seq) 118 | 119 | def __str__(self): 120 | return self.__repr__() 121 | 122 | def __repr__(self): 123 | return self.format('imgt') 124 | -------------------------------------------------------------------------------- /ab1.py: -------------------------------------------------------------------------------- 1 | # Downloaded from http://www.interactive-biosoftware.com/open-source/ABIFReader.py 2 | # on 14 November 2010. 3 | # 4 | # Python implementation of an ABIF file reader according to Applied Biosystems' specificatons, 5 | # see http://www.appliedbiosystems.com/support/software_community/ABIF_File_Format.pdf 6 | # 7 | # This code is published by Interactive Biosoftware, France, 8 | # see http://www.interactive-biosoftware.com/ 9 | # under GPL license, 10 | # see http://www.gnu.org/licenses/gpl.html 11 | # 12 | # Author: Francis Wolinski 13 | # Version: 1.0, March 2007 14 | # Copyright (c) Francis Wolinski 2007 15 | # 16 | # User Manual 17 | # 18 | # Conversion of ABIF data types to Python types (see struct.unpack method): 19 | # type 1 = byte -> integer 20 | # type 2 = char -> string 21 | # type 3 = word -> long 22 | # type 4 = short -> integer 23 | # type 5 = long -> integer 24 | # type 7 = float -> float 25 | # type 8 = double -> float 26 | # type 10 = date -> datetime.date instance 27 | # type 11 = time -> datetime.time instance 28 | # type 12 = thumb -> tuple 29 | # type 13 = bool -> True or False 30 | # type 18 = pString -> string 31 | # type 19 = cString -> string 32 | # type = 1024+ = user -> NotImplemented: to be overwritten in user's code in ABIFReader.readNextUserData method 33 | # type = other -> NotImplemented 34 | # 35 | # from ABIFReader import * 36 | # reader = ABIFReader() # creates an instance of ABIFReader 37 | # reader.version # version of ABIF file 38 | # reader.showEntries() # print all entries of ABIF file " () / ()" 39 | # data = reader.getData([, ]) # read data for entry named with number , by default is 1 40 | # reader.close() # close the file, since it is kept open 41 | # 42 | 43 | import struct 44 | import datetime 45 | 46 | ABIF_TYPES = {1: 'byte', 2: 'char', 3: 'word', 4: 'short', 5: 'long', 7: 'float', 8: 'double',\ 47 | 10: 'date', 11: 'time', 12: 'thumb', 13: 'bool', 18: 'pString', 19: 'cString'} 48 | 49 | class ABIFReader: 50 | def __init__(self, fn): 51 | self.filename = fn 52 | self.file = open(fn, 'rb') 53 | self.type = self.readNextString(4) 54 | if self.type != 'ABIF': 55 | self.close() 56 | raise SystemExit("error: No ABIF file '%s'" % fn) 57 | self.version = self.readNextShort() 58 | dir = DirEntry(self) 59 | self.seek(dir.dataoffset) 60 | self.entries = [DirEntry(self) for i in range(dir.numelements)] 61 | 62 | def getData(self, name, num = 1): 63 | entry = self.getEntry(name, num) 64 | if not entry: 65 | raise SystemExit("error: Entry '%s (%i)' not found in '%s'" % (name, num, self.filename)) 66 | self.seek(entry.mydataoffset()) 67 | data = self.readData(entry.elementtype, entry.numelements) 68 | if data != NotImplemented and len(data) == 1: 69 | return data[0] 70 | else: 71 | return data 72 | 73 | def showEntries(self): 74 | for e in self.entries: 75 | print e 76 | 77 | def getEntry(self, name, num): 78 | for e in self.entries: 79 | if e.name == name and e.number == num: 80 | return e 81 | return None 82 | 83 | def readData(self, type, num): 84 | if type == 1: 85 | return [self.readNextByte() for i in range(num)] 86 | elif type == 2: 87 | return self.readNextString(num) 88 | elif type == 3: 89 | return [self.readNextUnsignedInt() for i in range(num)] 90 | elif type == 4: 91 | return [self.readNextShort() for i in range(num)] 92 | elif type == 5: 93 | return [self.readNextLong() for i in range(num)] 94 | elif type == 7: 95 | return [self.readNextFloat() for i in range(num)] 96 | elif type == 8: 97 | return [self.readNextDouble() for i in range(num)] 98 | elif type == 10: 99 | return [self.readNextDate() for i in range(num)] 100 | elif type == 11: 101 | return [self.readNextTime() for i in range(num)] 102 | elif type == 12: 103 | return [self.readNextThumb() for i in range(num)] 104 | elif type == 13: 105 | return [self.readNextBool() for i in range(num)] 106 | elif type == 18: 107 | return self.readNextpString() 108 | elif type == 19: 109 | return self.readNextcString() 110 | elif type >= 1024: 111 | return self.readNextUserData(type, num) 112 | else: 113 | return NotImplemented 114 | 115 | def readNextBool(self): 116 | return readNextByte(self) == 1 117 | 118 | def readNextByte(self): 119 | return self.primUnpack('B', 1) 120 | 121 | def readNextChar(self): 122 | return self.primUnpack('c', 1) 123 | 124 | def readNextcString(self): 125 | chars = [] 126 | while True: 127 | c = self.readNextChar() 128 | if ord(c) == 0: 129 | return ''.join(chars) 130 | else: 131 | chars.append(c) 132 | 133 | def readNextDate(self): 134 | return datetime.date(self.readNextShort(), self.readNextByte(), self.readNextByte()) 135 | 136 | def readNextDouble(self): 137 | return self.primUnpack('>d', 8) 138 | 139 | def readNextInt(self): 140 | return self.primUnpack('>i', 4) 141 | 142 | def readNextFloat(self): 143 | return self.primUnpack('>f', 4) 144 | 145 | def readNextLong(self): 146 | return self.primUnpack('>l', 4) 147 | 148 | def readNextpString(self): 149 | nb = self.readNextByte() 150 | chars = [self.readNextChar() for i in range(nb)] 151 | return ''.join(chars) 152 | 153 | def readNextShort(self): 154 | return self.primUnpack('>h', 2) 155 | 156 | def readNextString(self, size): 157 | chars = [self.readNextChar() for i in range(size)] 158 | return ''.join(chars) 159 | 160 | def readNextThumb(self): 161 | return (self.readNextLong(), self.readNextLong(), self.readNextByte(), self.readNextByte()) 162 | 163 | def readNextTime(self): 164 | return datetime.time(self.readNextByte(), self.readNextByte(), self.readNextByte(), self.readNextByte()) 165 | 166 | def readNextUnsignedInt(self): 167 | return self.primUnpack('>I', 4) 168 | 169 | def readNextUserData(self, type, num): 170 | # to be overwritten in user's code 171 | return NotImplemented 172 | 173 | def primUnpack(self, format, nb): 174 | x = struct.unpack(format, self.file.read(nb)) 175 | return x[0] 176 | 177 | def close(self): 178 | self.file.close() 179 | 180 | def seek(self, pos): 181 | self.file.seek(pos) 182 | 183 | def tell(self): 184 | return self.file.tell() 185 | 186 | class DirEntry: 187 | def __init__(self, reader): 188 | self.name = reader.readNextString(4) 189 | self.number = reader.readNextInt() 190 | self.elementtype = reader.readNextShort() 191 | self.elementsize = reader.readNextShort() 192 | self.numelements = reader.readNextInt() 193 | self.datasize = reader.readNextInt() 194 | self.dataoffsetpos = reader.tell() 195 | self.dataoffset = reader.readNextInt() 196 | self.datahandle = reader.readNextInt() 197 | 198 | def __str__(self): 199 | return "%s (%i) / %s (%i)" % (self.name, self.number, self.mytype(), self.numelements) 200 | 201 | def mydataoffset(self): 202 | if self.datasize <= 4: 203 | return self.dataoffsetpos 204 | else: 205 | return self.dataoffset 206 | 207 | def mytype(self): 208 | if self.elementtype < 1024: 209 | return ABIF_TYPES.get(self.elementtype, 'unknown') 210 | else: 211 | return 'user' -------------------------------------------------------------------------------- /analyze_oligos.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from Bio import SeqIO 4 | import numpy as np 5 | 6 | import seqtools 7 | import oligoTm 8 | import unafold 9 | import blat 10 | 11 | # ================== 12 | # = Output primers = 13 | # ================== 14 | 15 | def output_primers(primers,names): 16 | datum = ('name','sequence','len','Tm',r'%GC','ss-dG','BLAT') 17 | header = "\n%-25s %-30s %-4s %-5s %-4s %-7s %-5s\n" % datum 18 | sys.stdout.write(header) 19 | 20 | lens = map(len,primers) 21 | Tms = map(oligoTm.oligo_Tm,primers) 22 | gcs = map(lambda p: seqtools.gc_content(p)*100,primers) 23 | dGs = map(lambda p: unafold.hybrid_ss_min(p,NA='DNA',sodium=0.05),primers) 24 | # trunc_primers = [p[-min(18,min(lens)):] for p in primers] 25 | trunc_primers = primers # NO TRUNCATION 26 | seqrecords = map(lambda t: seqtools.make_SeqRecord(*t),zip(names,trunc_primers)) 27 | # blat_hits = map(blat.search_sequence,seqrecords) 28 | 29 | for datum in zip(names,primers,lens,Tms,gcs,dGs): #,blat_hits): 30 | primer_string = "%-25s %-30s %-4i %-5.1f %-4.0f %-7.1f\n" % datum 31 | sys.stdout.write(primer_string) 32 | 33 | summary_data = lambda d: (np.mean(d),np.std(d),np.min(d),np.max(d)) 34 | 35 | sys.stdout.write('\nsummary:\n') 36 | sys.stdout.write('num primers: %i\n' % len(primers)) 37 | sys.stdout.write('len mean: %5.1f std: %5.1f min: %5.1f max %5.1f\n' % summary_data(lens)) 38 | sys.stdout.write('Tm mean: %5.1f std: %5.1f min: %5.1f max %5.1f\n' % summary_data(Tms)) 39 | sys.stdout.write('%%GC mean: %5.1f std: %5.1f min: %5.1f max %5.1f\n' % summary_data(gcs)) 40 | sys.stdout.write('dGs mean: %5.1f std: %5.1f min: %5.1f max %5.1f\n' % summary_data(dGs)) 41 | # sys.stdout.write('BLAT mean: %5.1f std: %5.1f min: %5.1f max %5.1f total: %5.1f\n' % (summary_data(blat_hits)+(np.sum(blat_hits),))) 42 | 43 | if __name__ == '__main__': 44 | 45 | if len(sys.argv) == 3: 46 | inhandle = open(sys.argv[1],'r') 47 | outhandle = open(sys.argv[2],'w') 48 | elif len(sys.argv) == 2: 49 | inhandle = open(sys.argv[1],'r') 50 | outhandle = sys.stdout 51 | elif len(sys.argv) == 1: 52 | inhandle = sys.stdin 53 | outhandle = sys.stdout 54 | 55 | seqrecords = list(SeqIO.parse(inhandle,'fasta')) 56 | names = [rec.id for rec in seqrecords] 57 | primers = [seqtools.get_string(rec) for rec in seqrecords] 58 | 59 | if not blat.is_server_running(): 60 | blat_server = blat.start_gfServer() 61 | 62 | output_primers(primers,names) 63 | 64 | # if blat.is_server_running(): 65 | # blat.stop_gfServer( blat_server ) 66 | 67 | -------------------------------------------------------------------------------- /bin/compute_divergence.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import subprocess 4 | import argparse 5 | 6 | argparser = argparse.ArgumentParser(description=None) 7 | argparser.add_argument('-q','--query',required=True) 8 | argparser.add_argument('-t','--target',required=True) 9 | argparser.add_argument('-o','--output',required=True) 10 | argparser.add_argument('-u','--usearch',default='usearch') 11 | args = argparser.parse_args() 12 | 13 | usearch_cmd = "%s --query %s --db %s --nofastalign --nousort --minlen 1 --maxaccepts 0 --maxrejects 0 --global --id 0 --userout %s --userfields query+target+id0+id1+id2+id3+id4+gaps+intgaps+qloz+qhiz+tloz+thiz+ql+tl+cols+intcols" 14 | 15 | p = subprocess.Popen(usearch_cmd % (args.usearch,args.query,args.target,args.output),shell=True) 16 | p.wait() 17 | -------------------------------------------------------------------------------- /bin/explode_fasta.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import argparse 6 | 7 | from Bio import SeqIO 8 | 9 | from pyutils import cleanup_id 10 | 11 | argparser = argparse.ArgumentParser(description=None) 12 | argparser.add_argument('input_file',nargs='?',type=argparse.FileType('r'),default=sys.stdin) 13 | argparser.add_argument('output_dir',nargs='?',default=os.getcwd()) 14 | args = argparser.parse_args() 15 | 16 | for record in SeqIO.parse(args.input_file,'fasta'): 17 | output_file = os.path.join(args.output_dir,'%s.fasta' % cleanup_id(record.id)) 18 | with open(output_file,'w') as op: 19 | print >>op, record.format('fasta') 20 | -------------------------------------------------------------------------------- /bin/fasta2idt.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import sys 4 | 5 | import seqtools 6 | 7 | if len(sys.argv) == 3: 8 | inhandle = open(sys.argv[1],'r') 9 | outhandle = open(sys.argv[2],'w') 10 | elif len(sys.argv) == 2: 11 | inhandle = open(sys.argv[1],'r') 12 | outhandle = sys.stdout 13 | elif len(sys.argv) == 1: 14 | inhandle = sys.stdin 15 | outhandle = sys.stdout 16 | 17 | for (descr,seq) in seqtools.FastaIterator(inhandle): 18 | print >>outhandle, "%s\t%s" % (descr,seq) 19 | -------------------------------------------------------------------------------- /bin/fasta2lenhist.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import argparse 6 | 7 | import numpy as np 8 | import matplotlib as mpl 9 | mpl.use('agg') 10 | import matplotlib.pyplot as plt 11 | 12 | import seqtools 13 | 14 | argparser = argparse.ArgumentParser(description=None) 15 | argparser.add_argument('positional',nargs='*') 16 | argparser.add_argument('--log',action='store_true') 17 | args = argparser.parse_args() 18 | 19 | if len(args.positional) == 2: 20 | inhandle = open(args.positional[0],'r') 21 | outfile = args.positional[1] 22 | elif len(args.positional) == 1: 23 | inhandle = open(args.positional[0],'r') 24 | outfile = 'lenhist.png' 25 | elif len(args.positional) == 0: 26 | inhandle = sys.stdin 27 | outfile = 'lenhist.png' 28 | 29 | read_lengths = [] 30 | for (name,read) in seqtools.FastaIterator(inhandle): 31 | read_lengths.append(len(read)) 32 | 33 | print "Number of reads: %i" % len(read_lengths) 34 | print "Shortest read length: %i bp" % min(read_lengths) 35 | print "Longest read length: %i bp" % max(read_lengths) 36 | print "Median read length: %i bp" % np.median(read_lengths) 37 | print "Mean read length: %i bp" % np.mean(read_lengths) 38 | 39 | if not args.log: 40 | fig = plt.figure() 41 | ax = fig.add_subplot(111) 42 | ax.hist(read_lengths,bins=range(max(read_lengths)+1),linewidth=0,log=False) 43 | ax.set_xlabel('Read length') 44 | fig.savefig(outfile) 45 | else: 46 | fig = plt.figure() 47 | ax = fig.add_subplot(111) 48 | ax.hist(read_lengths,bins=range(max(read_lengths)+1),linewidth=0,log=True) 49 | ax.set_xlabel('Read length') 50 | fig.savefig(outfile) 51 | -------------------------------------------------------------------------------- /bin/fasta2tiles.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import sys 4 | import optparse 5 | 6 | import blast 7 | 8 | parser = optparse.OptionParser() 9 | parser.add_option('-s','--size',type='int') 10 | parser.add_option('-o','--offset',type='int') 11 | parser.add_option('-p','--blastp',action='store_true') 12 | (options, args) = parser.parse_args() 13 | 14 | if len(args) == 2: 15 | inhandle = open(args[0],'r') 16 | outhandle = open(args[1],'w') 17 | elif len(args) == 1: 18 | inhandle = open(args[0],'r') 19 | outhandle = sys.stdout 20 | elif len(args) == 0: 21 | inhandle = sys.stdin 22 | outhandle = sys.stdout 23 | 24 | 25 | #----------------------------------------------------------------------------- 26 | 27 | def fasta_parser(handle): 28 | # taken from biopython 29 | 30 | #Skip any text before the first record (e.g. blank lines, comments) 31 | while True: 32 | line = handle.readline() 33 | if line == "" : return #Premature end of file, or just empty? 34 | if line[0] == ">": 35 | break 36 | 37 | while True: 38 | if line[0]!=">": 39 | raise ValueError("Records in Fasta files should start with '>' character") 40 | descr = line[1:].rstrip() 41 | 42 | lines = [] 43 | line = handle.readline() 44 | while True: 45 | if not line : break 46 | if line[0] == ">": break 47 | lines.append(line.rstrip().replace(" ","").replace("\r","")) 48 | line = handle.readline() 49 | 50 | yield (descr,"".join(lines)) 51 | 52 | if not line : return #StopIteration 53 | assert False, "Should not reach this line" 54 | 55 | #----------------------------------------------------------------------------- 56 | 57 | tile_size = options.size 58 | tile_offset = options.offset 59 | 60 | for (descr,seq) in fasta_parser(inhandle): 61 | pos = 0 62 | num = 1 63 | while pos < len(seq): 64 | if pos+tile_size >= len(seq): # last tile in seq 65 | tile = seq[-tile_size:] 66 | start = len(seq) - tile_size 67 | end = len(seq) 68 | else: 69 | tile = seq[pos:pos+tile_size] 70 | start = pos 71 | end = pos+tile_size 72 | 73 | if options.blastp == True: 74 | num_hits = blast.number_genome_qblast_protein_hits(tile) 75 | print >>outhandle, '>%s|tile%03i|%i|%i|%i\n%s' % (descr,num,start,end,num_hits,tile) 76 | else: 77 | print >>outhandle, '>%s|tile%03i|%i|%i\n%s' % (descr,num,start,end,tile) 78 | 79 | pos += tile_offset 80 | num += 1 81 | -------------------------------------------------------------------------------- /bin/fasta2uniq.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import seqtools 4 | 5 | if len(sys.argv) == 3: 6 | inhandle = open(sys.argv[1],'r') 7 | outhandle = open(sys.argv[2],'w') 8 | elif len(sys.argv) == 2: 9 | inhandle = open(sys.argv[1],'r') 10 | outhandle = sys.stdout 11 | elif len(sys.argv) == 1: 12 | inhandle = sys.stdin 13 | outhandle = sys.stdout 14 | 15 | all_seqs = [] 16 | uniq_seqs = set() 17 | 18 | for (descr,seq) in seqtools.FastaIterator(inhandle): 19 | all_seqs.append((descr,seq)) 20 | uniq_seqs.add(seq) 21 | 22 | for (descr,seq) in all_seqs: 23 | if seq in uniq_seqs: 24 | outhandle.write('>%s\n%s\n' % (descr,seq)) 25 | uniq_seqs.remove(seq) 26 | 27 | -------------------------------------------------------------------------------- /bin/fasta_rm_newlines.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import sys 4 | 5 | from Bio import SeqIO 6 | 7 | if len(sys.argv) == 3: 8 | inhandle = open(sys.argv[1],'r') 9 | outhandle = open(sys.argv[2],'w') 10 | elif len(sys.argv) == 2: 11 | inhandle = open(sys.argv[1],'r') 12 | outhandle = sys.stdout 13 | elif len(sys.argv) == 1: 14 | inhandle = sys.stdin 15 | outhandle = sys.stdout 16 | 17 | # SeqIO.write does not allow access to the wrap parameter 18 | # SeqIO.write(SeqIO.parse(inhandle,'fasta'),outhandle,'fasta') 19 | 20 | print_fasta = lambda r: outhandle.write(">%s\n%s\n" % (r.description,r.seq.tostring())) 21 | map(print_fasta,SeqIO.parse(inhandle,'fasta')) 22 | -------------------------------------------------------------------------------- /bin/fasta_sort_by_abundance.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | from collections import defaultdict 6 | 7 | from seqtools import FastaIterator 8 | 9 | argparser = argparse.ArgumentParser(description=None) 10 | argparser.add_argument('input',nargs='?',type=argparse.FileType('r'),default=sys.stdin) 11 | argparser.add_argument('output',nargs='?',type=argparse.FileType('w'),default=sys.stdout) 12 | args = argparser.parse_args() 13 | 14 | counts = defaultdict(list) 15 | for (name,seq) in FastaIterator(args.input): 16 | counts[seq].append(name) 17 | 18 | for seq in sorted(counts.keys(), key=lambda k: len(counts[k]), reverse=True): 19 | for name in counts[seq]: 20 | args.output.write(">%s\n%s\n" % (name,seq)) 21 | -------------------------------------------------------------------------------- /bin/generate_otu_table.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import optparse 4 | import os 5 | 6 | import vdj.analysis 7 | 8 | option_parser = optparse.OptionParser() 9 | option_parser.add_option('-m','--mapping_file') 10 | (options,args) = option_parser.parse_args() 11 | 12 | if len(args) != 2: 13 | raise ValueError, "need input and output filenames" 14 | 15 | # Read sample mapping file 16 | mapping_handle = open(options.mapping_file,'r') 17 | samples = [(line.split('\t')[0].strip(),line.split('\t')[2].strip()) for line in mapping_handle if not line.startswith('#')] 18 | mapping_handle.close() 19 | 20 | # Load count data 21 | infilename = args[0] 22 | inhandle = open(infilename,'r') 23 | (uniq_feature_values,countdict) = vdj.analysis.vdjxml2countdict(inhandle,['barcode','clone']) 24 | inhandle.close() 25 | 26 | # Convert to matrix form 27 | countmatrix = vdj.analysis.countdict2matrix(['barcode','clone'],uniq_feature_values,countdict).transpose() 28 | 29 | # Reorder columns to correspond to mapping file order 30 | sample_idxs = dict([(v,i) for (i,v) in enumerate(uniq_feature_values['barcode'])]) 31 | argsort = [sample_idxs[sample] for (sample,descr) in samples] 32 | countmatrix = countmatrix[:,argsort] 33 | 34 | # Dump OTU table 35 | basename = os.path.basename(args[0]) 36 | outfilename = args[1] 37 | outhandle = open(outfilename,'w') 38 | 39 | print >>outhandle, "#OTU counts %s" % basename 40 | header = "OTU ID" 41 | # for (sample,descr) in samples: header += "\t%s" % ('_'.join([sample,descr])) 42 | for (sample,descr) in samples: header += "\t%s" % sample 43 | print >>outhandle, header 44 | 45 | for (label,countvector) in zip(uniq_feature_values['clone'],countmatrix): 46 | line = label 47 | for count in countvector: 48 | line += "\t%i" % int(count) 49 | print >>outhandle, line 50 | 51 | outhandle.close() 52 | -------------------------------------------------------------------------------- /bin/generic_script.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | if __name__ == '__main__': 4 | import sys 5 | import argparse 6 | 7 | argparser = argparse.ArgumentParser(description=None) 8 | argparser.add_argument('positional',type=int,nargs='*') 9 | argparser.add_argument('input_file',nargs='?',type=argparse.FileType('r'),default=sys.stdin) 10 | argparser.add_argument('output_dir',nargs='?',default=os.getcwd()) 11 | argparser.add_argument('--option',dest='xxx',action='store_const',default=5) 12 | args = argparser.parse_args() 13 | 14 | if len(args.positional) == 2: 15 | inhandle = open(args.positional[0],'r') 16 | outhandle = open(args.positional[1],'w') 17 | elif len(args.positional) == 1: 18 | inhandle = open(args.positional[0],'r') 19 | outhandle = sys.stdout 20 | elif len(args.positional) == 0: 21 | inhandle = sys.stdin 22 | outhandle = sys.stdout 23 | 24 | 25 | 26 | # OR 27 | 28 | if __name__ == '__main__': 29 | import sys 30 | 31 | if len(sys.argv) == 3: 32 | inhandle = open(sys.argv[1],'r') 33 | outhandle = open(sys.argv[2],'w') 34 | elif len(sys.argv) == 2: 35 | inhandle = open(sys.argv[1],'r') 36 | outhandle = sys.stdout 37 | elif len(sys.argv) == 1: 38 | inhandle = sys.stdin 39 | outhandle = sys.stdout 40 | -------------------------------------------------------------------------------- /bin/idt2fasta.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import seqtools 4 | 5 | if len(sys.argv) == 3: 6 | inhandle = open(sys.argv[1],'r') 7 | outhandle = open(sys.argv[2],'w') 8 | elif len(sys.argv) == 2: 9 | inhandle = open(sys.argv[1],'r') 10 | outhandle = sys.stdout 11 | elif len(sys.argv) == 1: 12 | inhandle = sys.stdin 13 | outhandle = sys.stdout 14 | 15 | for line in inhandle: 16 | if line.strip() == '': 17 | print >>outhandle, '' 18 | continue 19 | descr = line.split()[0] 20 | seq = line.split()[1] 21 | print >>outhandle, ">%s\n%s" % (descr,seq) 22 | -------------------------------------------------------------------------------- /bin/make_timeseries_figures.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import optparse 4 | 5 | import numpy as np 6 | import matplotlib as mpl 7 | mpl.use('Agg') 8 | import matplotlib.pyplot as plt 9 | import matplotlib.collections 10 | 11 | import vdj 12 | import vdj.analysis 13 | import timeseries 14 | 15 | option_parser = optparse.OptionParser() 16 | option_parser.add_option('-r','--threshold',type='float') 17 | option_parser.add_option('-o','--outputbasename') 18 | option_parser.add_option('-q','--quantify') 19 | option_parser.add_option('-n','--normalize',action='store_true') 20 | (options,args) = option_parser.parse_args() 21 | 22 | if len(args) == 1: 23 | inhandle = open(args[0],'r') 24 | else: 25 | raise ValueError, "Must give a single argument that is a timeseries data file" 26 | 27 | data = timeseries.load_timeseries(inhandle) 28 | labels = data['labels'] 29 | times = data['times'] 30 | timeseriesmatrix = data['matrix'] 31 | 32 | try: 33 | sums = data['sums'] 34 | except KeyError: 35 | sums = timeseriesmatrix.sum(axis=0) 36 | 37 | # normalize if desired 38 | if options.normalize: 39 | timeseriesmatrix = np.float_(timeseriesmatrix) / np.asarray(sums) 40 | 41 | # define which time series to plot 42 | if options.threshold: 43 | idxs = np.sum(timeseriesmatrix>=options.threshold,axis=1)>0 # breaks threshold at least once 44 | else: 45 | idxs = np.asarray([True]*timeseriesmatrix.shape[0]) 46 | # idxs = np.sum(time_series_freqs>0,axis=1)>2 # seen at least twice 47 | # idxs_bool = np.logical_and(idxs_bool_1,idxs_bool_2) 48 | # idxs_bool = np.array([False]*len(reference_clones)) 49 | print "Number of lines plotted: %i" % np.sum(idxs) 50 | 51 | # ================== 52 | # = Make the plots = 53 | # ================== 54 | 55 | # get output names 56 | if options.outputbasename: 57 | outputbasename = options.outputbasename 58 | else: 59 | outputbasename = '.'.join(args[0].split('.')[:-1]) 60 | 61 | random_color = lambda: '#%02x%02x%02x' % tuple(np.random.randint(0,256,3)) 62 | 63 | segments = [zip(times,timeseries) for timeseries in timeseriesmatrix[idxs]] 64 | colors = [random_color() for i in xrange(len(segments))] 65 | lines = mpl.collections.LineCollection(segments,colors=colors,linewidths=0.5) 66 | lines.set_alpha(0.75) 67 | 68 | fig = plt.figure() 69 | ax = fig.add_subplot(111) 70 | ax.add_collection(lines) 71 | ax.spines['top'].set_visible(False) 72 | ax.spines['right'].set_visible(False) 73 | ax.spines['bottom'].set_position(('outward',5)) 74 | ax.spines['left'].set_position(('outward',5)) 75 | ax.xaxis.set_ticks_position('bottom') 76 | ax.yaxis.set_ticks_position('left') 77 | ax.xaxis.set_major_locator(mpl.ticker.FixedLocator(times)) 78 | ax.set_xlim([times.min(),times.max()]) 79 | ax.autoscale_view(scalex=False,scaley=True) 80 | # ax.set_yscale('log') 81 | ax.set_xlabel('time') 82 | ax.set_ylabel(options.quantify+' frequency') 83 | # fig.show() 84 | fig.savefig(outputbasename+'.%stimeseries.png' % options.quantify) 85 | fig.savefig(outputbasename+'.%stimeseries.pdf' % options.quantify) 86 | 87 | # segments = [np.asarray(zip(times,timeseries)) for timeseries in timeseriesmatrix[idxs]] 88 | # segments = [segment[segment[:,1]>0] for segment in segments if segment[:,1].sum()>0] 89 | # lines = mpl.collections.LineCollection(segments,colors=colors,linewidths=0.5) 90 | # lines.set_alpha(0.75) 91 | 92 | figlog = plt.figure() 93 | ax = figlog.add_subplot(111) 94 | ax.add_collection(lines) 95 | ax.spines['top'].set_visible(False) 96 | ax.spines['right'].set_visible(False) 97 | ax.spines['bottom'].set_position(('outward',5)) 98 | ax.spines['left'].set_position(('outward',5)) 99 | ax.xaxis.set_ticks_position('bottom') 100 | ax.yaxis.set_ticks_position('left') 101 | ax.xaxis.set_major_locator(mpl.ticker.FixedLocator(times)) 102 | ax.set_yscale('log') 103 | ax.set_xlim([times.min(),times.max()]) 104 | ax.set_xlabel('time') 105 | ax.set_ylabel(options.quantify+' frequency') 106 | # fig.show() 107 | figlog.savefig(outputbasename+'.%stimeseries.log.png' % options.quantify) 108 | figlog.savefig(outputbasename+'.%stimeseries.log.pdf' % options.quantify) 109 | -------------------------------------------------------------------------------- /bin/qiime_cluster_jobs_LSF.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import os 4 | import datetime 5 | import optparse 6 | 7 | import lsf 8 | 9 | option_parser = optparse.OptionParser() 10 | option_parser.add_option('-m','--make_jobs',action='store_true') 11 | option_parser.add_option('-s','--submit_jobs',action='store_true') 12 | option_parser.add_option('-q','--queue',default='normal_serial') 13 | option_parser.add_option('-l','--log_dir') 14 | (options,args) = option_parser.parse_args() 15 | 16 | # check that we get the qiime-required arguments 17 | if len(args) == 2: 18 | jobs_list_file = args[0] 19 | job_id = args[1] 20 | elif len(args) == 0: 21 | raise ValueError, "Didn't get the right command line arguments" 22 | 23 | # make a directory for holding LSF log files 24 | if options.log_dir == None: 25 | log_dir = os.path.join(os.environ['HOME'],'qiime_parallel_logs') 26 | else: 27 | log_dir = options.log_dir 28 | 29 | if not os.path.exists(log_dir): 30 | os.mkdir(log_dir,0755) 31 | 32 | # submit the jobs 33 | jobs_handle = open(jobs_list_file,'r') 34 | job_ids = [] 35 | logs = [] 36 | for (i,line) in enumerate(jobs_handle): 37 | datetimestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') 38 | log = os.path.join( log_dir, 'job_%i_%s.log' % (i,datetimestamp) ) 39 | job_id = lsf.submit_to_LSF(options.queue,log,line.strip()) 40 | job_ids.append(job_id) 41 | logs.append(log) 42 | jobs_handle.close() 43 | -------------------------------------------------------------------------------- /bin/quality_hist.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import sys 4 | import random 5 | 6 | from Bio import SeqIO 7 | 8 | import numpy as np 9 | import scipy as sp 10 | import scipy.stats 11 | 12 | import matplotlib as mpl 13 | mpl.use('agg') 14 | import matplotlib.pyplot as plt 15 | 16 | from pbs import wc 17 | 18 | input_file = sys.argv[1] 19 | output_file = sys.argv[2] 20 | 21 | num_lines = int(wc(input_file, '-l').split()[0]) 22 | assert(num_lines % 4 == 0) 23 | num_reads = num_lines / 4 24 | 25 | if num_reads > 10000000: 26 | idxs = set(sorted(random.sample(xrange(num_reads),10000000))) 27 | 28 | qualities = [] 29 | for (i,record) in enumerate(SeqIO.parse(input_file, 'fastq')): 30 | if num_reads <= 10000000 or i in idxs: 31 | qualities.append(record.letter_annotations['phred_quality']) 32 | 33 | if i % 10000 == 0: 34 | sys.stdout.write("%i " % i) 35 | sys.stdout.flush() 36 | 37 | qualities = np.array(qualities) 38 | 39 | positions = range(1, qualities.shape[1]+1) 40 | 41 | p5 = sp.stats.scoreatpercentile(qualities, 5) 42 | p25 = sp.stats.scoreatpercentile(qualities, 25) 43 | p50 = sp.stats.scoreatpercentile(qualities, 50) 44 | p75 = sp.stats.scoreatpercentile(qualities, 75) 45 | p95 = sp.stats.scoreatpercentile(qualities, 95) 46 | 47 | fig = plt.figure() 48 | ax = fig.add_subplot(111) 49 | ax.scatter(positions,p5, s=3, c='k', linewidths=0, zorder=2) 50 | ax.scatter(positions,p95, s=3, c='k', linewidths=0, zorder=2) 51 | for (pos, low, high) in zip(positions, p25, p75): 52 | ax.plot([pos, pos], [low, high], color='#bdbdbd', lw=1, zorder=1) 53 | ax.scatter(positions, p50, s=6, c='r', linewidths=0, zorder=3) 54 | ax.set_xlabel('position') 55 | ax.set_ylabel('phred score') 56 | ax.set_xlim([positions[0]-1, positions[-1]+1]) 57 | ax.set_ylim([0, 45]) 58 | fig.savefig(output_file) 59 | -------------------------------------------------------------------------------- /bin/sff2fastq_trimmed.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | 6 | from Bio import SeqIO 7 | 8 | argparser = argparse.ArgumentParser(description=None) 9 | argparser.add_argument('input_file',nargs='?',type=argparse.FileType('rb'),default=sys.stdin) 10 | argparser.add_argument('output_file',nargs='?',type=argparse.FileType('w'),default=sys.stdout) 11 | args = argparser.parse_args() 12 | 13 | for record in SeqIO.parse(args.input_file,'sff'): 14 | start = record.annotations['clip_qual_left'] 15 | end = record.annotations['clip_qual_right'] 16 | args.output_file.write( record[start:end].format('fastq') ) 17 | -------------------------------------------------------------------------------- /bin/streamgraph_html.py: -------------------------------------------------------------------------------- 1 | streamgraph_html = r""" 2 | 3 | 4 | Visualization 5 | 6 | 7 | 8 | 9 | 113 | 114 | 115 | 116 | """ -------------------------------------------------------------------------------- /bin/timeseries2json.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import sys 4 | import optparse 5 | import json 6 | 7 | import timeseries 8 | 9 | option_parser = optparse.OptionParser() 10 | # option_parser.add_option('-x','--xxx',dest='xxxx',type='int') 11 | (options,args) = option_parser.parse_args() 12 | 13 | if len(args) == 2: 14 | inhandle = open(args[0],'r') 15 | outhandle = open(args[1],'w') 16 | elif len(args) == 1: 17 | inhandle = open(args[0],'r') 18 | outhandle = sys.stdout 19 | elif len(args) == 0: 20 | inhandle = sys.stdin 21 | outhandle = sys.stdout 22 | 23 | data = timeseries.load_timeseries(inhandle) 24 | 25 | # eliminate numpy-ness of objects before JSON output 26 | np_matrix = data['matrix'] 27 | py_matrix = [] 28 | for row in np_matrix: 29 | py_matrix.append(list(row)) 30 | data['matrix'] = py_matrix 31 | data['labels'] = list(data['labels']) 32 | 33 | for label in data.keys(): 34 | if label == 'labels' or label == 'matrix': 35 | continue 36 | data[label] = list(data[label]) 37 | 38 | json.dump(data,outhandle) 39 | -------------------------------------------------------------------------------- /bin/timeseries2streamgraph.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import optparse 4 | import colorsys 5 | 6 | import numpy as np 7 | import matplotlib as mpl 8 | mpl.use('Agg') 9 | import matplotlib.pyplot as plt 10 | 11 | import scale 12 | import timeseries 13 | import streamgraph 14 | 15 | option_parser = optparse.OptionParser() 16 | option_parser.add_option('-f','--filter',type='choice',choices=['none','seen2','sum2','sum3'],default='none') 17 | (options,args) = option_parser.parse_args() 18 | 19 | if len(args) == 2: 20 | inhandle = open(args[0],'r') 21 | else: 22 | raise ValueError, "need input and output names" 23 | 24 | data = timeseries.load_timeseries(inhandle) 25 | matrix = data['matrix'] 26 | labels = np.asarray(data['labels']) 27 | times = data['times'] 28 | sums = data['sums'] 29 | 30 | streams = matrix / sums 31 | 32 | # determine colors for the streamgraph 33 | colors = [] 34 | time_idxs = np.arange(streams.shape[1]) 35 | onset_time = lambda stream: np.min(time_idxs[stream > 0]) 36 | weight = lambda stream: np.sum(stream) 37 | Hscale = scale.linear(range(len(times))).range(0,1-1./len(times)) 38 | Lscale = scale.root(streams.sum(axis=1)).range(0.8,0.5).power(4) 39 | for stream in streams: 40 | h = Hscale(onset_time(stream)) 41 | l = Lscale(weight(stream)) 42 | colors.append( colorsys.hls_to_rgb(h,l,1) + (1.,) ) 43 | colors = np.array(colors) 44 | 45 | # sort streamgraphs appropriately 46 | argsort_onset = streamgraph.argsort_onset(streams) 47 | streams = streams[argsort_onset] 48 | matrix = matrix[argsort_onset] 49 | colors = colors[argsort_onset] 50 | 51 | # argsort_inside_out = streamgraph.argsort_inside_out(streams) 52 | # streams = streams[argsort_inside_out] 53 | # colors = colors[argsort_inside_out] 54 | 55 | # filter out some clones 56 | if options.filter == 'none': 57 | filter_idxs = np.ones(streams.shape[0]) > 0 # all streams 58 | elif options.filter == 'seen2': 59 | filter_idxs = np.sum(streams > 0, axis=1) >= 2 # seen twice 60 | elif options.filter == 'sum2': 61 | filter_idxs = np.sum(matrix, axis=1) >= 2 # sum=2 62 | elif options.filter == 'sum3': 63 | filter_idxs = np.sum(matrix, axis=1) >= 3 # sum=3 64 | else: 65 | raise ValueError, "what filter do you want me to use?" 66 | 67 | fig = plt.figure(figsize=(24,16)) 68 | ax = fig.add_subplot(111) 69 | streamgraph.streamgraph(ax, streams[filter_idxs], x=times, colors=colors[filter_idxs]) 70 | streamgraph.format_streamgraph(ax) 71 | ax.xaxis.set_ticks(times) 72 | ax.autoscale_view() 73 | # fig.show() 74 | fig.savefig(args[1],dpi=120) 75 | -------------------------------------------------------------------------------- /blast.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from Bio.Blast import NCBIWWW 3 | from Bio.Blast import NCBIXML 4 | 5 | def number_genome_qblast_hits(seqreclist): 6 | fastastring = ''.join([rec.format('fasta') for rec in seqreclist]) 7 | results_handle = NCBIWWW.qblast('blastn','nr',fastastring,expect=1.,word_size=7,nucl_reward=1,nucl_penalty=-3,hitlist_size=1000) 8 | blast_records = NCBIXML.parse(results_handle) 9 | 10 | hits = [len(record.alignments) for record in blast_records] 11 | 12 | return hits 13 | 14 | def number_genome_qblast_protein_hits(sequence): 15 | results_handle = NCBIWWW.qblast('blastp','nr',sequence,expect=100,word_size=3,hitlist_size=1000) 16 | blast_records = NCBIXML.parse(results_handle) 17 | num_hits = sum([len(record.alignments) for record in blast_records]) 18 | return num_hits 19 | 20 | 21 | 22 | # def number_genome_qblast_hits(seqlist): 23 | # fastastring = '' 24 | # for (i,seq) in enumerate(seqlist): fastastring += '>seq%i\n%s\n' % (i,seq) 25 | # 26 | # results_handle = NCBIWWW.qblast('blastn','nr',fastastring,expect=0.1,word_size=7,nucl_reward=1,nucl_penalty=-3,hitlist_size=500) 27 | # blast_records = NCBIXML.parse(results_handle) 28 | # 29 | # total_hits = 0 30 | # for record in blast_records: total_hits += len(record.alignments) 31 | # 32 | # # print total_hits 33 | # # sys.stdout.flush() 34 | # 35 | # return total_hits 36 | -------------------------------------------------------------------------------- /blat.py: -------------------------------------------------------------------------------- 1 | # BLAT tools 2 | # based on Sri's code 3 | 4 | import sys 5 | import subprocess 6 | import os 7 | import signal 8 | import time 9 | 10 | import seqtools 11 | 12 | hg_idx = '~/genome/hg19.2bit' 13 | 14 | def start_gfServer(file2idx=hg_idx,tileSize=11,stepSize=2,minMatch=2,maxGap=4,repMatch=1000000,debug=False): 15 | params = (tileSize,stepSize,minMatch,maxGap,repMatch,file2idx) 16 | cmd = "gfServer start -tileSize=%i -stepSize=%i -minMatch=%i -maxGap=%i -repMatch=%i localhost 17779 %s" % params 17 | if debug: print "Command is:\n%s" % cmd 18 | p = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) 19 | time.sleep(660) 20 | print "Finished starting up BLAT server (hopefully)." 21 | return p 22 | 23 | def is_server_running(): 24 | p = subprocess.Popen('ps -A',shell=True,stdout=subprocess.PIPE) 25 | lines = p.stdout.readlines() 26 | for line in lines: 27 | if 'gfServer' in line: 28 | return True 29 | return False 30 | 31 | def stop_gfServer(p=None): 32 | if p != None: 33 | os.kill(p.pid,signal.SIGTERM) 34 | time.sleep(5) 35 | else: 36 | pids = [] 37 | p = subprocess.Popen('ps -A',shell=True,stdout=subprocess.PIPE) 38 | lines = p.stdout.readlines() 39 | for line in lines: 40 | if 'gfServer' in line: 41 | pids.append(int(line.split()[0])) 42 | for pid in pids: 43 | os.kill(pid,signal.SIGTERM) 44 | time.sleep(5) 45 | 46 | # HACK/BUG: for some reason gfClient is doubling the directory prefix. It works if 47 | # file2idx='/' 48 | # def search_sequences(seqs,file2idx=hg_idx,minScore=20,minIdentity=70,debug=False): 49 | def search_sequences(seqs,file2idx='/',minScore=15,minIdentity=70,debug=False): 50 | if not is_server_running(): 51 | raise RuntimeError, "BLAT server not running." 52 | 53 | # generate query 54 | if hasattr(seqs[0],'format'): 55 | query = ''.join([s.format('fasta') for s in seqs]) 56 | else: 57 | query = ''.join(['>query%i\n%s\n' % (i,s) for (i,s) in enumerate(seqs)]) 58 | 59 | # define and run command 60 | nibdir = os.path.dirname(file2idx) 61 | params = (minScore,minIdentity,nibdir) 62 | cmd = "gfClient -minScore=%i -minIdentity=%i -nohead localhost 17779 %s /dev/stdin /dev/stdout" % params 63 | if debug: print cmd 64 | p = subprocess.Popen(cmd,shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE) 65 | p.stdin.write( query ) 66 | p.stdin.close() 67 | 68 | # process output 69 | num = 0 70 | for line in p.stdout: 71 | if debug: print line 72 | if line == "Output is in /dev/stdout\n": 73 | continue 74 | num += 1 75 | 76 | return num 77 | 78 | # HACK/BUG: for some reason gfClient is doubling the directory prefix. It works if 79 | # file2idx='/' 80 | def search_sequence(seq,file2idx='/',minScore=15,minIdentity=50,debug=False): 81 | return search_sequences([seq],file2idx,minScore,minIdentity,debug) 82 | -------------------------------------------------------------------------------- /countdata.py: -------------------------------------------------------------------------------- 1 | """ 2 | countdata.py 3 | 4 | Functions for stats and analysis of count data. 5 | 6 | """ 7 | 8 | import sys 9 | 10 | import numpy as np 11 | import scipy as sp 12 | # import scipy.stats 13 | 14 | # ============================================================================== 15 | 16 | # ====================== 17 | # = Count manipulation = 18 | # ====================== 19 | 20 | def sample2counts(sample, categories=0): 21 | """Return count vector from list of samples. 22 | 23 | Take vector of samples and return a vector of counts. The elts 24 | refer to indices in something that would ultimately map to the 25 | originating category (like from a multinomial). Therefore, if there 26 | are, say, 8 categories, then valid values in sample should be 0-7. 27 | If categories is not given, then i compute it from the highest value 28 | present in sample (+1). 29 | 30 | """ 31 | counts = np.bincount(sample) 32 | if (categories > 0) and (categories > len(counts)): 33 | counts = np.append( counts, np.zeros(categories-len(counts)) ) 34 | return counts 35 | 36 | def counts2sample(counts): 37 | """Computes a consistent sample from a vector of counts. 38 | 39 | Takes a vector of counts and returns a vector of indices x 40 | such that len(x) = sum(c) and each elt of x is the index of 41 | a corresponding elt in c 42 | 43 | """ 44 | x = np.ones(np.sum(counts),dtype=np.int_) 45 | 46 | start_idx = 0 47 | end_idx = 0 48 | for i in xrange(len(counts)): 49 | start_idx = end_idx 50 | end_idx = end_idx + counts[i] 51 | x[start_idx:end_idx] = x[start_idx:end_idx] * i 52 | return x 53 | 54 | # ============================================================================== 55 | 56 | # ======================== 57 | # = Percentile functions = 58 | # ======================== 59 | 60 | def scoreatpercentile(values,rank): 61 | return sp.stats.scoreatpercentile(values,rank) 62 | 63 | def percentileofscore(values,score): 64 | values.sort() 65 | return values.searchsorted(score) / np.float_(len(values)) 66 | 67 | #The scipy version does some funny histogramming thing 68 | #def percentileofscore(values,score): 69 | # return stats.percentileofscore(values,score,kind='weak') 70 | 71 | # ============================================================================== 72 | 73 | # ============ 74 | # = q-values = 75 | # ============ 76 | 77 | def qvalues(p,lambd=np.arange(0,0.91,0.05),method='bootstrap',B=100,smoothlog = False,robust=False): 78 | """Compute q-values using Storey method from array of p-values. 79 | 80 | Adapted from his R software. 81 | 82 | """ 83 | # check validity of values 84 | p = np.array(p) 85 | if np.min(p)<0 or np.max(p)>1: 86 | raise Exception, "p-values not in valid range" 87 | 88 | m = len(p) 89 | 90 | pi0 = np.zeros(len(lambd)) 91 | 92 | for i in np.arange(len(lambd)): 93 | pi0[i] = np.mean(p >= lambd[i]) / (1-lambd[i]) 94 | 95 | if method == 'bootstrap': 96 | minpi0 = np.min(pi0) 97 | mse = np.zeros(len(lambd)) 98 | pi0_boot = np.zeros(len(lambd)) 99 | for i in np.arange( B ): 100 | p_boot = p[ np.random.randint(0,m,m) ] 101 | for j in np.arange( len(lambd) ): 102 | pi0_boot[j] = np.mean(p_boot >= lambd[j]) / (1-lambd[j]) 103 | mse += (pi0_boot - minpi0)**2 104 | pi0 = np.min(pi0[mse == np.min(mse)]) 105 | print pi0.shape 106 | pi0 = np.min(pi0,axis=1) 107 | elif method == 'smoother': 108 | # TODO 109 | print "Not implemented yet" 110 | return 111 | 112 | if pi0 <= 0: 113 | raise Exception, "The estimated pi0 <=0. May be problem with pvalues." 114 | 115 | # calculate estimated q-values 116 | u = np.argsort(p) 117 | v = qvalrank(p) 118 | 119 | qvalue = pi0*m*p/v 120 | if robust == True: 121 | qvalue = pi0*m*p/(v*(1-(1-p)**m)) 122 | 123 | qvalue[u[m-1]] = np.min( [qvalue[u[m-1]], 1] ) 124 | for i in np.arange(m-2,-1,-1): 125 | qvalue[u[i]] = np.min( [qvalue[u[i]], qvalue[u[i+1]], 1] ) 126 | 127 | return qvalue 128 | 129 | def qvalrank(x): 130 | idx = np.argsort(x) 131 | levels = np.unique(x) # sorted unique-d list 132 | bin = levels.searchsorted(x) 133 | tbl = np.bincount(bin) 134 | cs = np.cumsum(tbl) 135 | 136 | tbl = cs.repeat(tbl) 137 | tbl2 = np.zeros(len(tbl),np.int_) 138 | tbl2[idx] = tbl 139 | 140 | return tbl2 141 | 142 | # ============================================================================== 143 | 144 | # ==================== 145 | # = Compute p-values = 146 | # ==================== 147 | 148 | def pval_KalZtest(n1,N1,n2,N2): 149 | """Compute p-value using Kal Z-test for count data. 150 | 151 | Compute pval using Z-test, as published in 152 | Kal et al, 1999, Mol Biol Cell 10:1859. 153 | 154 | Z = (p1-p2) / sqrt( p0 * (1-p0) * (1/N1 + 1/N2) ) 155 | where p1 = n1/N1, p2=n2/N2, and p0=(n1+n2)/(N1+N2) 156 | You reject if |Z| > Z_a/2 where a is sig lev. Here 157 | we return the p-value itself. 158 | 159 | """ 160 | if n1==0 and n2==0: 161 | return 1.0 162 | 163 | n1 = np.float_(n1) 164 | N1 = np.float_(N1) 165 | n2 = np.float_(n2) 166 | N2 = np.float_(N2) 167 | 168 | p0 = (n1+n2)/(N1+N2) 169 | p1 = n1/N1 170 | p2 = n2/N2 171 | 172 | Z = (p1-p2) / np.sqrt( p0 * (1-p0) * ((1/N1) + (1/N2)) ) 173 | 174 | pval = 2 * sp.stats.norm.cdf(-1*abs(Z)) 175 | 176 | return pval 177 | 178 | def pval_KalZtest_vec(n1,N1,n2,N2): 179 | assert n1.shape[0] == n2.shape[0] 180 | 181 | p0 = (n1+n2)/(float(N1)+N2) 182 | p1 = n1/float(N1) 183 | p2 = n2/float(N2) 184 | 185 | p0[(n1 == 0) & (n2 == 0)] = 0.5 186 | 187 | Z = (p1-p2) / np.sqrt( p0 * (1.-p0) * ((1./N1) + (1./N2)) ) 188 | 189 | pval = 2 * sp.stats.norm.cdf(-1*abs(Z)) 190 | pval[(n1 == 0) & (n2 == 0)] = -1. 191 | 192 | return pval 193 | 194 | def pval_logRatioMC(n1,N1,n2,N2): 195 | pass 196 | 197 | def pvals_logRatioMC(counts1, counts2, B=1e6, pseudocount=1, verbose=False): 198 | """Compute component-wise p-values of difference between two count vectors 199 | using Monte Carlo sampling of log ratios. 200 | 201 | Null hypothesis is that data is from same multinomial. Parameters estimated 202 | by combining both count vectors. Zeros are handled by adding pseudocount to 203 | each element. 204 | 205 | The test statistic is log Ratio, which is computed for each component. 206 | 207 | Two random count vectors are generated, and and component-wise log ratio 208 | is computed. For each component, it is recorded whether the abs random log 209 | ratio was greater than or less than the abs test statistic value. This is 210 | performed B times. The absolute value makes the test two-sided and symmetric. 211 | 212 | The achieved significance level (ASL) is returned for each component. 213 | 214 | """ 215 | if len(counts1) != len(counts2): raise ValueError, "Counts vectors have different lengths." 216 | 217 | counts1 = np.asarray(counts1, dtype=np.float) 218 | counts2 = np.asarray(counts2, dtype=np.float) 219 | 220 | total1 = int(np.round(np.sum(counts1))) 221 | total2 = int(np.round(np.sum(counts2))) 222 | 223 | countsMLE = counts1 + counts2 + pseudocount 224 | counts1 = counts1 + pseudocount # note: counts1 and counts2 are changed at this point 225 | counts2 = counts2 + pseudocount 226 | 227 | normcounts1 = counts1 / np.sum(counts1) 228 | normcounts2 = counts2 / np.sum(counts2) 229 | 230 | testabslogratios = np.abs(np.log10(normcounts2 / normcounts1)) 231 | 232 | probvec = countsMLE / np.sum(countsMLE) 233 | 234 | atleastasextreme = np.zeros(len(counts1)) 235 | 236 | for i in xrange(B): 237 | if verbose and i % 10 == 0: 238 | sys.stdout.write("%i " % i) 239 | sys.stdout.flush() 240 | 241 | randcounts1 = np.float_(np.random.multinomial(total1, probvec)) + pseudocount 242 | randcounts2 = np.float_(np.random.multinomial(total2, probvec)) + pseudocount 243 | 244 | normrandcounts1 = randcounts1 / np.sum(randcounts1) 245 | normrandcounts2 = randcounts2 / np.sum(randcounts2) 246 | 247 | randabslogratios = np.abs(np.log10(normrandcounts2 / normrandcounts1)) 248 | 249 | atleastasextreme += np.float_(randabslogratios >= testabslogratios) 250 | 251 | ASL = atleastasextreme / B 252 | 253 | return ASL 254 | 255 | def pvals_counts(counts1,counts2,method='KalZtest'): 256 | """Compute component-wise p-values of difference between two count vectors. 257 | 258 | method can be one of: 259 | KalZtest 260 | MonteCarlo 261 | 262 | """ 263 | if len(counts1) != len(counts2): raise ValueError, "Counts vectors have different lengths." 264 | 265 | pvals = np.zeros(len(counts1)) 266 | N1 = np.sum(counts1) 267 | N2 = np.sum(counts2) 268 | 269 | if method == 'KalZtest': 270 | for i in xrange(len(pvals)): 271 | pvals[i] = pval_KalZtest(counts1[i],N1,counts2[i],N2) 272 | elif method == 'MonteCarlo': 273 | pvals = pvals_logRatioMC(counts1,counts2,B=1e6,pseudocounts=1) 274 | else: 275 | raise Exception, method + " is not a recognized method for computing p-values." 276 | 277 | return pvals 278 | 279 | # ============================================================================== 280 | 281 | # ========================== 282 | # = Random data generation = 283 | # ========================== 284 | 285 | def gen_rand_count_vec(numComponents,numCounts,fracNull,probvecNull,probvecAlt): 286 | pass 287 | 288 | # ============================================================================== 289 | # ============================================================================== 290 | # ============================================================================== 291 | # ============================================================================== 292 | # ============================================================================== 293 | # ============================================================================== -------------------------------------------------------------------------------- /daemonize.py: -------------------------------------------------------------------------------- 1 | # from Python Cookbook, 2nd edition, Recipe 9.13 2 | import sys, os 3 | ''' Module to fork the current process as a daemon. 4 | NOTE: don't do any of this if your daemon gets started by inetd! inetd 5 | does all you need, including redirecting standard file descriptors; 6 | the chdir( ) and umask( ) steps are the only ones you may still want. 7 | ''' 8 | def daemonize (stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'): 9 | ''' Fork the current process as a daemon, redirecting standard file 10 | descriptors (by default, redirects them to /dev/null). 11 | ''' 12 | # Perform first fork. 13 | try: 14 | pid = os.fork( ) 15 | if pid > 0: 16 | sys.exit(0) # Exit first parent. 17 | except OSError, e: 18 | sys.stderr.write("fork #1 failed: (%d) %s\n" % (e.errno, e.strerror)) 19 | sys.exit(1) 20 | # Decouple from parent environment. 21 | os.chdir("/") 22 | os.umask(0) 23 | os.setsid( ) 24 | # Perform second fork. 25 | try: 26 | pid = os.fork( ) 27 | if pid > 0: 28 | sys.exit(0) # Exit second parent. 29 | except OSError, e: 30 | sys.stderr.write("fork #2 failed: (%d) %s\n" % (e.errno, e.strerror)) 31 | sys.exit(1) 32 | # The process is now daemonized, redirect standard file descriptors. 33 | for f in sys.stdout, sys.stderr: f.flush( ) 34 | si = file(stdin, 'r') 35 | so = file(stdout, 'a+') 36 | se = file(stderr, 'a+', 0) 37 | os.dup2(si.fileno( ), sys.stdin.fileno( )) 38 | os.dup2(so.fileno( ), sys.stdout.fileno( )) 39 | os.dup2(se.fileno( ), sys.stderr.fileno( )) 40 | 41 | # def _example_main ( ): 42 | # ''' Example main function: print a count & timestamp each second ''' 43 | # import time 44 | # sys.stdout.write('Daemon started with pid %d\n' % os.getpid( ) ) 45 | # sys.stdout.write('Daemon stdout output\n') 46 | # sys.stderr.write('Daemon stderr output\n') 47 | # c = 0 48 | # while True: 49 | # sys.stdout.write('%d: %s\n' % (c, time.ctime( ))) 50 | # sys.stdout.flush( ) 51 | # c = c + 1 52 | # time.sleep(1) 53 | # if _ _name_ _ == "_ _main_ _": 54 | # daemonize('/dev/null','/tmp/daemon.log','/tmp/daemon.log') 55 | # _example_main( ) 56 | -------------------------------------------------------------------------------- /degex.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # take a FASTA file of DNA sequences (short oligos) with IUPAC degeneracies 4 | # and ambiguities and expand all combinatorial possibilities into a new FASTA 5 | # file 6 | # 7 | # works by implementing a recursive depth first search 8 | 9 | IUPAC_vals = {'A': 'A', 10 | 'B': 'CGT', 11 | 'C': 'C', 12 | 'D': 'AGT', 13 | 'G': 'G', 14 | 'H': 'ACT', 15 | 'K': 'GT', 16 | 'M': 'AC', 17 | 'N': 'GATC', 18 | 'R': 'AG', 19 | 'S': 'CG', 20 | 'T': 'T', 21 | 'V': 'ACG', 22 | 'W': 'AT', 23 | 'X': 'GATC', 24 | 'Y': 'CT'} 25 | 26 | # ====================== 27 | # = Depth first search = 28 | # ====================== 29 | 30 | class dfs_node: 31 | def __init__(self, cum, rem): 32 | self.visited = False 33 | self.neighbors = [] 34 | self.cumul_seq = cum 35 | self.remain_seq = rem 36 | 37 | # to use: must supply: 38 | # 1. a list where the sequences will be pushed and 39 | # 2. a dfs_node with cumul_seq empty and remain_seq = IUPAC DNA sequence 40 | 41 | def dfs_expand_seq( curr_dfs_node, cum_list ): 42 | curr_dfs_node.visited = True 43 | 44 | # if we are not at the end of the tree yet 45 | if len(curr_dfs_node.remain_seq) > 0: 46 | # construct neighbors of current dfs_node based on remaining sequence 47 | for nucleotide in IUPAC_vals[ curr_dfs_node.remain_seq[0] ]: 48 | curr_dfs_node.neighbors.append( dfs_node(curr_dfs_node.cumul_seq + nucleotide, curr_dfs_node.remain_seq[1:]) ) 49 | 50 | # implement recursive DFS 51 | for neighbor in curr_dfs_node.neighbors: 52 | if neighbor.visited == False: 53 | dfs_expand_seq( neighbor, cum_list ) 54 | 55 | # we should only run this when there are no neighbors left 56 | elif len(curr_dfs_node.remain_seq) == 0: 57 | cum_list.append(curr_dfs_node.cumul_seq) 58 | 59 | def expand_seq(seq): 60 | expanded_list = [] 61 | start_node = dfs_node('',seq) 62 | dfs_expand_seq( start_node, expanded_list ) 63 | return expanded_list 64 | 65 | # ======== 66 | # = MAIN = 67 | # ======== 68 | 69 | if __name__ == '__main__': 70 | import sys 71 | 72 | from Bio import SeqIO 73 | 74 | if len(sys.argv) == 3: 75 | inhandle = open(sys.argv[1],'r') 76 | outhandle = open(sys.argv[2],'w') 77 | elif len(sys.argv) == 2: 78 | inhandle = open(sys.argv[1],'r') 79 | outhandle = sys.stdout 80 | elif len(sys.argv) == 1: 81 | inhandle = sys.stdin 82 | outhandle = sys.stdout 83 | 84 | for record in SeqIO.parse(inhandle,'fasta'): 85 | seq = record.seq.tostring().upper() 86 | expanded_seqs = expand_seq( seq ) 87 | for (i,s) in enumerate(expanded_seqs): 88 | outhandle.write(">%s|%i\n%s\n" % (record.description,i+1,s)) # write fasta output 89 | -------------------------------------------------------------------------------- /exonerate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import subprocess 4 | 5 | import seqtools 6 | 7 | class ExonerateCommand(object): 8 | """Build command for exonerate""" 9 | 10 | options_list = [ 11 | 'query', 12 | 'target', 13 | 'querytype', 14 | 'targettype', 15 | 'querychunkid', 16 | 'querychunktotal', 17 | 'targetchunkid', 18 | 'targetchunktotal', 19 | 'verbose', 20 | 'exhaustive', 21 | 'bigseq', 22 | 'forcescan', 23 | 'saturatethreshold', 24 | 'customserver', 25 | 'fastasuffix', 26 | 'model', 27 | 'score', 28 | 'percent', 29 | 'showalignment', 30 | 'showsugar', 31 | 'showcigar', 32 | 'showvulgar', 33 | 'showquerygff', 34 | 'showtargetgff', 35 | # 'ryo', NOTE: this is left out as it requires special handling 36 | 'bestn', 37 | 'subopt', 38 | 'gappedextension', 39 | 'refine', 40 | 'refineboundary', 41 | 'dpmemory', 42 | 'compiled', 43 | 'terminalrangeint', 44 | 'terminalrangeext', 45 | 'joinrangeint', 46 | 'joinrangeext', 47 | 'spanrangeint', 48 | 'spanrangeext', 49 | 'extensionthreshold', 50 | 'singlepass', 51 | 'joinfilter', 52 | 'annotation', 53 | 'softmaskquery', 54 | 'softmasktarget', 55 | 'dnasubmat', 56 | 'proteinsubmat', 57 | 'fsmmemory', 58 | 'forcefsm', 59 | 'wordjump', 60 | 'gapopen', 61 | 'gapextend', 62 | 'codongapopen', 63 | 'codongapextend', 64 | 'minner', 65 | 'maxner', 66 | 'neropen', 67 | 'minintron', 68 | 'maxintron', 69 | 'intronpenalty', 70 | 'frameshift', 71 | 'useaatla', 72 | 'geneticcode', 73 | 'hspfilter', 74 | 'useworddropoff', 75 | 'seedrepeat', 76 | 'dnawordlen', 77 | 'proteinwordlen', 78 | 'codonnwordlen', 79 | 'dnahspdropoff', 80 | 'proteinhspdropoff', 81 | 'codonhspdropoff', 82 | 'dnahspthreshold', 83 | 'proteinhspthreshold', 84 | 'codonhspthreshold', 85 | 'dnawordlimit', 86 | 'proteinwordlimit', 87 | 'codonwordlimit', 88 | 'geneseed', 89 | 'geneseedrepeat', 90 | 'alignmentwidth', 91 | 'forwardcoordinates', 92 | 'quality', 93 | 'splice3', 94 | 'splice5', 95 | 'forcegtag'] 96 | 97 | 98 | def __init__(self, *args, **kw): 99 | # register preset handlers 100 | self.register = { 101 | 'affine:local' : self.preset_affinelocal, 102 | 'affine:global' : self.preset_affineglobal, 103 | 'findend' : self.preset_findend, 104 | 'parsable' : self.preset_parsable, 105 | 'pretty' : self.preset_pretty, 106 | 'bestonly' : self.preset_bestonly, 107 | 'ungapped' : self.preset_ungapped 108 | } 109 | 110 | # these attributes must be handled special, and set manually at the start 111 | self.options = {} 112 | self.ryo = None 113 | 114 | # first execute any registered functions 115 | for a in args: 116 | self.register[a]() 117 | 118 | # check for ryo output and save it (needs special handling) 119 | if kw.has_key('ryo'): self.ryo = kw.pop('ryo') 120 | 121 | # then set all the manual options supplied 122 | self.options.update(kw) 123 | 124 | # set standard options in case they weren't given initially 125 | # they can still be overwritten 126 | self.softset_default() 127 | 128 | # return self 129 | 130 | def __setattr__(self,name,value): 131 | """Allows setting of options by acting on object attributes. 132 | 133 | For example: 134 | cmd = ExonerateCommand() 135 | cmd.querytype = 'dna' 136 | 137 | Catches the special cases of ryo and options. 138 | ryo needs to be set manually 139 | options shouldn't be overwritten, but lets you... 140 | """ 141 | if name in ExonerateCommand.options_list: 142 | self.options[name] = value 143 | else: 144 | object.__setattr__(self,name,value) 145 | 146 | def __getattr__(self,name): 147 | if name in ExonerateCommand.options_list: 148 | return self.options[name] 149 | else: 150 | raise AttributeError 151 | 152 | def build_command(self): 153 | self.cmd = 'exonerate' 154 | for (option,value) in self.options.iteritems(): 155 | self.cmd += ' --%s %s' % (option,value) 156 | 157 | # handle ryo output using raw string 158 | if self.ryo is not None: 159 | self.cmd += r' --%s "%s"' % ('ryo',self.ryo) 160 | 161 | return self.cmd 162 | 163 | def softset_default(self): 164 | """Conditionally override options to a reasonable default.""" 165 | if not self.options.has_key('model'): 166 | self.model = 'affine:local' 167 | if not self.options.has_key('querytype'): 168 | self.querytype = 'dna' 169 | if not self.options.has_key('targettype'): 170 | self.targettype = 'dna' 171 | 172 | def hardset_preset(self,*args): 173 | for a in args: 174 | register[a](self) 175 | 176 | def preset_affinelocal(self): 177 | self.model = 'affine:local' 178 | 179 | def preset_affineglobal(self): 180 | self.model = 'affine:global' 181 | self.exhaustive = True 182 | 183 | def preset_ungapped(self): 184 | self.model = 'ungapped' 185 | self.exhaustive = True 186 | 187 | def preset_findend(self): 188 | self.model = 'affine:overlap' 189 | self.exhaustive = True 190 | 191 | def preset_parsable(self): 192 | self.verbose = 0 193 | # self.showalignment = False 194 | # self.showvulgar = False 195 | self.ryo = r'aln_summary: %qi %ql %qab %qae %qS %ti %tl %tab %tae %tS %s %et %ei %pi\n' 196 | 197 | def preset_pretty(self): 198 | self.showalignment = True 199 | self.showvulgar = True 200 | self.showsugar = True 201 | 202 | def preset_bestonly(self): 203 | self.bestn = 1 204 | 205 | def run_exonerate(cmd,query=None,target=None): 206 | """Run exonerate using given ExonerateCommand object 207 | 208 | query and target must refer to files 209 | """ 210 | # check query and target are set properly 211 | if query is not None: cmd.query = query 212 | if target is not None: cmd.target = target 213 | try: 214 | cmd.query 215 | cmd.target 216 | except KeyError: 217 | print "cmd.query or cmd.target is not set" 218 | raise 219 | 220 | # submit process 221 | p = subprocess.Popen(cmd.build_command(),shell=True,stdout=subprocess.PIPE) 222 | aln = p.stdout.read() 223 | p.wait() 224 | return aln 225 | 226 | def run_exonerate2(cmd,query,target,queryname='query',targetname='target',debug=False): 227 | """Perform pairwise alignment using cmd ExonerateCommand object 228 | 229 | query and target are sequences 230 | """ 231 | # TODO: see if this can be implemented without writing to temporary files 232 | 233 | # write seqs to tempfiles 234 | (fdq,queryfile) = tempfile.mkstemp() 235 | (fdt,targetfile) = tempfile.mkstemp() 236 | iopq = open(queryfile,'w') 237 | iopt = open(targetfile,'w') 238 | print >>iopq, ">%s\n%s\n" % (queryname,query) 239 | print >>iopt, ">%s\n%s\n" % (targetname,target) 240 | iopq.close() 241 | iopt.close() 242 | os.close(fdq) 243 | os.close(fdt) 244 | 245 | try: 246 | # perform alignment 247 | cmd.query = queryfile 248 | cmd.target = targetfile 249 | aln = run_exonerate(cmd) 250 | finally: 251 | # clean up 252 | os.remove(queryfile) 253 | os.remove(targetfile) 254 | 255 | if debug: print aln 256 | 257 | return aln 258 | 259 | def iter_alnsummary(rawaln): 260 | """Return alnsummary line from rawaln.""" 261 | for line in rawaln.split('\n'): 262 | if line.startswith('aln_summary'): 263 | yield line 264 | 265 | def extract_alnsummary(rawaln): 266 | """Return alnsummary line from rawaln.""" 267 | return iter_alnsummary(rawaln).next() 268 | 269 | def iter_vulgar(rawaln): 270 | """Return vulgar line from rawaln.""" 271 | for line in rawaln.split('\n'): 272 | if line.startswith('vulgar'): 273 | yield line 274 | 275 | def extract_vulgar(rawaln): 276 | """Return vulgar line from rawaln.""" 277 | return iter_vulgar(rawaln).next() 278 | 279 | def iter_alnsummary_vulgar(rawaln): 280 | for (alnsummary,vulgar_commands) in zip(iter_alnsummary(rawaln),iter_vulgar(rawaln)): 281 | yield (alnsummary,vulgar_commands) 282 | 283 | def parse_alnsummary(rawalnsummary): 284 | """Parse alnsummary line from exonerate using 'parsable' preset. 285 | 286 | Takes an alnsummary line from an alignment that was generated from an ryo 287 | 'parsable' preset. 288 | """ 289 | # 'aln_summary: %qi %ql %qab %qae %qS %ti %tl %tab %tae %tS %s %et %ei %pi\n' 290 | data = rawalnsummary.split() 291 | 292 | aln = {} 293 | aln['query_id'] = data[1] 294 | aln['query_len'] = int(data[2]) 295 | aln['query_aln_begin'] = int(data[3]) 296 | aln['query_aln_end'] = int(data[4]) 297 | aln['query_strand'] = data[5] 298 | aln['target_id'] = data[6] 299 | aln['target_len'] = int(data[7]) 300 | aln['target_aln_begin'] = int(data[8]) 301 | aln['target_aln_end'] = int(data[9]) 302 | aln['target_strand'] = data[10] 303 | aln['score'] = int(data[11]) 304 | aln['equiv_total'] = int(data[12]) 305 | aln['equiv_id'] = int(data[13]) 306 | aln['percent_id'] = float(data[14]) 307 | 308 | return aln 309 | 310 | def parse_aln(rawaln): 311 | """Parse raw alignment from exonerate using 'parsable' preset. 312 | 313 | Takes a raw alignment and searches for an alnsummary line (generated from 314 | an ryo 'parsable' preset) and parses it. 315 | """ 316 | for line in rawaln.split('\n'): 317 | if line.strip().startswith('aln_summary'): 318 | rawalnsummary = line.strip() 319 | break 320 | else: 321 | raise ValueError, "aln_summary line not found in raw aln:\n%s" % rawaln 322 | 323 | return parse_alnsummary(rawalnsummary) 324 | 325 | def parse_vulgar(rawvulgar): 326 | """Parse vulgar line 327 | 328 | Takes vulgar line from alignment output 329 | 330 | returns only the non-sugar part that allows you to build the aln 331 | """ 332 | data = rawvulgar.split()[10:] 333 | cmds = [] 334 | for i in range(0,len(data),3): 335 | cmds.append( (data[0],int(data[1]),int(data[2])) ) 336 | return cmds 337 | 338 | def build_aln(alnsummary,vulgar_commands,queryseq,targetseq): 339 | """Build full alignment from exonerate using 'parsable' preset and vulgar output""" 340 | 341 | queryname = alnsummary['query_id'] 342 | targetname = alnsummary['target_id'] 343 | 344 | # process strands. the position vars below will always progress 345 | # from 0->len(seq), so the seqs must be revcomped accordingly 346 | 347 | queryposition = alnsummary['query_aln_begin'] 348 | targetposition = alnsummary['target_aln_begin'] 349 | if alnsummary['query_strand'] == '-': 350 | queryseq = seqtools.reverse_complement(queryseq) 351 | queryposition = len(queryseq) - queryposition 352 | if alnsummary['target_strand'] == '-': 353 | targetseq = seqtools.reverse_complement(targetseq) 354 | targetposition = len(targetseq) - targetposition 355 | pad = abs(queryposition - targetposition) 356 | 357 | # build alignment 358 | queryaln = '' 359 | targetaln = '' 360 | 361 | # process necessary padding 362 | if queryposition > targetposition: 363 | targetaln = ' ' * pad 364 | else: 365 | queryaln = ' ' * pad 366 | 367 | # add pre-aln sequence 368 | queryaln += queryseq[0:queryposition] 369 | targetaln += targetseq[0:targetposition] 370 | 371 | # walk through alignment (from vulgar output) 372 | for cmd in vulgar_commands: 373 | if cmd[0] == 'M': 374 | assert(cmd[1]==cmd[2]) 375 | queryaln += queryseq[queryposition:queryposition+cmd[1]] 376 | targetaln += targetseq[targetposition:targetposition+cmd[2]] 377 | queryposition += cmd[1] 378 | targetposition += cmd[2] 379 | elif cmd[0] == 'G': 380 | assert( (cmd[1]==0) != (cmd[1]==0) ) # xor 381 | if cmd[1] == 0: 382 | queryaddendum = '-' * cmd[2] 383 | targetaddendum = targetseq[targetposition:targetposition+cmd[2]] 384 | elif cmd[2] == 0: 385 | queryaddendum = queryseq[queryposition:queryposition+cmd[1]] 386 | targetaddendum = '-' * cmd[1] 387 | queryaln += queryaddendum 388 | targetaln += targetaddendum 389 | queryposition += cmd[1] 390 | targetposition += cmd[2] 391 | else: 392 | raise ValueError, "I do not understand the vulgar command %s" % cmd[0] 393 | 394 | # add any post-aln sequence 395 | queryaln += queryseq[queryposition:] 396 | targetaln += targetseq[targetposition:] 397 | 398 | return (queryaln,targetaln) 399 | -------------------------------------------------------------------------------- /graphtools.py: -------------------------------------------------------------------------------- 1 | import pygraphviz as pgv 2 | 3 | import scale 4 | 5 | def load_immunitree_nodes(infile): 6 | G = pgv.AGraph(strict=True,directed=True) 7 | with open(infile,'r') as ip: 8 | ip.next() # burn header 9 | for line in ip: 10 | data = [d.strip() for d in line.split(',')] 11 | 12 | node = data[0] 13 | parent = data[1] 14 | size = int(data[2]) 15 | muts = len(data[-1].split('-')) 16 | 17 | G.add_node(node,xlabel="[%s] %i" % (node,size),size=size) 18 | if parent != '0': 19 | G.add_edge(parent,node,label=muts) 20 | 21 | return G 22 | 23 | def format_immunitree_graph(G): 24 | min_size = max(min([int(node.attr['size']) for node in G.nodes_iter()]),1) 25 | max_size = max([int(node.attr['size']) for node in G.nodes_iter()]) 26 | min_area = 0.3 27 | max_area = 1.3 28 | area_scale = scale.root(min_size,max_size).range(min_area,max_area).power(2) 29 | for node in G.nodes_iter(): 30 | node.attr['fixedsize'] = True 31 | if int(node.attr['size']) == 0: 32 | node.attr['shape'] = 'point' 33 | else: 34 | node.attr['shape'] = 'circle' 35 | node.attr['height'] = area_scale(int(node.attr['size'])) 36 | 37 | for edge in G.edges_iter(): 38 | pass 39 | 40 | G.graph_attr['forcelabels'] = True 41 | G.layout(prog='dot') 42 | -------------------------------------------------------------------------------- /lsf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import time 4 | 5 | # =================== 6 | # = LSF Dispatching = 7 | # =================== 8 | 9 | def submit_to_LSF(queue, LSFopfile, duration, cmd_to_submit, mem_usage=None): 10 | # wrap command to submit in quotations 11 | cmd_to_submit = r"'%s'" % cmd_to_submit.strip(r'"') 12 | LSF_params = {'LSFoutput': LSFopfile, 13 | 'queue': queue, 14 | 'duration': duration} 15 | LSF_cmd = 'rbsub -q%(queue)s -W %(duration)s -o%(LSFoutput)s' % LSF_params 16 | if mem_usage != None: 17 | LSF_cmd += r' -R "rusage[mem=%d]"' % mem_usage 18 | cmd = ' '.join([LSF_cmd, cmd_to_submit]) 19 | p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) 20 | #p.wait() 21 | return p.stdout.read().split('<')[1].split('>')[0] 22 | 23 | def parse_LSF_report(filename): 24 | jobID = -1 25 | finished = False 26 | succeeded = False 27 | 28 | ip = open(filename) 29 | for line in ip: 30 | if line.startswith('Subject:') and 'Job' in line: 31 | jobID = line.split()[2].rstrip(':') 32 | if 'Done' in line or 'Exited' in line: 33 | finished = True 34 | if 'TERM_REQUEUE_ADMIN' in line: # for when rbsub requeues 35 | finished = False 36 | if 'Successfully completed.' in line: 37 | succeeded = True 38 | ip.close() 39 | 40 | return (jobID,finished,succeeded) 41 | 42 | def wait_for_LSF_jobs(jobIDs,logfiles,interval=120): 43 | while len(jobIDs) > 0: 44 | time.sleep(interval) 45 | # parse logfiles to see which jobs finished in the interim 46 | for logfile in logfiles: 47 | if not os.path.exists(logfile): # (job not finished) 48 | continue 49 | (jobID,finished,succeeded) = parse_LSF_report(logfile) 50 | if jobID != -1 and finished and succeeded: 51 | jobIDs.remove(jobID) 52 | logfiles.remove(logfile) 53 | elif jobID != -1 and finished and not succeeded: 54 | raise ValueError, "Job %s failed" % jobID 55 | 56 | # DEPRECATED: USES bjobs TO TEST FOR JOB COMPLETION 57 | # def wait_for_LSF_jobs(PIDs,interval=30): 58 | # finished = False 59 | # while not finished: 60 | # time.sleep(interval) 61 | # p = subprocess.Popen('bjobs',shell=True,stdout=subprocess.PIPE) 62 | # #p.wait() 63 | # status = p.stdout.read().split('\n') 64 | # if status[0].split()[0] != 'JOBID': 65 | # finished = False 66 | # continue 67 | # runningprocesses = [line.split()[0] for line in status if line.split() != [] and line.split()[0] != 'JOBID'] 68 | # finished = True 69 | # for pid in PIDs: 70 | # if pid in runningprocesses: 71 | # finished = False -------------------------------------------------------------------------------- /mplextensions.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import scipy as sp 5 | import scipy.stats 6 | 7 | import matplotlib as mpl 8 | import matplotlib.pyplot as plt 9 | 10 | def jitter(data, bins=100): 11 | data = np.asarray(data) 12 | (hist,edges) = np.histogram(data,bins=bins) 13 | hist = np.float_(hist) / max(hist) 14 | idxs = np.searchsorted(edges[:-2],data) 15 | return hist[idxs] 16 | 17 | def jitter_x(x,y,width=None,bins=100): 18 | x = np.asarray(x) 19 | y = np.asarray(y) 20 | 21 | x_argsort = np.argsort(x) 22 | x_uniq = sorted(list(set(x))) 23 | 24 | # find smallest interval between any two x-values 25 | if width == None: 26 | if len(x_uniq) == 1: 27 | width = 1. 28 | else: 29 | interval = min([x[x_argsort[i+1]]-x[x_argsort[i]] for i in xrange(len(x)-1)]) 30 | width = interval / 3. 31 | 32 | x_jit = [] 33 | y_jit = [] 34 | for val in x_uniq: 35 | idx = (x==val) 36 | scaling_factors = jitter(y[idx],bins=bins) 37 | for (x_val,y_val,scaling) in zip(x[idx],y[idx],scaling_factors): 38 | x_jit.append( x_val + width * scaling * random.choice([-1,1]) * np.random.uniform(0,1)) 39 | y_jit.append( y_val ) 40 | 41 | return (x_jit,y_jit) 42 | 43 | 44 | # def jitter_x(x,y,width=None): 45 | # x = np.asarray(x) 46 | # y = np.asarray(y) 47 | # 48 | # x_argsort = np.argsort(x) 49 | # x_uniq = sorted(list(set(x))) 50 | # 51 | # # find smallest interval between any two x-values 52 | # if width == None: 53 | # interval = min([x[x_argsort[i+1]]-x[x_argsort[i]] for i in xrange(len(x)-1)]) 54 | # width = interval / 3. 55 | # 56 | # x_jit = [] 57 | # y_jit = [] 58 | # for val in x_uniq: 59 | # idx = (x==val) 60 | # kernel = sp.stats.kde.gaussian_kde(y[idx]) 61 | # kernel_max = max([kernel(v) for v in set(y[idx])]) 62 | # for (x_val,y_val) in zip(x[idx],y[idx]): 63 | # x_jit.append( x_val + np.random.uniform(-1,1) * width * kernel(y_val) / kernel_max) 64 | # y_jit.append( y_val ) 65 | # 66 | # return (x_jit,y_jit) 67 | 68 | 69 | class ConstWidthRectangle(mpl.patches.Patch): 70 | 71 | def __init__(self, x, y1, y2, w, **kwargs): 72 | self.x = x 73 | self.y1 = y1 74 | self.y2 = y2 75 | self.w = w 76 | mpl.patches.Patch.__init__(self,**kwargs) 77 | 78 | def get_path(self): 79 | return mpl.path.Path.unit_rectangle() 80 | 81 | def get_transform(self): 82 | box = np.array([[self.x,self.y1], 83 | [self.x,self.y2]]) 84 | box = self.axes.transData.transform(box) 85 | 86 | w = self.w * self.axes.bbox.width / 2.0 87 | 88 | box[0,0] -= w 89 | box[1,0] += w 90 | 91 | return mpl.transforms.BboxTransformTo(mpl.transforms.Bbox(box)) 92 | 93 | class ConstWidthLine(mpl.lines.Line2D): 94 | 95 | def __init__(self,x,y,w,**kwargs): 96 | self.x = x 97 | self.y = y 98 | self.w = w 99 | mpl.lines.Line2D.__init__(self,[0,1],[0,0],**kwargs) # init to unit line 100 | 101 | def get_transform(self): 102 | # define transform that takes unit horiz line seg 103 | # and places it in correct position using display 104 | # coords 105 | 106 | box = np.array([[self.x,self.y], 107 | [self.x,self.y+1]]) 108 | box = self.axes.transData.transform(box) 109 | 110 | w = self.w * self.axes.bbox.width / 2.0 111 | 112 | box[0,0] -= w 113 | box[1,0] += w 114 | 115 | #xdisp,ydisp = self.axes.transData.transform_point([self.x,self.y]) 116 | #xdisp -= w 117 | #xleft = xdisp - w 118 | #xright = xdisp + w 119 | 120 | return mpl.transforms.BboxTransformTo(mpl.transforms.Bbox(box)) 121 | #return mpl.transforms.Affine2D().scale(w,1).translate(xdisp,ydisp) 122 | 123 | def draw(self,renderer): 124 | # the ONLY purpose of redefining this function is to force the Line2D 125 | # object to execute recache(). Otherwise, certain changes in the scale 126 | # do not invalidate the Line2D object, and the transform will not be 127 | # recomputed (and so the Axes coords computed earlier will be obsolete) 128 | self.recache() 129 | return mpl.lines.Line2D.draw(self,renderer) 130 | 131 | 132 | class ConstHeightRectangle(mpl.patches.Patch): 133 | 134 | def __init__(self, x1, x2, y, h, **kwargs): 135 | self.x1 = x1 136 | self.x2 = x2 137 | self.y = y 138 | self.h = h 139 | mpl.patches.Patch.__init__(self,**kwargs) 140 | 141 | def get_path(self): 142 | return mpl.path.Path.unit_rectangle() 143 | 144 | def get_transform(self): 145 | box = np.array([[self.x1,self.y], 146 | [self.x2,self.y]]) 147 | box = self.axes.transData.transform(box) 148 | 149 | h = self.h * self.axes.bbox.height / 2.0 150 | 151 | box[0,1] -= h 152 | box[1,1] += h 153 | 154 | return mpl.transforms.BboxTransformTo(mpl.transforms.Bbox(box)) 155 | 156 | class ConstHeightLine(mpl.lines.Line2D): 157 | 158 | def __init__(self,x,y,h,**kwargs): 159 | self.x = x 160 | self.y = y 161 | self.h = h 162 | mpl.lines.Line2D.__init__(self,[0,0],[0,1],**kwargs) # init to unit line 163 | 164 | # self.x = x 165 | # self.y = y 166 | # self.w = w 167 | # mpl.lines.Line2D.__init__(self,[0,1],[0,0],**kwargs) # init to unit line 168 | 169 | def get_transform(self): 170 | # define transform that takes unit horiz line seg 171 | # and places it in correct position using display 172 | # coords 173 | 174 | box = np.array([[self.x,self.y], 175 | [self.x+1,self.y]]) 176 | box = self.axes.transData.transform(box) 177 | 178 | h = self.h * self.axes.bbox.height / 2.0 179 | 180 | box[0,1] -= h 181 | box[1,1] += h 182 | 183 | #xdisp,ydisp = self.axes.transData.transform_point([self.x,self.y]) 184 | #xdisp -= w 185 | #xleft = xdisp - w 186 | #xright = xdisp + w 187 | 188 | return mpl.transforms.BboxTransformTo(mpl.transforms.Bbox(box)) 189 | #return mpl.transforms.Affine2D().scale(w,1).translate(xdisp,ydisp) 190 | 191 | def draw(self,renderer): 192 | # the ONLY purpose of redefining this function is to force the Line2D 193 | # object to execute recache(). Otherwise, certain changes in the scale 194 | # do not invalidate the Line2D object, and the transform will not be 195 | # recomputed (and so the Axes coords computed earlier will be obsolete) 196 | self.recache() 197 | return mpl.lines.Line2D.draw(self,renderer) 198 | 199 | 200 | def boxplot(ax, x, positions=None, widths=None, vert=1): 201 | # adapted from matplotlib 202 | 203 | # convert x to a list of vectors 204 | if hasattr(x, 'shape'): 205 | if len(x.shape) == 1: 206 | if hasattr(x[0], 'shape'): 207 | x = list(x) 208 | else: 209 | x = [x,] 210 | elif len(x.shape) == 2: 211 | nr, nc = x.shape 212 | if nr == 1: 213 | x = [x] 214 | elif nc == 1: 215 | x = [x.ravel()] 216 | else: 217 | x = [x[:,i] for i in xrange(nc)] 218 | else: 219 | raise ValueError, "input x can have no more than 2 dimensions" 220 | if not hasattr(x[0], '__len__'): 221 | x = [x] 222 | col = len(x) 223 | 224 | # get some plot info 225 | if positions is None: 226 | positions = range(1, col + 1) 227 | if widths is None: 228 | widths = min(0.3/len(positions),0.05) 229 | if isinstance(widths, float) or isinstance(widths, int): 230 | widths = np.ones((col,), float) * widths 231 | 232 | # loop through columns, adding each to plot 233 | for i,pos in enumerate(positions): 234 | d = np.ravel(x[i]) 235 | row = len(d) 236 | if row==0: 237 | # no data, skip this position 238 | continue 239 | # get distrib info 240 | q1, med, q3 = mpl.mlab.prctile(d,[25,50,75]) 241 | dmax = np.max(d) 242 | dmin = np.min(d) 243 | 244 | line_color = '#074687' 245 | face_color = '#96B7EC' 246 | if vert == 1: 247 | medline = ConstWidthLine(pos,med,widths[i],color=line_color,zorder=3) 248 | box = ConstWidthRectangle(pos,q1,q3,widths[i],facecolor=face_color,edgecolor=line_color,zorder=2) 249 | vertline = mpl.lines.Line2D([pos,pos],[dmin,dmax],color=line_color,zorder=1) 250 | else: 251 | medline = ConstHeightLine(med,pos,widths[i],color=line_color,zorder=3) 252 | box = ConstHeightRectangle(q1,q3,pos,widths[i],facecolor=face_color,edgecolor=line_color,zorder=2) 253 | vertline = mpl.lines.Line2D([dmin,dmax],[pos,pos],color=line_color,zorder=1) 254 | 255 | ax.add_line(vertline) 256 | ax.add_patch(box) 257 | ax.add_line(medline) 258 | 259 | 260 | # define colormap for -1 to 1 (green-black-red) like gene expression 261 | _redgreencdict = {'red': [(0.0, 0.0, 0.0), 262 | (0.5, 0.0, 0.0), 263 | (1.0, 1.0, 0.0)], 264 | 265 | 'green':[(0.0, 0.0, 1.0), 266 | (0.5, 0.0, 0.0), 267 | (1.0, 0.0, 0.0)], 268 | 269 | 'blue': [(0.0, 0.0, 0.0), 270 | (0.5, 0.0, 0.0), 271 | (1.0, 0.0, 0.0)]} 272 | 273 | redgreen = mpl.colors.LinearSegmentedColormap('redgreen',_redgreencdict,256) 274 | redgreen.set_bad(color='w') 275 | 276 | 277 | def compute_log_view_lim(data): 278 | lo_lim = 10**np.floor(np.log10(np.min(data))) 279 | hi_lim = 10**np.ceil(np.log10(np.max(data))) 280 | return (lo_lim, hi_lim) 281 | 282 | def generate_counthist(counts, label, view_lim=[1e-6,1e0,1e0,1e5]): 283 | """Generate count size histogram. 284 | 285 | counts -- dictionary of (key,count) pairs 286 | label -- for the legend 287 | """ 288 | max_size = max(counts.values()) 289 | num_chains = sum(counts.values()) 290 | sizes = np.arange(1,max_size+1) 291 | freqs = np.float_(sizes) / num_chains 292 | (hist,garbage) = np.histogram(counts.values(),bins=sizes) 293 | idxs = hist > 0 294 | 295 | fig = plt.figure() 296 | 297 | ax = fig.add_subplot(111) 298 | ax2 = ax.twiny() 299 | 300 | ax.spines['top'].set_position(('outward',5)) 301 | ax.spines['right'].set_visible(False) 302 | ax.spines['bottom'].set_position(('outward',5)) 303 | ax.spines['left'].set_position(('outward',5)) 304 | ax.xaxis.set_ticks_position('bottom') 305 | ax.yaxis.set_ticks_position('left') 306 | ax.plot(freqs[idxs],hist[idxs],marker='o',linestyle='None',color='#e31a1c',markeredgewidth=0,markersize=4,clip_on=False,label=label) 307 | ax.set_xscale('log') 308 | ax.set_yscale('log') 309 | ax.set_xlim(view_lim[:2]) 310 | ax.set_ylim(view_lim[2:]) 311 | 312 | ax2.spines['top'].set_position(('outward',5)) 313 | ax2.spines['right'].set_visible(False) 314 | ax2.spines['bottom'].set_visible(False) 315 | ax2.spines['left'].set_visible(False) 316 | ax2.xaxis.set_ticks_position('top') 317 | ax2.yaxis.set_ticks_position('none') 318 | ax2.set_xscale('log') 319 | ax2.set_xlim([view_lim[0]*num_chains,view_lim[1]*num_chains]) 320 | 321 | ax.set_xlabel('junction frequency (bottom) or count (top)') 322 | ax.set_ylabel('number of junctions') 323 | 324 | leg = ax.legend(loc=0,numpoints=1,prop=mpl.font_manager.FontProperties(size='small')) 325 | leg.get_frame().set_visible(False) 326 | 327 | return fig 328 | 329 | def generate_counthistline(counts, label, view_lim=[1e-6,1e0,1e0,1e5]): 330 | """Generate count size histogram. 331 | 332 | counts -- dictionary of (key,count) pairs 333 | label -- for the legend 334 | """ 335 | max_size = max(counts.values()) 336 | num_chains = sum(counts.values()) 337 | bins = np.logspace(0,np.log10(max_size),21) 338 | bins_freqs = np.float_(bins) / num_chains 339 | (hist,garbage) = np.histogram(counts.values(),bins=bins) 340 | 341 | fig = plt.figure() 342 | 343 | ax = fig.add_subplot(111) 344 | ax2 = ax.twiny() 345 | 346 | ax.spines['top'].set_position(('outward',5)) 347 | ax.spines['right'].set_visible(False) 348 | ax.spines['bottom'].set_position(('outward',5)) 349 | ax.spines['left'].set_position(('outward',5)) 350 | ax.xaxis.set_ticks_position('bottom') 351 | ax.yaxis.set_ticks_position('left') 352 | ax.plot(bins_freqs,list(hist)+[hist[-1]],color='#e31a1c',drawstyle='steps-post',clip_on=False,label=label) 353 | ax.set_xscale('log') 354 | ax.set_yscale('log') 355 | ax.set_xlim(view_lim[:2]) 356 | ax.set_ylim(view_lim[2:]) 357 | 358 | ax2.spines['top'].set_position(('outward',5)) 359 | ax2.spines['right'].set_visible(False) 360 | ax2.spines['bottom'].set_visible(False) 361 | ax2.spines['left'].set_visible(False) 362 | ax2.xaxis.set_ticks_position('top') 363 | ax2.yaxis.set_ticks_position('none') 364 | ax2.set_xscale('log') 365 | ax2.set_xlim([view_lim[0]*num_chains,view_lim[1]*num_chains]) 366 | 367 | ax.set_xlabel('junction frequency (bottom) or count (top)') 368 | ax.set_ylabel('number of junctions') 369 | 370 | leg = ax.legend(loc=0,numpoints=1,prop=mpl.font_manager.FontProperties(size='small')) 371 | leg.get_frame().set_visible(False) 372 | 373 | return fig 374 | 375 | def generate_rankaccum(counts,label,view_lim=[1e0,1e5,1e-6,1e0]): 376 | """Generate rankaccum curve. 377 | 378 | counts -- dictionary of (key,count) pairs 379 | label -- for the legend 380 | """ 381 | num_chains = sum(counts.values()) 382 | freqs = np.float_(counts.values()) / num_chains 383 | 384 | fig = plt.figure() 385 | 386 | ax = fig.add_subplot(111) 387 | ax2 = ax.twinx() 388 | 389 | ax.spines['top'].set_visible(False) 390 | ax.spines['right'].set_position(('outward',5)) 391 | ax.spines['bottom'].set_position(('outward',5)) 392 | ax.spines['left'].set_position(('outward',5)) 393 | ax.xaxis.set_ticks_position('bottom') 394 | ax.yaxis.set_ticks_position('left') 395 | ax.plot(range(1,len(counts.values())+1),sorted(freqs,reverse=True),marker='o',linestyle='None',color='#377db8',markeredgewidth=0,markersize=4,clip_on=False,label=label) 396 | ax.set_xscale('log') 397 | ax.set_yscale('log') 398 | ax.set_xlim(view_lim[:2]) 399 | ax.set_ylim(view_lim[2:]) 400 | 401 | ax2.spines['top'].set_visible(False) 402 | ax2.spines['right'].set_position(('outward',5)) 403 | ax2.spines['bottom'].set_visible(False) 404 | ax2.spines['left'].set_visible(False) 405 | ax2.xaxis.set_ticks_position('none') 406 | ax2.yaxis.set_ticks_position('right') 407 | ax2.set_yscale('log') 408 | ax2.set_ylim([view_lim[2]*num_chains,view_lim[3]*num_chains]) 409 | 410 | ax.set_xlabel('rank') 411 | ax.set_ylabel('junction frequency (left) or count (right)') 412 | 413 | leg = ax.legend(loc=0,numpoints=1,prop=mpl.font_manager.FontProperties(size='small')) 414 | leg.get_frame().set_visible(False) 415 | 416 | return fig 417 | 418 | -------------------------------------------------------------------------------- /oligoTm.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from Bio.Seq import Seq 4 | from Bio.SeqRecord import SeqRecord 5 | 6 | def oligoTm(seqobj): 7 | """Computes the melting temp based on the NN model. 8 | 9 | (Originated from Kun Zhang) 10 | """ 11 | 12 | if isinstance(seqobj,SeqRecord): 13 | seq = seqobj.seq.tostring().upper() 14 | elif isinstance(seqobj,Seq): 15 | seq = seqobj.tostring().upper() 16 | elif isinstance(seqobj,str): 17 | seq = seqobj.upper() 18 | 19 | # set the default Tm parameters 20 | C_primer = 250.0 # nM 21 | C_Mg = 0.0 # mM #1.5 # 10mM intracellular concentration 22 | C_MonovalentIon = 50.0 #mM #10mM Tris-Cl in 9N Ligase 23 | C_dNTP = 0.0 #mM #0.8 #mM 24 | percentage_DMSO = 0 25 | percentage_annealed = 50 26 | 27 | percentage_annealed = percentage_annealed/100.0 28 | percentage_DMSO = percentage_DMSO/100.0 29 | 30 | #Some constants 31 | R = 1.987 32 | deltaH = dict() 33 | deltaS = dict() 34 | deltaH = { "AA": -7.6, "TT": -7.6, "AT": -7.2, "TA": -7.2, "CA": -8.5, "TG": -8.5, "GT": -8.4, "AC": -8.4,"CT": -7.8, "AG": -7.8, "GA": -8.2, "TC": -8.2,"CG": -10.6,"GC": -9.8, "GG": -8.0, "CC": -8.0, "A": 2.2, "T": 2.2, "G": 0.0, "C": 0.0} 35 | deltaS = { "AA": -21.3, "TT": -21.3, "AT": -20.4, "TA": -21.3, "CA": -22.7, "TG": -22.7, "GT": -22.4, "AC": -22.4, "CT": -21.0, "AG": -21.0, "GA": -22.2, "TC": -22.2,"CG": -27.2, "GC": -24.4, "GG": -19.9, "CC":-19.9, "A": 6.9, "T": 6.9, "G": 0.0, "C": 0.0} 36 | 37 | C_SodiumEquivalent = C_MonovalentIon + 120 * math.sqrt(C_Mg-C_dNTP) 38 | seqLength = len(seq) 39 | dH = 0.2 + deltaH[str(seq[0])] + deltaH[str(seq[len(seq)-1])] 40 | dS = -5.7 + deltaS[seq[0]] + deltaS[seq[len(seq)-1]] 41 | for i in range(0, seqLength - 1): 42 | dH += deltaH[str(seq[i:i+2])] 43 | dS += deltaS[seq[i:i+2]] 44 | dS = dS + 0.368 * seqLength * math.log(C_SodiumEquivalent/1000.0) 45 | #val = math.log(C_primer*(1-percentage_annealed)/percentage_annealed) 46 | Tm =(dH * 1000) / (dS + R * (math.log(C_primer*(1-percentage_annealed)/percentage_annealed)-21.4164)) - 273.15 - 0.75*percentage_DMSO 47 | return Tm 48 | 49 | oligo_Tm = oligoTm -------------------------------------------------------------------------------- /primers.py: -------------------------------------------------------------------------------- 1 | import oligoTm 2 | import unafold 3 | import seqtools 4 | 5 | def generate_candidates(seq,minlen=18,maxlen=30): 6 | candidates = [] 7 | for start in xrange(len(seq)): 8 | length = minlen 9 | while length <= maxlen and start+length <= len(seq): 10 | candidates.append( seq[start:start+length] ) 11 | length += 1 12 | return candidates 13 | 14 | def choose_PCR_primer(seq,target_Tm=62.): 15 | candidates = generate_candidates(seq) 16 | 17 | # filter for Tm 18 | candidates = filter(lambda s: abs(oligoTm.oligoTm(s) - target_Tm) <= 2, candidates) 19 | if len(candidates) == 0: 20 | raise ValueError, "No primer candidates meet Tm cutoffs" 21 | 22 | # filter for 0.4-0.6 GC content 23 | candidates = filter(lambda s: abs(seqtools.gc_content(s) - 0.5) <= 0.1,candidates) 24 | if len(candidates) == 0: 25 | raise ValueError, "No primer candidates meet GC content cutoffs" 26 | 27 | # rank on secondary structure minimization 28 | candidates.sort(key=unafold.hybrid_ss_min) 29 | 30 | return candidates[0] 31 | -------------------------------------------------------------------------------- /pyutils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import string 3 | import collections 4 | import contextlib 5 | 6 | @contextlib.contextmanager 7 | def as_handle(handleish, mode='r', **kwargs): 8 | """Open handleish as file. 9 | 10 | Stolen from Biopython 11 | """ 12 | if isinstance(handleish, basestring): 13 | with open(handleish, mode, **kwargs) as fp: 14 | yield fp 15 | else: 16 | yield handleish 17 | 18 | # for generating 'safe' filenames from identifiers 19 | cleanup_table = string.maketrans('/*|><+ ','_____p_') 20 | def cleanup_id(identifier): 21 | return identifier.translate(cleanup_table) 22 | 23 | 24 | class nesteddict(collections.defaultdict): 25 | """Nested dictionary structure. 26 | 27 | Based on Stack Overflow question 635483 28 | """ 29 | def __init__(self,default=None): 30 | if default == None: 31 | collections.defaultdict.__init__(self, nesteddict) 32 | else: 33 | collections.defaultdict.__init__(self, default) 34 | self.locked = False 35 | 36 | def lock(self): 37 | # self.default_factory = raiseKeyError 38 | self.default_factory = None 39 | self.locked = True 40 | for value in self.itervalues(): 41 | if isinstance(value, nesteddict): 42 | value.lock() 43 | 44 | def unlock(self): 45 | self.default_factory = nesteddict 46 | self.locked = False 47 | for value in self.itervalues(): 48 | if isinstance(value, nesteddict): 49 | value.unlock() 50 | 51 | def islocked(self): 52 | return self.locked 53 | 54 | def todict(self): 55 | raise NotImplementedError 56 | for (key,val) in self.iteritems(): 57 | if isinstance(val,nesteddict): 58 | val.todict() 59 | self[key] = dict(val) 60 | self = dict(self) 61 | 62 | @staticmethod 63 | def asdict(d): 64 | d = copy.deepcopy(d) 65 | for (key,val) in d.iteritems(): 66 | if isinstance(val,nesteddict): 67 | d[key] = nesteddict.asdict(val) 68 | return dict(d) 69 | 70 | def nested_setdefault(self,keylist,default): 71 | curr_dict = self 72 | for key in keylist[:-1]: 73 | curr_dict = curr_dict[key] 74 | key = keylist[-1] 75 | return curr_dict.setdefault(key,default) 76 | 77 | def nested_get(self,keylist,default): 78 | curr_dict = self 79 | for key in keylist[:-1]: 80 | curr_dict = curr_dict[key] 81 | key = keylist[-1] 82 | return curr_dict.get(key,default) 83 | 84 | def nested_assign(self,keylist,val): 85 | curr_dict = self 86 | for key in keylist[:-1]: 87 | curr_dict = curr_dict[key] 88 | key = keylist[-1] 89 | curr_dict[key] = val 90 | return self 91 | 92 | def walk(self): 93 | for (key,value) in self.iteritems(): 94 | if isinstance(value, nesteddict): 95 | for tup in value.walk(): 96 | yield (key,) + tup 97 | else: 98 | yield (key,value) 99 | 100 | # these functions below implement special cases of nesteddict, where the 101 | # deepest-level dict is of a particular type (e.g., int for counter, set 102 | # for uniq objects, etc.) 103 | # 104 | # These functions could be implemented with nested_setdefault and 105 | # nested_get, but would be less efficient since they would have to 106 | # traverse the dict structure more times. 107 | 108 | def nested_increment(self,keylist,increment=1): 109 | curr_dict = self 110 | for key in keylist[:-1]: 111 | curr_dict = curr_dict[key] 112 | key = keylist[-1] 113 | curr_dict[key] = curr_dict.get(key,0) + increment 114 | 115 | def nested_add(self,keylist,obj): 116 | curr_dict = self 117 | for key in keylist[:-1]: 118 | curr_dict = curr_dict[key] 119 | key = keylist[-1] 120 | curr_dict.setdefault(key,set()).add(obj) 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | # class ModuleWrapper(object): 131 | # """Wrap a module to allow user-defined __getattr__ 132 | # 133 | # see http://stackoverflow.com/questions/2447353/getattr-on-a-module 134 | # """ 135 | # def __init__(self, module, usergetattr): 136 | # self.module = module 137 | # self.usergetattr = usergetattr 138 | # 139 | # def __getattr__(self, name): 140 | # return self.usergetattr(self,name) 141 | -------------------------------------------------------------------------------- /qPCR2melting.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import matplotlib as mpl 4 | import matplotlib.pyplot as plt 5 | 6 | def qPCR2melting(inputfile,output_formats): 7 | outputbasename = os.path.splitext(os.path.basename(inputfile))[0] 8 | 9 | # Learn some things about the data: 10 | # How many curves are there? 11 | ip = open(inputfile,'r') 12 | for line in ip: 13 | if line.startswith('Step'): 14 | # Verify the fields in the line: 15 | fields = line.split(',') 16 | if fields[0] != 'Step' or fields[1] != 'Cycle' or fields[2] != 'Dye' or fields[3] != 'Temp.': 17 | raise ValueError, 'Expected line like: "Step,Cycle,Dye,Temp.,..."' 18 | curve_labels = fields[4:-1] # (skip the above four fields and last extra comma) 19 | break 20 | # What step is the melting at? 21 | for line in ip: # advance to data set characterization 22 | if line.strip() == 'Analysis Options': 23 | break 24 | for line in ip: 25 | if line.startswith("Step") and "Melting Curve" in line: 26 | line_id = line.split()[1].strip(':') 27 | break 28 | ip.close() 29 | 30 | # Create data structures 31 | temps = [] 32 | curves = [[] for curve in curve_labels] 33 | 34 | # Load the data 35 | ip = open(inputfile,'r') 36 | for line in ip: # advance to data 37 | if line.startswith('Step'): 38 | break 39 | for line in ip: 40 | if line.strip() == '': 41 | break 42 | if line.split(',')[0] == line_id: 43 | temps.append(float(line.split(',')[3])) 44 | data = map(float,line.split(',')[4:-1]) 45 | for (i,value) in enumerate(data): 46 | curves[i].append(value) 47 | 48 | # Make the plots 49 | fig = plt.figure() 50 | ax = fig.add_subplot(111) 51 | for (label,curve) in zip(curve_labels,curves): 52 | ax.plot(temps,curve,label=label) 53 | ax.legend(loc=2) 54 | ax.set_xlabel('Temperature') 55 | ax.set_ylabel('Fluorescence (a.u.)') 56 | for format in output_formats: 57 | fig.savefig(outputbasename+'.melting.'+format) 58 | 59 | if __name__ == '__main__': 60 | import sys 61 | import optparse 62 | 63 | output_formats = set() 64 | def append_format(option,opt_str,value,parser): 65 | output_formats.add(opt_str.strip('-')) 66 | 67 | option_parser = optparse.OptionParser() 68 | option_parser.add_option('--png',action='callback',callback=append_format) 69 | option_parser.add_option('--pdf',action='callback',callback=append_format) 70 | option_parser.add_option('--eps',action='callback',callback=append_format) 71 | (options,args) = option_parser.parse_args() 72 | 73 | if len(args) != 1: 74 | raise ValueError, "Must give a single file as input." 75 | 76 | output_formats = list(output_formats) 77 | if output_formats == []: 78 | output_formats.append('pdf') 79 | output_formats.append('png') 80 | inputfile = args[0] 81 | 82 | qPCR2melting(inputfile,output_formats) 83 | -------------------------------------------------------------------------------- /qPCR2quantitation.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import matplotlib as mpl 4 | import matplotlib.pyplot as plt 5 | 6 | def qPCR2quantitation(inputfile,output_formats): 7 | outputbasename = os.path.splitext(os.path.basename(inputfile))[0] 8 | 9 | # Learn some things about the data: 10 | # How many curves are there? 11 | ip = open(inputfile,'r') 12 | for line in ip: 13 | if line.startswith('Step'): 14 | # Verify the fields in the line: 15 | fields = line.split(',') 16 | if fields[0] != 'Step' or fields[1] != 'Cycle' or fields[2] != 'Dye' or fields[3] != 'Temp.': 17 | raise ValueError, 'Expected line like: "Step,Cycle,Dye,Temp.,..."' 18 | curve_labels = fields[4:-1] # (skip the above four fields and last extra comma) 19 | break 20 | # What step is the quantitation at? 21 | for line in ip: # advance to data set characterization 22 | if line.strip() == 'Analysis Options': 23 | break 24 | for line in ip: 25 | if line.startswith("Step") and "Quantitation" in line: 26 | line_id = line.split()[1].strip(':') 27 | break 28 | ip.close() 29 | 30 | # Create data structures 31 | cycles = [] 32 | curves = [[] for curve in curve_labels] 33 | 34 | # Load the data 35 | ip = open(inputfile,'r') 36 | for line in ip: # advance to data 37 | if line.startswith('Step'): 38 | break 39 | for line in ip: 40 | if line.strip() == '': 41 | break 42 | if line.split(',')[0] == line_id: 43 | cycles.append(int(line.split(',')[1])) 44 | data = map(float,line.split(',')[4:-1]) 45 | for (i,value) in enumerate(data): 46 | curves[i].append(value) 47 | 48 | # Make the plots 49 | fig = plt.figure() 50 | ax = fig.add_subplot(111) 51 | for (label,curve) in zip(curve_labels,curves): 52 | ax.plot(cycles,curve,label=label) 53 | ax.legend(loc=2) 54 | ax.set_xlabel('Cycles') 55 | ax.set_ylabel('Fluorescence (a.u.)') 56 | for format in output_formats: 57 | fig.savefig(outputbasename+'.quantitation.'+format) 58 | 59 | if __name__ == '__main__': 60 | import sys 61 | import optparse 62 | 63 | output_formats = set() 64 | def append_format(option,opt_str,value,parser): 65 | output_formats.add(opt_str.strip('-')) 66 | 67 | option_parser = optparse.OptionParser() 68 | option_parser.add_option('--png',action='callback',callback=append_format) 69 | option_parser.add_option('--pdf',action='callback',callback=append_format) 70 | option_parser.add_option('--eps',action='callback',callback=append_format) 71 | (options,args) = option_parser.parse_args() 72 | 73 | if len(args) != 1: 74 | raise ValueError, "Must give a single file as input." 75 | 76 | output_formats = list(output_formats) 77 | if output_formats == []: 78 | output_formats.append('pdf') 79 | output_formats.append('png') 80 | inputfile = args[0] 81 | 82 | qPCR2quantitation(inputfile,output_formats) 83 | -------------------------------------------------------------------------------- /sanger.py: -------------------------------------------------------------------------------- 1 | import exonerate 2 | 3 | standard_primers = { # 5' -> 3' 4 | 'M13R' : 'caggaaacagctatgac', 5 | 'M13F-20' : 'gtaaaacgacggccag', 6 | 'T3' : 'attaaccctcactaaaggga', 7 | 'T7' : 'taatacgactcactataggg' 8 | } 9 | 10 | standard_vectors = { 11 | 'pCR4-TOPO-left' : 'catgattacgccaagctcagaattaaccctcactaaagggactagtcctgcaggtttaaacgaattcgccctt', 12 | 'pCR4-TOPO-right' : 'aagggcgaattcgcggccgctaaattcaattcgccctatagtgagtcgtattacaattca', 13 | 'pCR4Blunt-TOPO-left' : 'catgattacgccaagctcagaattaaccctcactaaagggactagtcctgcaggtttaaacgaattcgccctt', 14 | 'pCR4Blunt-TOPO-right' : 'aagggcgaattcgcggccgctaaattcaattcgccctatagtgagtcgtattacaattca' 15 | } 16 | 17 | 18 | 19 | 20 | def trimleft(left,read): 21 | """Align 2 seqs, forcing alignment of right-end of left. 22 | 23 | ...RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR... 24 | ...LLLLLLLLLLLLLLLLLLLLLLLLLLL <- (forced aln here) 25 | 26 | Uses exonerate. 27 | """ 28 | # perform alignment 29 | cmd = exonerate.ExonerateCommand('findend','parsable','bestonly') 30 | rawaln = exonerate.run_exonerate2(cmd,left,read) 31 | if rawaln == '': return read 32 | aln = exonerate.parse_aln(rawaln) 33 | 34 | # check that the right-end of left was successfully placed 35 | if aln['query_len'] != aln['query_aln_end']: 36 | raise ValueError, "failed to align right-end of left sequence" 37 | 38 | # check that both strands are + orientation 39 | if aln['query_strand'] != '+': 40 | raise ValueError, "query strand has been reversed" 41 | if aln['target_strand'] != '+': 42 | raise ValueError, "target strand has been reversed" 43 | 44 | # return trimmed sequence 45 | return read[aln['target_aln_end']:] 46 | 47 | def trimright(right,read): 48 | """Align 2 seqs, forcing alignment of left-end of right. 49 | 50 | ...DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD... 51 | (forced aln here) -> RRRRRRRRRRRRRRRRRRRRRRRRRRRRR... 52 | 53 | Uses exonerate. 54 | """ 55 | # perform alignment 56 | cmd = exonerate.ExonerateCommand('findend','parsable','bestonly') 57 | rawaln = exonerate.run_exonerate2(cmd,right,read) 58 | if rawaln == '': return read 59 | aln = exonerate.parse_aln(rawaln) 60 | 61 | # check that the left-end of right was successfully placed 62 | if aln['query_aln_begin'] != 0: 63 | raise ValueError, "failed to align left-end of right sequence" 64 | 65 | # check that both strands are + orientation 66 | if aln['query_strand'] != '+': 67 | raise ValueError, "query strand has been reversed" 68 | if aln['target_strand'] != '+': 69 | raise ValueError, "target strand has been reversed" 70 | 71 | # return trimmed sequence 72 | return read[:aln['target_aln_begin']] 73 | 74 | # =============== 75 | # = UNFINISHED: = 76 | # =============== 77 | 78 | def bidirectional_alignment(forward,reverse): 79 | """Align forward and reverse sequence of bidirectional Sanger reads. 80 | 81 | forward and reverse sequences must already be in the same 'sense' (i.e., 82 | reverse should be revcomped if necessary so that both strands in alignment 83 | are '+'). 84 | 85 | Forces alignment of both ends (right end of forward, and left end of 86 | reverse). 87 | 88 | ...FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF 89 | RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR... 90 | 91 | Uses exonerate. 92 | """ 93 | # perform alignment 94 | cmd = exonerate.ExonerateCommand('findend','parsable','bestonly') 95 | rawaln = exonerate.run_exonerate2(cmd,forward,reverse) 96 | aln = exonerate.parse_aln(rawaln) 97 | 98 | # check that both strands are + orientation 99 | if aln['query_strand'] != '+': 100 | raise ValueError, "query strand has been reversed" 101 | if aln['target_strand'] != '+': 102 | raise ValueError, "target strand has been reversed" 103 | 104 | # check that right end of forward and left end of reverse are placed 105 | if aln['query_len'] != aln['query_aln_end']: 106 | raise ValueError, "failed to align right-end of forward sequence" 107 | if aln['query_aln_begin'] != 0: 108 | raise ValueError, "failed to align left-end of right sequence" 109 | -------------------------------------------------------------------------------- /scale.py: -------------------------------------------------------------------------------- 1 | import math 2 | import types 3 | import numbers 4 | import bisect 5 | 6 | import numpy as np 7 | 8 | def is_iterable(x): 9 | try: 10 | iter(x) 11 | return True 12 | except TypeError: 13 | return False 14 | 15 | class quantitative(object): 16 | """Implement abstract quantitative scale.""" 17 | 18 | def __init__(self, *args): 19 | self._domain = [0,1] 20 | self._range = [0,1] 21 | 22 | self._transform = lambda x: x 23 | self._inverse = lambda y: y 24 | 25 | self.domain(*args) 26 | 27 | def _in_domain(self,x): 28 | return (x >= min(self._domain)) and (x <= max(self._domain)) 29 | 30 | def _in_range(self,y): 31 | return (y >= min(self._range)) and (y <= max(self._range)) 32 | 33 | def __call__(self,x): 34 | if not self._in_domain(x): 35 | raise ValueError, "outside domain" 36 | segment = bisect.bisect_right(self._domain,x) - 1 37 | if segment + 1 == len(self._domain): segment -= 1 # deal with extra endpoint (fully closed interval), e.g., [0,1) [1,2) [2,3] 38 | return (self._transform(x) - self._transform(self._domain[segment])) / (self._transform(self._domain[segment+1]) - self._transform(self._domain[segment])) * (self._range[segment+1] - self._range[segment]) + self._range[segment] 39 | 40 | def domain(self,*args): 41 | if len(args) == 0: 42 | return self._domain 43 | elif is_iterable(args[0]): # given array of data from which to determine domain 44 | if len(args[0]) < 2: raise ValueError, "domain specification needs at least two numbers" 45 | self._domain = [np.min(args[0]),np.max(args[0])] 46 | else: # given explicit values for piecewise domain 47 | if len(args) != len(set(args)): 48 | raise ValueError, "domain values must be unique" 49 | if list(args) != sorted(list(args)) and list(args)[::-1] != sorted(list(args)): # FIGURE THIS OUT 50 | raise ValueError, "domain values must be sorted" 51 | self._domain = args 52 | 53 | self._domain = map(float,self._domain) 54 | map(self._transform,self._domain) # test that transform is defined on domain 55 | 56 | return self 57 | 58 | def range(self,*args): 59 | if len(args) == 0: 60 | return self._range 61 | elif is_iterable(args[0]): # given array of data from which to determine range 62 | if len(args[0]) != len(self._domain): raise ValueError, "range specification needs at least two numbers" 63 | self._range = [np.min(args[0]),np.max(args[0])] 64 | else: # given explicit values for piecewise range 65 | if len(args) != len(set(args)): 66 | raise ValueError, "range values must be unique" 67 | if list(args) != sorted(list(args)) and list(args)[::-1] != sorted(list(args)): # FIGURE THIS OUT 68 | raise ValueError, "range values must be sorted" 69 | self._range = args 70 | 71 | if len(args) != len(self._domain): 72 | raise ValueError, "range specification must have same number of points as domain" 73 | 74 | return self 75 | 76 | def invert(self,y): 77 | if not self._in_range(x): 78 | raise ValueError, "outside range" 79 | segment = bisect.bisect_right(self._range,y) - 1 80 | if segment == len(self._range): segment -= 1 # deal with extra endpoint (fully closed interval), e.g., [0,1) [1,2) [2,3] 81 | return self._inverse((y - self._range[segment]) / (self._range[segment+1] - self._range[segment]) * (self._transform(self._domain[segment+1]) - self._transform(self._domain[segment])) + self._transform(self._domain[segment])) 82 | 83 | linear = quantitative 84 | 85 | class log(quantitative): 86 | """Implementation of log scale""" 87 | 88 | def __init__(self, *args): 89 | self._domain = [1,10] 90 | quantitative.__init__(self,*args) 91 | self.base(10) 92 | 93 | def base(self,*args): 94 | if len(args) == 0: 95 | return self._base 96 | else: 97 | self._base = args[0] 98 | self._logbase = math.log(self._base) 99 | self._transform = lambda x: math.log(x) / self._logbase 100 | self._inverse = lambda y: self._base ** y 101 | return self 102 | 103 | class root(quantitative): 104 | """root scale""" 105 | 106 | def __init__(self, *args): 107 | quantitative.__init__(self,*args) 108 | self.power(2) 109 | 110 | def power(self,*args): 111 | if len(args) == 0: 112 | return self._power 113 | else: 114 | self._power = args[0] 115 | self._transform = lambda x: x**(1./self._power) 116 | self._inverse = lambda y: y**self._power 117 | return self 118 | 119 | 120 | 121 | # class ordinal(object): 122 | # """Implementation for ordinal scale""" 123 | # 124 | # def __init__(self, *args): 125 | # Scale.__init__(self) 126 | # self._domain = [] 127 | # self._indices = {} 128 | # self._range = [] 129 | # self._band = 0 130 | # self.domain(*args) 131 | # return self 132 | # 133 | # def scale(self,x): 134 | # if x not in self._indices: 135 | # self._domain.append(x) 136 | # self._indices[x] = len(self._domain) - 1 137 | # return self._range[ self._indices[x] % len(self._range) ] 138 | # 139 | # def domain(self,*args): 140 | # if len(args) == 0: 141 | # return self._domain 142 | # 143 | # try: 144 | # iter(args[0]) # test for array type 145 | # array = args[0] 146 | # if len(args) > 1: 147 | # array = map(args[1],array) 148 | # except TypeError: 149 | # array = args 150 | # 151 | # self._domain = list(set(array)) 152 | # self._indices = pv.numerate(self._domain) 153 | # 154 | # return self 155 | # 156 | # def range(self,*args): 157 | # if len(args) == 0: 158 | # return self._range 159 | # 160 | # try: 161 | # iter(args[0]) # test for array type 162 | # array = args[0] 163 | # if len(args) > 1: 164 | # array = map(args[1],array) 165 | # except TypeError: 166 | # array = args 167 | # 168 | # if isinstance(array[0],types.StringType): 169 | # array = map(pv.color,array) 170 | # 171 | # self._range = array 172 | # 173 | # return self 174 | # 175 | # def split(self,_min,_max): 176 | # step = float(_max - _min) / length(self.domain()) 177 | # self._range = range(_min + step / 2., _max, step) 178 | # return self 179 | # 180 | # def splitFlush(self,_min,_max): 181 | # n = len(self.domain()) 182 | # step = float(_max - _min) / (n - 1) 183 | # if n == 1: 184 | # self._range = (_min + _max) / 2. 185 | # else: 186 | # self._range = range(_min, _max + step / 2., step) 187 | # return self 188 | # 189 | # def splitBanded(self,_min,_max,band=1): 190 | # if band < 0: 191 | # n = len(self.domain()) 192 | # total = -band * n 193 | # remaining = _max - _min - total 194 | # padding = remaining / float(n + 1) 195 | # self._range = range(_min + padding, _max, padding - band) 196 | # self._band = -band 197 | # else: 198 | # step = float(_max - _min) / (len(self.domain()) + (1 - band)) 199 | # self._range = range(_min + step * (1 - band), _max, step) 200 | # self._band = step * band 201 | # return self 202 | # 203 | # def by(self,f): 204 | # raise NotImplementedError 205 | # 206 | # class quantile(Scale): 207 | # """quantile scale""" 208 | # 209 | # def __init__(self, *args): 210 | # Scale.__init__(self) 211 | # self._num_quantiles = -1 212 | # self._max_quantile_index = -1 213 | # self._quantile_boundaries = [] 214 | # self._domain = [] 215 | # self._y = linear() # the range 216 | # self.domain(*args) 217 | # return self 218 | # 219 | # def scale(self,x): 220 | # return self._y(max(0, min(self._max_quantile_index, bisect.bisect_right(self._quantile_boundaries, x) - 1)) / float(self._max_quantile_index)) 221 | # 222 | # def quantiles(self,*args): 223 | # if len(args) == 0: 224 | # return self._quantile_boundaries 225 | # 226 | # self._num_quantiles = int(args[0]) 227 | # 228 | # if self._num_quantiles < 0: 229 | # self._quantile_boundaries = [self._domain[0]] + self._domain 230 | # self._max_quantile_index = len(self._domain) - 1 231 | # else: 232 | # self._quantile_boundaries = [self._domain[0]] 233 | # for i in range(1,self._num_quantiles+1): 234 | # self._quantile_boundaries.append( self._domain[ int(float(i) * (len(self._domain) - 1) / self._num_quantiles) ] ) 235 | # self._max_quantile_index = self._num_quantiles - 1 236 | # 237 | # return self 238 | # 239 | # def domain(self,*args): 240 | # if len(args) == 0: 241 | # return self._domain 242 | # 243 | # try: 244 | # iter(args[0]) 245 | # array = args[0] 246 | # if len(args) > 1: 247 | # array = map(args[1],array) 248 | # except TypeError: 249 | # array = args 250 | # 251 | # self._domain = array 252 | # self._domain.sort() 253 | # self.quantiles(self._num_quantiles) 254 | # return self 255 | # 256 | # def range(self,*args): 257 | # if len(args) == 0: 258 | # return self._y.range() 259 | # 260 | # self._y.range(*args) 261 | # return self 262 | # 263 | # def by(self,f): 264 | # raise NotImplementedError 265 | # 266 | -------------------------------------------------------------------------------- /seqtools.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import string 4 | import random 5 | import itertools 6 | 7 | from Bio import Alphabet 8 | from Bio.Seq import Seq 9 | from Bio.SeqRecord import SeqRecord 10 | from Bio.SeqFeature import SeqFeature, FeatureLocation 11 | from Bio import pairwise2 12 | 13 | import numpy as np 14 | import scipy as sp 15 | import scipy.stats 16 | 17 | from jellyfish import hamming_distance 18 | 19 | import unafold 20 | from pyutils import as_handle 21 | 22 | random.seed() 23 | 24 | # ============================== 25 | # = General sequence utilities = 26 | # ============================== 27 | 28 | def substitute(seq,pos,sub): 29 | return seq[:pos] + sub + seq[pos+1:] 30 | 31 | 32 | complement_table = string.maketrans('ACGTRYSWKMBDHVN','TGCAYRSWMKVHDBN') 33 | 34 | def reverse(seq): 35 | return seq[::-1] 36 | 37 | 38 | def complement(seq): 39 | return seq.upper().translate(complement_table) 40 | 41 | 42 | def reverse_complement(seq): 43 | """Compute reverse complement of sequence. 44 | 45 | Mindful of IUPAC ambiguities. 46 | Return all uppercase. 47 | """ 48 | return reverse(complement(seq)) 49 | # return seq.upper().translate(complement_table)[::-1] 50 | 51 | def translate(seq): 52 | return Seq(seq.replace('-','N'),Alphabet.DNAAlphabet()).translate().tostring() 53 | 54 | 55 | def gc_content(seq): 56 | gc = seq.lower().count('g') + seq.lower().count('c') 57 | return float(gc) / len(seq) 58 | 59 | 60 | def random_dna_seq(n): 61 | choice = random.choice 62 | return reduce(lambda cumul,garbage:cumul+choice('ACGT'),xrange(n),'') 63 | 64 | global_align = lambda seq1,seq2: pairwise2.align.globalms(seq1,seq2,0.5,-0.75,-2.,-1.5,one_alignment_only=True)[0] 65 | 66 | def percent_id(seq1,seq2): 67 | alignment = global_align(seq1,seq2) 68 | return (1. - hamming_distance(alignment[0],alignment[1]) / float(len(alignment[0]))) * 100. 69 | 70 | 71 | # barcode mapping fns 72 | def barcode_hamming(observed,barcodes): 73 | """Compute entropy of probabilistic barcode assignment. 74 | 75 | observed -- SeqRecord of the barcode 76 | barcodes -- list of barcode possibilities (python strings) 77 | """ 78 | obs_seq = observed.seq.tostring() 79 | distances = [(barcode,hamming_distance(obs_seq,barcode)) for barcode in barcodes] 80 | closest = min(distances,key=lambda p: p[1]) 81 | return closest # tuple of (barcode, distance) 82 | 83 | def barcode_probabilities(observed,barcodes): 84 | """Compute entropy of probabilistic barcode assignment. 85 | 86 | observed -- 'fastq' SeqRecord of the barcode 87 | barcodes -- list of barcode possibilities (python strings) 88 | """ 89 | obs_seq = np.array(list(observed.seq.tostring())) 90 | obs_qual = np.array(observed.letter_annotations['phred_quality']) 91 | barcodes = np.array([list(bc) for bc in barcodes]) 92 | 93 | choice = np.zeros(barcodes.shape, dtype=np.int) 94 | choice[barcodes == obs_seq] = 1 95 | choice[barcodes != obs_seq] = 2 96 | choice[:, obs_seq == 'N'] = 0 97 | 98 | N = np.zeros((1,barcodes.shape[1])) 99 | E = np.log1p(-np.power(10, -obs_qual / 10.)) 100 | D = -np.log(3) - (obs_qual / 10.) * np.log(3) 101 | 102 | B = np.exp(np.sum(np.choose(choice, [N,E,D]), axis=1)) 103 | return B / np.sum(B) 104 | 105 | def barcode_entropy(observed, barcodes): 106 | """Compute entropy of probabilistic barcode assignment. 107 | 108 | observed -- 'fastq' SeqRecord of the barcode 109 | barcodes -- list of barcode possibilities (python strings) 110 | """ 111 | P = barcode_probabilities(observed, barcodes) 112 | return sp.stats.entropy(P) 113 | 114 | 115 | # for generating 'safe' filenames from identifiers 116 | cleanup_table = string.maketrans('/*|><+ ','_____p_') 117 | def cleanup_id(identifier): 118 | return identifier.translate(cleanup_table) 119 | 120 | 121 | def seqhist(seqlist): 122 | seqdict = dict() 123 | for seq in seqlist: 124 | seqdict[seq] = seqdict.get(seq,0) + 1 125 | return seqdict 126 | 127 | def seqmode(seqs): 128 | if isinstance(seqs,list): 129 | seqs = seqhist(seqs) 130 | return max(seqs.iterkeys(),key=lambda k: seqs[k]) 131 | 132 | def dimer_dG(seq1,seq2): 133 | """Compute a primer-dimer score using UNAFOLD hybrid_min""" 134 | scores = [] 135 | subseqs1 = [] 136 | subseqs2 = [] 137 | for i in xrange( min(len(seq1),len(seq2)) ): 138 | subseqs1.append( seq1[-i-1:] ) 139 | subseqs2.append( seq2[-i-1:] ) 140 | scores = unafold.hybrid_min_list(subseqs1,subseqs2,NA='DNA') 141 | return -min(scores) 142 | 143 | def dimer_overlap(seq1,seq2,weight_3=10): 144 | """Compute a primer-dimer score by counting overlaps 145 | 146 | weight_3 is the num of 3' bases to add extra weight to either primer 147 | """ 148 | # import pdb 149 | # pdb.set_trace() 150 | overlap_score = lambda s1,s2: sum(1 if c1.lower() == c2.lower() else -1 for c1, c2 in itertools.izip(s1,s2)) 151 | seq2rc = reverse_complement(seq1) 152 | scores = [] 153 | for i in xrange( min(len(seq1),len(seq2)) ): 154 | subseq1 = seq1[-i-1:] 155 | subseq2 = seq2rc[:i+1] 156 | score = 0 157 | if (i+1) <= 2*weight_3: 158 | score += overlap_score(subseq1,subseq2) * 2 159 | else: 160 | score += overlap_score(subseq1[:weight_3],subseq2[:weight_3]) * 2 161 | score += overlap_score(subseq1[weight_3:-weight_3],subseq2[weight_3:-weight_3]) 162 | score += overlap_score(subseq1[-weight_3:],subseq2[-weight_3:]) * 2 163 | scores.append(score) 164 | return max(scores) 165 | 166 | # ========================== 167 | # = Manual FASTA iteration = 168 | # ========================== 169 | 170 | # taken from biopython 171 | 172 | identity = string.maketrans('','') 173 | nonalpha = identity.translate(identity,string.ascii_letters) 174 | 175 | def FastaIterator(handleish,title2ids=lambda s: s): 176 | with as_handle(handleish,'r') as handle: 177 | while True: 178 | line = handle.readline() 179 | if line == '' : return 180 | if line[0] == '>': 181 | break 182 | 183 | while True: 184 | if line[0] != '>': 185 | raise ValueError("Records in Fasta files should start with '>' character") 186 | descr = title2ids(line[1:].rstrip()) 187 | fullline = '' 188 | line = handle.readline() 189 | while True: 190 | if not line : break 191 | if line[0] == '>': break 192 | fullline += line.translate(identity,nonalpha) 193 | line = handle.readline() 194 | 195 | yield (descr,fullline) 196 | 197 | if not line : return #StopIteration 198 | assert False, "Should not reach this line" 199 | 200 | 201 | # ============================ 202 | # = biopython-specific tools = 203 | # ============================ 204 | 205 | def make_SeqRecord(name,seq): 206 | return SeqRecord(Seq(seq),id=name,name=name,description=name) 207 | 208 | 209 | def get_string(seqobj): 210 | if isinstance(seqobj,SeqRecord): 211 | seq = seqobj.seq.tostring().upper() 212 | elif isinstance(seqobj,Seq): 213 | seq = seqobj.tostring().upper() 214 | elif isinstance(seqobj,str): 215 | seq = seqobj.upper() 216 | return seq 217 | 218 | 219 | def get_features(feature_list,feature_type): 220 | target_features = [] 221 | for feature in feature_list: 222 | if feature.type == feature_type: 223 | target_features.append(feature) 224 | return target_features 225 | 226 | 227 | def advance_to_features(feature_iter,feature_types): 228 | # note, here feature_types is a list of possible stopping points 229 | for feature in feature_iter: 230 | if feature.type in feature_types: 231 | return feature 232 | raise ValueError, "didn't find %s in record" % feature_types 233 | 234 | 235 | def advance_to_feature(feature_iter,feature_type): 236 | return advance_to_features(feature_iter,[feature_type]) 237 | 238 | def map_feature( feature, coord_mapping, offset=0, erase=[] ): 239 | new_feature = copy.deepcopy(feature) 240 | new_start = coord_mapping[feature.location.start.position][-1] + offset 241 | new_end = coord_mapping[feature.location.end.position][0] + offset 242 | new_location = FeatureLocation(new_start,new_end) 243 | new_feature.location = new_location 244 | for qual in erase: 245 | new_feature.qualifiers.pop(qual,None) 246 | return new_feature 247 | 248 | def copy_features( record_from, record_to, coord_mapping, offset=0, erase=[], replace=False ): 249 | if replace: 250 | # index record_to features: 251 | feature_index = {} 252 | for (i,feature) in enumerate(record_to.features): 253 | feature_index.setdefault(feature.type,[]).append(i) 254 | 255 | feat_idx_to_delete = [] 256 | for feature in record_from.features: 257 | if replace: 258 | feat_idx_to_delete += feature_index.get(feature.type,[]) 259 | new_feature = map_feature( feature, coord_mapping, offset, erase ) 260 | record_to.features.append(new_feature) 261 | 262 | if replace: 263 | for idx in sorted(feat_idx_to_delete,reverse=True): 264 | record_to.features.pop(idx) 265 | 266 | def translate_features( record ): 267 | for feature in record.features: 268 | offset = int(feature.qualifiers.get('codon_start',[1])[0]) - 1 269 | feature.qualifiers['translation'] = feature.extract(record.seq)[offset:].translate() 270 | 271 | # SeqRecord <-> JSON-serializable 272 | 273 | def simplifySeq(seq): 274 | obj = {} 275 | obj['__Seq__'] = True 276 | obj['seq'] = seq.tostring() 277 | obj['alphabet'] = seq.alphabet.__repr__().rstrip(')').rstrip('(') 278 | return obj 279 | 280 | def complicateSeq(obj): 281 | if '__Seq__' not in obj: 282 | raise ValueError, "object must be converable to Bio.Seq" 283 | 284 | # Figure out which alphabet to use 285 | try: 286 | alphabet = Alphabet.__getattribute__(obj['alphabet'])() 287 | except AttributeError: 288 | pass 289 | try: 290 | alphabet = Alphabet.IUPAC.__getattribute__(obj['alphabet'])() 291 | except AttributeError: 292 | raise 293 | 294 | seq = Seq(obj['seq'],alphabet=alphabet) 295 | return seq 296 | 297 | def simplifySeqFeature(feature): 298 | obj = {} 299 | obj['__SeqFeature__'] = True 300 | obj['location'] = (feature.location.nofuzzy_start,feature.location.nofuzzy_end) 301 | obj['type'] = feature.type 302 | obj['strand'] = feature.strand 303 | obj['id'] = feature.id 304 | obj['qualifiers'] = feature.qualifiers 305 | return obj 306 | 307 | def complicateSeqFeature(obj): 308 | if '__SeqFeature__' not in obj: 309 | raise ValueError, "object must be converable to Bio.SeqFeature" 310 | location = FeatureLocation(*obj['location']) 311 | feature = SeqFeature(location=location,type=obj['type'],strand=obj['strand'],id=obj['id'],qualifiers=obj['qualifiers']) 312 | return feature 313 | 314 | def simplifySeqRecord(record): 315 | obj = {} 316 | obj['__SeqRecord__'] = True 317 | obj['seq'] = simplifySeq(record.seq) 318 | obj['id'] = record.id 319 | obj['name'] = record.name 320 | obj['description'] = record.description 321 | obj['dbxrefs'] = record.dbxrefs 322 | obj['annotations'] = record.annotations 323 | obj['letter_annotations'] = record.letter_annotations # should work because it is actually a _RestrictedDict obj which subclasses dict 324 | obj['features'] = map(simplifySeqFeature,record.features) 325 | return obj 326 | 327 | def complicateSeqRecord(obj): 328 | if '__SeqRecord__' not in obj: 329 | raise ValueError, "object must be converable to Bio.SeqRecord" 330 | features = map(complicateSeqFeature,obj['features']) 331 | record = SeqRecord(seq=complicateSeq(obj['seq']),id=obj['id'],name=obj['name'],description=obj['description'],dbxrefs=obj['dbxrefs'],features=features,annotations=obj['annotations'],letter_annotations=obj['letter_annotations']) 332 | return record 333 | -------------------------------------------------------------------------------- /statstools.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | import scipy as sp 5 | import scipy.stats 6 | import scipy.spatial 7 | 8 | random.seed() 9 | # random.seed(1) 10 | 11 | np.random.seed() 12 | # np.random.seed(1) 13 | 14 | permutation = np.random.permutation 15 | randint = np.random.randint 16 | 17 | def random_read(seq,read_len): 18 | position = randint(0,len(seq)-read_len+1) 19 | return (position,seq[position:position+read_len]) 20 | 21 | def sample_with_replacement(population,len,choose=random.choice): 22 | """Sample from a population with replacement 23 | 24 | Taken from Python Cookbook, 2nd ed, recipe 18.3 25 | """ 26 | s = [] 27 | for i in xrange(len): 28 | s.append(choose(population)) 29 | return s 30 | 31 | def multinomial_sample(n,p): 32 | """Return sample variates from multinomial 33 | 34 | NOTE: the numpy/scipy multinomial function will return a vector same 35 | length as p with the number of observations of each type. This function 36 | will return a vector of length n that has the actual observations. 37 | 38 | n - number of experiments 39 | p - vector or parameters; must sum to 1 40 | """ 41 | if sum(p) != 1.: raise ValueError, "p must sum to 1" 42 | uniform_sample = np.random.uniform(size=n) 43 | p_cum = np.cumsum(p) 44 | return np.searchsorted(p_cum,uniform_sample,side='right') 45 | 46 | def permsamp(x, nperm, theta): 47 | '''sample nperm times from the permutation distribution of the data x (numpy) 48 | theta is the function that takes the data and computes the statistic 49 | it must know how the data is "encoded" in x 50 | 51 | returns a vector of nperm th_star values 52 | ''' 53 | N = len(x) 54 | 55 | def perm_iter(): 56 | for i in xrange(nperm): 57 | yield x[permutation(N)] 58 | 59 | th_star = np.asarray( map(theta,perm_iter()) ) 60 | 61 | return th_star 62 | 63 | def bootstrap(x, nboot, theta): 64 | '''return n bootstrap replications of theta from x''' 65 | 66 | N = len(x) 67 | 68 | def rand_iter(): 69 | for i in xrange(nboot): 70 | yield x[randint(0,N,N)] 71 | 72 | th_star = np.asarray( map(theta,rand_iter()) ) 73 | 74 | return th_star 75 | 76 | def sample2counts(sample, categories=0): 77 | """Return count vector from list of samples. 78 | 79 | Take vector of samples and return a vector of counts. The elts 80 | refer to indices in something that would ultimately map to the 81 | originating category (like from a multinomial). Therefore, if there 82 | are, say, 8 categories, then valid values in sample should be 0-7. 83 | If categories is not given, then i compute it from the highest value 84 | present in sample (+1). 85 | 86 | """ 87 | counts = np.bincount(sample) 88 | if (categories > 0) and (categories > len(counts)): 89 | counts = np.append( counts, np.zeros(categories-len(counts)) ) 90 | return counts 91 | 92 | def counts2sample(counts): 93 | """Computes a consistent sample from a vector of counts. 94 | 95 | Takes a vector of counts and returns a vector of indices x 96 | such that len(x) = sum(c) and each elt of x is the index of 97 | a corresponding elt in c 98 | 99 | """ 100 | x = np.ones(np.sum(counts),dtype=np.int_) 101 | 102 | start_idx = 0 103 | end_idx = 0 104 | for i in xrange(len(counts)): 105 | start_idx = end_idx 106 | end_idx = end_idx + counts[i] 107 | x[start_idx:end_idx] = x[start_idx:end_idx] * i 108 | return x 109 | 110 | def density2d(x,y): 111 | x = np.asarray(x).ravel() 112 | y = np.asarray(y).ravel() 113 | data = np.r_['0,2',x,y] 114 | kde = sp.stats.kde.gaussian_kde(data) 115 | return kde(data) 116 | 117 | def entropy_bootstrap(pk,size,N=1000): 118 | """Compute bootstrapped entropy values. 119 | 120 | pk is a multinomial vector (will be normalized) 121 | size is the number of objects to draw from a multinomial at each iter 122 | N is number of bootstrap replicates 123 | """ 124 | pk = np.asarray(pk,dtype=np.float) 125 | pk = pk / np.sum(pk) 126 | 127 | entropies = [] 128 | for i in xrange(N): 129 | entropies.append( sp.stats.entropy(np.random.multinomial(size,pk)) ) 130 | 131 | return entropies 132 | 133 | def entropy_bootstrap2(pk,N=1000,total=0): 134 | """Compute bootstrapped entropy values. 135 | 136 | pk is a count vector 137 | sum is the total number of objects to draw from 138 | N is number of bootstrap replicates 139 | """ 140 | n = sum(pk) 141 | if total == 0: 142 | total = n 143 | 144 | pk = list(pk) 145 | pk.append(total-n) 146 | pk = np.asarray(pk,dtype=np.float) 147 | pk = pk / np.sum(pk) 148 | 149 | entropies = [] 150 | for i in xrange(N): 151 | entropies.append( sp.stats.entropy(np.random.multinomial(n,pk)[:-1]) ) 152 | 153 | return entropies 154 | 155 | def silhouette(Y,T): 156 | """Emulate MATLAB silhouette fn for cluster quality. 157 | 158 | Y -- condensed-form pairwise distance matrix 159 | T -- cluster assignments 160 | 161 | Based on StackOverflow #6644445 162 | """ 163 | n = len(T) # number of objects 164 | clusters = set(T) # the cluster labels 165 | 166 | X = sp.spatial.distance.squareform(Y) 167 | 168 | s = np.zeros(n) 169 | for i in xrange(n): 170 | incluster = T==T[i] 171 | incluster[i] = False 172 | if np.sum(incluster) == 0: 173 | continue 174 | 175 | outcluster = lambda j: T==j 176 | 177 | # incluster average dist 178 | a = np.mean( X[incluster,i] ) 179 | 180 | # min outcluster avg dist 181 | b = np.min([np.mean( X[outcluster(j),i] ) for j in (clusters-set([T[i]]))]) 182 | 183 | s[i] = (b - a) / np.max([a,b]) 184 | 185 | return s 186 | -------------------------------------------------------------------------------- /stitch.py: -------------------------------------------------------------------------------- 1 | from numpy import array, power, log, log10, log1p, choose, sum 2 | from Bio import SeqIO 3 | from itertools import izip 4 | from seqtools import reverse_complement 5 | 6 | def stitch(record1, record2): 7 | seq1 = array([record1.seq.tostring()]) 8 | seq2 = array([reverse_complement(record2.seq.tostring())]) 9 | seq1.dtype = '|S1' 10 | seq2.dtype = '|S1' 11 | quals1 = array(record1.letter_annotations['phred_quality']) 12 | quals2 = array(record2.letter_annotations['phred_quality'][::-1]) 13 | 14 | log10p_consensus_1 = log1p(-power(10, -quals1 / 10.)) / log(10) 15 | log10p_consensus_2 = log1p(-power(10, -quals2 / 10.)) / log(10) 16 | log10p_error_1 = -log10(3) - (quals1 / 10.) 17 | log10p_error_2 = -log10(3) - (quals2 / 10.) 18 | 19 | min_overlap = 1 20 | max_overlap = max(len(record1), len(record2)) 21 | overlaps = {} 22 | for overlap in range(1, max_overlap): 23 | s1 = seq1[-overlap:] 24 | s2 = seq2[:overlap] 25 | q1 = quals1[-overlap:] 26 | q2 = quals2[:overlap] 27 | lpc1 = log10p_consensus_1[-overlap:] 28 | lpc2 = log10p_consensus_2[:overlap] 29 | lpe1 = log10p_error_1[-overlap:] 30 | lpe2 = log10p_error_2[:overlap] 31 | 32 | consensus = choose(q1 < q2, [s1, s2]) 33 | score = sum(choose(consensus == s1, [lpe1, lpc1])) + sum(choose(consensus == s2, [lpe2, lpc2])) + len(consensus) * log10(4) * 2 # last term is null hypothesis, p=1/4 34 | consensus.dtype = '|S%i' % len(consensus) 35 | overlaps[overlap] = (consensus[0],score) 36 | 37 | return overlaps 38 | 39 | import numpy as np 40 | 41 | if __name__ == '__main__': 42 | input_file1 = '/n/home00/laserson/data/MS_HIV_MiSeq_data_20120105/samples/HIV1.1.fastq' 43 | input_file2 = '/n/home00/laserson/data/MS_HIV_MiSeq_data_20120105/samples/HIV1.2.fastq' 44 | 45 | input_file1 = '/Users/laserson/Dropbox/stitcher/test.1.fastq' 46 | input_file2 = '/Users/laserson/Dropbox/stitcher/test.2.fastq' 47 | 48 | it = izip(SeqIO.parse(input_file1,'fastq'), SeqIO.parse(input_file2,'fastq')) 49 | (record1,record2) = it.next() 50 | overlaps = stitch(record1,record2) 51 | scores = [p[1] for p in overlaps.values()] 52 | (entropy(power(10,scores)), max(overlaps.items(),key=lambda i: i[1][1])) 53 | 54 | entropies = [] 55 | for (i,(rec1,rec2)) in enumerate(izip(SeqIO.parse(input_file1,'fastq'), SeqIO.parse(input_file1,'fastq'))): 56 | entropies.append(stitch(rec1,rec2)) 57 | if i == 1000: 58 | break -------------------------------------------------------------------------------- /streamgraph.py: -------------------------------------------------------------------------------- 1 | # Based on http://code.activestate.com/recipes/576633/ 2 | # which is based on: 3 | # Reference: 'Stacked graphs- geometry & aesthetics' by Byron and Wattenberg 4 | # http://www.leebyron.com/else/streamgraph/download.php?file=stackedgraphs_byron_wattenberg.pdf 5 | 6 | import numpy as np 7 | import matplotlib as mpl 8 | # mpl.use('Agg') 9 | import matplotlib.pyplot as plt 10 | 11 | # baseline functions 12 | def baseline_symmetric(streams): 13 | """Symmetric baseline ('silhouette')""" 14 | g0 = -0.5 * np.sum(np.asarray(streams),axis=0) 15 | return g0 16 | 17 | def baseline_zero(streams): 18 | """Zero baseline""" 19 | return np.zeros(np.asarray(streams).shape[1]) 20 | 21 | def baseline_weighted_wiggle(streams): 22 | """Weighted-wiggle minimization 23 | 24 | NOTE: streams should already be ordered as desired 25 | """ 26 | streams = np.asarray(streams) 27 | 28 | # add a column of zeros on the left side of streams 29 | f = np.hstack( (np.zeros((streams.shape[0],1)),streams) ) 30 | df = np.diff(f) 31 | cum_sum_df = np.vstack( (np.zeros((1,df.shape[1])),np.cumsum(df,axis=0)) )[:-1,:] 32 | dg0 = (-1./np.sum(streams,axis=0)) * np.sum((0.5 * df + cum_sum_df) * streams,axis=0) 33 | g0 = np.cumsum(dg0) 34 | return g0 35 | 36 | # ordering functions 37 | def argsort_onset(streams): 38 | """Returns permutation indices (like argsort) for onset ordering.""" 39 | streams = np.asarray(streams) 40 | nonzero_idxs = [np.arange(streams.shape[1])[idxs] for idxs in (streams > 0)] 41 | onset_idxs = [np.min(nzi) if len(nzi) > 0 else streams.shape[1] for nzi in nonzero_idxs] 42 | return np.argsort(onset_idxs) 43 | 44 | def argsort_inside_out(streams): 45 | """Returns permutation indices (like argsort) for inside-out ordering.""" 46 | upper = [] 47 | lower = [] 48 | weight_up = 0 49 | weight_lo = 0 50 | for (i,stream) in enumerate(streams): 51 | if weight_up < weight_lo: 52 | upper.append(i) 53 | weight_up += np.sum(stream) 54 | else: 55 | lower.append(i) 56 | weight_lo += np.sum(stream) 57 | 58 | return upper[::-1] + lower 59 | 60 | def streamgraph(ax, streams, x=None, colors=None, baseline=baseline_weighted_wiggle, yoffset=0., whitebg=True): 61 | streams = np.asarray(streams) 62 | 63 | g0 = baseline(streams) + yoffset 64 | 65 | if x == None: 66 | x = range(streams.shape[1]) 67 | 68 | if colors == None: 69 | colors = map(mpl.cm.bone,np.random.uniform(size=streams.shape[0])) 70 | 71 | layers = [] 72 | g_lo = g0 73 | for stream in streams: 74 | g_hi = g_lo + stream 75 | verts_lo = zip(x,g_lo) 76 | verts_hi = zip(x[::-1],g_hi[::-1]) 77 | layer = verts_lo + verts_hi 78 | layers.append(layer) 79 | g_lo = g_hi 80 | 81 | polys = mpl.collections.PolyCollection(layers,facecolors=colors,linewidths=0, zorder=10) 82 | ax.add_collection(polys) 83 | 84 | # add an opaque white background to the streamgraph 85 | if whitebg == True: 86 | verts = np.asarray(zip(x,g0) + zip(x[::-1],g_hi[::-1])) 87 | bglayer = mpl.patches.Polygon(verts, closed=True, color='white', alpha=1, zorder=5) 88 | ax.add_patch(bglayer) 89 | 90 | return ax 91 | 92 | def format_streamgraph(ax): 93 | """Performs some common formatting operations for streamgraphs""" 94 | # kill the frame 95 | ax.spines['top'].set_visible(False) 96 | ax.spines['right'].set_visible(False) 97 | ax.spines['bottom'].set_visible(False) 98 | ax.spines['left'].set_visible(False) 99 | 100 | # set ticks 101 | ax.xaxis.set_ticks_position('bottom') 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /timeseries.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def load_timeseries(inhandle): 4 | """Load timeseries data from file. 5 | 6 | There may be 'control' lines that start with #, e.g.: 7 | #times 8 | #sums 9 | that are loaded into the returned data dictionary as: 10 | data['times'] = ... 11 | 12 | The control lines must contain a whitespace delimited list of numbers 13 | (float allowed) 14 | 15 | data['labels'] will contain all the labels (first entry of each timeseries 16 | row) in order 17 | 18 | data['matrix'] will contain the numpy array that has the actual data in it 19 | """ 20 | data = {} 21 | labels = [] 22 | matrix = [] 23 | for line in inhandle: 24 | if line.startswith('#'): 25 | tokens = line.split() 26 | label = tokens[0].lstrip('#') 27 | values = np.asarray(map(float,tokens[1:])) 28 | data[label] = values 29 | else: 30 | values = line.split() 31 | labels.append(values[0].strip()) 32 | matrix.append(map(int,values[1:])) 33 | data['labels'] = labels 34 | data['matrix'] = np.asarray(matrix) 35 | 36 | return data 37 | 38 | def write_timeseries(outhandle,**kw): 39 | """Write timeseries to file. 40 | 41 | Must provide labels and matrix. All other arguments must be lists of 42 | numbers (floats allowed) which get printed as "comments". They will be 43 | loaded by the load function as well. 44 | 45 | matrix must always be integer type. (If normalization is required, pass 46 | the proper column sums in as control line #sums). 47 | """ 48 | labels = kw.pop('labels') 49 | matrix = kw.pop('matrix') 50 | for (key,values) in kw.iteritems(): 51 | print >>outhandle, '#%s ' % key + ' '.join(map(str,values)) 52 | for (label,timeseries) in zip(labels,matrix): 53 | print >>outhandle, ' '.join(map(str,[label]+list(timeseries))) 54 | 55 | def normalized_timeseries(timeseries): 56 | return np.float_(timeseries) / timeseries.sum(axis=0) 57 | -------------------------------------------------------------------------------- /unafold.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import random 4 | import tempfile 5 | import subprocess 6 | 7 | def hybrid_ss_min(seq,NA='RNA',tmin=37,tinc=1,tmax=37,sodium=1,magnesium=0): 8 | cmd = 'hybrid-ss-min --quiet --NA=%s --tmin=%f --tinc=%f --tmax=%f --sodium=%f --magnesium=%f %s' % (NA,tmin,tinc,tmax,sodium,magnesium,seq) 9 | p = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) 10 | p.wait() 11 | dG = float(p.stdout.read()) 12 | return dG 13 | 14 | def hybrid_min(seq1,seq2,NA='RNA',tmin=37,tinc=1,tmax=37,sodium=1,magnesium=0): 15 | cmd = 'hybrid-min --quiet --NA=%s --tmin=%f --tinc=%f --tmax=%f --sodium=%f --magnesium=%f %s %s' % (NA,tmin,tinc,tmax,sodium,magnesium,seq1,seq2) 16 | p = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) 17 | p.wait() 18 | dG = float(p.stdout.read().split()[0]) 19 | return dG 20 | 21 | def hybrid_min_list(seqlist1,seqlist2,NA='RNA',tmin=37,tinc=1,tmax=37,sodium=1,magnesium=0): 22 | # set up temporary files 23 | temp_out_prefix = 'temporary_hybrid_%i_%i' % (os.getpid(),random.randint(0,10000)) 24 | seqfile1 = tempfile.NamedTemporaryFile(mode='w',dir='.',prefix='hybrid_min_temp',suffix='.fasta') 25 | seqfile2 = tempfile.NamedTemporaryFile(mode='w',dir='.',prefix='hybrid_min_temp',suffix='.fasta') 26 | for (i,seq) in enumerate(seqlist1): print >>seqfile1, ">1_%i\n%s" % (i,seq) 27 | for (i,seq) in enumerate(seqlist2): print >>seqfile2, ">2_%i\n%s" % (i,seq) 28 | seqfile1.file.flush() 29 | seqfile2.file.flush() 30 | 31 | # set up and execute command 32 | cmd = 'hybrid-min --NA=%s --tmin=%f --tinc=%f --tmax=%f --sodium=%f --magnesium=%f --output=%s %s %s' % (NA,tmin,tinc,tmax,sodium,magnesium,temp_out_prefix,seqfile1.name,seqfile2.name) 33 | p = subprocess.Popen(cmd,shell=True) 34 | p.wait() 35 | 36 | # read results 37 | ip = open(temp_out_prefix+'.dG','r') 38 | dGs = [] 39 | for line in ip: 40 | if line.startswith('#'): continue 41 | dGs.append(float(line.split()[1])) 42 | 43 | # clean up output 44 | seqfile1.close() 45 | seqfile2.close() 46 | for filename in glob.glob(temp_out_prefix+'*'): os.remove(filename) 47 | 48 | return dGs 49 | --------------------------------------------------------------------------------