├── .gitignore
├── LICENSE
├── Minimus2-pipeline
├── Minimus2_pipeline.py
└── README.md
├── POCP-calculator
├── POCP-matrix.py
└── README.md
├── README.md
├── blast-matrix
├── README.md
└── blast_identity_matrix.py
├── blast-wrapper
├── README.md
├── blast_wrapper.py
└── blastout2fasta.py
├── cdhit-clstr2tbl
├── README.md
├── cdhit_clstr2tbl.py
├── test.clstr
└── test.clstr.tab
├── circular_genomes_from_gfa
├── README.md
└── circular_genomes_from_gfa.py
├── download_uniprot_proteomes
├── README.md
└── download_uniprot_proteomes_UPID.py
├── fasta-splitter
└── fasta_splitter.py
├── prodigal-wrapper
└── prodigal_run.py
├── prokka2kegg
├── README.md
├── idmapping_KO.tab.gz
├── prokka2kegg.py
├── prokka2kegg_batch.py
├── sample.gbk
└── sample.kegg.out.txt
└── remove_duplicate_seqs
├── README.md
└── remove_duplicate_seqs.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU General Public License is a free, copyleft license for
11 | software and other kinds of works.
12 |
13 | The licenses for most software and other practical works are designed
14 | to take away your freedom to share and change the works. By contrast,
15 | the GNU General Public License is intended to guarantee your freedom to
16 | share and change all versions of a program--to make sure it remains free
17 | software for all its users. We, the Free Software Foundation, use the
18 | GNU General Public License for most of our software; it applies also to
19 | any other work released this way by its authors. You can apply it to
20 | your programs, too.
21 |
22 | When we speak of free software, we are referring to freedom, not
23 | price. Our General Public Licenses are designed to make sure that you
24 | have the freedom to distribute copies of free software (and charge for
25 | them if you wish), that you receive source code or can get it if you
26 | want it, that you can change the software or use pieces of it in new
27 | free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you
30 | these rights or asking you to surrender the rights. Therefore, you have
31 | certain responsibilities if you distribute copies of the software, or if
32 | you modify it: responsibilities to respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether
35 | gratis or for a fee, you must pass on to the recipients the same
36 | freedoms that you received. You must make sure that they, too, receive
37 | or can get the source code. And you must show them these terms so they
38 | know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps:
41 | (1) assert copyright on the software, and (2) offer you this License
42 | giving you legal permission to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains
45 | that there is no warranty for this free software. For both users' and
46 | authors' sake, the GPL requires that modified versions be marked as
47 | changed, so that their problems will not be attributed erroneously to
48 | authors of previous versions.
49 |
50 | Some devices are designed to deny users access to install or run
51 | modified versions of the software inside them, although the manufacturer
52 | can do so. This is fundamentally incompatible with the aim of
53 | protecting users' freedom to change the software. The systematic
54 | pattern of such abuse occurs in the area of products for individuals to
55 | use, which is precisely where it is most unacceptable. Therefore, we
56 | have designed this version of the GPL to prohibit the practice for those
57 | products. If such problems arise substantially in other domains, we
58 | stand ready to extend this provision to those domains in future versions
59 | of the GPL, as needed to protect the freedom of users.
60 |
61 | Finally, every program is threatened constantly by software patents.
62 | States should not allow patents to restrict development and use of
63 | software on general-purpose computers, but in those that do, we wish to
64 | avoid the special danger that patents applied to a free program could
65 | make it effectively proprietary. To prevent this, the GPL assures that
66 | patents cannot be used to render the program non-free.
67 |
68 | The precise terms and conditions for copying, distribution and
69 | modification follow.
70 |
71 | TERMS AND CONDITIONS
72 |
73 | 0. Definitions.
74 |
75 | "This License" refers to version 3 of the GNU General Public License.
76 |
77 | "Copyright" also means copyright-like laws that apply to other kinds of
78 | works, such as semiconductor masks.
79 |
80 | "The Program" refers to any copyrightable work licensed under this
81 | License. Each licensee is addressed as "you". "Licensees" and
82 | "recipients" may be individuals or organizations.
83 |
84 | To "modify" a work means to copy from or adapt all or part of the work
85 | in a fashion requiring copyright permission, other than the making of an
86 | exact copy. The resulting work is called a "modified version" of the
87 | earlier work or a work "based on" the earlier work.
88 |
89 | A "covered work" means either the unmodified Program or a work based
90 | on the Program.
91 |
92 | To "propagate" a work means to do anything with it that, without
93 | permission, would make you directly or secondarily liable for
94 | infringement under applicable copyright law, except executing it on a
95 | computer or modifying a private copy. Propagation includes copying,
96 | distribution (with or without modification), making available to the
97 | public, and in some countries other activities as well.
98 |
99 | To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies. Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 |
103 | An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License. If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 |
112 | 1. Source Code.
113 |
114 | The "source code" for a work means the preferred form of the work
115 | for making modifications to it. "Object code" means any non-source
116 | form of a work.
117 |
118 | A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 |
123 | The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form. A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 |
134 | The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities. However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work. For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 |
147 | The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 |
151 | The Corresponding Source for a work in source code form is that
152 | same work.
153 |
154 | 2. Basic Permissions.
155 |
156 | All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met. This License explicitly affirms your unlimited
159 | permission to run the unmodified Program. The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work. This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 |
164 | You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force. You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright. Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 |
175 | Conveying under any other circumstances is permitted solely under
176 | the conditions stated below. Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 |
179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 |
181 | No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 |
187 | When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 |
195 | 4. Conveying Verbatim Copies.
196 |
197 | You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 |
205 | You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 |
208 | 5. Conveying Modified Source Versions.
209 |
210 | You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 |
214 | a) The work must carry prominent notices stating that you modified
215 | it, and giving a relevant date.
216 |
217 | b) The work must carry prominent notices stating that it is
218 | released under this License and any conditions added under section
219 | 7. This requirement modifies the requirement in section 4 to
220 | "keep intact all notices".
221 |
222 | c) You must license the entire work, as a whole, under this
223 | License to anyone who comes into possession of a copy. This
224 | License will therefore apply, along with any applicable section 7
225 | additional terms, to the whole of the work, and all its parts,
226 | regardless of how they are packaged. This License gives no
227 | permission to license the work in any other way, but it does not
228 | invalidate such permission if you have separately received it.
229 |
230 | d) If the work has interactive user interfaces, each must display
231 | Appropriate Legal Notices; however, if the Program has interactive
232 | interfaces that do not display Appropriate Legal Notices, your
233 | work need not make them do so.
234 |
235 | A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit. Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 |
245 | 6. Conveying Non-Source Forms.
246 |
247 | You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 |
252 | a) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by the
254 | Corresponding Source fixed on a durable physical medium
255 | customarily used for software interchange.
256 |
257 | b) Convey the object code in, or embodied in, a physical product
258 | (including a physical distribution medium), accompanied by a
259 | written offer, valid for at least three years and valid for as
260 | long as you offer spare parts or customer support for that product
261 | model, to give anyone who possesses the object code either (1) a
262 | copy of the Corresponding Source for all the software in the
263 | product that is covered by this License, on a durable physical
264 | medium customarily used for software interchange, for a price no
265 | more than your reasonable cost of physically performing this
266 | conveying of source, or (2) access to copy the
267 | Corresponding Source from a network server at no charge.
268 |
269 | c) Convey individual copies of the object code with a copy of the
270 | written offer to provide the Corresponding Source. This
271 | alternative is allowed only occasionally and noncommercially, and
272 | only if you received the object code with such an offer, in accord
273 | with subsection 6b.
274 |
275 | d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 |
288 | e) Convey the object code using peer-to-peer transmission, provided
289 | you inform other peers where the object code and Corresponding
290 | Source of the work are being offered to the general public at no
291 | charge under subsection 6d.
292 |
293 | A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 |
297 | A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling. In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage. For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product. A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source. The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed. Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 |
337 | Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 |
343 | 7. Additional Terms.
344 |
345 | "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law. If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 |
354 | When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it. (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.) You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 |
361 | Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 |
365 | a) Disclaiming warranty or limiting liability differently from the
366 | terms of sections 15 and 16 of this License; or
367 |
368 | b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 |
372 | c) Prohibiting misrepresentation of the origin of that material, or
373 | requiring that modified versions of such material be marked in
374 | reasonable ways as different from the original version; or
375 |
376 | d) Limiting the use for publicity purposes of names of licensors or
377 | authors of the material; or
378 |
379 | e) Declining to grant rights under trademark law for use of some
380 | trade names, trademarks, or service marks; or
381 |
382 | f) Requiring indemnification of licensors and authors of that
383 | material by anyone who conveys the material (or modified versions of
384 | it) with contractual assumptions of liability to the recipient, for
385 | any liability that these contractual assumptions directly impose on
386 | those licensors and authors.
387 |
388 | All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10. If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term. If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 |
398 | If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 |
403 | Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 |
407 | 8. Termination.
408 |
409 | You may not propagate or modify a covered work except as expressly
410 | provided under this License. Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 |
415 | However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 |
422 | Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 |
429 | Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License. If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 |
435 | 9. Acceptance Not Required for Having Copies.
436 |
437 | You are not required to accept this License in order to receive or
438 | run a copy of the Program. Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance. However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work. These actions infringe copyright if you do
443 | not accept this License. Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 |
446 | 10. Automatic Licensing of Downstream Recipients.
447 |
448 | Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License. You are not responsible
451 | for enforcing compliance by third parties with this License.
452 |
453 | An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations. If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 |
463 | You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License. For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 |
471 | 11. Patents.
472 |
473 | A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based. The
475 | work thus licensed is called the contributor's "contributor version".
476 |
477 | A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version. For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 |
487 | Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 |
492 | In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement). To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 |
499 | If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients. "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 |
513 | If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 |
521 | A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License. You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 |
536 | Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 |
540 | 12. No Surrender of Others' Freedom.
541 |
542 | If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License. If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all. For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 |
552 | 13. Use with the GNU Affero General Public License.
553 |
554 | Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work. The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 |
563 | 14. Revised Versions of this License.
564 |
565 | The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time. Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 |
570 | Each version is given a distinguishing version number. If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation. If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 |
579 | If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 |
584 | Later license versions may give you additional or different
585 | permissions. However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 |
589 | 15. Disclaimer of Warranty.
590 |
591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 |
600 | 16. Limitation of Liability.
601 |
602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 |
612 | 17. Interpretation of Sections 15 and 16.
613 |
614 | If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 |
621 | END OF TERMS AND CONDITIONS
622 |
623 | How to Apply These Terms to Your New Programs
624 |
625 | If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 |
629 | To do so, attach the following notices to the program. It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 |
634 |
635 | Copyright (C)
636 |
637 | This program is free software: you can redistribute it and/or modify
638 | it under the terms of the GNU General Public License as published by
639 | the Free Software Foundation, either version 3 of the License, or
640 | (at your option) any later version.
641 |
642 | This program is distributed in the hope that it will be useful,
643 | but WITHOUT ANY WARRANTY; without even the implied warranty of
644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645 | GNU General Public License for more details.
646 |
647 | You should have received a copy of the GNU General Public License
648 | along with this program. If not, see .
649 |
650 | Also add information on how to contact you by electronic and paper mail.
651 |
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 |
655 | Copyright (C)
656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 | This is free software, and you are welcome to redistribute it
658 | under certain conditions; type `show c' for details.
659 |
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License. Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 |
664 | You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | .
668 |
669 | The GNU General Public License does not permit incorporating your program
670 | into proprietary programs. If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library. If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License. But first, please read
674 | .
675 |
--------------------------------------------------------------------------------
/Minimus2-pipeline/Minimus2_pipeline.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | """
4 | This pipeline is described in the Amos official website:
5 | http://amos.sourceforge.net/wiki/index.php/Minimus2
6 | All the parameters are as default
7 |
8 | Usage:
9 | $ python3 Minimus2_pipleline.py -s1 S1.fas -s2 S2.fas -o output_prefix
10 |
11 | Sample:
12 | $ python Minimus2_pipeline.py -s1 seq1.fas -s2 seq2.fas -o Minimus2_out/seq1-2
13 |
14 | """
15 | import os
16 | import argparse
17 |
18 | __author__ = "Heyu Lin"
19 | __contact__ = "heyu.lin(AT)student.unimelb.edu.au"
20 |
21 | parser = argparse.ArgumentParser()
22 | parser.add_argument('-s1', metavar='seq_1', dest='s1',
23 | type=str, required=True)
24 | parser.add_argument('-s2', metavar='seq_2', dest='s2',
25 | type=str, required=True)
26 | parser.add_argument('-o', metavar='output', dest='o',
27 | type=str, required=True)
28 | args = parser.parse_args()
29 |
30 |
31 | def create_dir(directory):
32 | dirnm = os.path.dirname(directory)
33 | if dirnm != '':
34 | if not os.path.exists(dirnm):
35 | os.makedirs(dirnm)
36 |
37 |
38 | def seq_num(fasta_file):
39 | num = len([1 for line in open(fasta_file) if line.startswith(">")])
40 | return num
41 |
42 |
43 | def cat_files(file_list, outfile):
44 | with open(outfile, 'w') as fo:
45 | for fname in file_list:
46 | with open(fname) as infile:
47 | for line in infile:
48 | fo.write(line)
49 |
50 |
51 | def run_toAmos(in_fas, out_afg):
52 | cmd_para = [
53 | 'toAmos',
54 | '-s', in_fas,
55 | "-o", out_afg
56 | ]
57 | cmd = ' '.join(cmd_para)
58 | try:
59 | print("\n", 'RUN toAmos'.center(50, '*'))
60 | print(cmd, "\n")
61 | os.system(cmd)
62 | except Exception as e:
63 | raise e
64 |
65 |
66 | def run_minimus2(in_afg, refcount):
67 | cmd_para = [
68 | 'minimus2',
69 | in_afg,
70 | '-D', 'REFCOUNT=' + str(refcount)
71 | ]
72 | cmd = ' '.join(cmd_para)
73 | try:
74 | print("\n", 'RUN Minimus2'.center(50, '*'))
75 | print(cmd, "\n")
76 | os.system(cmd)
77 | except Exception as e:
78 | raise e
79 |
80 |
81 | def main():
82 | create_dir(args.o)
83 | seq_1_num = seq_num(args.s1)
84 | cat_fas = args.o + '.cat.seq'
85 | cat_files([args.s1, args.s2], cat_fas)
86 | run_toAmos(cat_fas, args.o + '.cat.afg')
87 | run_minimus2(args.o + '.cat', seq_1_num)
88 |
89 |
90 | if __name__ == '__main__':
91 | main()
92 |
--------------------------------------------------------------------------------
/Minimus2-pipeline/README.md:
--------------------------------------------------------------------------------
1 | # Minimus2 Pipeline
2 | Using `Minimus2` (a component of `Amos`) to merge two sets of genome contigs.
3 |
4 | This pipeline is described in the Amos official website: http://amos.sourceforge.net/wiki/index.php/Minimus2
5 |
6 | All the parameters are as default.
7 |
8 | ## Usage
9 | ```bash
10 | $ python Minimus2_pipleline.py -s1 S1.fas -s2 S2.fas -o output_prefix
11 | ```
12 | ## Sample
13 |
14 | ```bash
15 | $ python Minimus2_pipeline.py -s1 seq1.fas -s2 seq2.fas -o Minimus2_out/seq1-2
16 | ```
17 |
18 | ## Options
19 |
20 | - `-s1`: genome set 1 (fasta format; used as reference)
21 | - `-s2`: genome set 2 (fasta format)
22 | - `-o`: prefix of output (directory is allowed to involve and will be create if not exists)
23 |
24 | ## Require
25 | - Using **Python3**
26 | - Amos was installed, including which `toAmos` and `minimus2` was already in the $PATH
27 | - No 3rd party python modules required
28 |
29 | ## Output
30 |
31 | The following two files are the most important output:
32 |
33 | - prefix.fasta : merged contig sequences
34 | - prefix.singletons.seq : singleton sequences
35 |
36 | Consider to use `cat` command to combine these two files, in order to do downstream analysis.
37 |
38 | # Chinese Usage 中文使用说明
39 | Minimus2是Amos套件中的一个程序,主要用于进行两个基因组文件的合并与再拼接。
40 |
41 | ## 使用
42 | ```bash
43 | $ python Minimus2_pipleline.py -s1 S1.fas -s2 S2.fas -o output_prefix
44 | ```
45 | ## 示例
46 |
47 | ```bash
48 | $ python Minimus2_pipeline.py -s1 seq1.fas -s2 seq2.fas -o Minimus2_out/seq1-2
49 | ```
50 |
51 | ## 选项
52 |
53 | - `-s1`: 基因组1(fasta格式,将会用做参考序列)
54 | - `-s2`: 基因组2(fasta格式)
55 | - `-o`: 输出文件的前缀(可以包含前置路径名,路径若不存在则会被新建)
56 | ## 要求
57 | - 使用**Python3**
58 | - 无需第三方python模块
59 | - Amos已安装,并至少将`toAmos` 和 `minimus2` 两个组件放进$PATH中以便调用
60 |
61 | ## 输出
62 |
63 | 下面两个文件是所有输出文件中最重要的:
64 |
65 | - prefix.fasta : 合并的contigs文件
66 | - prefix.singletons.seq : 未合并的contigs
67 |
68 | 可以考虑使用 `cat` 命令将这两个文件合并进行下游分析。
--------------------------------------------------------------------------------
/POCP-calculator/POCP-matrix.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | """
4 | Calculate the percentage of conserved proteins (POCP) between two or
5 | more genomes to estimate their evolutionary and phenotypic distance.
6 | An elegant matrix table will be created after the calculation.
7 |
8 | The program was written based on (Qin et al. 2014; doi: 10.1128/JB.01688-14)
9 |
10 | # Required:
11 | BLAST+ installed in $PATH
12 |
13 | # Usage:
14 | $ python POCP-matrix.py -i input_dir -o output_matrix.tab [-n 8] [--clean]
15 |
16 | # Options:
17 | -i: input directory contained more than 2 translated genome files (suffix: .faa)
18 | -o: output POCP matrix file
19 | -n: number of threads (optional, default: 3)
20 | --clean: blast output and databases created by this program will be removed (optional)
21 |
22 | """
23 |
24 | import sys, os, re
25 | import glob
26 | import itertools
27 | from math import factorial # used to compute the progress
28 | import subprocess
29 | import argparse
30 |
31 | __author__ = "Heyu Lin"
32 | __contact__ = "heyu.lin@student.unimelb.edu.au"
33 |
34 | """
35 | Deal with some options
36 | """
37 | parser = argparse.ArgumentParser()
38 | parser.add_argument('-i', '--input', metavar='input_directory', dest='i',
39 | type=str, required=True)
40 | parser.add_argument('-o', '--output', metavar='output_filename', dest='o',
41 | type=str, required=True)
42 | parser.add_argument('-n', '--num_threads', metavar='num_cpu',
43 | dest='n', type=int, default=3,
44 | help='specify the number of threads used by blast (default=3)')
45 | parser.add_argument('--clean', metavar='clean_blast_db_output',
46 | dest='c', nargs="?", const=True, default=False,
47 | help='redundant files created by this program will be removed if this argument is added')
48 | args=parser.parse_args()
49 |
50 | """
51 | Define functions
52 | """
53 |
54 | def run_mkblastdb(fi, fo):
55 | '''
56 | fi: input fasta file
57 | fo: output database name
58 | '''
59 | cmd_para = [
60 | 'makeblastdb',
61 | '-in', fi,
62 | '-dbtype', 'prot',
63 | '-parse_seqids',
64 | '-out', fo
65 | ]
66 | try:
67 | run = subprocess.call(cmd_para, stdout=subprocess.PIPE)
68 | except Exception as e:
69 | raise e
70 |
71 | def run_blastp(q, db, o, n):
72 | """
73 | q: query
74 | db: database
75 | o: output
76 | n: num_cpu
77 | """
78 | cmd_para = [
79 | 'blastp',
80 | '-query', q,
81 | '-out', o,
82 | '-db', db,
83 | '-evalue', '1e-5',
84 | '-outfmt', "6 std qlen",
85 | '-max_target_seqs', '1',
86 | '-num_threads', str(n),
87 | ]
88 | try:
89 | process = subprocess.Popen(cmd_para, stderr=subprocess.PIPE)
90 | _, stderr = process.communicate()
91 |
92 | if stderr:
93 | warnings = stderr.decode("utf-8").split('\n')
94 | for warning in warnings:
95 | if "Warning: [blastp] Examining 5 or more matches is recommended" not in warning and warning != '':
96 | print("Warning:", warning)
97 | except Exception as e:
98 | raise e
99 |
100 | def num_sequnces(fasta):
101 | pattern = r"^>"
102 | with open(fasta, "r") as f:
103 | data = f.read()
104 | iterator = re.finditer(pattern, data, re.MULTILINE)
105 | count = 0
106 | for match in iterator:
107 | count += 1
108 | return count
109 |
110 | def comb(n, r):
111 | return factorial(n) // factorial(r) // factorial(n-r)
112 |
113 | def POCP_calculator(pair, num_cpu):
114 | T1 = num_sequnces(pair[0])
115 | T2 = num_sequnces(pair[1])
116 | blastout_name1 = pair[0] + '--' + os.path.basename(pair[1]) + '.POCPout'
117 | if not os.path.exists(blastout_name1):
118 | run_blastp(pair[0], pair[1]+'_POCP', blastout_name1, num_cpu)
119 | blastout_name2 = pair[1] + '--' + os.path.basename(pair[0]) + '.POCPout'
120 | if not os.path.exists(blastout_name2):
121 | run_blastp(pair[1], pair[0]+'_POCP', blastout_name2, num_cpu)
122 | hit_sum = 0 # Initialize the number of hit sequences
123 | for outfile in [blastout_name1, blastout_name2]:
124 | with open(outfile, 'r') as f:
125 | """
126 | qury_temp: used to test whether a query has only one hit region
127 | recd: In the case that a query has more than on alignabe region,
128 | only one hit that eligible should be count
129 | """
130 | qury_temp = 'temp'
131 | recd = False
132 | for line in f.readlines():
133 | items = line.split()
134 | qury = items[0]
135 | iden = float(items[2])
136 | qcov = float(items[3]) / float(items[12])
137 | if qury != qury_temp and iden >= 40 and qcov >= 0.5:
138 | hit_sum += 1
139 | recd = True # This query has been counted, and should not be
140 | # counted again if it has another eligible regions
141 | elif qury == qury_temp and iden >= 40 and qcov >= 0.5:
142 | if recd == False: # Although the sequence has two hit region,
143 | # the previous regions were not eligible
144 | hit_sum += 1
145 | recd = True
146 | qury_temp = qury
147 | return hit_sum/(T1 + T2) * 100
148 |
149 | def output_table(dict, items, out):
150 | with open(out, 'w') as fo:
151 | fo.write('POCP' + "\t" + "\t".join(items) + "\n")
152 | num = len(items)
153 | for i in range(len(items)):
154 | lst = []
155 | lst.append(os.path.basename(items[i]))
156 | for j in range(len(items)):
157 | if items[i] == items[j]:
158 | lst.append('100')
159 | else:
160 | lst.append(str(dict.get((items[j],items[i]), '~')))
161 | fo.write("\t".join(lst) + "\n")
162 |
163 | def clean(pth):
164 | for file in glob.iglob(os.path.join(pth,'*_POCP.p??')):
165 | os.remove(file) # Clean blast databases
166 | for file in glob.iglob(os.path.join(pth,'*.POCPout')):
167 | os.remove(file) # Clean blast output files
168 |
169 | """
170 | Main Program
171 | """
172 | def main():
173 | genomes = glob.glob(os.path.join(args.i,'*.faa'))
174 | genomes_bn = list(map(os.path.basename, genomes))
175 | num_genomes = len(genomes)
176 | print(num_genomes, 'genomes have been read.')
177 | num_blastp = comb(num_genomes,2) * 2 # The number of blastp should be called
178 | # Make blast database for all the genomes
179 | for genome in genomes:
180 | run_mkblastdb(genome, genome+'_POCP')
181 | # Run blastp between every two genomes
182 | dict = {}
183 | processed = 0
184 | for genome_pair in itertools.combinations(genomes,2):
185 | genome_pair_bn = tuple(map(os.path.basename, genome_pair))
186 | POCP_value = POCP_calculator(genome_pair, args.n)
187 | dict[genome_pair_bn] = POCP_value
188 | processed += 2
189 | processed_perc = round(processed/num_blastp * 30)
190 | print("\r"+"["+">"*processed_perc+"]",
191 | "{}/{}".format(processed, num_blastp),end='') # print progress bar
192 | sys.stdout.flush()
193 | output_table(dict, genomes_bn, args.o)
194 | if args.c == True:
195 | clean(args.i)
196 | print("\ndone.")
197 |
198 | if __name__ == '__main__':
199 | main()
200 |
--------------------------------------------------------------------------------
/POCP-calculator/README.md:
--------------------------------------------------------------------------------
1 | # POCP Calculator
2 |
3 | Calculate the percentage of conserved proteins **(POCP)** between two or
4 | more genomes to estimate their evolutionary and phenotypic distance.
5 |
6 | POCP value could be used as a robust genomic index for establishing the **genus boundary** for prokaryotic groups. Generally, a POCP value of 50% could be used as a genus boundary for prokaryotic lineages according to [Qin et al (2014)](https://journals.asm.org/doi/10.1128/JB.01688-14)
7 |
8 | An elegant matrix table will be created after the calculation.
9 |
10 | The program was written based on the paper (*Qin et al. 2014; doi: [10.1128/JB.01688-14](https://journals.asm.org/doi/10.1128/JB.01688-14)*)
11 |
12 | ## Usage
13 |
14 | ```bash
15 | $ python POCP-matrix.py -i input_dir -o output_matrix.tab [-n 8] [--clean]
16 | ```
17 |
18 | ## Options
19 |
20 | - `-i`: input directory contained more than 2 translated genome files (suffix: .faa)
21 | - `-o`: output POCP matrix file
22 | - `-n`: number of threads (optional, default: 3)
23 | - `--clean`: blast output and databases created by this program will be removed (optional)
24 |
25 | ## Require
26 |
27 | - BLAST+ installed in `$PATH`
28 | - Using **Python3**
29 | - Works both on Windows and unix-like systems
30 | - No 3rd party python modules required
31 |
32 | ## Sample Output:
33 |
34 | | POCP | Genome1.faa | Genome2.faa | Genome3.faa | Genome4.faa |
35 | | ----------- | ----------- | ----------- | ----------- | ----------- |
36 | | Genome1.faa | 100 | ~ | ~ | ~ |
37 | | Genome2.faa | 77.25376031 | 100 | ~ | ~ |
38 | | Genome3.faa | 92.18714253 | 59.14082 | 100 | ~ |
39 | | Genome4.faa | 41.25224685 | 57.19096 | 66.48514 | 100 |
40 |
41 | > Please ensure that the length of every sequence header is less than 50 characters. Otherwise, Blast will be unable to create the database and will produce an error.
42 |
43 | # Chinese Usage 中文使用说明
44 |
45 | POCP_matrix.py脚本能够计算多个基因组之间的**POCP值**(保守蛋白百分比),用来判断原核生物在**属水平**上的遗传距离。POCP值在50%以上可以被认为是一个属的边界[Qin et al (2014)](https://journals.asm.org/doi/10.1128/JB.01688-14)。
46 |
47 | 该程序基于文献:(*Qin et al. 2014; doi: [10.1128/JB.01688-14](https://journals.asm.org/doi/10.1128/JB.01688-14)*)
48 |
49 | ## 使用
50 |
51 | ```bash
52 | $ python POCP-matrix.py -i input_dir -o output_matrix.tab [-n 8] [--clean]
53 | ```
54 |
55 | ## 选项
56 |
57 | - `-i`: 输入文件夹,至少含有两个基因组的蛋白质文件(后缀为.faa)
58 | - `-o`: 输出POCP表格的文件名
59 | - `-n`: 使用cpu核心数 (可选, 默认: 3)
60 | - `--clean`: 该程序计算过程中产生的blast数据库与结果将会被清除 (可选)
61 |
62 | ## 要求
63 | - Blast+已安装并存在环境变量`$PATH`中
64 | - 使用**Python3**
65 | - 在Windows和类unix系统中均可运行
66 | - 无需第三方python模块
67 |
68 | ## 输出示例:
69 |
70 | | POCP | Genome1.faa | Genome2.faa | Genome3.faa | Genome4.faa |
71 | | ----------- | ----------- | ----------- | ----------- | ----------- |
72 | | Genome1.faa | 100 | ~ | ~ | ~ |
73 | | Genome2.faa | 77.25376031 | 100 | ~ | ~ |
74 | | Genome3.faa | 92.18714253 | 59.14082 | 100 | ~ |
75 | | Genome4.faa | 41.25224685 | 57.19096 | 66.48514 | 100 |
76 |
77 | > 注意:faa文件中的header必须都小于50个字符,否则blast无法建库,会报错
78 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://doi.org/10.5281/zenodo.4954426) []()
2 |
3 | # Bio-py
4 | Some useful python scripts for biologists.
5 |
6 | All the scripts in this repo are developed and tested on python >= 3.6.
7 |
8 | I would be glad if you submit issues or email me at (heyu.lin🅰️qut.edu.au) for questions, suggestions or other feedback.
9 |
10 | ## How to cite
11 | If you find these scripts useful for your scientific research, please consider citing this repo to make it easier for other peers to find it. This repo has been published with [Zenodo](https://doi.org/10.5281/zenodo.4954426), so you are encouraged to cite the tools as follows:
12 | > Lin, H. (2021). _SilentGene/Bio-py: Bio-py_. Zenodo. http://doi.org/10.5281/zenodo.4954426
13 |
--------------------------------------------------------------------------------
/blast-matrix/README.md:
--------------------------------------------------------------------------------
1 | # BLAST Matrix
2 |
3 | This script calculates pair-wise sequence identities for all sequences in a multifasta format file.
4 | A matrix table will be generated after the calculation, and a clustered heatmap will be drawn if required.
5 |
6 | ## Require
7 |
8 | - BLAST+ installed in $PATH
9 | - Biopython (with pandas > 0.21)
10 | - seaborn & scipy (for drawing clustered heatmap)
11 |
12 | ## Usage
13 |
14 | ```bash
15 | $ python blast_identity_matrix.py -i input_seqs.fasta [-o output_matrix.tsv] [--thread 4] [--program blastp] [--heatmap output_heatmap.pdf] [--clean]
16 | ```
17 |
18 | ## Options
19 |
20 | - `-i`: Input file in multi-sequence FASTA format
21 | - `-o`: Output matrix table in tab-delimited format [default: (input file name) + '_ident.tsv']
22 | - `-t`: Threads that would be used for makeblastdb and blast [default: 2]
23 | - `-p`: blast program that would be used (blastp or blastn) [default: blastp]
24 | - `--heatmap`: Draw clustered heatmap.
25 | - `--clean`: Clean temporary files. [default: False]
26 |
27 |
28 |
29 | # Chinese Usage 中文使用说明
30 |
31 | 此脚本会进行两两blast比较并计算一致性(identity)。输入一个含有多条fasta序列的文件,生成一个一致性数值矩阵。
32 |
33 | ## 要求
34 |
35 | - BLAST+ 安装在 `$PATH`
36 | - Python3.x
37 | - Biopython (包含pandas > 0.21)
38 | - seaborn & scipy (如果绘制聚类热图需要安装)
39 |
40 | ## 使用命令
41 |
42 | ```bash
43 | $ python blast_identity_matrix.py -i input_seqs.fasta [-o output_matrix.tsv] [--thread 4] [--program blastp] [--heatmap] [--clean]]
44 | ```
45 |
46 | ## 可选项
47 |
48 | - `-i`: 输入文件。含有多条fasta序列的文件。
49 | - `-o`: 输出文件。tab分割的数值矩阵。[默认文件名: (输入文件名) + '_ident.tsv']
50 | - `-t`: makeblastdb和blast过程会调用的线程数。 [默认: 2]
51 | - `-p`: blast程序 (可选blastp或blastn) [默认: blastp]
52 | - `--heatmap`: 绘制聚簇热图.
53 | - `--clean`: 清除中间文件 [默认: False]
54 |
--------------------------------------------------------------------------------
/blast-matrix/blast_identity_matrix.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python3
2 |
3 | """
4 | This script calculates pair-wise sequence identities for all sequences in a multifasta format file.
5 | A matrix table will be generated after the calculation, and a clustered heatmap will be drawn if required.
6 |
7 | # Required:
8 | - BLAST+ installed in $PATH
9 | - Biopython (with pandas > 0.21)
10 | - seaborn & scipy (for drawing clustered heatmap)
11 |
12 | # Usage:
13 | $ python blast_identity_matrix.py -i input_seqs.fasta [-o output_matrix.tsv] [--heatmap output_heatmap.pdf] [--thread 4] [--program blastp] [--clean]
14 |
15 | # Options:
16 | -i: Input file in multi-sequence FASTA format
17 | -o: Output matrix table in tab-delimited format [default: (input file name) + '_ident.tsv']
18 | -t: Threads that would be used for makeblastdb and blast [default: 2]
19 | -p: blast program that would be used (blastp or blastn) [default: blastp]
20 | --heatmap: Draw clustered heatmap.
21 | --clean: Clean temporary files. [default: False]
22 | """
23 |
24 | import os
25 | from Bio import SeqIO
26 | import argparse
27 | import random
28 | import shutil
29 | from itertools import permutations
30 | import pandas as pd
31 | import subprocess
32 | from multiprocessing import Pool
33 |
34 | __author__ = "Heyu Lin"
35 | __contact__ = "heyu.lin@student.unimelb.edu.au"
36 |
37 | parser = argparse.ArgumentParser()
38 |
39 | parser.add_argument('-i', '--input', metavar='input_fasta_file', dest='i',
40 | type=str, required=True,
41 | help='Input file in multi-sequence FASTA format')
42 | parser.add_argument('-o', '--output', metavar='output_table', dest='o',
43 | type=str, required=False,
44 | help='Output matrix table in tab-delimited format')
45 | parser.add_argument('-t', '--threads', metavar='threads', dest='t',
46 | type=int, required=False, default=2,
47 | help='Threads that would be used for makeblastdb and blast')
48 | parser.add_argument('-p', '--program', metavar='blast_program', dest='p',
49 | type=str, required=False, default='blastp',
50 | help='blast program that would be used (blastp or blastn)')
51 | parser.add_argument('-m', '--heatmap', metavar='heatmap', dest='m',
52 | type=str, required=False,
53 | help='Draw clustered heatmap.')
54 | parser.add_argument('--clean', metavar='clean', dest='c',
55 | action='store_true', required=False,
56 | help='Clean temporary files. Default: False')
57 | args = parser.parse_args()
58 |
59 | input_faa = args.i
60 | output_table = input_faa + '_ident.tsv' if args.o == None else args.o
61 | tmp_folder = 'blast_matrix_tmp_' + str(random.randint(0,999999)).zfill(6)
62 | if args.p == 'blastp':
63 | blast_program = 'blastp'
64 | data_type = 'prot'
65 | elif args.p == 'blastn':
66 | blast_program = 'blastn'
67 | data_type = 'nucl'
68 | else:
69 | raise AttributeError('Only blastp or blastn is supported!')
70 |
71 | if not os.path.exists(tmp_folder):
72 | os.makedirs(tmp_folder)
73 | else:
74 | raise IOError(f"Sorry, the temporary folder could not be created. Please remove the {tmp_folder} folder.")
75 |
76 |
77 | def run_mkblastdb(fi, tp):
78 | fo = fi + '.db'
79 | '''
80 | fi: input fasta file
81 | fo: output database name
82 | tp: prot or nucl
83 | '''
84 | cmd_para = [
85 | 'makeblastdb',
86 | '-in', fi,
87 | "-dbtype", tp,
88 | "-parse_seqids",
89 | "-out", fo
90 | ]
91 | try:
92 | # print("\n", 'Make Blast Database'.center(50, '*'))
93 | # print(cmd, "\n")
94 | subprocess.check_call(cmd_para,
95 | stdout=open(os.devnull, 'wb'),
96 | stderr=subprocess.STDOUT,
97 | )
98 | except subprocess.CalledProcessError as exc:
99 | print('cmd:', exc.cmd)
100 | print("Status : FAIL", exc.returncode, exc.output)
101 |
102 |
103 | def run_blast(q, o, db, e, b):
104 | '''
105 | q: query
106 | o: output
107 | db: database
108 | e: evalue
109 | f: outfmt
110 | n: num_threads
111 | b: blast program
112 | '''
113 | cmd_para = [
114 | b,
115 | '-query', q,
116 | '-out', o,
117 | '-db', db,
118 | '-evalue', str(e),
119 | '-outfmt', '6',
120 | '-num_threads', '1'
121 | ]
122 | try:
123 | # print("\n", 'BLAST Searching'.center(50, '*'))
124 | # print(cmd, "\n")
125 | res = subprocess.check_call(cmd_para,
126 | stdout=open(os.devnull, 'wb'),
127 | stderr=subprocess.STDOUT,
128 | )
129 | except subprocess.CalledProcessError as exc:
130 | print('cmd:', exc.cmd)
131 | print('output:', exc.output)
132 |
133 |
134 | def blast_Parser(fi):
135 | '''
136 | fi: blast output (format 6)
137 | '''
138 | if not os.path.getsize(fi):
139 | return 0
140 |
141 | with open(fi) as input:
142 | for line in input.readlines():
143 | items = line.strip().split("\t")
144 | return float(items[2])
145 |
146 |
147 | def include_outputdir(s):
148 | return os.path.join(tmp_folder, s)
149 |
150 | def draw_heatmap(df, out_pdf):
151 | import seaborn as sns
152 | import scipy
153 |
154 | # Draw clustered heatmap
155 | cmap = sns.clustermap(df)
156 |
157 | # Save plot to a PDF file
158 | cmap.savefig(out_pdf)
159 |
160 |
161 | if __name__ == "__main__":
162 | pool = Pool(args.t)
163 |
164 | seq_ids = []
165 | for seq_record in SeqIO.parse(input_faa, "fasta"):
166 | single_seq = include_outputdir(seq_record.id) + ".faa"
167 | SeqIO.write(seq_record, single_seq, "fasta")
168 | seq_ids.append(seq_record.id)
169 |
170 | # build parameters for mkblastdb
171 | mkblastdb_para = [(include_outputdir(i + '.faa'), data_type) for i in seq_ids]
172 | # run mkblastdb in parallel
173 | pool.starmap(run_mkblastdb, mkblastdb_para)
174 |
175 | blast_para = [] # build parameters for blast
176 | for query, targ in permutations(seq_ids, 2):
177 | blast_out = include_outputdir(query + '+' + targ + '_blast')
178 | blast_query = include_outputdir(query + '.faa')
179 | blast_targ = include_outputdir(targ + '.faa.db')
180 | blast_para.append((blast_query, blast_out, blast_targ, '1e-5', blast_program))
181 |
182 | data = {}
183 | pool.starmap(run_blast, blast_para)
184 |
185 | for query, targ in permutations(seq_ids, 2):
186 | blast_out = include_outputdir(query + '+' + targ + '_blast')
187 | ident = blast_Parser(blast_out)
188 | if data.get(query):
189 | data[query][targ] = ident
190 | else:
191 | data[query] = {}
192 | data[query][targ] = ident
193 |
194 | df = pd.DataFrame(data).sort_index().sort_index(axis=1)
195 |
196 |
197 | mean_ident = df.mean(skipna = True).mean()
198 |
199 | max_qur_tar = df.stack().idxmax()
200 | max_ident = df.loc[max_qur_tar]
201 | min_qur_tar = df.stack().idxmin()
202 | min_ident = df.loc[min_qur_tar]
203 |
204 | print('\n***** Statistics *****')
205 | print(f'Maximum Identity:\n{max_ident}%: {max_qur_tar[0]} -> {max_qur_tar[1]}')
206 | print(f'Mimimum Identity:\n{min_ident}%: {min_qur_tar[0]} -> {min_qur_tar[1]}')
207 | print(f'Average Identity: {mean_ident}%')
208 |
209 | df = df.fillna(100) # Fill NaN values with 100
210 | df.to_csv(output_table, sep='\t')
211 |
212 | if args.c:
213 | shutil.rmtree(tmp_folder)
214 |
215 | ######## ~ draw clustered heatmap ~ ########
216 | if args.m:
217 | draw_heatmap(df, args.m)
218 |
219 |
220 |
221 |
--------------------------------------------------------------------------------
/blast-wrapper/README.md:
--------------------------------------------------------------------------------
1 | # blast-wrapper
2 | Pipeline for conducting **makeblastdb** and **blastp/blastn/blastx/tblastn** using one simple command.
3 |
4 | Show blast results in a **more elegant way**. Not only table headers, but also **query coverages** and the **original query sequences** were calculated and showed in the results.
5 |
6 | This script can also parse and filter the blast result by setting threshold of identity and coverage!
7 |
8 | ## Require
9 | - BLAST+ installed in `$PATH`
10 | - Using **Python3**
11 | - Works both on Windows and unix-like systems
12 | ## Usage
13 | ```
14 | $ python3 blast_wrapper.py -h
15 | usage: blast_wrapper.py [-h] -q query_fasta [-o output] [-df database_fasta]
16 | [-db database] [-e max_e-value] [-ms num_sequences]
17 | [-n num_cpu] [-b blast+ program]
18 | [-id identity_threshold] [-qc coverage_threshold]
19 | [--no_qseq [hide qseq column]] [-f output_format*]
20 |
21 | optional arguments:
22 | -h, --help show this help message and exit
23 | -q query_fasta, --query query_fasta
24 | -o output, --output output
25 | -df database_fasta, --database_fasta database_fasta
26 | fasta file to be used as database
27 | -db database, --database database
28 | blast database which has already been made
29 | -e max_e-value, --evalue max_e-value
30 | threshod e-value for blast (default=1e-5)
31 | -ms num_sequences, --max_target_seqs num_sequences
32 | specify the max_number of target seqs for hits per
33 | query (default=1)
34 | -n num_cpu, --num_threads num_cpu
35 | specify the number of threads used by blast
36 | (default=3)
37 | -b blast+ program, --blast_program blast+ program
38 | specify the blast program (default=blastp)
39 | -id identity_threshold, --identity identity_threshold
40 | specify the threshold of identity (default=0)
41 | -qc coverage_threshold, --qcov coverage_threshold
42 | specify the threshold of query coverage (default=0)
43 | --no_qseq [hide qseq column]
44 | no query sequences will be showed if this argument is
45 | added
46 | -f output_format*, --outfmt output_format*
47 | outfmt defined by blast+, it is dangerous to change
48 | the default value
49 | ```
50 | ## Sample Output
51 | qid | sid | ident% | aln_len | miss | gap | qstart | qend | sstart | send | qlen | slen | evalue | bitscore | qcov% | qseq
52 | --- | --- | ------ | ------- | ---- | --- | ------ | ---- | ------ | ---- | ---- | ---- | ------ | -------- | ----- | ----
53 | HC_02247 | HgcA_ND132 | 34.483 | 58 | 37 | 1 | 550 | 607 | 9 | 65 | 608 | 95 | 1.42e-08 | 43.1 | 9.4 | MEAVE...
54 | HC_00217 | HgcB_ND132 | 28.049 | 82 | 42 | 3 | 104 | 176 | 18 | 91 | 220 | 95 | 8.56e-06 | 33.5 | 32.7 | METVE...
55 | HC_01133 | MerA_RS | 31.567 | 453 | 286 | 12 | 6 | 445 | 9 | 450 | 466 | 480 | 2.88e-55 | 182 | 94.2 | MSKVH...
56 | HC_01413 | MerA_WE | 30.660 | 424 | 283 | 4 | 26 | 443 | 114 | 532 | 455 | 554 | 7.74e-63 | 204 | 91.6 | MDFFD...
57 | ## Simplest
58 | ```bash
59 | $ python blast_wrapper.py -q query.faa -df database.faa
60 | ```
61 | or if you already have an established database:
62 | ```bash
63 | $ python blast_warpper.py -q query.faa -db database
64 | ```
65 | ## Moderate
66 | ```bash
67 | $ python blast_wrapper.py -b blastn -q query.fna -o output -df database.fna -e 1e-10 -n 5
68 | ```
69 |
70 | ## Control freak
71 | ```bash
72 | $ python blast_wrapper.py -b blastx -q query.fna -o output -df database.faa -e 1e-10 -id 30 -qc 50 -n 5 -ms 3 --no_qseq
73 | ```
74 | *Any change to output format by -f option may lead to errors when parsing output results, although it's up to you to make any change*
75 |
76 | ## Note
77 | - blastp would be used if no algorithm is specified by option `-b blastn`.
78 | - The option `-q` is required to specify the query fasta file. The option `-df` or `-db` is required to specify the target database in fasta famat or an database that has already made by makeblastdb command in blast+ software.
79 | - If no output is specified by `-o`, the result would be created in the current direcoty according to the regular `QueryFileName_blast.out`.
80 | - If `-df` is specified, the database would be created in the same directory as the argument specified using the name `DatabaseFasta.db`. And if such a database already exsits, the script would skip the makeblastdb step.
81 | - Using `-id` and `-qc` to set the threshold of **identity** and **query coverage**, respectively.
82 | - `--no_seqs` could used when you don't want the orignal query sequences appear in the final result. This may speed up the program in some extend.
83 | - 3 threads would be used by default, which could be modified by the `-n` option.
84 | - A custom function has been developed to take the place of the original `-max_target_seqs` option, since the latter one has been found to only generate the first hit, not the best hit.
85 |
86 | ## Tips
87 | If you happen to have a bunch of fasta files waiting for blast against a single database, try out the following bash command to make your life simpler: (eg. you are in the fasta files directory, and all the query files have a suffix `.faa`)
88 | ```bash
89 | $ for f in *.faa; do python3 blast_wrapper.py -q $f -df data.faa; done
90 | ```
91 |
92 | You can use the script `blastout2fasta.py` provided along with this blast wrapper to convert the output to `fasta` format.
93 |
94 | ```bash
95 | $ python3 blastout2fasta.py blast.out > blast_out.fa
96 | ```
97 |
98 |
99 |
100 | # Chinese Usage 中文使用说明
101 |
102 | blast-wrapper.py脚本能够通过简单的一行命令实现**建库**和**blast搜索**两个本地blast步骤。
103 |
104 | 使用该脚本还可以帮助我们以更优雅的方式阅读blast的结果。得到的表格不仅具有清晰的表头信息,且经过计算的**覆盖度**和**原查询序列**均可以显示在结果中,便于进一步分析解读。
105 |
106 | ## 要求
107 | - Blast+已安装并存在环境变量`$PATH`中
108 | - 使用**Python3**
109 | - 在Windows和类unix系统中均可运行
110 | ## 初级
111 |
112 | 大多数情况下,你只需要用如下的命令进行blastp:
113 |
114 | ```bash
115 | $ python blast_wrapper.py -q query.faa -df database.faa
116 | ```
117 | 如果你已经有一个通过blast+的makeblastdb建立的数据库,则:
118 | ```bash
119 | $ python blast_warpper.py -q query.faa -db database
120 | ```
121 | ## 中级
122 | ```bash
123 | $ python blast_wrapper.py -b blastn -q query.fna -o output -df database.fna -e 1e-10 -n 5
124 | ```
125 |
126 | ## 高级
127 | ```bash
128 | $ python blast_wrapper.py -b blastn -q query.fna -o output -df database.fna -e 1e-10 -id 30 -qc 50 -n 5 -ms 3 --no_qseq
129 | ```
130 | *虽然脚本支持通过选项-f来更改输出样式,但任何样式的更改都可能会导致后续分析结果呈现的错误*
131 |
132 | ## 注意
133 | - 默认使用blastp运行程序,可通过`-b blastn`来指定使用blastn。
134 | - 选项 `-q`是必选项,用来指定查询序列的文件位置。选项`-df`或者 `-db` 必须指定其一,分别可以指定用来建库的fasta文件或者已经建立的数据库位置。
135 | - 如果`-o`选项为缺省状态,则程序会在当前路径下新建文件名为 `QueryFileName_blast.out`格式的文件存放结果。
136 | - 如果指定了`-df`选项,则程序会在指定的fasta库相同路径下新建`DatabaseFasta.db`名称格式的数据库文件,如果该数据库被程序发现已经存在,则程序会自动跳过建库步骤,直接使用存在的数据库进行搜索。
137 | - 通过`-id`和`-qc`分别指定**一致性**和**覆盖度**的最小值以实现对结果的过滤
138 | - 可以使用`--no_seqs`选项来取消在结果中显示查询序列的原序列,这可能会在一定程度上加快程序运行的速度。
139 | - 程序默认的线程数是3个,可以使用`-n`选项来更改。
140 | - 编写了自定义的函数来代替原生`-max_target_seqs` 参数来筛选出最优的结果。因为原生参数实际只产出数据库中第一个匹配序列,而不是最优的序列。
141 |
142 |
143 | ## 输出示例
144 |
145 | qid | sid | ident% | aln_len | miss | gap | qstart | qend | sstart | send | qlen | slen | evalue | bitscore | qcov% | qseq
146 | --- | --- | ------ | ------- | ---- | --- | ------ | ---- | ------ | ---- | ---- | ---- | ------ | -------- | ----- | ----
147 | HC_02247 | HgcA_ND132 | 34.483 | 58 | 37 | 1 | 550 | 607 | 9 | 65 | 608 | 95 | 1.42e-08 | 43.1 | 9.4 | MEAVE...
148 | HC_00217 | HgcB_ND132 | 28.049 | 82 | 42 | 3 | 104 | 176 | 18 | 91 | 220 | 95 | 8.56e-06 | 33.5 | 32.7 | METVE...
149 | HC_01133 | MerA_RS | 31.567 | 453 | 286 | 12 | 6 | 445 | 9 | 450 | 466 | 480 | 2.88e-55 | 182 | 94.2 | MSKVH...
150 | HC_01413 | MerA_WE | 30.660 | 424 | 283 | 4 | 26 | 443 | 114 | 532 | 455 | 554 | 7.74e-63 | 204 | 91.6 | MDFFD...
151 |
152 | ## 小技巧
153 |
154 | 如果你有很多fasta文件想要对一个数据库进行比对,不妨试试下面的命令调用bash来帮助你循环调用脚本(假设当前路径在存放fasta文件的路径中,且所有的fasta文件有统一的后缀`.faa`:
155 | ```bash
156 | $ for f in *.faa; do python3 blast_wrapper.py -q $f -df data.faa; done
157 | ```
158 | 你可以使用脚本`blastout2fasta.py`来将`blast_wrapper.py`的结果转换成对应的`fasta`格式:
159 | ```bash
160 | $ python3 blastout2fasta.py blast.out > blast_out.fa
161 | ```
--------------------------------------------------------------------------------
/blast-wrapper/blast_wrapper.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | """
4 | Required: BLAST+ installed in $PATH
5 |
6 | Usage:
7 |
8 | ## Simplest:
9 | $ python blast_wrapper.py -q query.faa -df database.faa
10 | or if you already have an established database:
11 | $ python blast_warpper.py -q query.faa -db blast+_database
12 |
13 | ## Moderate:
14 | $ python blast_wrapper.py -b blastn -q query.fna -o output -df database.fna \
15 | -e 1e-10 -n 5
16 |
17 | ## Control freak:
18 | $ python blast_wrapper.py -b blastn -q query.fna -o output -df database.fna \
19 | -e 1e-10 -n 5 -ms 3 --no_qseq
20 |
21 | *Any change to output format by -f option may lead to errors when parsing output results.
22 | """
23 |
24 | import os
25 | import sys
26 | import argparse
27 | from collections import defaultdict
28 |
29 | __author__ = "Heyu Lin"
30 | __contact__ = "heyu.lin(AT)student.unimelb.edu.au"
31 |
32 | parser = argparse.ArgumentParser()
33 | parser.add_argument('-q', '--query', metavar='query_fasta', dest='q',
34 | type=str, required=True)
35 | parser.add_argument('-o', '--output', metavar='output', dest='o',
36 | type=str)
37 | parser.add_argument('-df', '--database_fasta', metavar='database_fasta',
38 | dest='df', type=str,
39 | help='fasta file to be used as database')
40 | parser.add_argument('-db', '--database', metavar='database',
41 | dest='db', type=str,
42 | help='blast database which has already been made')
43 | parser.add_argument('-e', '--evalue', metavar='max_e-value', dest='e',
44 | type=float, default=1e-5,
45 | help='threshod e-value for blast (default=1e-5)')
46 | parser.add_argument('-ms', '--max_target_seqs', metavar='num_sequences',
47 | dest='ms', type=int, default=1,
48 | help='specify the max_number of target seqs for hits per query (default=1)')
49 | parser.add_argument('-n', '--num_threads', metavar='num_cpu',
50 | dest='n', type=int, default=3,
51 | help='specify the number of threads used by blast (default=3)')
52 | parser.add_argument('-b', '--blast_program', metavar='blast+ program',
53 | dest='b', type=str, default='blastp',
54 | help='specify the blast program (default=blastp)')
55 | parser.add_argument('-id', '--identity', metavar='identity_threshold',
56 | dest='idt', type=float, default=0,
57 | help='specify the threshold of identity (default=0)')
58 | parser.add_argument('-qc', '--qcov', metavar='coverage_threshold',
59 | dest='qc', type=float, default=0,
60 | help='specify the threshold of query coverage (default=0)')
61 | parser.add_argument('--no_qseq', metavar='hide qseq column',
62 | dest='nq', nargs="?", const=True, default=False,
63 | help='no query sequences will be showed if this argument is added')
64 | # You're not going to like to change this default output format.
65 | # Any change to this outfmt argument may lead to exceptions for query coverage calculation
66 | parser.add_argument('-f', '--outfmt', metavar='output_format*',
67 | dest='f', type=str,
68 | default='"6 qseqid sseqid pident length mismatch gapopen ' \
69 | + 'qstart qend sstart send qlen slen evalue bitscore"',
70 | help='outfmt defined by blast+, it is dangerous to change the default value')
71 | args = parser.parse_args()
72 |
73 |
74 | def input_type(b):
75 | '''
76 | return blast database type (prot or nucl)
77 | '''
78 | if b == 'blastp' or b == 'blastx':
79 | tp = 'prot'
80 | return tp
81 | elif b == 'blastn' or b == 'tblastn':
82 | tp = 'nucl'
83 | return tp
84 | else:
85 | sys.exit("Error: -b argument should only be 'blastp/blastn/blastx/tblastn'!")
86 |
87 |
88 | def database_exist(db):
89 | prot_databases = db + '.phr'
90 | nucl_databases = db + '.nhr'
91 | if os.path.exists(prot_databases) or os.path.exists(nucl_databases):
92 | return True
93 |
94 |
95 | def run_mkblastdb(fi, fo, tp):
96 | '''
97 | fi: input fasta file
98 | fo: output database name
99 | tp: prot or nucl
100 | '''
101 | cmd_para = [
102 | 'makeblastdb',
103 | '-in', fi,
104 | "-dbtype", tp,
105 | "-parse_seqids",
106 | "-out", fo
107 | ]
108 | cmd = ' '.join(cmd_para)
109 | try:
110 | print("\n", 'Make Blast Database'.center(50, '*'))
111 | print(cmd, "\n")
112 | os.system(cmd)
113 | except Exception as e:
114 | raise e
115 |
116 |
117 | def run_blast(q, o, db, e, f, n, b):
118 | '''
119 | q: query
120 | o: output
121 | db: database
122 | e: evalue
123 | f: outfmt
124 | n: num_threads
125 | b: blast program
126 | '''
127 | cmd_para = [
128 | b,
129 | '-query', q,
130 | '-out', o,
131 | '-db', db,
132 | '-evalue', str(e),
133 | '-outfmt', f,
134 | '-num_threads', str(n)
135 | ]
136 | cmd = ' '.join(cmd_para)
137 | try:
138 | print("\n", 'BLAST Searching'.center(50, '*'))
139 | print(cmd, "\n")
140 | os.system(cmd)
141 | except Exception as e:
142 | raise e
143 |
144 |
145 | def creat_dict(fa):
146 | with open(fa, 'r') as f:
147 | dict = defaultdict(str)
148 | name = ''
149 | for line in f:
150 | if line.startswith('>'):
151 | name = line[1:-1].split()[0]
152 | continue
153 | dict[name] += line.strip()
154 | return dict
155 |
156 |
157 | def blast_Parser(fi, fo, header, idt, qc, ms, *dict):
158 | '''
159 | fi: blast output (format as defined in this script)
160 | fo: final output
161 | dict: dictionary created from query fasta (used to extract hit sequences)
162 | '''
163 | seq_dict = {} # initialize a dict to index query sequences
164 | if dict:
165 | seq_dict = dict[0]
166 |
167 | with open(fi) as input, open(fo, 'w') as output:
168 | output.write("\t".join(header) + "\n")
169 | times = 0 # initialize the hit number
170 | quer_last = '' # initialize the hit sequence
171 | for line in input.readlines():
172 | items = line.strip().split("\t")
173 | quer = items[0]
174 | if quer == quer_last:
175 | times += 1
176 | if times > ms:
177 | continue
178 | else:
179 | quer_last = quer
180 | times = 1
181 | qstart, qend, qlen = map(float, (items[6], items[7], items[10]))
182 | qcov = 100 * (qend - qstart) / qlen
183 | ident = float(items[2])
184 | if ident < idt or qcov < qc:
185 | continue
186 | items.append(str(round(qcov, 1)))
187 | if seq_dict:
188 | qid = items[0]
189 | items.append(seq_dict[qid])
190 | output.write("\t".join(items) + "\n")
191 |
192 |
193 | def review_output(file):
194 | with open(file, 'r+') as fi:
195 | if len(fi.readlines()) == 1:
196 | fi.seek(0)
197 | fi.truncate()
198 |
199 | def main():
200 | tp = input_type(args.b)
201 |
202 | if not args.o:
203 | args.o = os.path.basename(args.q) + '_blast.out'
204 |
205 | # Make blast database
206 | if args.df:
207 | database_file = os.path.join(os.getcwd(), args.df) + '.db'
208 | if not database_exist(database_file):
209 | print("Starting to make blast database...")
210 | run_mkblastdb(args.df, database_file, tp)
211 | args.db = database_file
212 | print('DB: ', args.db)
213 |
214 | # Storing temporary blast result
215 | tempt_output = str(args.o) + '_blast.tmp'
216 |
217 | # => Run blast program
218 | run_blast(args.q, tempt_output, args.db, args.e, args.f, args.n, args.b)
219 |
220 | # Creat dict from query fasta, in order to extract sequencs later
221 | dict = creat_dict(args.q)
222 |
223 | # Parse blast output
224 | header = [
225 | 'qid', 'sid', 'ident%', 'aln_len', 'miss',
226 | 'gap', 'qstart', 'qend', 'sstart', 'send',
227 | 'qlen', 'slen', 'evalue', 'bitscore', 'qcov%', 'qseq'
228 | ]
229 | # If the --no_qseq option was specified, there would be no qseq column.
230 | if args.nq:
231 | header.remove('qseq')
232 | blast_Parser(tempt_output, args.o, header, args.idt, args.qc, args.ms)
233 | else:
234 | blast_Parser(tempt_output, args.o, header, args.idt, args.qc, args.ms, dict)
235 | # Remove temp file
236 | os.remove(tempt_output)
237 |
238 | # Clear the lonely header line if no hit was found
239 | review_output(args.o)
240 |
241 | print("\n", 'OUTPUT'.center(50, '*'))
242 | print("Output File: {0}".format(args.o))
243 |
244 |
245 | if __name__ == '__main__':
246 | main()
247 |
--------------------------------------------------------------------------------
/blast-wrapper/blastout2fasta.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | """
4 | Used to convert the output from blast_wrapper.py to fasta format
5 |
6 | Usage:
7 | $ python3 blastout2fasta.py blast.out > blast_out.fa
8 | """
9 |
10 | import sys
11 | import textwrap
12 |
13 | __author__ = "Heyu Lin"
14 | __contact__ = "heyu.lin(AT)student.unimelb.edu.au"
15 |
16 | in_file = sys.argv[1]
17 | with open(in_file, 'r') as fi:
18 | for line in fi.readlines():
19 | fields = line.strip().split('\t')
20 | if fields[0] != 'qid':
21 | print('>{header}'.format(header=fields[0]))
22 | print(textwrap.fill(fields[15], 80))
23 |
--------------------------------------------------------------------------------
/cdhit-clstr2tbl/README.md:
--------------------------------------------------------------------------------
1 | # CD-HIT clstr2tbl
2 | Given a `clstr` file from `CD-HIT` program, this program will generate a table (tab separated) that contains the header of every sequence in the 1st column and the corresponding representative in the 2nd column.
3 |
4 | The output file is more friendly for further analysis.
5 |
6 | ## Usage
7 | ```bash
8 | $ python3 cdhit_clstr2tbl.py input.clstr > out.tab
9 | ```
10 | ## Input Sample
11 |
12 | ```
13 | >Cluster 0
14 | 0 14739aa, >gene1... *
15 | 1 656aa, >gene2... at 99.85%
16 | >Cluster 1
17 | 0 66aa, >gene3... at 100.00%
18 | 1 13708aa, >gene4... *
19 | 2 13708aa, >gene5... at 100.00%
20 | ```
21 |
22 | Output Sample
23 |
24 | | gene_id | representative |
25 | | ------- | -------------- |
26 | | gene1 | gene1 |
27 | | gene2 | gene4 |
28 | | gene3 | gene4 |
29 | | gene4 | gene4 |
30 |
31 | # Chinese Usage 中文使用说明
32 | 输入一个`CD-HIT`文件产出的`clstr`文件,此脚本可以将其转换为一个tab分隔的表格文件,第一列是每个序列的名称,第二列是每个序列对应的代表序列的名称。
33 |
34 | 经转换过的文件对下游分析更友好。
35 |
36 | ## 使用
37 | ```bash
38 | $ python3 cdhit_clstr2tbl.py input.clstr > out.tab
39 | ```
40 | ## 输入文件示例
41 |
42 | ```
43 | >Cluster 0
44 | 0 14739aa, >gene1... *
45 | 1 656aa, >gene2... at 99.85%
46 | >Cluster 1
47 | 0 66aa, >gene3... at 100.00%
48 | 1 13708aa, >gene4... *
49 | 2 13708aa, >gene5... at 100.00%
50 | ```
51 |
52 | ## 输出文件示例
53 |
54 | | gene_id | representative |
55 | | ------- | -------------- |
56 | | gene1 | gene1 |
57 | | gene2 | gene4 |
58 | | gene3 | gene4 |
59 | | gene4 | gene4 |
60 |
--------------------------------------------------------------------------------
/cdhit-clstr2tbl/cdhit_clstr2tbl.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | """
4 | Given a clstr file from cd-hit program, this program will generate a table that
5 | contains the header of every sequence in the 1st column and the corresponding
6 | representative in the 2nd column.
7 | The output file is more friendly for further analysis.
8 |
9 | Usage: $ python3 cdhit_clstr2tbl.py input.clstr > out.tab
10 |
11 |
12 | The input .clstr file looks like:
13 | >Cluster 0
14 | 0 14739aa, >gene1... *
15 | 1 656aa, >gene2... at 99.85%
16 | >Cluster 1
17 | 0 13708aa, >gene3... *
18 | >Cluster 2
19 | 0 66aa, >gene4... at 100.00%
20 | 1 13708aa, >gene5... *
21 | 2 13708aa, >gene6... at 100.00%
22 |
23 |
24 | The output table file looks like:
25 | gene_id representative
26 | gene1 gene1
27 | gene2 gene1
28 | gene3 gene3
29 | gene4 gene5
30 | gene5 gene5
31 | gene6 gene5
32 | """
33 | import re
34 | import sys
35 |
36 | __author__ = "Heyu Lin"
37 | __contact__ = "heyu.lin(AT)student.unimelb.edu.au"
38 |
39 | in_file = sys.argv[1]
40 |
41 | match_header = re.compile(r'>(.*?)\.{3}')
42 |
43 | header_list = []
44 | repre = ''
45 |
46 | # print the header
47 | print('gene_id' + '\t' + 'representative')
48 |
49 | with open(in_file) as input:
50 | for line in input.readlines():
51 | if line.startswith('>'):
52 | for name in header_list:
53 | print(name + '\t' + repre)
54 | header_list = []
55 | else:
56 | if line.strip().endswith('*'):
57 | repre = match_header.findall(line)[0]
58 | header_list.append(repre)
59 | else:
60 | header_list.append(match_header.findall(line)[0])
61 |
62 | # patch for the last cluster
63 | for name in header_list:
64 | print(name + '\t' + repre)
65 |
--------------------------------------------------------------------------------
/cdhit-clstr2tbl/test.clstr:
--------------------------------------------------------------------------------
1 | >Cluster 0
2 | 0 14739aa, >gene1... *
3 | 1 656aa, >gene2... at 99.85%
4 | >Cluster 1
5 | 0 13708aa, >gene3... *
6 | >Cluster 2
7 | 0 66aa, >gene4... at 100.00%
8 | 1 13708aa, >gene5... *
9 | 2 13708aa, >gene6... at 100.00%
--------------------------------------------------------------------------------
/cdhit-clstr2tbl/test.clstr.tab:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SilentGene/Bio-py/33f9827114723c9db661c80f7e13564f2375417a/cdhit-clstr2tbl/test.clstr.tab
--------------------------------------------------------------------------------
/circular_genomes_from_gfa/README.md:
--------------------------------------------------------------------------------
1 | # Circular genomes from GFA
2 |
3 | This script is used for extracting circular DNA sequences (including genomes, plasmids, viruses, etc) from a GFA file
4 |
5 | ## Usage
6 |
7 | ```bash
8 | $ python circular_genomes_from_gfa.py [output_dir]
9 | ```
10 |
11 | ## Example
12 |
13 | Using the "assembly_graph.gfa" file generated by flye
14 |
15 | ```bash
16 | # Assembly
17 | $ flye --pacbio-hifi pacbio-css.fq.gz --out-dir flye_out --threads 16 --meta --scaffold
18 | # Get circular DNA
19 | $ cd flye_out
20 | $ python circular_genomes_from_gfa.py assembly_graph.gfa
21 | ```
22 |
23 | ### Result
24 |
25 | - Output folder: assembly_graph_circular
26 | - assembly_graph_circular_all.fna: A fasta file containing all circular sequences
27 | - assembly_graph_circular_all_info.tsv: A tab-separated file containing information about the circular sequences (ID, length)
28 | - edge_17343.fasta: Each *.fasta file contains an individule circular sequence
29 | - edge_129.fasta
30 | - edge_*.fasta
31 |
32 | ## Chinese Usage 中文使用说明
33 |
34 | 这个脚本用于从 GFA 文件中提取环形的DNA序列(包括基因组、质粒、病毒等)
35 |
36 | ## 使用
37 |
38 | ```bash
39 | $ python circular_genomes_from_gfa.py [output_dir]
40 | ```
41 |
42 | ## 示例
43 |
44 | 使用 flye 生成的 "assembly_graph.gfa" 文件
45 |
46 | ```bash
47 | # Assembly
48 | $ flye --pacbio-hifi pacbio-css.fq.gz --out-dir flye_out --threads 16 --meta --scaffold
49 | # Get circular DNA
50 | $ cd flye_out
51 | $ python circular_genomes_from_gfa.py assembly_graph.gfa
52 | ```
53 |
54 | ### 结果
55 |
56 | - 输入文件夹: assembly_graph_circular
57 | - assembly_graph_circular_all.fna: 包含所有环形序列的fasta文件
58 | - assembly_graph_circular_all_info.tsv: 包含环形序列信息的tab分隔文件(ID, 长度)
59 | - edge_17343.fasta: 每个 *.fasta 文件包含一个环形序列
60 | - edge_129.fasta
61 | - edge_*.fasta
62 |
--------------------------------------------------------------------------------
/circular_genomes_from_gfa/circular_genomes_from_gfa.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 |
4 | """
5 | Extract circular genomes from a GFA file
6 | Usage: python circular_genomes_from_gfa.py [output_dir]
7 | """
8 |
9 | def usage():
10 | print("Extract circular genomes from a GFA file")
11 | print("Usage: python circular_genomes_from_gfa.py [output_dir]")
12 | print("\tOptional: output_dir - directory to write output files to (default: _circular)")
13 | sys.exit(1)
14 |
15 | def get_args():
16 | if len(sys.argv) < 2:
17 | usage()
18 |
19 | input_file = sys.argv[1]
20 | if not input_file.endswith('.gfa'):
21 | print("Error: Input file must be a .gfa file")
22 | usage()
23 |
24 | base = os.path.splitext(input_file)[0]
25 |
26 | output_dir = sys.argv[2] if len(sys.argv) > 2 else f"{base}_circular"
27 | fasta_file = os.path.join(output_dir, f'{base}_circular_all.fna')
28 | tsv_file = os.path.join(output_dir, f'{base}_circular_all_info.tsv')
29 | return input_file, output_dir, fasta_file, tsv_file
30 |
31 | def get_seqs(gfa_file):
32 | segments = {} # id -> sequence
33 |
34 | with open(gfa_file) as f:
35 | for line in f:
36 | if line.startswith('S'): # Segment line
37 | parts = line.strip().split('\t')
38 | seg_id, sequence = parts[1], parts[2]
39 | segments[seg_id] = sequence
40 | return segments
41 |
42 | def find_circular_paths(gfa_file):
43 | circular_paths = set()
44 | with open(gfa_file) as f:
45 | for line in f:
46 | if line.startswith('L'): # Link line
47 | parts = line.strip().split('\t')
48 | from_id, from_orient = parts[1], parts[2]
49 | to_id, to_orient = parts[3], parts[4]
50 | overlap = parts[5]
51 | if from_id == to_id and from_orient == to_orient and overlap == '0M':
52 | circular_paths.add(from_id)
53 | # if no circular paths found, exit with warning
54 | if not circular_paths:
55 | print("Warning: No circular paths found in the GFA file")
56 | sys.exit(1)
57 | return circular_paths
58 |
59 | def write_output(seq_dict, ids, output_dir, output_fasta, output_tsv):
60 | if not os.path.exists(output_dir):
61 | os.makedirs(output_dir)
62 |
63 | count = 1
64 | seq_len = {}
65 | for id in ids:
66 | sequence = seq_dict[id]
67 | seq_len[id] = len(sequence)
68 | # order by length
69 | sorted_ids = sorted(ids, key=lambda x: seq_len[x], reverse=True)
70 | with open(output_fasta, 'w') as ff, open(output_tsv, 'w') as tf:
71 | tf.write('#id\tSeqID\tLength(bp)\n')
72 | for id in sorted_ids:
73 | sequence = seq_dict[id]
74 | ff.write(f'>{id}\n{sequence}\n')
75 | tf.write(f'{count}\t{id}\t{seq_len[id]}\n')
76 | count += 1
77 | # write sequences to individual files
78 | for id in sorted_ids:
79 | with open(os.path.join(output_dir, f'{id}.fasta'), 'w') as f:
80 | f.write(f'>{id}\n{seq_dict[id]}\n')
81 |
82 |
83 |
84 |
85 | def main():
86 | input_file, output_dir, fasta_file, tsv_file = get_args()
87 |
88 | # Parse GFA file
89 | segments_dict = get_seqs(input_file)
90 |
91 | # Find circular paths
92 | circular_edges = find_circular_paths(input_file)
93 |
94 | # Write output files
95 | write_output(segments_dict, circular_edges, output_dir, fasta_file, tsv_file)
96 |
97 | if __name__ == '__main__':
98 | main()
--------------------------------------------------------------------------------
/download_uniprot_proteomes/README.md:
--------------------------------------------------------------------------------
1 | # Uniprot Proteome Downloader
2 |
3 | This script is used for batch retrieval proteomes in faa.gz format according to a list of Proteome identifiers (UPIDs)
4 |
5 | ## Usage:
6 |
7 | ```bash
8 | $ python3 download_uniprot_proteomes_UPID.py input_list.txt output_dir
9 | ```
10 |
11 | ## Sample input_list.txt
12 |
13 | ```
14 | UP000000272
15 | UP000000391
16 | UP000000442
17 | ```
18 |
19 | ## Chinese Usage 中文使用说明
20 |
21 | 该脚本可以通过一个包含Proteome identifiers (UPIDs)列表的文件来批量下载基因组文件。下载格式为faa.gz。
22 |
23 | ## 使用
24 |
25 | ```bash
26 | $ python3 download_uniprot_proteomes_UPID.py input_list.txt output_dir
27 | ```
28 |
29 | ## 输入列表文件示例
30 |
31 | ```
32 | UP000000272
33 | UP000000391
34 | UP000000442
35 | ```
36 |
--------------------------------------------------------------------------------
/download_uniprot_proteomes/download_uniprot_proteomes_UPID.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python3
2 |
3 | """
4 | This script is used for batch retrieval proteomes in faa.gz format according
5 | to a list of Proteome identifiers (UPIDs)
6 |
7 | # Usage:
8 | $ python3 download_uniprot_proteomes_UPID.py input_list.txt output_dir
9 |
10 | input_list.txt sample:
11 | UP000000272
12 | UP000000391
13 | UP000000442
14 | """
15 |
16 | import sys
17 | import os
18 | import requests
19 |
20 | __author__ = "Heyu Lin"
21 | __contact__ = "heyu.lin@student.unimelb.edu.au"
22 |
23 | list_file = sys.argv[1]
24 | output_dir = sys.argv[2]
25 |
26 |
27 | def request_proteome(upid, output_dir, num):
28 | base_url = 'https://www.uniprot.org/uniprot/?include=false&format=fasta&compress=yes&force=true&query=proteome:'
29 | request_url = base_url + upid
30 | try:
31 | r = requests.get(request_url, allow_redirects=True)
32 | r.raise_for_status()
33 | except requests.exceptions.HTTPError as http_err:
34 | raise SystemExit(f'HTTP error occurred: {http_err}')
35 | except Exception as err:
36 | raise SystemExit(f'Other error occurred: {err}')
37 | else:
38 | print(f'[{num}] {upid} - OK')
39 |
40 | # save the content with name
41 | open(os.path.join(output_dir, upid + '.faa.gz'), 'wb').write(r.content)
42 |
43 |
44 | if __name__ == "__main__":
45 | if not os.path.exists(output_dir):
46 | os.makedirs(output_dir)
47 |
48 | # read input list
49 | with open(list_file) as f:
50 | upids = f.read().splitlines()
51 | print(str(len(upids)) + ' lines have been read. Request started...')
52 |
53 | # retreival
54 | num = 1
55 | for upid in upids:
56 | request_proteome(upid, output_dir, num)
57 | num += 1
58 |
--------------------------------------------------------------------------------
/fasta-splitter/fasta_splitter.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | """
4 | A Python script for splitting a fasta format file into pieces by
5 | specifying the number of divided files or the number of sequences in divided files
6 |
7 | Inspired by Biopython wiki - https://biopython.org/wiki/Split_large_file
8 |
9 | ==Required: Biopython
10 |
11 | ==Options:
12 | -i, --input: input fasta file
13 | -o, --output: output directory
14 | -partn, --partnumber: number of files will be divided into
15 | -parts, --partseq: number of sequences will be put into every divided file
16 |
17 | ==Examples:
18 | 1. Divide a fasta file into <10> files, storing in
19 | python fasta_splitter.py -i input.fasta -o output_dir -partn 10
20 |
21 | 2. Divide a fasta file into files containing <1000> sequences in
22 | python fasta_splitter.py -i input.fasta -parts 1000
23 | """
24 |
25 | import sys
26 | import os
27 | from math import ceil
28 | from Bio import SeqIO
29 | import argparse
30 |
31 | #
32 | __author__ = "Heyu Lin"
33 | __contact__ = "heyu.lin(AT)student.unimelb.edu.au"
34 |
35 |
36 | """
37 | Arguments
38 | """
39 | parser = argparse.ArgumentParser()
40 | parser.add_argument('-i', '--input', metavar='input_file', dest='i',
41 | type=str, required=True)
42 | parser.add_argument('-o', '--output', metavar='output_dir', dest='o',
43 | type=str, default='.')
44 |
45 | group = parser.add_mutually_exclusive_group(required=True)
46 | group.add_argument('-partn', '--partnumber', metavar='number_of_parts', dest='p',
47 | type=int)
48 | group.add_argument('-parts', '--partseq', metavar='number_of_seqences_in_every_part', dest='s',
49 | type=int)
50 |
51 | args = parser.parse_args()
52 |
53 | def batch_iterator(iterator, batch_size):
54 | """Returns lists of length batch_size.
55 |
56 | This can be used on any iterator, for example to batch up
57 | SeqRecord objects from Bio.SeqIO.parse(...), or to batch
58 | Alignment objects from Bio.AlignIO.parse(...), or simply
59 | lines from a file handle.
60 |
61 | This is a generator function, and it returns lists of the
62 | entries from the supplied iterator. Each list will have
63 | batch_size entries, although the final list may be shorter.
64 | """
65 | entry = True # Make sure we loop once
66 | while entry:
67 | batch = []
68 | while len(batch) < batch_size:
69 | try:
70 | entry = next(iterator)
71 | except StopIteration:
72 | entry = None
73 | if entry is None:
74 | # End of file
75 | break
76 | batch.append(entry)
77 | if batch:
78 | yield batch
79 |
80 |
81 | def total_num_calc(fasta):
82 | """
83 | Calculate total number of the given fasta file
84 | """
85 | total_num = len([1 for line in open(fasta) if line.startswith(">")])
86 | return total_num
87 |
88 |
89 | def splitter(input, num, outdir):
90 | """
91 | split fasta sequences into pieces
92 | """
93 | fname = os.path.basename(input)
94 | fbname, fename = os.path.splitext(fname)
95 | record_iter = SeqIO.parse(open(input),"fasta")
96 | for i, batch in enumerate(batch_iterator(record_iter, num)):
97 | filename = "{0}.p-{1}{2}".format(fbname, i + 1, fename)
98 | output = os.path.join(outdir, filename)
99 | with open(output, "w") as handle:
100 | count = SeqIO.write(batch, handle, "fasta")
101 | print("Wrote %i records to %s" % (count, output))
102 |
103 |
104 | def main():
105 | n_seq = 0 # Number of sequences in every divided files
106 |
107 | if args.p:
108 | total_num = total_num_calc(args.i)
109 | n_seq = ceil(total_num / args.p)
110 | elif args.s:
111 | n_seq = args.s
112 |
113 | splitter(args.i, n_seq, args.o)
114 |
115 |
116 | if __name__ == '__main__':
117 | main()
--------------------------------------------------------------------------------
/prodigal-wrapper/prodigal_run.py:
--------------------------------------------------------------------------------
1 | from multiprocessing import Pool
2 | import os
3 |
4 | def prodigal(fasta, basename, outdir):
5 | cmd_para = [
6 | 'prodigal', '-q',
7 | '-i', fasta,
8 | '-p', 'meta',
9 | '-a', os.path.join(outdir, basename + '.faa'),
10 | '-d', os.path.join(outdir, basename + '.ffn'),
11 | '-o', os.path.join(outdir, basename + '.gbk')
12 | ]
13 | cmd = ' '.join(cmd_para)
14 | try:
15 | print("\n" + 'ORFs prediction'.center(50, '*'))
16 | print(cmd + '\n')
17 | os.system(cmd)
18 | except:
19 | print("\nSomething wrong with prodigal annotation!")
20 |
21 |
22 |
--------------------------------------------------------------------------------
/prokka2kegg/README.md:
--------------------------------------------------------------------------------
1 | # Prokka2KEGG
2 | This script is used to assign KO entries (K numbers in KEGG annotation) according to UniProtKB ID in the *.gbk file generated by `Prokka`
3 |
4 | ## Usage
5 |
6 | ### ~~Step 1: Download and initialize the cross-reference database provided by UniProt~~
7 |
8 | ```bash
9 | $ wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz
10 | $ gzip -dc idmapping.dat.gz | awk '{if($2=="KO") print $1,$3}' OFS="\t" | gzip > idmapping_KO.tab.gz
11 | ```
12 | ~~You could choose to remove 'idmapping.dat.gz' now.~~
13 |
14 | ### Note
15 |
16 | UniProt has removed K numbers from their cross-reference database since early 2021. Now users have to download the formatted database (formatted in Jan 2019) from this repo ([idmapping_KO.tab.gz](https://github.com/SilentGene/Bio-py/blob/master/prokka2kegg/idmapping_KO.tab.gz)).
17 |
18 | ### Step 2: Retrieve K numbers according to the UniProtKB IDs of proteins
19 | ```bash
20 | $ python3 prokka2kegg.py -i input.gbk -d idmapping_KO.tab.gz -o output.txt
21 | ```
22 |
23 | *This script will produce a json format database in the same folder of idmapping_KO.tab.gz for reuse, which may speed up the program when running next time.*
24 |
25 | ## Options
26 |
27 | - `-i`: input gbk file generated by Prokka
28 | - `-o`: output file with gene ids and K entries in tab separeted format
29 | - `-d`: formated cross-reference database from the step 1 (or downloaded directly from my repo)
30 |
31 | ## Require
32 |
33 | - Using **Python3**
34 | - Works both on Windows and unix-like systems
35 | - No 3rd party python modules required
36 |
37 | ## Sample Output:
38 |
39 | | | |
40 | | -------- | ------ |
41 | | ORF_0001 | |
42 | | ORF_0002 | K03152 |
43 | | ORF_0003 | |
44 | | ORF_0004 | K16331 |
45 | | ORF_0005 | K01997 |
46 |
47 | ## Tips
48 |
49 | There is another script `prokka2kegg_batch.py` which could helped you handle many gbk files in a batch mode.
50 |
51 | # Chinese Usage 中文使用说明
52 |
53 | 这个脚本可以帮助你利用`Prokka`注释得到的gbk文件进行KEGG注释,得到每个ORF对应的KO号。
54 |
55 | ## 使用
56 |
57 | ### ~~第一步:下载和初始化Uniprot提供的数据库间对应的查询库~~
58 | ```bash
59 | $ wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz
60 | $ gzip -dc idmapping.dat.gz | awk '{if($2=="KO") print $1,$3}' OFS="\t" | gzip > idmapping_KO.tab.gz
61 | ```
62 | ## 说明
63 |
64 | 因为Uniprot在2021年上半年废弃了其数据库中的KO号信息,现在使用该脚本必须下载本github库中格式化好的数据库(下载于2019年1月)来运行脚本 ([idmapping_KO.tab.gz](https://github.com/SilentGene/Bio-py/blob/master/prokka2kegg/idmapping_KO.tab.gz))。
65 |
66 | ### 第二步: 通过每个ORF的UniProtKB IDs在数据库中查询对应的KO号
67 |
68 | ```bash
69 | $ python3 prokka2kegg.py -i input.gbk -d idmapping_KO.tab.gz -o output.txt
70 | ```
71 |
72 | *脚本会在idmapping_KO.tab.gz所在的文件夹下产生一个json文件来加快下一次调用数据库查询时的速度。*
73 |
74 | ## 选项
75 |
76 | - `-i`: 输入文件,Prokka注释产生的gbk文件
77 | - `-o`: 输出的带有每个ORF ID和其对应KO号的tab分隔的文本文件
78 | - `-d`: 由第一步产生(或直接从我库中下载)的数据库文件
79 |
80 | ## 要求
81 |
82 | - 使用**Python3**
83 | - 在Windows和类unix系统中均可运行
84 | - 无需第三方python模块
85 |
86 | ## 输出示例:
87 |
88 | | | |
89 | | -------- | ------ |
90 | | ORF_0001 | |
91 | | ORF_0002 | K03152 |
92 | | ORF_0003 | |
93 | | ORF_0004 | K16331 |
94 | | ORF_0005 | K01997 |
95 |
96 | ## 提示
97 |
98 | 另外提供一个脚本`prokka2kegg_batch.py`来同时完成多个gbk文件的转换。
--------------------------------------------------------------------------------
/prokka2kegg/idmapping_KO.tab.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SilentGene/Bio-py/33f9827114723c9db661c80f7e13564f2375417a/prokka2kegg/idmapping_KO.tab.gz
--------------------------------------------------------------------------------
/prokka2kegg/prokka2kegg.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | """
4 | Description:
5 | KO entries (K numbers in KEGG annotation) assignment
6 | according to UniProtKB ID in `Prokka` *.gbk file
7 |
8 | Usage:
9 |
10 | Step1: Download and initialize the cross-reference database provided by UniProt
11 | $ wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz
12 | $ gzip -dc idmapping.dat.gz | awk '{if($2=="KO") print $1,$3}' OFS="\t" | gzip > idmapping_KO.tab.gz
13 | You could choose to remove 'idmapping.dat.gz' now.
14 |
15 | Step2: Retrieve K numbers according to the UniProtKB IDs of proteins
16 | $ python3 gbk2kegg.py -i input.gbk -d idmapping_KO.tab.gz -o output.txt
17 |
18 | This script will produce a json format database in the same folder of
19 | idmapping_KO.tab.gz for reuse, which may speed up the program when
20 | running next time.
21 | """
22 |
23 | import os
24 | import re
25 | import gzip
26 | import curses
27 | import argparse
28 | import json
29 |
30 | __author__ = "Heyu Lin"
31 | __contact__ = "heyu.lin(AT)student.unimelb.edu.au"
32 |
33 | parser = argparse.ArgumentParser()
34 | parser.add_argument('-i', '--input', metavar='input_gbk', dest='i',
35 | type=str, required=True)
36 | parser.add_argument('-o', '--output', metavar='output', dest='o',
37 | type=str, required=True)
38 | parser.add_argument('-d', '--data', metavar='idmapping.dat.gz',
39 | dest='d', type=str,
40 | help='UniProtKB cross-references database')
41 | args = parser.parse_args()
42 |
43 |
44 | def gbk_parser(gbk):
45 | """
46 | gbk: gbk genome file generated by Prokka
47 | """
48 | arr = [] # output array containing locus_tag and UniProtKB
49 | with open(gbk) as input:
50 | cds = 0
51 | locus = 0
52 | pattern_locus = re.compile('"(.*)"')
53 | pattern_uniprotkb = re.compile('UniProtKB:(.*)"')
54 | for line in input.readlines():
55 | if line.startswith(' ' * 5 + 'CDS'):
56 | cds = 1 # This is a CDS
57 | if line.startswith(' ' * 21 + '/locus_tag=') and cds == 1:
58 | locus_tag = pattern_locus.findall(line)[0]
59 | locus = 1 # locus_tag was read
60 | if line.startswith(' ' * 21 + '/inference="similar to AA sequence:UniProtKB') and locus == 1:
61 | uniprotkb = pattern_uniprotkb.findall(line)[0]
62 | arr.append([locus_tag, uniprotkb])
63 | cds = 0
64 | locus = 0
65 | if line.startswith(' ' * 21 + '/codon_start') and locus == 1:
66 | arr.append([locus_tag, ''])
67 | cds = 0
68 | locus = 0
69 | return arr
70 |
71 |
72 | def dict_initialize(gzfile):
73 | dict = {}
74 | with gzip.open(gzfile) as fi:
75 | for line in fi.readlines():
76 | fields = line.decode('utf-8').strip().split('\t')
77 | if fields[0] not in dict:
78 | dict[fields[0]] = [fields[1]]
79 | else:
80 | dict[fields[0]].append(fields[1])
81 | return dict
82 |
83 |
84 | def dict_load(json_file):
85 | with open(json_file, 'r') as f:
86 | r = json.load(f)
87 | return r
88 |
89 |
90 | def retrieve_KO(arr, dict):
91 | """
92 | arr = [
93 | ['AMLFNMKI_00003', ''],
94 | ['AMLFNMKI_00004', 'Q24SP7']
95 | ]
96 | new_arr = [
97 | ['AMLFNMKI_00025', 'Q01465', ['K03569']],
98 | ['AMLFNMKI_00026', 'P15639', ['K00602','K00604']]
99 | ['AMLFNMKI_00027', '', '']
100 | ]
101 | """
102 | new_arr = []
103 | id_no_match = [] # record UniProtKB IDs have no corresponding KO numbers
104 | for cds in arr:
105 | if cds[1] == '':
106 | cds.append('')
107 | new_arr.append(cds)
108 | else:
109 | ko = dict.get(cds[1], None)
110 | if ko is None:
111 | id_no_match.append(cds[1])
112 | cds.append('')
113 | new_arr.append(cds)
114 | else:
115 | cds.append(ko)
116 | new_arr.append(cds)
117 | # print(json.dumps(new_arr))
118 | """ Report failure in search K numbers according to UniProtKB IDs
119 | print("Warning: The following " + str(len(id_no_match))
120 | + " UniProtKB IDs don't have corresponding K numbers")
121 | print(' '.join(id_no_match))
122 | """
123 | return new_arr
124 |
125 |
126 | def write_json(content, outfile):
127 | with open(outfile, 'w') as fo:
128 | json.dump(content, fo)
129 |
130 |
131 | def output(arr, outfile):
132 | """
133 | arr = [
134 | ["AMLFNMKI_00025", Q01465, ["K03569"]],
135 | ["AMLFNMKI_00026", P15639, ["K00602","K00604"]]
136 | ["AMLFNMKI_00027", "", ""]
137 | ]
138 | """
139 | with open(outfile, 'w') as fo:
140 | for cds in arr:
141 | if cds[2] != "":
142 | for ko in cds[2]:
143 | fo.write(cds[0] + "\t" + ko + "\n")
144 | else:
145 | fo.write(cds[0] + "\n")
146 |
147 |
148 | def main():
149 | if os.path.exists(args.d + '.json'):
150 | db_dict = dict_load(args.d + '.json')
151 | else:
152 | db_dict = dict_initialize(args.d)
153 | write_json(db_dict, args.d + '.json')
154 | mapping_array = gbk_parser(args.i)
155 | final_arr = retrieve_KO(mapping_array, db_dict)
156 | output(final_arr, args.o)
157 |
158 |
159 | if __name__ == '__main__':
160 | main()
161 |
--------------------------------------------------------------------------------
/prokka2kegg/prokka2kegg_batch.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | """
4 | Description:
5 | KO entries (K numbers in KEGG annotation) assignment *in batch mode*
6 | according to UniProtKB ID in `Prokka` *.gbk files
7 |
8 | Usage:
9 |
10 | Step1: Download and initialize the cross-reference database provided by UniProt
11 | $ wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz
12 | $ gzip -dc idmapping.dat.gz | awk '{if($2=="KO") print $1,$3}' OFS="\t" | gzip > idmapping_KO.tab.gz
13 | You could choose to remove 'idmapping.dat.gz' now.
14 |
15 | Step2: Retrieve K numbers according to the UniProtKB IDs of proteins
16 | $ python3 gbk2kegg_batch.py -i input_dir -d idmapping_KO.tab.gz -o output_dir
17 |
18 | This script will produce a json format database in the same folder of
19 | idmapping_KO.tab.gz for reuse, which may speed up the program when
20 | running next time.
21 | """
22 |
23 | import os
24 | import re
25 | import gzip
26 | import curses
27 | import argparse
28 | import json
29 |
30 | __author__ = "Heyu Lin"
31 | __contact__ = "heyu.lin(AT)student.unimelb.edu.au"
32 |
33 | parser = argparse.ArgumentParser()
34 | parser.add_argument('-i', '--input', metavar='input_dir', dest='i',
35 | type=str, required=True,
36 | help='specify the directory containing *.gbk files')
37 | parser.add_argument('-o', '--output', metavar='output_dir', dest='o',
38 | type=str, required=True,
39 | help='output tab files will be produced in this directory')
40 | parser.add_argument('-d', '--data', metavar='idmapping_KO.tab.gz',
41 | dest='d', type=str,
42 | help='database generated accroding to "step1" instruction')
43 | args = parser.parse_args()
44 |
45 |
46 | def gbk_parser(gbk):
47 | """
48 | gbk: gbk genome file generated by Prokka
49 | """
50 | arr = [] # output array containing locus_tag and UniProtKB
51 | with open(gbk) as input:
52 | cds = 0
53 | locus = 0
54 | pattern_locus = re.compile('"(.*)"')
55 | pattern_uniprotkb = re.compile('UniProtKB:(.*)"')
56 | for line in input.readlines():
57 | if line.startswith(' ' * 5 + 'CDS'):
58 | cds = 1 # This is a CDS
59 | if line.startswith(' ' * 21 + '/locus_tag=') and cds == 1:
60 | locus_tag = pattern_locus.findall(line)[0]
61 | locus = 1 # locus_tag was read
62 | if line.startswith(' ' * 21 + '/inference="similar to AA sequence:UniProtKB') and locus == 1:
63 | uniprotkb = pattern_uniprotkb.findall(line)[0]
64 | arr.append([locus_tag, uniprotkb])
65 | cds = 0
66 | locus = 0
67 | if line.startswith(' ' * 21 + '/codon_start') and locus == 1:
68 | arr.append([locus_tag, ''])
69 | cds = 0
70 | locus = 0
71 | return arr
72 |
73 |
74 | def dict_initialize(gzfile):
75 | dict = {}
76 | with gzip.open(gzfile) as fi:
77 | for line in fi.readlines():
78 | fields = line.decode('utf-8').strip().split('\t')
79 | if fields[0] not in dict:
80 | dict[fields[0]] = [fields[1]]
81 | else:
82 | dict[fields[0]].append(fields[1])
83 | return dict
84 |
85 |
86 | def dict_load(json_file):
87 | with open(json_file, 'r') as f:
88 | r = json.load(f)
89 | return r
90 |
91 |
92 | def retrieve_KO(arr, dict):
93 | """
94 | arr = [
95 | ['AMLFNMKI_00003', ''],
96 | ['AMLFNMKI_00004', 'Q24SP7']
97 | ]
98 | new_arr = [
99 | ['AMLFNMKI_00025', 'Q01465', ['K03569']],
100 | ['AMLFNMKI_00026', 'P15639', ['K00602','K00604']]
101 | ['AMLFNMKI_00027', '', '']
102 | ]
103 | """
104 | new_arr = []
105 | id_no_match = [] # record UniProtKB IDs have no corresponding KO numbers
106 | for cds in arr:
107 | if cds[1] == '':
108 | cds.append('')
109 | new_arr.append(cds)
110 | else:
111 | ko = dict.get(cds[1], None)
112 | if ko is None:
113 | id_no_match.append(cds[1])
114 | cds.append('')
115 | new_arr.append(cds)
116 | else:
117 | cds.append(ko)
118 | new_arr.append(cds)
119 | # print(json.dumps(new_arr))
120 | """ Report failure in search K numbers according to UniProtKB IDs
121 | print("Warning: The following " + str(len(id_no_match))
122 | + " UniProtKB IDs don't have corresponding K numbers")
123 | print(' '.join(id_no_match))
124 | """
125 | return new_arr
126 |
127 |
128 | def write_json(content, outfile):
129 | with open(outfile, 'w') as fo:
130 | json.dump(content, fo)
131 |
132 |
133 | def output(arr, outfile):
134 | """
135 | arr = [
136 | ["AMLFNMKI_00025", Q01465, ["K03569"]],
137 | ["AMLFNMKI_00026", P15639, ["K00602","K00604"]]
138 | ["AMLFNMKI_00027", "", ""]
139 | ]
140 | """
141 | with open(outfile, 'w') as fo:
142 | for cds in arr:
143 | if cds[2] != "":
144 | for ko in cds[2]:
145 | fo.write(cds[0] + "\t" + ko + "\n")
146 | else:
147 | fo.write(cds[0] + "\n")
148 |
149 |
150 | def get_input_files(dir):
151 | files = []
152 | for fi in os.listdir(dir):
153 | fi_path = os.path.join(dir, fi)
154 | if os.path.isfile(fi_path) and os.path.splitext(fi)[1] == '.gbk':
155 | files.append(fi)
156 | return files
157 |
158 |
159 | def create_dir(dir):
160 | if not os.path.exists(dir):
161 | os.mkdir(dir)
162 |
163 |
164 | def main():
165 | create_dir(args.o)
166 | if os.path.exists(args.d + '.json'):
167 | db_dict = dict_load(args.d + '.json')
168 | else:
169 | db_dict = dict_initialize(args.d)
170 | write_json(db_dict, args.d + '.json')
171 | gbks = get_input_files(args.i)
172 | print("{} gbk files have been read.".format(len(gbks)))
173 | for gbk in gbks:
174 | print("parsing {}...".format(gbk))
175 | in_path = os.path.join(args.i, gbk)
176 | out_path = os.path.join(args.o, gbk) + ".ko.out"
177 | mapping_array = gbk_parser(in_path)
178 | final_arr = retrieve_KO(mapping_array, db_dict)
179 | output(final_arr, out_path)
180 |
181 |
182 | if __name__ == '__main__':
183 | main()
184 |
--------------------------------------------------------------------------------
/prokka2kegg/sample.kegg.out.txt:
--------------------------------------------------------------------------------
1 | AHCGDBLN_00001 K05685
2 | AHCGDBLN_00002 K10804
3 | AHCGDBLN_00003
4 | AHCGDBLN_00004
5 | AHCGDBLN_00005
6 | AHCGDBLN_00006
7 | AHCGDBLN_00007
8 | AHCGDBLN_00008
9 | AHCGDBLN_00009
10 | AHCGDBLN_00010
11 | AHCGDBLN_00011
12 | AHCGDBLN_00012
13 | AHCGDBLN_00013
14 | AHCGDBLN_00014
15 | AHCGDBLN_00015 K02410
16 | AHCGDBLN_00016 K02409
17 | AHCGDBLN_00017 K02388
18 | AHCGDBLN_00018 K07714
19 | AHCGDBLN_00019 K07710
20 | AHCGDBLN_00020
21 | AHCGDBLN_00021
22 | AHCGDBLN_00022
23 | AHCGDBLN_00023 K10012
24 | AHCGDBLN_00024
25 | AHCGDBLN_00025 K07264
26 | AHCGDBLN_00026 K07264
27 | AHCGDBLN_00027 K16148
28 | AHCGDBLN_00028
29 | AHCGDBLN_00029 K19569
30 | AHCGDBLN_00030
31 | AHCGDBLN_00031
32 | AHCGDBLN_00032 K15914
33 | AHCGDBLN_00033 K03274
34 | AHCGDBLN_00034
35 | AHCGDBLN_00035 K00754
36 | AHCGDBLN_00036 K02844
37 | AHCGDBLN_00037 K16870
38 | AHCGDBLN_00038
39 | AHCGDBLN_00039
40 | AHCGDBLN_00040
41 | AHCGDBLN_00041
42 | AHCGDBLN_00042 K01482
43 | AHCGDBLN_00043
44 | AHCGDBLN_00044 K03671
45 | AHCGDBLN_00045
46 | AHCGDBLN_00046
47 | AHCGDBLN_00047 K01012
48 | AHCGDBLN_00048
49 | AHCGDBLN_00049 K00800
50 | AHCGDBLN_00050
51 | AHCGDBLN_00051
52 | AHCGDBLN_00052
53 | AHCGDBLN_00053
54 | AHCGDBLN_00054 K00324
55 | AHCGDBLN_00055 K00325
56 | AHCGDBLN_00056
57 | AHCGDBLN_00057
58 | AHCGDBLN_00058 K00325
59 | AHCGDBLN_00059 K20932
60 | AHCGDBLN_00060 K07533
61 | AHCGDBLN_00061
62 | AHCGDBLN_00062 K04066
63 | AHCGDBLN_00063
64 | AHCGDBLN_00064 K03926
65 | AHCGDBLN_00065 K07147
66 | AHCGDBLN_00066
67 | AHCGDBLN_00067
68 | AHCGDBLN_00068
69 | AHCGDBLN_00069
70 | AHCGDBLN_00070
71 | AHCGDBLN_00071
72 | AHCGDBLN_00072 K01666
73 | AHCGDBLN_00073 K01625
74 | AHCGDBLN_00074
75 | AHCGDBLN_00075 K01299
76 | AHCGDBLN_00076
77 | AHCGDBLN_00077 K02203
78 | AHCGDBLN_00078 K03980
79 | AHCGDBLN_00079
80 | AHCGDBLN_00080
81 | AHCGDBLN_00081
82 | AHCGDBLN_00082
83 | AHCGDBLN_00083
84 | AHCGDBLN_00084 K06213
85 | AHCGDBLN_00085
86 | AHCGDBLN_00086
87 | AHCGDBLN_00087
88 | AHCGDBLN_00088
89 | AHCGDBLN_00089 K03685
90 | AHCGDBLN_00090 K09458
91 | AHCGDBLN_00091 K02078
92 | AHCGDBLN_00092 K00059
93 | AHCGDBLN_00093 K00645
94 | AHCGDBLN_00094 K00648
95 | AHCGDBLN_00095 K03621
96 | AHCGDBLN_00096 K02911
97 | AHCGDBLN_00097 K07040
98 | AHCGDBLN_00098 K04763
99 | AHCGDBLN_00099 K03841
100 | AHCGDBLN_00100 K01892
101 | AHCGDBLN_00101 K03711
102 | AHCGDBLN_00102
103 | AHCGDBLN_00103
104 | AHCGDBLN_00104 K01895
105 | AHCGDBLN_00105
106 | AHCGDBLN_00106
107 | AHCGDBLN_00107
108 | AHCGDBLN_00108 K02408
109 | AHCGDBLN_00109 K02387
110 | AHCGDBLN_00110
111 | AHCGDBLN_00111
112 | AHCGDBLN_00112 K01563
113 | AHCGDBLN_00113 K00648
114 | AHCGDBLN_00114 K11085
115 | AHCGDBLN_00115
116 | AHCGDBLN_00116
117 | AHCGDBLN_00117 K01407
118 | AHCGDBLN_00118
119 | AHCGDBLN_00119 K11210
120 | AHCGDBLN_00120 K03637
121 | AHCGDBLN_00121 K03636
122 | AHCGDBLN_00122 K03635
123 | AHCGDBLN_00123
124 | AHCGDBLN_00124 K16898
125 | AHCGDBLN_00125 K00384
126 | AHCGDBLN_00126
127 | AHCGDBLN_00127 K01733
128 | AHCGDBLN_00128 K00003
129 | AHCGDBLN_00129 K01423
130 | AHCGDBLN_00130 K01681
131 | AHCGDBLN_00131 K06889
132 | AHCGDBLN_00132
133 | AHCGDBLN_00133
134 | AHCGDBLN_00134
135 | AHCGDBLN_00136
136 | AHCGDBLN_00137
137 | AHCGDBLN_00138 K09121
138 | AHCGDBLN_00139
139 | AHCGDBLN_00140
140 | AHCGDBLN_00141
141 | AHCGDBLN_00142 K03118
142 | AHCGDBLN_00143
143 | AHCGDBLN_00144
144 | AHCGDBLN_00145 K01808
145 | AHCGDBLN_00146 K00600
146 | AHCGDBLN_00147 K07738
147 | AHCGDBLN_00148
148 | AHCGDBLN_00149 K02492
149 | AHCGDBLN_00150 K01749
150 | AHCGDBLN_00151 K00767
151 | AHCGDBLN_00152
152 | AHCGDBLN_00153
153 | AHCGDBLN_00154
154 | AHCGDBLN_00155
155 | AHCGDBLN_00156
156 | AHCGDBLN_00157 K02406
157 | AHCGDBLN_00158 K07213
158 | AHCGDBLN_00159
159 | AHCGDBLN_00160 K03183
160 | AHCGDBLN_00161 K03152
161 | AHCGDBLN_00162
162 | AHCGDBLN_00163 K06998
163 | AHCGDBLN_00164 K01953
164 | AHCGDBLN_00165 K01662
165 | AHCGDBLN_00166
166 | AHCGDBLN_00167
167 | AHCGDBLN_00168 K13888
168 | AHCGDBLN_00169 K02003
169 | AHCGDBLN_00170
170 | AHCGDBLN_00171 K01665
171 | AHCGDBLN_00172 K00824
172 | AHCGDBLN_00173
173 | AHCGDBLN_00174
174 | AHCGDBLN_00175
175 | AHCGDBLN_00176
176 | AHCGDBLN_00177 K02169
177 | AHCGDBLN_00178
178 | AHCGDBLN_00179 K21377
179 | AHCGDBLN_00180
180 | AHCGDBLN_00181 K03686
181 | AHCGDBLN_00182
182 | AHCGDBLN_00183
183 | AHCGDBLN_00184 K00609
184 | AHCGDBLN_00185 K01465
185 | AHCGDBLN_00186 K01956
186 | AHCGDBLN_00187
187 | AHCGDBLN_00188 K01955
188 | AHCGDBLN_00189
189 | AHCGDBLN_00190
190 | AHCGDBLN_00191 K10773
191 | AHCGDBLN_00192
192 | AHCGDBLN_00193
193 | AHCGDBLN_00194 K00330
194 | AHCGDBLN_00195 K00331
195 | AHCGDBLN_00196 K00332
196 | AHCGDBLN_00197 K00333
197 | AHCGDBLN_00198 K00334
198 | AHCGDBLN_00199 K00335
199 | AHCGDBLN_00200 K05299
200 | AHCGDBLN_00201 K00337
201 | AHCGDBLN_00202 K00338
202 | AHCGDBLN_00203 K00339
203 | AHCGDBLN_00204 K00340
204 | AHCGDBLN_00205 K00341
205 | AHCGDBLN_00206 K00342
206 | AHCGDBLN_00207 K06206
207 | AHCGDBLN_00208 K13307
208 | AHCGDBLN_00209
209 | AHCGDBLN_00210 K00773
210 | AHCGDBLN_00211 K00611
211 | AHCGDBLN_00212 K00821
212 | AHCGDBLN_00213
213 | AHCGDBLN_00214 K03316
214 | AHCGDBLN_00215
215 | AHCGDBLN_00216 K01662
216 | AHCGDBLN_00217 K01693
217 | AHCGDBLN_00218 K03564
218 | AHCGDBLN_00219 K01262
219 | AHCGDBLN_00220
220 | AHCGDBLN_00221
221 | AHCGDBLN_00222 K03702
222 | AHCGDBLN_00223 K03702
223 | AHCGDBLN_00224 K01693
224 | AHCGDBLN_00225 K02501
225 | AHCGDBLN_00226
226 | AHCGDBLN_00227 K05515
227 | AHCGDBLN_00228
228 | AHCGDBLN_00229 K03570
229 | AHCGDBLN_00230 K03569
230 | AHCGDBLN_00231 K03770
231 | AHCGDBLN_00232 K00941
232 | AHCGDBLN_00233 K00820
233 | AHCGDBLN_00234 K04042
234 | AHCGDBLN_00235 K04042
235 | AHCGDBLN_00236 K18707
236 | AHCGDBLN_00237 K03664
237 | AHCGDBLN_00238
238 | AHCGDBLN_00239
239 | AHCGDBLN_00240
240 | AHCGDBLN_00241
241 | AHCGDBLN_00242
242 | AHCGDBLN_00243
243 | AHCGDBLN_00244 K01963
244 | AHCGDBLN_00245
245 | AHCGDBLN_00246 K01613
246 | AHCGDBLN_00247 K00053
247 | AHCGDBLN_00248 K01653
248 | AHCGDBLN_00249 K10773
249 | AHCGDBLN_00250 K00989
250 | AHCGDBLN_00251
251 | AHCGDBLN_00252 K07301
252 | AHCGDBLN_00253
253 | AHCGDBLN_00254 K17828
254 | AHCGDBLN_00255 K01775
255 | AHCGDBLN_00256
256 | AHCGDBLN_00257 K16325
257 | AHCGDBLN_00258
258 | AHCGDBLN_00259
259 | AHCGDBLN_00260
260 | AHCGDBLN_00261
261 | AHCGDBLN_00262 K15352
262 | AHCGDBLN_00263
263 | AHCGDBLN_00264 K05851
264 | AHCGDBLN_00265 K02557
265 | AHCGDBLN_00266 K08218
266 | AHCGDBLN_00267
267 | AHCGDBLN_00268
268 | AHCGDBLN_00269
269 | AHCGDBLN_00270
270 | AHCGDBLN_00271
271 | AHCGDBLN_00272
272 | AHCGDBLN_00273
273 | AHCGDBLN_00274
274 | AHCGDBLN_00275
275 | AHCGDBLN_00276
276 | AHCGDBLN_00277 K03116
277 | AHCGDBLN_00278
278 | AHCGDBLN_00279 K00343
279 | AHCGDBLN_00280 K00342
280 | AHCGDBLN_00281 K00341
281 | AHCGDBLN_00282
282 | AHCGDBLN_00283 K00339
283 | AHCGDBLN_00284
284 | AHCGDBLN_00285 K00337
285 | AHCGDBLN_00286
286 | AHCGDBLN_00287 K00330
287 | AHCGDBLN_00289
288 | AHCGDBLN_00290 K07126
289 | AHCGDBLN_00291 K07126
290 | AHCGDBLN_00292
291 | AHCGDBLN_00293
292 | AHCGDBLN_00294 K07126
293 | AHCGDBLN_00295
294 | AHCGDBLN_00296
295 | AHCGDBLN_00297
296 | AHCGDBLN_00298 K14393
297 | AHCGDBLN_00299
298 | AHCGDBLN_00300
299 | AHCGDBLN_00301
300 | AHCGDBLN_00302
301 | AHCGDBLN_00303
302 | AHCGDBLN_00304
303 | AHCGDBLN_00305 K01953
304 | AHCGDBLN_00306
305 | AHCGDBLN_00307
306 | AHCGDBLN_00308 K19423
307 | AHCGDBLN_00309 K07126
308 | AHCGDBLN_00310 K00449
309 | AHCGDBLN_00311
310 | AHCGDBLN_00312
311 | AHCGDBLN_00313
312 | AHCGDBLN_00314
313 | AHCGDBLN_00315
314 | AHCGDBLN_00316
315 | AHCGDBLN_00317
316 | AHCGDBLN_00318 K16902
317 | AHCGDBLN_00319
318 | AHCGDBLN_00320
319 | AHCGDBLN_00321 K07126
320 | AHCGDBLN_00322
321 | AHCGDBLN_00323
322 | AHCGDBLN_00324
323 | AHCGDBLN_00325
324 | AHCGDBLN_00326
325 | AHCGDBLN_00327
326 | AHCGDBLN_00328
327 | AHCGDBLN_00329
328 | AHCGDBLN_00330
329 | AHCGDBLN_00331
330 | AHCGDBLN_00332
331 | AHCGDBLN_00333
332 | AHCGDBLN_00334
333 | AHCGDBLN_00335
334 | AHCGDBLN_00336
335 | AHCGDBLN_00337
336 | AHCGDBLN_00338
337 | AHCGDBLN_00339
338 | AHCGDBLN_00340
339 | AHCGDBLN_00341 K02835
340 | AHCGDBLN_00342 K02493
341 | AHCGDBLN_00343 K00790
342 | AHCGDBLN_00344 K02843
343 | AHCGDBLN_00345
344 | AHCGDBLN_00346
345 | AHCGDBLN_00347 K18979
346 | AHCGDBLN_00348 K01599
347 | AHCGDBLN_00349 K01772
348 | AHCGDBLN_00350
349 | AHCGDBLN_00351 K03657
350 | AHCGDBLN_00352 K03665
351 | AHCGDBLN_00353
352 | AHCGDBLN_00354 K03216
353 | AHCGDBLN_00355
354 | AHCGDBLN_00356 K21402
355 | AHCGDBLN_00357 K19422
356 | AHCGDBLN_00358
357 | AHCGDBLN_00359
358 | AHCGDBLN_00360 K01709
359 | AHCGDBLN_00361 K00978
360 | AHCGDBLN_00362 K01710
361 | AHCGDBLN_00363 K06148
362 | AHCGDBLN_00364 K02835
363 | AHCGDBLN_00365
364 | AHCGDBLN_00366 K03628
365 | AHCGDBLN_00367
366 | AHCGDBLN_00368
367 | AHCGDBLN_00369
368 | AHCGDBLN_00370 K02879
369 | AHCGDBLN_00371 K03040
370 | AHCGDBLN_00372 K02986
371 | AHCGDBLN_00373 K02948
372 | AHCGDBLN_00374 K02952
373 | AHCGDBLN_00375 K02518
374 | AHCGDBLN_00376 K00939
375 | AHCGDBLN_00377 K03076
376 | AHCGDBLN_00378 K02876
377 | AHCGDBLN_00379 K02988
378 | AHCGDBLN_00380 K02881
379 | AHCGDBLN_00381
380 | AHCGDBLN_00382 K02994
381 | AHCGDBLN_00383 K02931
382 | AHCGDBLN_00384
383 | AHCGDBLN_00385 K02874
384 | AHCGDBLN_00386 K02961
385 | AHCGDBLN_00387
386 | AHCGDBLN_00388 K02878
387 | AHCGDBLN_00389 K02982
388 | AHCGDBLN_00390 K02890
389 | AHCGDBLN_00391
390 | AHCGDBLN_00392 K02886
391 | AHCGDBLN_00393 K02892
392 | AHCGDBLN_00394
393 | AHCGDBLN_00395 K02906
394 | AHCGDBLN_00396 K02946
395 | AHCGDBLN_00397 K02358
396 | AHCGDBLN_00398
397 | AHCGDBLN_00399 K01358
398 | AHCGDBLN_00400 K03544
399 | AHCGDBLN_00401
400 | AHCGDBLN_00402
401 | AHCGDBLN_00403
402 | AHCGDBLN_00404 K06045
403 | AHCGDBLN_00405 K02871
404 | AHCGDBLN_00406 K02996
405 | AHCGDBLN_00407 K00145
406 | AHCGDBLN_00408 K00620
407 | AHCGDBLN_00409 K01462
408 | AHCGDBLN_00410 K00604
409 | AHCGDBLN_00411 K01784
410 | AHCGDBLN_00412 K03271
411 | AHCGDBLN_00413 K03272
412 | AHCGDBLN_00414 K03274
413 | AHCGDBLN_00415
414 | AHCGDBLN_00416
415 | AHCGDBLN_00417
416 | AHCGDBLN_00418 K00928
417 | AHCGDBLN_00419 K01649
418 | AHCGDBLN_00420 K01915
419 | AHCGDBLN_00421
420 | AHCGDBLN_00422
421 | AHCGDBLN_00423 K00748
422 | AHCGDBLN_00424 K09949
423 | AHCGDBLN_00425 K16043
424 | AHCGDBLN_00426 K00677
425 | AHCGDBLN_00427 K02372
426 | AHCGDBLN_00428
427 | AHCGDBLN_00429 K07277
428 | AHCGDBLN_00430 K03696
429 | AHCGDBLN_00431 K00548
430 | AHCGDBLN_00432
431 | AHCGDBLN_00433 K00605
432 | AHCGDBLN_00434 K02437
433 | AHCGDBLN_00435 K00282
434 | AHCGDBLN_00436
435 | AHCGDBLN_00437
436 | AHCGDBLN_00438
437 | AHCGDBLN_00439
438 | AHCGDBLN_00440
439 | AHCGDBLN_00441
440 | AHCGDBLN_00442
441 | AHCGDBLN_00443
442 | AHCGDBLN_00444
443 | AHCGDBLN_00445
444 | AHCGDBLN_00446 K07126
445 | AHCGDBLN_00447
446 | AHCGDBLN_00448 K15352
447 | AHCGDBLN_00449
448 | AHCGDBLN_00450
449 | AHCGDBLN_00451
450 | AHCGDBLN_00452
451 | AHCGDBLN_00453
452 | AHCGDBLN_00454
453 | AHCGDBLN_00455
454 | AHCGDBLN_00456 K01951
455 | AHCGDBLN_00457 K00088
456 | AHCGDBLN_00458 K06920
457 | AHCGDBLN_00459 K03106
458 | AHCGDBLN_00460 K02959
459 | AHCGDBLN_00461 K06960
460 | AHCGDBLN_00462 K02860
461 | AHCGDBLN_00463 K00554
462 | AHCGDBLN_00464
463 | AHCGDBLN_00465 K02884
464 | AHCGDBLN_00466
465 | AHCGDBLN_00467 K21402
466 | AHCGDBLN_00468 K01423
467 | AHCGDBLN_00469 K03470
468 | AHCGDBLN_00470 K07462
469 | AHCGDBLN_00471 K06942
470 | AHCGDBLN_00472 K02575
471 | AHCGDBLN_00473 K01999
472 | AHCGDBLN_00474 K01997
473 | AHCGDBLN_00475
474 | AHCGDBLN_00476 K06861
475 | AHCGDBLN_00477 K01996
476 | AHCGDBLN_00478
477 | AHCGDBLN_00479
478 | AHCGDBLN_00480 K00943
479 | AHCGDBLN_00481 K03186
480 | AHCGDBLN_00482 K04487
481 | AHCGDBLN_00483 K03151
482 | AHCGDBLN_00484
483 | AHCGDBLN_00485 K13940
484 | AHCGDBLN_00486 K10206
485 | AHCGDBLN_00487
486 | AHCGDBLN_00488
487 | AHCGDBLN_00489 K03832
488 | AHCGDBLN_00490
489 | AHCGDBLN_00491 K02015
490 | AHCGDBLN_00492 K02013
491 | AHCGDBLN_00493 K06858
492 | AHCGDBLN_00494
493 | AHCGDBLN_00495
494 | AHCGDBLN_00496 K00974
495 | AHCGDBLN_00497 K04562
496 | AHCGDBLN_00498
497 | AHCGDBLN_00499
498 | AHCGDBLN_00500
499 | AHCGDBLN_00501
500 | AHCGDBLN_00502 K01867
501 | AHCGDBLN_00503 K01870
502 | AHCGDBLN_00504 K01870
503 | AHCGDBLN_00505 K03101
504 | AHCGDBLN_00506 K13292
505 | AHCGDBLN_00507
506 | AHCGDBLN_00508
507 | AHCGDBLN_00509
508 | AHCGDBLN_00510
509 | AHCGDBLN_00511 K03769
510 | AHCGDBLN_00512
511 | AHCGDBLN_00513
512 | AHCGDBLN_00514
513 | AHCGDBLN_00515
514 | AHCGDBLN_00516 K08884
515 | AHCGDBLN_00517
516 | AHCGDBLN_00518
517 | AHCGDBLN_00519 K00058
518 | AHCGDBLN_00520 K04034
519 | AHCGDBLN_00521
520 | AHCGDBLN_00522
521 | AHCGDBLN_00523
522 | AHCGDBLN_00524 K03271
523 | AHCGDBLN_00525 K00616
524 | AHCGDBLN_00526 K03273
525 | AHCGDBLN_00527 K00966
526 | AHCGDBLN_00528 K01710
527 | AHCGDBLN_00529
528 | AHCGDBLN_00530 K07031
529 | AHCGDBLN_00531 K19427
530 | AHCGDBLN_00532 K03639
531 | AHCGDBLN_00533
532 | AHCGDBLN_00534
533 | AHCGDBLN_00535
534 | AHCGDBLN_00536
535 | AHCGDBLN_00537
536 | AHCGDBLN_00538 K03593
537 | AHCGDBLN_00539
538 | AHCGDBLN_00540
539 | AHCGDBLN_00541 K00974
540 | AHCGDBLN_00542
541 | AHCGDBLN_00543
542 | AHCGDBLN_00544
543 | AHCGDBLN_00545 K03797
544 | AHCGDBLN_00546 K03673
545 | AHCGDBLN_00547
546 | AHCGDBLN_00548 K07053
547 | AHCGDBLN_00549 K03551
548 | AHCGDBLN_00550 K03550
549 | AHCGDBLN_00551 K01159
550 | AHCGDBLN_00552
551 | AHCGDBLN_00553
552 | AHCGDBLN_00554 K00278
553 | AHCGDBLN_00555
554 | AHCGDBLN_00556
555 | AHCGDBLN_00557 K03210
556 | AHCGDBLN_00558 K03072
557 | AHCGDBLN_00559
558 | AHCGDBLN_00560
559 | AHCGDBLN_00561 K04567
560 | AHCGDBLN_00562 K18682
561 | AHCGDBLN_00563 K01934
562 | AHCGDBLN_00564
563 | AHCGDBLN_00565
564 | AHCGDBLN_00566
565 | AHCGDBLN_00567 K01890
566 | AHCGDBLN_00568
567 | AHCGDBLN_00569 K07387
568 | AHCGDBLN_00570
569 | AHCGDBLN_00571
570 | AHCGDBLN_00572
571 | AHCGDBLN_00573 K11927
572 | AHCGDBLN_00574 K07461
573 | AHCGDBLN_00575 K09858
574 | AHCGDBLN_00576 K06994
575 | AHCGDBLN_00577
576 | AHCGDBLN_00578
577 | AHCGDBLN_00579
578 | AHCGDBLN_00580 K21271
579 | AHCGDBLN_00581
580 | AHCGDBLN_00582
581 | AHCGDBLN_00583 K11940
582 | AHCGDBLN_00584
583 | AHCGDBLN_00585
584 | AHCGDBLN_00586 K03525
585 | AHCGDBLN_00587
586 | AHCGDBLN_00588
587 | AHCGDBLN_00589 K07126
588 | AHCGDBLN_00590 K00891
589 | AHCGDBLN_00591
590 | AHCGDBLN_00592 K01247
591 | AHCGDBLN_00593
592 | AHCGDBLN_00594
593 | AHCGDBLN_00595
594 | AHCGDBLN_00596 K06219
595 | AHCGDBLN_00597 K02888
596 | AHCGDBLN_00598 K02899
597 | AHCGDBLN_00599 K03979
598 | AHCGDBLN_00600
599 | AHCGDBLN_00601
600 | AHCGDBLN_00602 K00979
601 | AHCGDBLN_00603 K01297
602 | AHCGDBLN_00604 K02500
603 | AHCGDBLN_00605 K01814
604 | AHCGDBLN_00606 K19889
605 | AHCGDBLN_00607 K01790
606 | AHCGDBLN_00608
607 | AHCGDBLN_00609 K00059
608 | AHCGDBLN_00610
609 | AHCGDBLN_00611
610 | AHCGDBLN_00612
611 | AHCGDBLN_00613
612 | AHCGDBLN_00614
613 | AHCGDBLN_00615
614 | AHCGDBLN_00617
615 | AHCGDBLN_00618 K07126
616 | AHCGDBLN_00619 K19789
617 | AHCGDBLN_00620
618 | AHCGDBLN_00621 K03427
619 | AHCGDBLN_00622
620 | AHCGDBLN_00623
621 | AHCGDBLN_00624 K07126
622 | AHCGDBLN_00625 K07126
623 | AHCGDBLN_00626
624 | AHCGDBLN_00627
625 | AHCGDBLN_00628
626 | AHCGDBLN_00629 K03686
627 | AHCGDBLN_00630
628 | AHCGDBLN_00631
629 | AHCGDBLN_00632
630 | AHCGDBLN_00633
631 | AHCGDBLN_00634 K07126
632 | AHCGDBLN_00635
633 | AHCGDBLN_00636
634 | AHCGDBLN_00637
635 | AHCGDBLN_00638 K01872
636 | AHCGDBLN_00639 K06041
637 | AHCGDBLN_00640 K02335
638 | AHCGDBLN_00641
639 | AHCGDBLN_00642 K07305
640 | AHCGDBLN_00643
641 | AHCGDBLN_00644
642 | AHCGDBLN_00645 K15977
643 | AHCGDBLN_00646
644 | AHCGDBLN_00647
645 | AHCGDBLN_00648
646 | AHCGDBLN_00649
647 | AHCGDBLN_00650 K03704
648 | AHCGDBLN_00651
649 | AHCGDBLN_00652
650 | AHCGDBLN_00653
651 | AHCGDBLN_00654
652 | AHCGDBLN_00655 K09771
653 | AHCGDBLN_00656
654 | AHCGDBLN_00657
655 | AHCGDBLN_00658 K15352
656 | AHCGDBLN_00659
657 | AHCGDBLN_00660
658 | AHCGDBLN_00661
659 | AHCGDBLN_00662
660 | AHCGDBLN_00663 K15383
661 | AHCGDBLN_00664 K15352
662 | AHCGDBLN_00665
663 | AHCGDBLN_00666 K07783
664 | AHCGDBLN_00667 K18911
665 | AHCGDBLN_00668
666 | AHCGDBLN_00669
667 | AHCGDBLN_00670
668 | AHCGDBLN_00671 K19428
669 | AHCGDBLN_00672 K18235
670 | AHCGDBLN_00673
671 | AHCGDBLN_00674 K02469
672 | AHCGDBLN_00675
673 | AHCGDBLN_00678
674 | AHCGDBLN_00681 K02343
675 | AHCGDBLN_00682 K09747
676 | AHCGDBLN_00683 K06187
677 | AHCGDBLN_00685
678 | AHCGDBLN_00686 K07714
679 | AHCGDBLN_00687 K10914
680 | AHCGDBLN_00688 K10914
681 | AHCGDBLN_00689 K08303
682 | AHCGDBLN_00690
683 | AHCGDBLN_00691
684 | AHCGDBLN_00692 K15352
685 | AHCGDBLN_00693
686 | AHCGDBLN_00694 K01834
687 | AHCGDBLN_00695
688 | AHCGDBLN_00696 K06442
689 | AHCGDBLN_00697 K03602
690 | AHCGDBLN_00698 K03601
691 | AHCGDBLN_00699 K01491
692 | AHCGDBLN_00700
693 | AHCGDBLN_00701
694 | AHCGDBLN_00702
695 | AHCGDBLN_00703
696 | AHCGDBLN_00705
697 | AHCGDBLN_00706
698 | AHCGDBLN_00707 K03113
699 | AHCGDBLN_00708
700 | AHCGDBLN_00709
701 | AHCGDBLN_00710 K02111
702 | AHCGDBLN_00711 K02115
703 | AHCGDBLN_00712 K02112
704 | AHCGDBLN_00713 K02114
705 | AHCGDBLN_00714
706 | AHCGDBLN_00715
707 | AHCGDBLN_00716 K03701
708 | AHCGDBLN_00717 K03182
709 | AHCGDBLN_00718 K00971
710 | AHCGDBLN_00719
711 | AHCGDBLN_00720 K18889
712 | AHCGDBLN_00721 K06147
713 | AHCGDBLN_00722 K00147
714 | AHCGDBLN_00723
715 | AHCGDBLN_00724
716 | AHCGDBLN_00725 K03168
717 | AHCGDBLN_00726 K04094
718 | AHCGDBLN_00727 K02470
719 | AHCGDBLN_00728 K03593
720 | AHCGDBLN_00729 K04039
721 | AHCGDBLN_00730
722 | AHCGDBLN_00731
723 | AHCGDBLN_00732 K03593
724 | AHCGDBLN_00733 K00058
725 | AHCGDBLN_00734 K07533
726 | AHCGDBLN_00735
727 | AHCGDBLN_00736
728 | AHCGDBLN_00737
729 | AHCGDBLN_00738
730 | AHCGDBLN_00739
731 | AHCGDBLN_00740
732 | AHCGDBLN_00741
733 | AHCGDBLN_00742
734 | AHCGDBLN_00743
735 | AHCGDBLN_00744
736 | AHCGDBLN_00745
737 | AHCGDBLN_00746 K03439
738 | AHCGDBLN_00747
739 | AHCGDBLN_00748
740 | AHCGDBLN_00749
741 | AHCGDBLN_00750 K08884
742 | AHCGDBLN_00751
743 | AHCGDBLN_00752
744 | AHCGDBLN_00753
745 | AHCGDBLN_00754
746 | AHCGDBLN_00755
747 | AHCGDBLN_00756
748 | AHCGDBLN_00757
749 | AHCGDBLN_00758
750 | AHCGDBLN_00759
751 | AHCGDBLN_00760
752 | AHCGDBLN_00761
753 | AHCGDBLN_00762
754 | AHCGDBLN_00763
755 | AHCGDBLN_00764
756 | AHCGDBLN_00765
757 | AHCGDBLN_00767
758 | AHCGDBLN_00768
759 | AHCGDBLN_00769 K06179
760 | AHCGDBLN_00770 K07789
761 | AHCGDBLN_00771 K01993
762 | AHCGDBLN_00772 K06969
763 | AHCGDBLN_00773 K01897
764 | AHCGDBLN_00774
765 | AHCGDBLN_00775
766 | AHCGDBLN_00776
767 | AHCGDBLN_00777
768 | AHCGDBLN_00778 K04562
769 | AHCGDBLN_00779
770 | AHCGDBLN_00780 K02405
771 | AHCGDBLN_00781 K12974
772 | AHCGDBLN_00782 K00997
773 | AHCGDBLN_00783 K07566
774 | AHCGDBLN_00784 K01588
775 | AHCGDBLN_00785
776 | AHCGDBLN_00786 K03742
777 | AHCGDBLN_00787 K19225
778 | AHCGDBLN_00788
779 | AHCGDBLN_00789
780 | AHCGDBLN_00790 K11752
781 | AHCGDBLN_00791 K00793
782 | AHCGDBLN_00792 K14652
783 | AHCGDBLN_00793 K00794
784 | AHCGDBLN_00794 K03625
785 | AHCGDBLN_00795
786 | AHCGDBLN_00796
787 | AHCGDBLN_00797
788 | AHCGDBLN_00798 K04764
789 | AHCGDBLN_00799 K02355
790 | AHCGDBLN_00800 K03977
791 | AHCGDBLN_00801 K01448
792 | AHCGDBLN_00802
793 | AHCGDBLN_00803 K03722
794 | AHCGDBLN_00804
795 | AHCGDBLN_00805 K02042
796 | AHCGDBLN_00806
797 | AHCGDBLN_00807 K01591
798 | AHCGDBLN_00808
799 | AHCGDBLN_00809
800 | AHCGDBLN_00810 K00537
801 | AHCGDBLN_00811 K03529
802 | AHCGDBLN_00812
803 | AHCGDBLN_00813 K00568
804 | AHCGDBLN_00814
805 | AHCGDBLN_00815 K10012
806 | AHCGDBLN_00816 K12902
807 | AHCGDBLN_00817
808 | AHCGDBLN_00818
809 | AHCGDBLN_00819 K01710
810 | AHCGDBLN_00820
811 | AHCGDBLN_00821 K10816
812 | AHCGDBLN_00822
813 | AHCGDBLN_00823 K01737
814 | AHCGDBLN_00824 K11745
815 | AHCGDBLN_00825
816 | AHCGDBLN_00826
817 | AHCGDBLN_00827
818 | AHCGDBLN_00828
819 | AHCGDBLN_00829
820 | AHCGDBLN_00830
821 | AHCGDBLN_00831 K01243
822 | AHCGDBLN_00832 K03215
823 | AHCGDBLN_00833 K01893
824 | AHCGDBLN_00834
825 | AHCGDBLN_00835
826 | AHCGDBLN_00836 K09131
827 | AHCGDBLN_00837 K08963
828 | AHCGDBLN_00838 K00966
829 | AHCGDBLN_00839
830 | AHCGDBLN_00840 K07533
831 | AHCGDBLN_00841
832 | AHCGDBLN_00842
833 | AHCGDBLN_00843
834 | AHCGDBLN_00844 K06287
835 | AHCGDBLN_00845
836 | AHCGDBLN_00846
837 | AHCGDBLN_00847
838 | AHCGDBLN_00848
839 | AHCGDBLN_00849 K18430
840 | AHCGDBLN_00850
841 | AHCGDBLN_00851
842 | AHCGDBLN_00852
843 | AHCGDBLN_00853 K03453
844 | AHCGDBLN_00854 K16868
845 | AHCGDBLN_00855
846 | AHCGDBLN_00856
847 | AHCGDBLN_00857 K00428
848 | AHCGDBLN_00858
849 | AHCGDBLN_00859
850 | AHCGDBLN_00860
851 | AHCGDBLN_00861
852 | AHCGDBLN_00862 K02600
853 | AHCGDBLN_00863
854 | AHCGDBLN_00864
855 | AHCGDBLN_00865 K02834
856 | AHCGDBLN_00866 K03177
857 | AHCGDBLN_00867
858 | AHCGDBLN_00868 K00962
859 | AHCGDBLN_00869 K03787
860 | AHCGDBLN_00870 K03386
861 | AHCGDBLN_00871 K06167
862 | AHCGDBLN_00872
863 | AHCGDBLN_00873 K01935
864 | AHCGDBLN_00874 K07658
865 | AHCGDBLN_00875 K07636
866 | AHCGDBLN_00876
867 | AHCGDBLN_00877
868 | AHCGDBLN_00878
869 | AHCGDBLN_00879
870 | AHCGDBLN_00880 K00655
871 | AHCGDBLN_00881
872 | AHCGDBLN_00882 K03564
873 | AHCGDBLN_00883 K07390
874 | AHCGDBLN_00884
875 | AHCGDBLN_00885
876 | AHCGDBLN_00886
877 | AHCGDBLN_00887 K01657
878 | AHCGDBLN_00888 K01664
879 | AHCGDBLN_00889 K00766
880 | AHCGDBLN_00890 K01609
881 | AHCGDBLN_00891 K01817
882 | AHCGDBLN_00892 K01696
883 | AHCGDBLN_00893 K01695
884 | AHCGDBLN_00894
885 | AHCGDBLN_00895
886 | AHCGDBLN_00896 K08884
887 | AHCGDBLN_00897
888 | AHCGDBLN_00898 K12339
889 | AHCGDBLN_00899 K21029
890 | AHCGDBLN_00900 K03636
891 | AHCGDBLN_00901 K01733
892 | AHCGDBLN_00902 K01738
893 | AHCGDBLN_00903 K02453
894 | AHCGDBLN_00904
895 | AHCGDBLN_00905
896 | AHCGDBLN_00906 K01698
897 | AHCGDBLN_00907
898 | AHCGDBLN_00908 K21464
899 | AHCGDBLN_00909
900 | AHCGDBLN_00910 K01810
901 | AHCGDBLN_00911 K00616
902 | AHCGDBLN_00912 K01940
903 | AHCGDBLN_00913 K01999
904 | AHCGDBLN_00914 K03116
905 | AHCGDBLN_00915
906 | AHCGDBLN_00916
907 | AHCGDBLN_00917
908 | AHCGDBLN_00918 K05366
909 | AHCGDBLN_00919
910 | AHCGDBLN_00920 K01652
911 | AHCGDBLN_00921
912 | AHCGDBLN_00922
913 | AHCGDBLN_00923
914 | AHCGDBLN_00924
915 | AHCGDBLN_00925 K04034
916 | AHCGDBLN_00926
917 | AHCGDBLN_00927 K07126
918 | AHCGDBLN_00928
919 | AHCGDBLN_00929 K12902
920 | AHCGDBLN_00930
921 | AHCGDBLN_00931 K13309
922 | AHCGDBLN_00932 K12902
923 | AHCGDBLN_00933 K05527
924 | AHCGDBLN_00934
925 | AHCGDBLN_00935
926 | AHCGDBLN_00936
927 | AHCGDBLN_00937
928 | AHCGDBLN_00938 K04043
929 | AHCGDBLN_00939 K03687
930 | AHCGDBLN_00940 K03686
931 | AHCGDBLN_00941
932 | AHCGDBLN_00942
933 | AHCGDBLN_00943 K00721
934 | AHCGDBLN_00944 K00266
935 | AHCGDBLN_00945 K00284
936 | AHCGDBLN_00946
937 | AHCGDBLN_00947 K00602
938 | AHCGDBLN_00948 K04034
939 | AHCGDBLN_00949
940 | AHCGDBLN_00950 K22320
941 | AHCGDBLN_00951 K10780
942 | AHCGDBLN_00952
943 | AHCGDBLN_00953
944 | AHCGDBLN_00954
945 | AHCGDBLN_00955
946 | AHCGDBLN_00956
947 | AHCGDBLN_00957
948 | AHCGDBLN_00958 K01129
949 | AHCGDBLN_00959
950 | AHCGDBLN_00960 K06077
951 | AHCGDBLN_00961
952 | AHCGDBLN_00962
953 | AHCGDBLN_00963 K01520
954 | AHCGDBLN_00964
955 | AHCGDBLN_00965
956 | AHCGDBLN_00966
957 | AHCGDBLN_00967
958 | AHCGDBLN_00968
959 | AHCGDBLN_00969 K01990
960 | AHCGDBLN_00970 K00010
961 | AHCGDBLN_00971 K02406
962 | AHCGDBLN_00972
963 | AHCGDBLN_00973 K00796
964 | AHCGDBLN_00974 K03474
965 | AHCGDBLN_00975
966 | AHCGDBLN_00976
967 | AHCGDBLN_00977 K01803
968 | AHCGDBLN_00978 K03075
969 | AHCGDBLN_00980
970 | AHCGDBLN_00981
971 | AHCGDBLN_00982 K00382
972 | AHCGDBLN_00983
973 | AHCGDBLN_00984 K02635
974 | AHCGDBLN_00985 K03798
975 | AHCGDBLN_00986 K04075
976 | AHCGDBLN_00987 K07533
977 | AHCGDBLN_00988 K03530
978 | AHCGDBLN_00989 K07568
979 | AHCGDBLN_00990 K07568
980 | AHCGDBLN_00991
981 | AHCGDBLN_00992 K00012
982 | AHCGDBLN_00993
983 | AHCGDBLN_00994 K09022
984 | AHCGDBLN_00995 K03584
985 | AHCGDBLN_00996 K00567
986 | AHCGDBLN_00997
987 | AHCGDBLN_00998
988 | AHCGDBLN_00999
989 | AHCGDBLN_01000 K04773
990 | AHCGDBLN_01001
991 | AHCGDBLN_01002 K04034
992 | AHCGDBLN_01003
993 | AHCGDBLN_01004
994 | AHCGDBLN_01005
995 | AHCGDBLN_01006 K00754
996 | AHCGDBLN_01007 K01057
997 | AHCGDBLN_01008
998 | AHCGDBLN_01009 K17947
999 | AHCGDBLN_01010 K02988
1000 | AHCGDBLN_01011 K02876
1001 | AHCGDBLN_01012 K03076
1002 | AHCGDBLN_01013
1003 | AHCGDBLN_01014 K02518
1004 | AHCGDBLN_01015
1005 | AHCGDBLN_01016 K00790
1006 | AHCGDBLN_01017 K02493
1007 | AHCGDBLN_01018 K02835
1008 | AHCGDBLN_01019
1009 | AHCGDBLN_01020 K03628
1010 | AHCGDBLN_01021
1011 | AHCGDBLN_01022 K02879
1012 | AHCGDBLN_01023 K03040
1013 | AHCGDBLN_01024 K02986
1014 | AHCGDBLN_01025
1015 | AHCGDBLN_01026
1016 | AHCGDBLN_01027
1017 | AHCGDBLN_01028
1018 | AHCGDBLN_01029
1019 | AHCGDBLN_01030
1020 | AHCGDBLN_01031 K06871
1021 | AHCGDBLN_01032
1022 | AHCGDBLN_01033
1023 | AHCGDBLN_01034
1024 | AHCGDBLN_01035
1025 | AHCGDBLN_01036 K04771
1026 | AHCGDBLN_01037
1027 | AHCGDBLN_01038
1028 | AHCGDBLN_01039
1029 | AHCGDBLN_01040 K02236
1030 | AHCGDBLN_01041 K06147
1031 | AHCGDBLN_01042
1032 | AHCGDBLN_01043
1033 | AHCGDBLN_01044
1034 | AHCGDBLN_01045
1035 | AHCGDBLN_01046
1036 | AHCGDBLN_01047 K21464
1037 | AHCGDBLN_01048
1038 | AHCGDBLN_01049
1039 | AHCGDBLN_01051
1040 | AHCGDBLN_01052
1041 | AHCGDBLN_01053 K01872
1042 | AHCGDBLN_01054 K06041
1043 | AHCGDBLN_01055 K00012
1044 | AHCGDBLN_01056
1045 | AHCGDBLN_01057 K07568
1046 | AHCGDBLN_01058 K03530
1047 | AHCGDBLN_01059 K05589
1048 | AHCGDBLN_01060
1049 | AHCGDBLN_01061
1050 | AHCGDBLN_01062
1051 | AHCGDBLN_01063
1052 | AHCGDBLN_01064
1053 | AHCGDBLN_01065
1054 | AHCGDBLN_01066
1055 | AHCGDBLN_01067
1056 | AHCGDBLN_01068
1057 | AHCGDBLN_01069
1058 | AHCGDBLN_01070
1059 | AHCGDBLN_01071
1060 | AHCGDBLN_01072
1061 | AHCGDBLN_01073
1062 | AHCGDBLN_01074
1063 | AHCGDBLN_01075
1064 | AHCGDBLN_01076
1065 | AHCGDBLN_01077 K01939
1066 | AHCGDBLN_01078
1067 | AHCGDBLN_01079
1068 | AHCGDBLN_01080
1069 | AHCGDBLN_01081
1070 | AHCGDBLN_01082
1071 | AHCGDBLN_01083 K06889
1072 | AHCGDBLN_01084
1073 | AHCGDBLN_01085
1074 | AHCGDBLN_01086
1075 | AHCGDBLN_01087
1076 | AHCGDBLN_01088
1077 | AHCGDBLN_01089
1078 | AHCGDBLN_01090
1079 | AHCGDBLN_01091
1080 | AHCGDBLN_01092
1081 | AHCGDBLN_01093
1082 | AHCGDBLN_01094
1083 | AHCGDBLN_01095
1084 | AHCGDBLN_01096
1085 | AHCGDBLN_01097
1086 | AHCGDBLN_01098 K01150
1087 | AHCGDBLN_01099
1088 | AHCGDBLN_01100
1089 | AHCGDBLN_01101 K07400
1090 | AHCGDBLN_01102
1091 | AHCGDBLN_01103 K21140
1092 | AHCGDBLN_01104
1093 | AHCGDBLN_01105
1094 | AHCGDBLN_01106 K03569
1095 | AHCGDBLN_01107
1096 | AHCGDBLN_01108
1097 | AHCGDBLN_01111 K09936
1098 | AHCGDBLN_01112 K03217
1099 | AHCGDBLN_01113 K08998
1100 | AHCGDBLN_01114 K03536
1101 | AHCGDBLN_01115
1102 | AHCGDBLN_01116
1103 | AHCGDBLN_01117
1104 | AHCGDBLN_01118
1105 | AHCGDBLN_01119 K03100
1106 | AHCGDBLN_01120 K03596
1107 | AHCGDBLN_01121
1108 | AHCGDBLN_01122
1109 | AHCGDBLN_01123
1110 | AHCGDBLN_01124 K15521
1111 | AHCGDBLN_01125
1112 | AHCGDBLN_01126
1113 | AHCGDBLN_01127
1114 | AHCGDBLN_01128
1115 | AHCGDBLN_01129
1116 | AHCGDBLN_01130 K11936
1117 | AHCGDBLN_01131
1118 | AHCGDBLN_01132 K07126
1119 | AHCGDBLN_01133
1120 | AHCGDBLN_01134
1121 | AHCGDBLN_01135
1122 | AHCGDBLN_01136
1123 | AHCGDBLN_01137
1124 | AHCGDBLN_01138
1125 | AHCGDBLN_01139
1126 | AHCGDBLN_01140
1127 | AHCGDBLN_01141
1128 | AHCGDBLN_01142 K21464
1129 | AHCGDBLN_01143
1130 | AHCGDBLN_01144 K00931
1131 | AHCGDBLN_01145 K00147
1132 | AHCGDBLN_01146 K00969
1133 | AHCGDBLN_01147 K09710
1134 | AHCGDBLN_01148 K03797
1135 | AHCGDBLN_01149 K08311
1136 | AHCGDBLN_01150
1137 | AHCGDBLN_01151
1138 | AHCGDBLN_01152
1139 | AHCGDBLN_01153
1140 | AHCGDBLN_01154 K00010
1141 | AHCGDBLN_01155 K13019
1142 | AHCGDBLN_01156
1143 | AHCGDBLN_01157 K20573
1144 | AHCGDBLN_01158
1145 | AHCGDBLN_01159
1146 | AHCGDBLN_01160
1147 | AHCGDBLN_01161
1148 | AHCGDBLN_01162
1149 | AHCGDBLN_01163
1150 | AHCGDBLN_01164
1151 | AHCGDBLN_01165
1152 | AHCGDBLN_01166 K07533
1153 | AHCGDBLN_01167 K00058
1154 | AHCGDBLN_01168 K00831
1155 | AHCGDBLN_01169 K04771
1156 | AHCGDBLN_01170
1157 | AHCGDBLN_01171
1158 | AHCGDBLN_01172
1159 | AHCGDBLN_01173
1160 | AHCGDBLN_01174
1161 | AHCGDBLN_01175
1162 | AHCGDBLN_01176
1163 | AHCGDBLN_01177
1164 | AHCGDBLN_01178
1165 | AHCGDBLN_01179
1166 | AHCGDBLN_01180
1167 | AHCGDBLN_01181
1168 | AHCGDBLN_01182
1169 | AHCGDBLN_01183
1170 | AHCGDBLN_01184 K01520
1171 | AHCGDBLN_01185
1172 | AHCGDBLN_01186
1173 | AHCGDBLN_01187
1174 | AHCGDBLN_01188
1175 | AHCGDBLN_01189
1176 | AHCGDBLN_01190
1177 | AHCGDBLN_01191
1178 | AHCGDBLN_01192
1179 | AHCGDBLN_01193 K12420
1180 | AHCGDBLN_01194
1181 | AHCGDBLN_01195 K01790
1182 | AHCGDBLN_01196 K19889
1183 | AHCGDBLN_01197 K01814
1184 | AHCGDBLN_01198
1185 | AHCGDBLN_01199 K21131
1186 | AHCGDBLN_01200 K15669
1187 | AHCGDBLN_01201 K12454
1188 | AHCGDBLN_01202
1189 | AHCGDBLN_01203
1190 | AHCGDBLN_01204 K02469
1191 | AHCGDBLN_01205 K02470
1192 | AHCGDBLN_01206 K00052
1193 | AHCGDBLN_01207 K00133
1194 | AHCGDBLN_01208
1195 | AHCGDBLN_01209
1196 | AHCGDBLN_01210
1197 | AHCGDBLN_01211
1198 | AHCGDBLN_01212
1199 | AHCGDBLN_01213
1200 | AHCGDBLN_01214
1201 | AHCGDBLN_01215
1202 | AHCGDBLN_01216 K06190
1203 | AHCGDBLN_01217
1204 | AHCGDBLN_01218
1205 | AHCGDBLN_01219 K00382
1206 | AHCGDBLN_01220
1207 | AHCGDBLN_01221 K04773
1208 | AHCGDBLN_01222
1209 | AHCGDBLN_01223
1210 | AHCGDBLN_01224
1211 | AHCGDBLN_01225
1212 | AHCGDBLN_01226 K00567
1213 | AHCGDBLN_01227
1214 | AHCGDBLN_01228
1215 | AHCGDBLN_01229
1216 | AHCGDBLN_01230 K07126
1217 | AHCGDBLN_01231
1218 | AHCGDBLN_01232
1219 | AHCGDBLN_01233
1220 | AHCGDBLN_01234 K02453
1221 | AHCGDBLN_01235
1222 | AHCGDBLN_01237
1223 | AHCGDBLN_01238
1224 | AHCGDBLN_01239
1225 | AHCGDBLN_01240
1226 | AHCGDBLN_01241 K03526
1227 | AHCGDBLN_01242 K03545
1228 | AHCGDBLN_01243 K22360
1229 | AHCGDBLN_01244
1230 | AHCGDBLN_01245 K00789
1231 | AHCGDBLN_01246 K08483
1232 | AHCGDBLN_01247
1233 | AHCGDBLN_01248 K03979
1234 | AHCGDBLN_01249 K02899
1235 | AHCGDBLN_01250 K02888
1236 | AHCGDBLN_01251 K06219
1237 | AHCGDBLN_01252
1238 | AHCGDBLN_01253
1239 | AHCGDBLN_01254
1240 | AHCGDBLN_01255 K01247
1241 | AHCGDBLN_01256
1242 | AHCGDBLN_01257 K03559
1243 | AHCGDBLN_01258
1244 | AHCGDBLN_01259
1245 | AHCGDBLN_01260
1246 | AHCGDBLN_01261
1247 | AHCGDBLN_01262 K02013
1248 | AHCGDBLN_01263 K22305
1249 | AHCGDBLN_01264
1250 | AHCGDBLN_01265
1251 | AHCGDBLN_01266
1252 | AHCGDBLN_01267
1253 | AHCGDBLN_01268
1254 | AHCGDBLN_01269
1255 | AHCGDBLN_01270
1256 | AHCGDBLN_01271
1257 | AHCGDBLN_01272
1258 | AHCGDBLN_01273
1259 | AHCGDBLN_01274
1260 | AHCGDBLN_01275 K13894
1261 | AHCGDBLN_01276 K13895
1262 | AHCGDBLN_01277 K00616
1263 | AHCGDBLN_01278 K01940
1264 | AHCGDBLN_01279
1265 | AHCGDBLN_01280
1266 | AHCGDBLN_01281
1267 | AHCGDBLN_01282 K12902
1268 | AHCGDBLN_01283 K02503
1269 | AHCGDBLN_01284
1270 | AHCGDBLN_01285 K01057
1271 | AHCGDBLN_01286
1272 | AHCGDBLN_01287 K10823
1273 | AHCGDBLN_01288
1274 | AHCGDBLN_01289 K03644
1275 | AHCGDBLN_01290
1276 | AHCGDBLN_01291
1277 | AHCGDBLN_01292 K01937
1278 | AHCGDBLN_01293
1279 | AHCGDBLN_01294 K00059
1280 | AHCGDBLN_01295
1281 | AHCGDBLN_01296 K07806
1282 | AHCGDBLN_01297
1283 | AHCGDBLN_01298
1284 | AHCGDBLN_01299
1285 | AHCGDBLN_01300
1286 | AHCGDBLN_01301
1287 | AHCGDBLN_01302
1288 | AHCGDBLN_01304 K02401
1289 | AHCGDBLN_01305 K22509
1290 | AHCGDBLN_01306
1291 | AHCGDBLN_01307 K02419
1292 | AHCGDBLN_01308
1293 | AHCGDBLN_01309 K02417
1294 | AHCGDBLN_01310
1295 | AHCGDBLN_01311
1296 | AHCGDBLN_01312
1297 | AHCGDBLN_01313
1298 | AHCGDBLN_01314
1299 | AHCGDBLN_01315
1300 | AHCGDBLN_01316 K00721
1301 | AHCGDBLN_01317 K06173
1302 | AHCGDBLN_01318 K03657
1303 | AHCGDBLN_01319
1304 | AHCGDBLN_01320
1305 | AHCGDBLN_01321 K00573
1306 | AHCGDBLN_01322
1307 | AHCGDBLN_01323 K01126
1308 | AHCGDBLN_01325
1309 | AHCGDBLN_01326 K07708
1310 | AHCGDBLN_01327
1311 | AHCGDBLN_01328 K19699
1312 | AHCGDBLN_01329
1313 | AHCGDBLN_01330
1314 | AHCGDBLN_01331
1315 | AHCGDBLN_01332 K11065
1316 | AHCGDBLN_01333
1317 | AHCGDBLN_01334
1318 | AHCGDBLN_01335
1319 | AHCGDBLN_01336
1320 | AHCGDBLN_01337 K04772
1321 | AHCGDBLN_01338
1322 | AHCGDBLN_01339
1323 | AHCGDBLN_01340
1324 | AHCGDBLN_01341
1325 | AHCGDBLN_01342 K00772
1326 | AHCGDBLN_01343 K00772
1327 | AHCGDBLN_01344 K01845
1328 | AHCGDBLN_01345 K07281
1329 | AHCGDBLN_01345 K07291
1330 | AHCGDBLN_01346 K02041
1331 | AHCGDBLN_01347 K02044
1332 | AHCGDBLN_01348 K01129
1333 | AHCGDBLN_01349
1334 | AHCGDBLN_01350
1335 | AHCGDBLN_01351
1336 | AHCGDBLN_01352
1337 | AHCGDBLN_01353
1338 | AHCGDBLN_01354
1339 | AHCGDBLN_01355
1340 | AHCGDBLN_01356
1341 | AHCGDBLN_01357
1342 | AHCGDBLN_01358
1343 | AHCGDBLN_01359
1344 | AHCGDBLN_01360 K00343
1345 | AHCGDBLN_01361
1346 | AHCGDBLN_01362 K03430
1347 | AHCGDBLN_01363 K11936
1348 | AHCGDBLN_01364
1349 | AHCGDBLN_01365
1350 | AHCGDBLN_01366 K01990
1351 | AHCGDBLN_01367 K00010
1352 | AHCGDBLN_01368
1353 | AHCGDBLN_01369
1354 | AHCGDBLN_01370
1355 | AHCGDBLN_01371
1356 | AHCGDBLN_01372
1357 | AHCGDBLN_01373
1358 | AHCGDBLN_01374
1359 | AHCGDBLN_01375
1360 | AHCGDBLN_01376
1361 | AHCGDBLN_01377
1362 | AHCGDBLN_01378
1363 | AHCGDBLN_01379
1364 | AHCGDBLN_01380
1365 | AHCGDBLN_01381
1366 | AHCGDBLN_01382
1367 | AHCGDBLN_01383 K01885
1368 | AHCGDBLN_01384 K03775
1369 | AHCGDBLN_01385
1370 | AHCGDBLN_01386
1371 | AHCGDBLN_01387
1372 | AHCGDBLN_01388
1373 | AHCGDBLN_01389
1374 | AHCGDBLN_01390
1375 | AHCGDBLN_01391
1376 | AHCGDBLN_01392 K03553
1377 | AHCGDBLN_01393
1378 | AHCGDBLN_01394 K12573
1379 | AHCGDBLN_01395
1380 | AHCGDBLN_01396 K02435
1381 | AHCGDBLN_01397 K02433
1382 | AHCGDBLN_01398 K02401
1383 | AHCGDBLN_01399
1384 | AHCGDBLN_01400
1385 | AHCGDBLN_01401
1386 | AHCGDBLN_01402
1387 | AHCGDBLN_01403
1388 | AHCGDBLN_01404
1389 | AHCGDBLN_01405
1390 | AHCGDBLN_01406
1391 | AHCGDBLN_01407
1392 | AHCGDBLN_01408
1393 | AHCGDBLN_01409
1394 | AHCGDBLN_01410
1395 | AHCGDBLN_01411
1396 | AHCGDBLN_01412
1397 | AHCGDBLN_01413
1398 | AHCGDBLN_01414
1399 | AHCGDBLN_01415 K19889
1400 | AHCGDBLN_01416
1401 | AHCGDBLN_01417 K03190
1402 | AHCGDBLN_01418
1403 | AHCGDBLN_01419
1404 | AHCGDBLN_01420
1405 | AHCGDBLN_01421 K01627
1406 | AHCGDBLN_01422 K09767
1407 | AHCGDBLN_01423
1408 | AHCGDBLN_01424
1409 | AHCGDBLN_01425
1410 | AHCGDBLN_01426
1411 | AHCGDBLN_01427
1412 | AHCGDBLN_01428
1413 | AHCGDBLN_01429 K01714
1414 | AHCGDBLN_01430 K00215
1415 | AHCGDBLN_01431
1416 | AHCGDBLN_01432
1417 | AHCGDBLN_01433
1418 | AHCGDBLN_01434
1419 | AHCGDBLN_01436 K15343
1420 | AHCGDBLN_01437 K03567
1421 | AHCGDBLN_01438 K00382
1422 | AHCGDBLN_01439
1423 | AHCGDBLN_01440
1424 | AHCGDBLN_01441 K02406
1425 | AHCGDBLN_01442 K13668
1426 | AHCGDBLN_01443 K00712
1427 | AHCGDBLN_01444
1428 | AHCGDBLN_01445
1429 | AHCGDBLN_01446 K18429
1430 | AHCGDBLN_01447 K00966
1431 | AHCGDBLN_01448
1432 | AHCGDBLN_01449
1433 | AHCGDBLN_01450
1434 | AHCGDBLN_01451
1435 | AHCGDBLN_01452 K08306
1436 | AHCGDBLN_01453
1437 | AHCGDBLN_01454 K03281
1438 | AHCGDBLN_01455 K15256
1439 | AHCGDBLN_01456
1440 | AHCGDBLN_01457
1441 | AHCGDBLN_01458 K12944
1442 | AHCGDBLN_01459
1443 | AHCGDBLN_01460
1444 | AHCGDBLN_01461
1445 | AHCGDBLN_01462 K12713
1446 | AHCGDBLN_01463 K03671
1447 | AHCGDBLN_01464
1448 | AHCGDBLN_01465
1449 | AHCGDBLN_01466 K00605
1450 | AHCGDBLN_01467 K07277
1451 | AHCGDBLN_01468
1452 | AHCGDBLN_01469
1453 | AHCGDBLN_01470
1454 | AHCGDBLN_01471
1455 | AHCGDBLN_01472
1456 | AHCGDBLN_01473
1457 | AHCGDBLN_01474
1458 | AHCGDBLN_01475 K07806
1459 | AHCGDBLN_01476
1460 | AHCGDBLN_01477
1461 | AHCGDBLN_01478
1462 | AHCGDBLN_01479
1463 | AHCGDBLN_01480
1464 | AHCGDBLN_01481
1465 | AHCGDBLN_01482
1466 | AHCGDBLN_01483
1467 | AHCGDBLN_01484
1468 | AHCGDBLN_01485 K00853
1469 | AHCGDBLN_01486
1470 | AHCGDBLN_01487
1471 | AHCGDBLN_01488
1472 | AHCGDBLN_01489
1473 | AHCGDBLN_01490
1474 | AHCGDBLN_01491
1475 | AHCGDBLN_01492
1476 | AHCGDBLN_01493
1477 | AHCGDBLN_01494
1478 | AHCGDBLN_01495 K08591
1479 | AHCGDBLN_01498 K00167
1480 | AHCGDBLN_01499
1481 | AHCGDBLN_01500
1482 | AHCGDBLN_01501
1483 | AHCGDBLN_01502 K03734
1484 | AHCGDBLN_01503 K17686
1485 | AHCGDBLN_01504 K02109
1486 | AHCGDBLN_01505
1487 | AHCGDBLN_01506
1488 | AHCGDBLN_01507
1489 | AHCGDBLN_01508
1490 | AHCGDBLN_01509
1491 | AHCGDBLN_01510
1492 | AHCGDBLN_01511
1493 | AHCGDBLN_01512
1494 | AHCGDBLN_01513
1495 | AHCGDBLN_01514
1496 | AHCGDBLN_01515 K01886
1497 | AHCGDBLN_01516
1498 | AHCGDBLN_01517
1499 | AHCGDBLN_01518
1500 | AHCGDBLN_01519
1501 | AHCGDBLN_01520
1502 | AHCGDBLN_01521
1503 | AHCGDBLN_01522
1504 | AHCGDBLN_01523 K03924
1505 | AHCGDBLN_01524
1506 | AHCGDBLN_01525
1507 | AHCGDBLN_01526
1508 | AHCGDBLN_01527 K03650
1509 | AHCGDBLN_01528
1510 | AHCGDBLN_01529
1511 | AHCGDBLN_01530
1512 | AHCGDBLN_01531
1513 | AHCGDBLN_01532
1514 | AHCGDBLN_01533
1515 | AHCGDBLN_01534 K02500
1516 | AHCGDBLN_01535
1517 | AHCGDBLN_01536
1518 | AHCGDBLN_01537
1519 | AHCGDBLN_01538
1520 | AHCGDBLN_01539
1521 | AHCGDBLN_01540
1522 | AHCGDBLN_01541
1523 | AHCGDBLN_01542
1524 | AHCGDBLN_01543 K04771
1525 | AHCGDBLN_01544
1526 | AHCGDBLN_01545
1527 | AHCGDBLN_01546
1528 | AHCGDBLN_01547
1529 | AHCGDBLN_01548 K03687
1530 | AHCGDBLN_01549 K02355
1531 | AHCGDBLN_01550 K01881
1532 | AHCGDBLN_01551 K09748
1533 | AHCGDBLN_01552
1534 | AHCGDBLN_01553 K08884
1535 | AHCGDBLN_01554 K01710
1536 | AHCGDBLN_01555
1537 | AHCGDBLN_01556 K06898
1538 | AHCGDBLN_01557 K22320
1539 | AHCGDBLN_01558
1540 | AHCGDBLN_01559
1541 | AHCGDBLN_01560
1542 | AHCGDBLN_01561
1543 | AHCGDBLN_01562
1544 | AHCGDBLN_01563
1545 | AHCGDBLN_01564 K01783
1546 | AHCGDBLN_01565
1547 | AHCGDBLN_01566
1548 | AHCGDBLN_01567 K03667
1549 | AHCGDBLN_01568 K00930
1550 | AHCGDBLN_01569
1551 | AHCGDBLN_01570
1552 | AHCGDBLN_01571
1553 | AHCGDBLN_01572 K20534
1554 | AHCGDBLN_01573
1555 | AHCGDBLN_01574
1556 | AHCGDBLN_01575
1557 | AHCGDBLN_01576
1558 | AHCGDBLN_01577 K04034
1559 | AHCGDBLN_01578 K02902
1560 | AHCGDBLN_01579
1561 | AHCGDBLN_01580
1562 | AHCGDBLN_01581
1563 | AHCGDBLN_01582 K05807
1564 | AHCGDBLN_01583
1565 | AHCGDBLN_01584 K01778
1566 | AHCGDBLN_01585 K03310
1567 | AHCGDBLN_01586 K03110
1568 | AHCGDBLN_01587 K00615
1569 | AHCGDBLN_01589
1570 | AHCGDBLN_01590
1571 | AHCGDBLN_01591 K06194
1572 | AHCGDBLN_01592
1573 | AHCGDBLN_01593
1574 | AHCGDBLN_01594
1575 | AHCGDBLN_01595
1576 | AHCGDBLN_01596
1577 | AHCGDBLN_01597
1578 | AHCGDBLN_01598 K05589
1579 | AHCGDBLN_01599 K02416
1580 | AHCGDBLN_01600
1581 | AHCGDBLN_01601
1582 | AHCGDBLN_01602
1583 | AHCGDBLN_01603
1584 | AHCGDBLN_01604 K01709
1585 | AHCGDBLN_01605
1586 | AHCGDBLN_01606
1587 | AHCGDBLN_01607
1588 | AHCGDBLN_01608
1589 | AHCGDBLN_01609
1590 | AHCGDBLN_01610
1591 | AHCGDBLN_01611 K07533
1592 | AHCGDBLN_01612
1593 | AHCGDBLN_01613 K13888
1594 | AHCGDBLN_01614 K02404
1595 | AHCGDBLN_01615
1596 | AHCGDBLN_01616 K05299
1597 | AHCGDBLN_01617
1598 | AHCGDBLN_01618
1599 | AHCGDBLN_01619
1600 | AHCGDBLN_01620
1601 | AHCGDBLN_01621
1602 | AHCGDBLN_01622
1603 | AHCGDBLN_01623
1604 | AHCGDBLN_01624
1605 | AHCGDBLN_01625
1606 | AHCGDBLN_01626
1607 | AHCGDBLN_01627
1608 | AHCGDBLN_01628
1609 | AHCGDBLN_01629 K00979
1610 | AHCGDBLN_01630
1611 | AHCGDBLN_01631
1612 | AHCGDBLN_01632 K00101
1613 | AHCGDBLN_01633 K02400
1614 | AHCGDBLN_01634 K18430
1615 | AHCGDBLN_01635
1616 | AHCGDBLN_01636
1617 | AHCGDBLN_01637
1618 | AHCGDBLN_01638
1619 | AHCGDBLN_01639 K02314
1620 | AHCGDBLN_01640
1621 | AHCGDBLN_01642
1622 | AHCGDBLN_01643
1623 | AHCGDBLN_01644
1624 | AHCGDBLN_01645
1625 | AHCGDBLN_01646
1626 | AHCGDBLN_01647
1627 | AHCGDBLN_01648 K02598
1628 | AHCGDBLN_01649
1629 | AHCGDBLN_01650
1630 | AHCGDBLN_01651 K06890
1631 | AHCGDBLN_01652
1632 | AHCGDBLN_01653
1633 | AHCGDBLN_01654 K02954
1634 | AHCGDBLN_01655
1635 | AHCGDBLN_01656
1636 | AHCGDBLN_01657
1637 | AHCGDBLN_01658
1638 | AHCGDBLN_01659
1639 | AHCGDBLN_01660
1640 | AHCGDBLN_01661
1641 | AHCGDBLN_01663
1642 | AHCGDBLN_01664
1643 | AHCGDBLN_01665 K13038
1644 | AHCGDBLN_01666
1645 | AHCGDBLN_01667
1646 | AHCGDBLN_01668 K07668
1647 | AHCGDBLN_01669
1648 | AHCGDBLN_01670
1649 | AHCGDBLN_01671
1650 | AHCGDBLN_01672
1651 | AHCGDBLN_01673
1652 | AHCGDBLN_01674
1653 |
--------------------------------------------------------------------------------
/remove_duplicate_seqs/README.md:
--------------------------------------------------------------------------------
1 | # Remove duplicate sequences
2 | Remove duplicate sequences from one or several multifasta files.
3 | According to the **id** in the header or the **sequence** itself.
4 |
5 | ## Require
6 | - `Biopython` module required
7 | - Using **Python3**
8 | - Works both on Windows and Unix-like systems
9 |
10 | ## Usage
11 |
12 | Filter according to the sequence id:
13 |
14 | ```bash
15 | $python3 remove_duplicate_seqs.py --id input.fa [input2.fa ...] > output.fa
16 | ```
17 | or filter according to the sequence itself:
18 | ```bash
19 | $python3 remove_duplicate_seqs.py --seq input.fa [input2.fa ...] > output.fa
20 | ```
21 |
22 | ## Note
23 | - `--id` or `--seq` are necessary and should be right put following the name of the script
24 | - The result will be sent to *stdout* by default, so please use `>` if you want to redirect the output.
25 |
26 | # Chinese Usage 中文使用说明
27 |
28 | 本脚本能够在一个或多个fasta格式的文本文件中清除重复的序列
29 | 可以根据序列的id或者根据序列本身来去除这种冗余
30 |
31 | ## 要求
32 |
33 | - 使用**Python3**
34 | - 需要调用`Biopython`
35 | - 在Windows和类Unix系统中均可运行
36 |
37 | ## 使用
38 |
39 | 通过序列的id号来过滤:
40 | ```bash
41 | $python3 remove_duplicate_seqs.py --id input.fa [input2.fa ...] > output.fa
42 | ```
43 |
44 | 通过序列本身来过滤:
45 | ```bash
46 | $python3 remove_duplicate_seqs.py --seq input.fa [input2.fa ...] > output.fa
47 | ```
48 |
49 | ## 注意
50 |
51 | - 参数`--id` 或者 `--seq`必须指定其一,并且它只能被置于第一个参数的位置(脚本名之后)
52 | - 结果默认打印输出到`stdout`,请使用`>`来重定向结果。
--------------------------------------------------------------------------------
/remove_duplicate_seqs/remove_duplicate_seqs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | """
4 | Description:
5 | Remove duplicate sequences from one or several multifasta files.
6 | According to the id in the header or the sequence itself.
7 |
8 | Usage:
9 | Filter according to the sequence id:
10 | $python3 remove_duplicate_seqs.py --id input.fa [input2.fa ...] > output.fa
11 |
12 | or filter according to the sequence itself:
13 | $python3 remove_duplicate_seqs.py --seq input.fa [input2.fa ...] > output.fa
14 | """
15 | import sys
16 | import textwrap
17 | from Bio import SeqIO
18 |
19 | __author__ = "Heyu Lin"
20 | __contact__ = "heyu.lin@student.unimelb.edu.au"
21 |
22 |
23 | def arg_parser(arr):
24 | if arr[1] != '--id' and arr[1] != '--seq':
25 | raise Exception('Please indicate the filter method by --id or --seq')
26 | if not arr[2]:
27 | raise Exception('Please indicate the input fasta file(s)')
28 | ref = arr[1]
29 | inputs = arr[2:]
30 | return ref, inputs
31 |
32 |
33 | def seqs_parser(filter, files):
34 | rec_dic = {}
35 | for fasfile in files:
36 | for seq_record in SeqIO.parse(fasfile, "fasta"):
37 | if filter == '--id':
38 | rec_dic[str(seq_record.description)] = str(seq_record.seq)
39 | elif filter == '--seq':
40 | rec_dic[str(seq_record.seq)] = str(seq_record.description)
41 | return rec_dic
42 |
43 |
44 | def main():
45 | ref, inputs = arg_parser(sys.argv)
46 | rec_dic = seqs_parser(ref, inputs)
47 | if ref == '--id':
48 | for key, value in rec_dic.items():
49 | print('>' + key)
50 | print(textwrap.fill(value))
51 | elif ref == '--seq':
52 | for key, value in rec_dic.items():
53 | print('>' + value)
54 | print(textwrap.fill(key))
55 |
56 |
57 | if __name__ == '__main__':
58 | main()
59 |
--------------------------------------------------------------------------------