├── .gitignore ├── LICENSE ├── Minimus2-pipeline ├── Minimus2_pipeline.py └── README.md ├── POCP-calculator ├── POCP-matrix.py └── README.md ├── README.md ├── blast-matrix ├── README.md └── blast_identity_matrix.py ├── blast-wrapper ├── README.md ├── blast_wrapper.py └── blastout2fasta.py ├── cdhit-clstr2tbl ├── README.md ├── cdhit_clstr2tbl.py ├── test.clstr └── test.clstr.tab ├── circular_genomes_from_gfa ├── README.md └── circular_genomes_from_gfa.py ├── download_uniprot_proteomes ├── README.md └── download_uniprot_proteomes_UPID.py ├── fasta-splitter └── fasta_splitter.py ├── prodigal-wrapper └── prodigal_run.py ├── prokka2kegg ├── README.md ├── idmapping_KO.tab.gz ├── prokka2kegg.py ├── prokka2kegg_batch.py ├── sample.gbk └── sample.kegg.out.txt └── remove_duplicate_seqs ├── README.md └── remove_duplicate_seqs.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /Minimus2-pipeline/Minimus2_pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """ 4 | This pipeline is described in the Amos official website: 5 | http://amos.sourceforge.net/wiki/index.php/Minimus2 6 | All the parameters are as default 7 | 8 | Usage: 9 | $ python3 Minimus2_pipleline.py -s1 S1.fas -s2 S2.fas -o output_prefix 10 | 11 | Sample: 12 | $ python Minimus2_pipeline.py -s1 seq1.fas -s2 seq2.fas -o Minimus2_out/seq1-2 13 | 14 | """ 15 | import os 16 | import argparse 17 | 18 | __author__ = "Heyu Lin" 19 | __contact__ = "heyu.lin(AT)student.unimelb.edu.au" 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('-s1', metavar='seq_1', dest='s1', 23 | type=str, required=True) 24 | parser.add_argument('-s2', metavar='seq_2', dest='s2', 25 | type=str, required=True) 26 | parser.add_argument('-o', metavar='output', dest='o', 27 | type=str, required=True) 28 | args = parser.parse_args() 29 | 30 | 31 | def create_dir(directory): 32 | dirnm = os.path.dirname(directory) 33 | if dirnm != '': 34 | if not os.path.exists(dirnm): 35 | os.makedirs(dirnm) 36 | 37 | 38 | def seq_num(fasta_file): 39 | num = len([1 for line in open(fasta_file) if line.startswith(">")]) 40 | return num 41 | 42 | 43 | def cat_files(file_list, outfile): 44 | with open(outfile, 'w') as fo: 45 | for fname in file_list: 46 | with open(fname) as infile: 47 | for line in infile: 48 | fo.write(line) 49 | 50 | 51 | def run_toAmos(in_fas, out_afg): 52 | cmd_para = [ 53 | 'toAmos', 54 | '-s', in_fas, 55 | "-o", out_afg 56 | ] 57 | cmd = ' '.join(cmd_para) 58 | try: 59 | print("\n", 'RUN toAmos'.center(50, '*')) 60 | print(cmd, "\n") 61 | os.system(cmd) 62 | except Exception as e: 63 | raise e 64 | 65 | 66 | def run_minimus2(in_afg, refcount): 67 | cmd_para = [ 68 | 'minimus2', 69 | in_afg, 70 | '-D', 'REFCOUNT=' + str(refcount) 71 | ] 72 | cmd = ' '.join(cmd_para) 73 | try: 74 | print("\n", 'RUN Minimus2'.center(50, '*')) 75 | print(cmd, "\n") 76 | os.system(cmd) 77 | except Exception as e: 78 | raise e 79 | 80 | 81 | def main(): 82 | create_dir(args.o) 83 | seq_1_num = seq_num(args.s1) 84 | cat_fas = args.o + '.cat.seq' 85 | cat_files([args.s1, args.s2], cat_fas) 86 | run_toAmos(cat_fas, args.o + '.cat.afg') 87 | run_minimus2(args.o + '.cat', seq_1_num) 88 | 89 | 90 | if __name__ == '__main__': 91 | main() 92 | -------------------------------------------------------------------------------- /Minimus2-pipeline/README.md: -------------------------------------------------------------------------------- 1 | # Minimus2 Pipeline 2 | Using `Minimus2` (a component of `Amos`) to merge two sets of genome contigs. 3 | 4 | This pipeline is described in the Amos official website: http://amos.sourceforge.net/wiki/index.php/Minimus2 5 | 6 | All the parameters are as default. 7 | 8 | ## Usage 9 | ```bash 10 | $ python Minimus2_pipleline.py -s1 S1.fas -s2 S2.fas -o output_prefix 11 | ``` 12 | ## Sample 13 | 14 | ```bash 15 | $ python Minimus2_pipeline.py -s1 seq1.fas -s2 seq2.fas -o Minimus2_out/seq1-2 16 | ``` 17 | 18 | ## Options 19 | 20 | - `-s1`: genome set 1 (fasta format; used as reference) 21 | - `-s2`: genome set 2 (fasta format) 22 | - `-o`: prefix of output (directory is allowed to involve and will be create if not exists) 23 | 24 | ## Require 25 | - Using **Python3** 26 | - Amos was installed, including which `toAmos` and `minimus2` was already in the $PATH 27 | - No 3rd party python modules required 28 | 29 | ## Output 30 | 31 | The following two files are the most important output: 32 | 33 | - prefix.fasta : merged contig sequences 34 | - prefix.singletons.seq : singleton sequences 35 | 36 | Consider to use `cat` command to combine these two files, in order to do downstream analysis. 37 | 38 | # Chinese Usage 中文使用说明 39 | Minimus2是Amos套件中的一个程序,主要用于进行两个基因组文件的合并与再拼接。 40 | 41 | ## 使用 42 | ```bash 43 | $ python Minimus2_pipleline.py -s1 S1.fas -s2 S2.fas -o output_prefix 44 | ``` 45 | ## 示例 46 | 47 | ```bash 48 | $ python Minimus2_pipeline.py -s1 seq1.fas -s2 seq2.fas -o Minimus2_out/seq1-2 49 | ``` 50 | 51 | ## 选项 52 | 53 | - `-s1`: 基因组1(fasta格式,将会用做参考序列) 54 | - `-s2`: 基因组2(fasta格式) 55 | - `-o`: 输出文件的前缀(可以包含前置路径名,路径若不存在则会被新建) 56 | ## 要求 57 | - 使用**Python3** 58 | - 无需第三方python模块 59 | - Amos已安装,并至少将`toAmos` 和 `minimus2` 两个组件放进$PATH中以便调用 60 | 61 | ## 输出 62 | 63 | 下面两个文件是所有输出文件中最重要的: 64 | 65 | - prefix.fasta : 合并的contigs文件 66 | - prefix.singletons.seq : 未合并的contigs 67 | 68 | 可以考虑使用 `cat` 命令将这两个文件合并进行下游分析。 -------------------------------------------------------------------------------- /POCP-calculator/POCP-matrix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """ 4 | Calculate the percentage of conserved proteins (POCP) between two or 5 | more genomes to estimate their evolutionary and phenotypic distance. 6 | An elegant matrix table will be created after the calculation. 7 | 8 | The program was written based on (Qin et al. 2014; doi: 10.1128/JB.01688-14) 9 | 10 | # Required: 11 | BLAST+ installed in $PATH 12 | 13 | # Usage: 14 | $ python POCP-matrix.py -i input_dir -o output_matrix.tab [-n 8] [--clean] 15 | 16 | # Options: 17 | -i: input directory contained more than 2 translated genome files (suffix: .faa) 18 | -o: output POCP matrix file 19 | -n: number of threads (optional, default: 3) 20 | --clean: blast output and databases created by this program will be removed (optional) 21 | 22 | """ 23 | 24 | import sys, os, re 25 | import glob 26 | import itertools 27 | from math import factorial # used to compute the progress 28 | import subprocess 29 | import argparse 30 | 31 | __author__ = "Heyu Lin" 32 | __contact__ = "heyu.lin@student.unimelb.edu.au" 33 | 34 | """ 35 | Deal with some options 36 | """ 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument('-i', '--input', metavar='input_directory', dest='i', 39 | type=str, required=True) 40 | parser.add_argument('-o', '--output', metavar='output_filename', dest='o', 41 | type=str, required=True) 42 | parser.add_argument('-n', '--num_threads', metavar='num_cpu', 43 | dest='n', type=int, default=3, 44 | help='specify the number of threads used by blast (default=3)') 45 | parser.add_argument('--clean', metavar='clean_blast_db_output', 46 | dest='c', nargs="?", const=True, default=False, 47 | help='redundant files created by this program will be removed if this argument is added') 48 | args=parser.parse_args() 49 | 50 | """ 51 | Define functions 52 | """ 53 | 54 | def run_mkblastdb(fi, fo): 55 | ''' 56 | fi: input fasta file 57 | fo: output database name 58 | ''' 59 | cmd_para = [ 60 | 'makeblastdb', 61 | '-in', fi, 62 | '-dbtype', 'prot', 63 | '-parse_seqids', 64 | '-out', fo 65 | ] 66 | try: 67 | run = subprocess.call(cmd_para, stdout=subprocess.PIPE) 68 | except Exception as e: 69 | raise e 70 | 71 | def run_blastp(q, db, o, n): 72 | """ 73 | q: query 74 | db: database 75 | o: output 76 | n: num_cpu 77 | """ 78 | cmd_para = [ 79 | 'blastp', 80 | '-query', q, 81 | '-out', o, 82 | '-db', db, 83 | '-evalue', '1e-5', 84 | '-outfmt', "6 std qlen", 85 | '-max_target_seqs', '1', 86 | '-num_threads', str(n), 87 | ] 88 | try: 89 | process = subprocess.Popen(cmd_para, stderr=subprocess.PIPE) 90 | _, stderr = process.communicate() 91 | 92 | if stderr: 93 | warnings = stderr.decode("utf-8").split('\n') 94 | for warning in warnings: 95 | if "Warning: [blastp] Examining 5 or more matches is recommended" not in warning and warning != '': 96 | print("Warning:", warning) 97 | except Exception as e: 98 | raise e 99 | 100 | def num_sequnces(fasta): 101 | pattern = r"^>" 102 | with open(fasta, "r") as f: 103 | data = f.read() 104 | iterator = re.finditer(pattern, data, re.MULTILINE) 105 | count = 0 106 | for match in iterator: 107 | count += 1 108 | return count 109 | 110 | def comb(n, r): 111 | return factorial(n) // factorial(r) // factorial(n-r) 112 | 113 | def POCP_calculator(pair, num_cpu): 114 | T1 = num_sequnces(pair[0]) 115 | T2 = num_sequnces(pair[1]) 116 | blastout_name1 = pair[0] + '--' + os.path.basename(pair[1]) + '.POCPout' 117 | if not os.path.exists(blastout_name1): 118 | run_blastp(pair[0], pair[1]+'_POCP', blastout_name1, num_cpu) 119 | blastout_name2 = pair[1] + '--' + os.path.basename(pair[0]) + '.POCPout' 120 | if not os.path.exists(blastout_name2): 121 | run_blastp(pair[1], pair[0]+'_POCP', blastout_name2, num_cpu) 122 | hit_sum = 0 # Initialize the number of hit sequences 123 | for outfile in [blastout_name1, blastout_name2]: 124 | with open(outfile, 'r') as f: 125 | """ 126 | qury_temp: used to test whether a query has only one hit region 127 | recd: In the case that a query has more than on alignabe region, 128 | only one hit that eligible should be count 129 | """ 130 | qury_temp = 'temp' 131 | recd = False 132 | for line in f.readlines(): 133 | items = line.split() 134 | qury = items[0] 135 | iden = float(items[2]) 136 | qcov = float(items[3]) / float(items[12]) 137 | if qury != qury_temp and iden >= 40 and qcov >= 0.5: 138 | hit_sum += 1 139 | recd = True # This query has been counted, and should not be 140 | # counted again if it has another eligible regions 141 | elif qury == qury_temp and iden >= 40 and qcov >= 0.5: 142 | if recd == False: # Although the sequence has two hit region, 143 | # the previous regions were not eligible 144 | hit_sum += 1 145 | recd = True 146 | qury_temp = qury 147 | return hit_sum/(T1 + T2) * 100 148 | 149 | def output_table(dict, items, out): 150 | with open(out, 'w') as fo: 151 | fo.write('POCP' + "\t" + "\t".join(items) + "\n") 152 | num = len(items) 153 | for i in range(len(items)): 154 | lst = [] 155 | lst.append(os.path.basename(items[i])) 156 | for j in range(len(items)): 157 | if items[i] == items[j]: 158 | lst.append('100') 159 | else: 160 | lst.append(str(dict.get((items[j],items[i]), '~'))) 161 | fo.write("\t".join(lst) + "\n") 162 | 163 | def clean(pth): 164 | for file in glob.iglob(os.path.join(pth,'*_POCP.p??')): 165 | os.remove(file) # Clean blast databases 166 | for file in glob.iglob(os.path.join(pth,'*.POCPout')): 167 | os.remove(file) # Clean blast output files 168 | 169 | """ 170 | Main Program 171 | """ 172 | def main(): 173 | genomes = glob.glob(os.path.join(args.i,'*.faa')) 174 | genomes_bn = list(map(os.path.basename, genomes)) 175 | num_genomes = len(genomes) 176 | print(num_genomes, 'genomes have been read.') 177 | num_blastp = comb(num_genomes,2) * 2 # The number of blastp should be called 178 | # Make blast database for all the genomes 179 | for genome in genomes: 180 | run_mkblastdb(genome, genome+'_POCP') 181 | # Run blastp between every two genomes 182 | dict = {} 183 | processed = 0 184 | for genome_pair in itertools.combinations(genomes,2): 185 | genome_pair_bn = tuple(map(os.path.basename, genome_pair)) 186 | POCP_value = POCP_calculator(genome_pair, args.n) 187 | dict[genome_pair_bn] = POCP_value 188 | processed += 2 189 | processed_perc = round(processed/num_blastp * 30) 190 | print("\r"+"["+">"*processed_perc+"]", 191 | "{}/{}".format(processed, num_blastp),end='') # print progress bar 192 | sys.stdout.flush() 193 | output_table(dict, genomes_bn, args.o) 194 | if args.c == True: 195 | clean(args.i) 196 | print("\ndone.") 197 | 198 | if __name__ == '__main__': 199 | main() 200 | -------------------------------------------------------------------------------- /POCP-calculator/README.md: -------------------------------------------------------------------------------- 1 | # POCP Calculator 2 | 3 | Calculate the percentage of conserved proteins **(POCP)** between two or 4 | more genomes to estimate their evolutionary and phenotypic distance. 5 | 6 | POCP value could be used as a robust genomic index for establishing the **genus boundary** for prokaryotic groups. Generally, a POCP value of 50% could be used as a genus boundary for prokaryotic lineages according to [Qin et al (2014)](https://journals.asm.org/doi/10.1128/JB.01688-14) 7 | 8 | An elegant matrix table will be created after the calculation. 9 | 10 | The program was written based on the paper (*Qin et al. 2014; doi: [10.1128/JB.01688-14](https://journals.asm.org/doi/10.1128/JB.01688-14)*) 11 | 12 | ## Usage 13 | 14 | ```bash 15 | $ python POCP-matrix.py -i input_dir -o output_matrix.tab [-n 8] [--clean] 16 | ``` 17 | 18 | ## Options 19 | 20 | - `-i`: input directory contained more than 2 translated genome files (suffix: .faa) 21 | - `-o`: output POCP matrix file 22 | - `-n`: number of threads (optional, default: 3) 23 | - `--clean`: blast output and databases created by this program will be removed (optional) 24 | 25 | ## Require 26 | 27 | - BLAST+ installed in `$PATH` 28 | - Using **Python3** 29 | - Works both on Windows and unix-like systems 30 | - No 3rd party python modules required 31 | 32 | ## Sample Output: 33 | 34 | | POCP | Genome1.faa | Genome2.faa | Genome3.faa | Genome4.faa | 35 | | ----------- | ----------- | ----------- | ----------- | ----------- | 36 | | Genome1.faa | 100 | ~ | ~ | ~ | 37 | | Genome2.faa | 77.25376031 | 100 | ~ | ~ | 38 | | Genome3.faa | 92.18714253 | 59.14082 | 100 | ~ | 39 | | Genome4.faa | 41.25224685 | 57.19096 | 66.48514 | 100 | 40 | 41 | > Please ensure that the length of every sequence header is less than 50 characters. Otherwise, Blast will be unable to create the database and will produce an error. 42 | 43 | # Chinese Usage 中文使用说明 44 | 45 | POCP_matrix.py脚本能够计算多个基因组之间的**POCP值**(保守蛋白百分比),用来判断原核生物在**属水平**上的遗传距离。POCP值在50%以上可以被认为是一个属的边界[Qin et al (2014)](https://journals.asm.org/doi/10.1128/JB.01688-14)。 46 | 47 | 该程序基于文献:(*Qin et al. 2014; doi: [10.1128/JB.01688-14](https://journals.asm.org/doi/10.1128/JB.01688-14)*) 48 | 49 | ## 使用 50 | 51 | ```bash 52 | $ python POCP-matrix.py -i input_dir -o output_matrix.tab [-n 8] [--clean] 53 | ``` 54 | 55 | ## 选项 56 | 57 | - `-i`: 输入文件夹,至少含有两个基因组的蛋白质文件(后缀为.faa) 58 | - `-o`: 输出POCP表格的文件名 59 | - `-n`: 使用cpu核心数 (可选, 默认: 3) 60 | - `--clean`: 该程序计算过程中产生的blast数据库与结果将会被清除 (可选) 61 | 62 | ## 要求 63 | - Blast+已安装并存在环境变量`$PATH`中 64 | - 使用**Python3** 65 | - 在Windows和类unix系统中均可运行 66 | - 无需第三方python模块 67 | 68 | ## 输出示例: 69 | 70 | | POCP | Genome1.faa | Genome2.faa | Genome3.faa | Genome4.faa | 71 | | ----------- | ----------- | ----------- | ----------- | ----------- | 72 | | Genome1.faa | 100 | ~ | ~ | ~ | 73 | | Genome2.faa | 77.25376031 | 100 | ~ | ~ | 74 | | Genome3.faa | 92.18714253 | 59.14082 | 100 | ~ | 75 | | Genome4.faa | 41.25224685 | 57.19096 | 66.48514 | 100 | 76 | 77 | > 注意:faa文件中的header必须都小于50个字符,否则blast无法建库,会报错 78 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4954426.svg)](https://doi.org/10.5281/zenodo.4954426) [![language](https://img.shields.io/badge/language-python%203.6-yellow)]() 2 | 3 | # Bio-py 4 | Some useful python scripts for biologists. 5 | 6 | All the scripts in this repo are developed and tested on python >= 3.6. 7 | 8 | I would be glad if you submit issues or email me at (heyu.lin🅰️qut.edu.au) for questions, suggestions or other feedback. 9 | 10 | ## How to cite 11 | If you find these scripts useful for your scientific research, please consider citing this repo to make it easier for other peers to find it. This repo has been published with [Zenodo](https://doi.org/10.5281/zenodo.4954426), so you are encouraged to cite the tools as follows: 12 | > Lin, H. (2021). _SilentGene/Bio-py: Bio-py_. Zenodo. http://doi.org/10.5281/zenodo.4954426 13 | -------------------------------------------------------------------------------- /blast-matrix/README.md: -------------------------------------------------------------------------------- 1 | # BLAST Matrix 2 | 3 | This script calculates pair-wise sequence identities for all sequences in a multifasta format file. 4 | A matrix table will be generated after the calculation, and a clustered heatmap will be drawn if required. 5 | 6 | ## Require 7 | 8 | - BLAST+ installed in $PATH 9 | - Biopython (with pandas > 0.21) 10 | - seaborn & scipy (for drawing clustered heatmap) 11 | 12 | ## Usage 13 | 14 | ```bash 15 | $ python blast_identity_matrix.py -i input_seqs.fasta [-o output_matrix.tsv] [--thread 4] [--program blastp] [--heatmap output_heatmap.pdf] [--clean] 16 | ``` 17 | 18 | ## Options 19 | 20 | - `-i`: Input file in multi-sequence FASTA format 21 | - `-o`: Output matrix table in tab-delimited format [default: (input file name) + '_ident.tsv'] 22 | - `-t`: Threads that would be used for makeblastdb and blast [default: 2] 23 | - `-p`: blast program that would be used (blastp or blastn) [default: blastp] 24 | - `--heatmap`: Draw clustered heatmap. 25 | - `--clean`: Clean temporary files. [default: False] 26 | 27 | 28 | 29 | # Chinese Usage 中文使用说明 30 | 31 | 此脚本会进行两两blast比较并计算一致性(identity)。输入一个含有多条fasta序列的文件,生成一个一致性数值矩阵。 32 | 33 | ## 要求 34 | 35 | - BLAST+ 安装在 `$PATH` 36 | - Python3.x 37 | - Biopython (包含pandas > 0.21) 38 | - seaborn & scipy (如果绘制聚类热图需要安装) 39 | 40 | ## 使用命令 41 | 42 | ```bash 43 | $ python blast_identity_matrix.py -i input_seqs.fasta [-o output_matrix.tsv] [--thread 4] [--program blastp] [--heatmap] [--clean]] 44 | ``` 45 | 46 | ## 可选项 47 | 48 | - `-i`: 输入文件。含有多条fasta序列的文件。 49 | - `-o`: 输出文件。tab分割的数值矩阵。[默认文件名: (输入文件名) + '_ident.tsv'] 50 | - `-t`: makeblastdb和blast过程会调用的线程数。 [默认: 2] 51 | - `-p`: blast程序 (可选blastp或blastn) [默认: blastp] 52 | - `--heatmap`: 绘制聚簇热图. 53 | - `--clean`: 清除中间文件 [默认: False] 54 | -------------------------------------------------------------------------------- /blast-matrix/blast_identity_matrix.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | """ 4 | This script calculates pair-wise sequence identities for all sequences in a multifasta format file. 5 | A matrix table will be generated after the calculation, and a clustered heatmap will be drawn if required. 6 | 7 | # Required: 8 | - BLAST+ installed in $PATH 9 | - Biopython (with pandas > 0.21) 10 | - seaborn & scipy (for drawing clustered heatmap) 11 | 12 | # Usage: 13 | $ python blast_identity_matrix.py -i input_seqs.fasta [-o output_matrix.tsv] [--heatmap output_heatmap.pdf] [--thread 4] [--program blastp] [--clean] 14 | 15 | # Options: 16 | -i: Input file in multi-sequence FASTA format 17 | -o: Output matrix table in tab-delimited format [default: (input file name) + '_ident.tsv'] 18 | -t: Threads that would be used for makeblastdb and blast [default: 2] 19 | -p: blast program that would be used (blastp or blastn) [default: blastp] 20 | --heatmap: Draw clustered heatmap. 21 | --clean: Clean temporary files. [default: False] 22 | """ 23 | 24 | import os 25 | from Bio import SeqIO 26 | import argparse 27 | import random 28 | import shutil 29 | from itertools import permutations 30 | import pandas as pd 31 | import subprocess 32 | from multiprocessing import Pool 33 | 34 | __author__ = "Heyu Lin" 35 | __contact__ = "heyu.lin@student.unimelb.edu.au" 36 | 37 | parser = argparse.ArgumentParser() 38 | 39 | parser.add_argument('-i', '--input', metavar='input_fasta_file', dest='i', 40 | type=str, required=True, 41 | help='Input file in multi-sequence FASTA format') 42 | parser.add_argument('-o', '--output', metavar='output_table', dest='o', 43 | type=str, required=False, 44 | help='Output matrix table in tab-delimited format') 45 | parser.add_argument('-t', '--threads', metavar='threads', dest='t', 46 | type=int, required=False, default=2, 47 | help='Threads that would be used for makeblastdb and blast') 48 | parser.add_argument('-p', '--program', metavar='blast_program', dest='p', 49 | type=str, required=False, default='blastp', 50 | help='blast program that would be used (blastp or blastn)') 51 | parser.add_argument('-m', '--heatmap', metavar='heatmap', dest='m', 52 | type=str, required=False, 53 | help='Draw clustered heatmap.') 54 | parser.add_argument('--clean', metavar='clean', dest='c', 55 | action='store_true', required=False, 56 | help='Clean temporary files. Default: False') 57 | args = parser.parse_args() 58 | 59 | input_faa = args.i 60 | output_table = input_faa + '_ident.tsv' if args.o == None else args.o 61 | tmp_folder = 'blast_matrix_tmp_' + str(random.randint(0,999999)).zfill(6) 62 | if args.p == 'blastp': 63 | blast_program = 'blastp' 64 | data_type = 'prot' 65 | elif args.p == 'blastn': 66 | blast_program = 'blastn' 67 | data_type = 'nucl' 68 | else: 69 | raise AttributeError('Only blastp or blastn is supported!') 70 | 71 | if not os.path.exists(tmp_folder): 72 | os.makedirs(tmp_folder) 73 | else: 74 | raise IOError(f"Sorry, the temporary folder could not be created. Please remove the {tmp_folder} folder.") 75 | 76 | 77 | def run_mkblastdb(fi, tp): 78 | fo = fi + '.db' 79 | ''' 80 | fi: input fasta file 81 | fo: output database name 82 | tp: prot or nucl 83 | ''' 84 | cmd_para = [ 85 | 'makeblastdb', 86 | '-in', fi, 87 | "-dbtype", tp, 88 | "-parse_seqids", 89 | "-out", fo 90 | ] 91 | try: 92 | # print("\n", 'Make Blast Database'.center(50, '*')) 93 | # print(cmd, "\n") 94 | subprocess.check_call(cmd_para, 95 | stdout=open(os.devnull, 'wb'), 96 | stderr=subprocess.STDOUT, 97 | ) 98 | except subprocess.CalledProcessError as exc: 99 | print('cmd:', exc.cmd) 100 | print("Status : FAIL", exc.returncode, exc.output) 101 | 102 | 103 | def run_blast(q, o, db, e, b): 104 | ''' 105 | q: query 106 | o: output 107 | db: database 108 | e: evalue 109 | f: outfmt 110 | n: num_threads 111 | b: blast program 112 | ''' 113 | cmd_para = [ 114 | b, 115 | '-query', q, 116 | '-out', o, 117 | '-db', db, 118 | '-evalue', str(e), 119 | '-outfmt', '6', 120 | '-num_threads', '1' 121 | ] 122 | try: 123 | # print("\n", 'BLAST Searching'.center(50, '*')) 124 | # print(cmd, "\n") 125 | res = subprocess.check_call(cmd_para, 126 | stdout=open(os.devnull, 'wb'), 127 | stderr=subprocess.STDOUT, 128 | ) 129 | except subprocess.CalledProcessError as exc: 130 | print('cmd:', exc.cmd) 131 | print('output:', exc.output) 132 | 133 | 134 | def blast_Parser(fi): 135 | ''' 136 | fi: blast output (format 6) 137 | ''' 138 | if not os.path.getsize(fi): 139 | return 0 140 | 141 | with open(fi) as input: 142 | for line in input.readlines(): 143 | items = line.strip().split("\t") 144 | return float(items[2]) 145 | 146 | 147 | def include_outputdir(s): 148 | return os.path.join(tmp_folder, s) 149 | 150 | def draw_heatmap(df, out_pdf): 151 | import seaborn as sns 152 | import scipy 153 | 154 | # Draw clustered heatmap 155 | cmap = sns.clustermap(df) 156 | 157 | # Save plot to a PDF file 158 | cmap.savefig(out_pdf) 159 | 160 | 161 | if __name__ == "__main__": 162 | pool = Pool(args.t) 163 | 164 | seq_ids = [] 165 | for seq_record in SeqIO.parse(input_faa, "fasta"): 166 | single_seq = include_outputdir(seq_record.id) + ".faa" 167 | SeqIO.write(seq_record, single_seq, "fasta") 168 | seq_ids.append(seq_record.id) 169 | 170 | # build parameters for mkblastdb 171 | mkblastdb_para = [(include_outputdir(i + '.faa'), data_type) for i in seq_ids] 172 | # run mkblastdb in parallel 173 | pool.starmap(run_mkblastdb, mkblastdb_para) 174 | 175 | blast_para = [] # build parameters for blast 176 | for query, targ in permutations(seq_ids, 2): 177 | blast_out = include_outputdir(query + '+' + targ + '_blast') 178 | blast_query = include_outputdir(query + '.faa') 179 | blast_targ = include_outputdir(targ + '.faa.db') 180 | blast_para.append((blast_query, blast_out, blast_targ, '1e-5', blast_program)) 181 | 182 | data = {} 183 | pool.starmap(run_blast, blast_para) 184 | 185 | for query, targ in permutations(seq_ids, 2): 186 | blast_out = include_outputdir(query + '+' + targ + '_blast') 187 | ident = blast_Parser(blast_out) 188 | if data.get(query): 189 | data[query][targ] = ident 190 | else: 191 | data[query] = {} 192 | data[query][targ] = ident 193 | 194 | df = pd.DataFrame(data).sort_index().sort_index(axis=1) 195 | 196 | 197 | mean_ident = df.mean(skipna = True).mean() 198 | 199 | max_qur_tar = df.stack().idxmax() 200 | max_ident = df.loc[max_qur_tar] 201 | min_qur_tar = df.stack().idxmin() 202 | min_ident = df.loc[min_qur_tar] 203 | 204 | print('\n***** Statistics *****') 205 | print(f'Maximum Identity:\n{max_ident}%: {max_qur_tar[0]} -> {max_qur_tar[1]}') 206 | print(f'Mimimum Identity:\n{min_ident}%: {min_qur_tar[0]} -> {min_qur_tar[1]}') 207 | print(f'Average Identity: {mean_ident}%') 208 | 209 | df = df.fillna(100) # Fill NaN values with 100 210 | df.to_csv(output_table, sep='\t') 211 | 212 | if args.c: 213 | shutil.rmtree(tmp_folder) 214 | 215 | ######## ~ draw clustered heatmap ~ ######## 216 | if args.m: 217 | draw_heatmap(df, args.m) 218 | 219 | 220 | 221 | -------------------------------------------------------------------------------- /blast-wrapper/README.md: -------------------------------------------------------------------------------- 1 | # blast-wrapper 2 | Pipeline for conducting **makeblastdb** and **blastp/blastn/blastx/tblastn** using one simple command. 3 | 4 | Show blast results in a **more elegant way**. Not only table headers, but also **query coverages** and the **original query sequences** were calculated and showed in the results. 5 | 6 | This script can also parse and filter the blast result by setting threshold of identity and coverage! 7 | 8 | ## Require 9 | - BLAST+ installed in `$PATH` 10 | - Using **Python3** 11 | - Works both on Windows and unix-like systems 12 | ## Usage 13 | ``` 14 | $ python3 blast_wrapper.py -h 15 | usage: blast_wrapper.py [-h] -q query_fasta [-o output] [-df database_fasta] 16 | [-db database] [-e max_e-value] [-ms num_sequences] 17 | [-n num_cpu] [-b blast+ program] 18 | [-id identity_threshold] [-qc coverage_threshold] 19 | [--no_qseq [hide qseq column]] [-f output_format*] 20 | 21 | optional arguments: 22 | -h, --help show this help message and exit 23 | -q query_fasta, --query query_fasta 24 | -o output, --output output 25 | -df database_fasta, --database_fasta database_fasta 26 | fasta file to be used as database 27 | -db database, --database database 28 | blast database which has already been made 29 | -e max_e-value, --evalue max_e-value 30 | threshod e-value for blast (default=1e-5) 31 | -ms num_sequences, --max_target_seqs num_sequences 32 | specify the max_number of target seqs for hits per 33 | query (default=1) 34 | -n num_cpu, --num_threads num_cpu 35 | specify the number of threads used by blast 36 | (default=3) 37 | -b blast+ program, --blast_program blast+ program 38 | specify the blast program (default=blastp) 39 | -id identity_threshold, --identity identity_threshold 40 | specify the threshold of identity (default=0) 41 | -qc coverage_threshold, --qcov coverage_threshold 42 | specify the threshold of query coverage (default=0) 43 | --no_qseq [hide qseq column] 44 | no query sequences will be showed if this argument is 45 | added 46 | -f output_format*, --outfmt output_format* 47 | outfmt defined by blast+, it is dangerous to change 48 | the default value 49 | ``` 50 | ## Sample Output 51 | qid | sid | ident% | aln_len | miss | gap | qstart | qend | sstart | send | qlen | slen | evalue | bitscore | qcov% | qseq 52 | --- | --- | ------ | ------- | ---- | --- | ------ | ---- | ------ | ---- | ---- | ---- | ------ | -------- | ----- | ---- 53 | HC_02247 | HgcA_ND132 | 34.483 | 58 | 37 | 1 | 550 | 607 | 9 | 65 | 608 | 95 | 1.42e-08 | 43.1 | 9.4 | MEAVE... 54 | HC_00217 | HgcB_ND132 | 28.049 | 82 | 42 | 3 | 104 | 176 | 18 | 91 | 220 | 95 | 8.56e-06 | 33.5 | 32.7 | METVE... 55 | HC_01133 | MerA_RS | 31.567 | 453 | 286 | 12 | 6 | 445 | 9 | 450 | 466 | 480 | 2.88e-55 | 182 | 94.2 | MSKVH... 56 | HC_01413 | MerA_WE | 30.660 | 424 | 283 | 4 | 26 | 443 | 114 | 532 | 455 | 554 | 7.74e-63 | 204 | 91.6 | MDFFD... 57 | ## Simplest 58 | ```bash 59 | $ python blast_wrapper.py -q query.faa -df database.faa 60 | ``` 61 | or if you already have an established database: 62 | ```bash 63 | $ python blast_warpper.py -q query.faa -db database 64 | ``` 65 | ## Moderate 66 | ```bash 67 | $ python blast_wrapper.py -b blastn -q query.fna -o output -df database.fna -e 1e-10 -n 5 68 | ``` 69 | 70 | ## Control freak 71 | ```bash 72 | $ python blast_wrapper.py -b blastx -q query.fna -o output -df database.faa -e 1e-10 -id 30 -qc 50 -n 5 -ms 3 --no_qseq 73 | ``` 74 | *Any change to output format by -f option may lead to errors when parsing output results, although it's up to you to make any change* 75 | 76 | ## Note 77 | - blastp would be used if no algorithm is specified by option `-b blastn`. 78 | - The option `-q` is required to specify the query fasta file. The option `-df` or `-db` is required to specify the target database in fasta famat or an database that has already made by makeblastdb command in blast+ software. 79 | - If no output is specified by `-o`, the result would be created in the current direcoty according to the regular `QueryFileName_blast.out`. 80 | - If `-df` is specified, the database would be created in the same directory as the argument specified using the name `DatabaseFasta.db`. And if such a database already exsits, the script would skip the makeblastdb step. 81 | - Using `-id` and `-qc` to set the threshold of **identity** and **query coverage**, respectively. 82 | - `--no_seqs` could used when you don't want the orignal query sequences appear in the final result. This may speed up the program in some extend. 83 | - 3 threads would be used by default, which could be modified by the `-n` option. 84 | - A custom function has been developed to take the place of the original `-max_target_seqs` option, since the latter one has been found to only generate the first hit, not the best hit. 85 | 86 | ## Tips 87 | If you happen to have a bunch of fasta files waiting for blast against a single database, try out the following bash command to make your life simpler: (eg. you are in the fasta files directory, and all the query files have a suffix `.faa`) 88 | ```bash 89 | $ for f in *.faa; do python3 blast_wrapper.py -q $f -df data.faa; done 90 | ``` 91 | 92 | You can use the script `blastout2fasta.py` provided along with this blast wrapper to convert the output to `fasta` format. 93 | 94 | ```bash 95 | $ python3 blastout2fasta.py blast.out > blast_out.fa 96 | ``` 97 | 98 | 99 | 100 | # Chinese Usage 中文使用说明 101 | 102 | blast-wrapper.py脚本能够通过简单的一行命令实现**建库**和**blast搜索**两个本地blast步骤。 103 | 104 | 使用该脚本还可以帮助我们以更优雅的方式阅读blast的结果。得到的表格不仅具有清晰的表头信息,且经过计算的**覆盖度**和**原查询序列**均可以显示在结果中,便于进一步分析解读。 105 | 106 | ## 要求 107 | - Blast+已安装并存在环境变量`$PATH`中 108 | - 使用**Python3** 109 | - 在Windows和类unix系统中均可运行 110 | ## 初级 111 | 112 | 大多数情况下,你只需要用如下的命令进行blastp: 113 | 114 | ```bash 115 | $ python blast_wrapper.py -q query.faa -df database.faa 116 | ``` 117 | 如果你已经有一个通过blast+的makeblastdb建立的数据库,则: 118 | ```bash 119 | $ python blast_warpper.py -q query.faa -db database 120 | ``` 121 | ## 中级 122 | ```bash 123 | $ python blast_wrapper.py -b blastn -q query.fna -o output -df database.fna -e 1e-10 -n 5 124 | ``` 125 | 126 | ## 高级 127 | ```bash 128 | $ python blast_wrapper.py -b blastn -q query.fna -o output -df database.fna -e 1e-10 -id 30 -qc 50 -n 5 -ms 3 --no_qseq 129 | ``` 130 | *虽然脚本支持通过选项-f来更改输出样式,但任何样式的更改都可能会导致后续分析结果呈现的错误* 131 | 132 | ## 注意 133 | - 默认使用blastp运行程序,可通过`-b blastn`来指定使用blastn。 134 | - 选项 `-q`是必选项,用来指定查询序列的文件位置。选项`-df`或者 `-db` 必须指定其一,分别可以指定用来建库的fasta文件或者已经建立的数据库位置。 135 | - 如果`-o`选项为缺省状态,则程序会在当前路径下新建文件名为 `QueryFileName_blast.out`格式的文件存放结果。 136 | - 如果指定了`-df`选项,则程序会在指定的fasta库相同路径下新建`DatabaseFasta.db`名称格式的数据库文件,如果该数据库被程序发现已经存在,则程序会自动跳过建库步骤,直接使用存在的数据库进行搜索。 137 | - 通过`-id`和`-qc`分别指定**一致性**和**覆盖度**的最小值以实现对结果的过滤 138 | - 可以使用`--no_seqs`选项来取消在结果中显示查询序列的原序列,这可能会在一定程度上加快程序运行的速度。 139 | - 程序默认的线程数是3个,可以使用`-n`选项来更改。 140 | - 编写了自定义的函数来代替原生`-max_target_seqs` 参数来筛选出最优的结果。因为原生参数实际只产出数据库中第一个匹配序列,而不是最优的序列。 141 | 142 | 143 | ## 输出示例 144 | 145 | qid | sid | ident% | aln_len | miss | gap | qstart | qend | sstart | send | qlen | slen | evalue | bitscore | qcov% | qseq 146 | --- | --- | ------ | ------- | ---- | --- | ------ | ---- | ------ | ---- | ---- | ---- | ------ | -------- | ----- | ---- 147 | HC_02247 | HgcA_ND132 | 34.483 | 58 | 37 | 1 | 550 | 607 | 9 | 65 | 608 | 95 | 1.42e-08 | 43.1 | 9.4 | MEAVE... 148 | HC_00217 | HgcB_ND132 | 28.049 | 82 | 42 | 3 | 104 | 176 | 18 | 91 | 220 | 95 | 8.56e-06 | 33.5 | 32.7 | METVE... 149 | HC_01133 | MerA_RS | 31.567 | 453 | 286 | 12 | 6 | 445 | 9 | 450 | 466 | 480 | 2.88e-55 | 182 | 94.2 | MSKVH... 150 | HC_01413 | MerA_WE | 30.660 | 424 | 283 | 4 | 26 | 443 | 114 | 532 | 455 | 554 | 7.74e-63 | 204 | 91.6 | MDFFD... 151 | 152 | ## 小技巧 153 | 154 | 如果你有很多fasta文件想要对一个数据库进行比对,不妨试试下面的命令调用bash来帮助你循环调用脚本(假设当前路径在存放fasta文件的路径中,且所有的fasta文件有统一的后缀`.faa`: 155 | ```bash 156 | $ for f in *.faa; do python3 blast_wrapper.py -q $f -df data.faa; done 157 | ``` 158 | 你可以使用脚本`blastout2fasta.py`来将`blast_wrapper.py`的结果转换成对应的`fasta`格式: 159 | ```bash 160 | $ python3 blastout2fasta.py blast.out > blast_out.fa 161 | ``` -------------------------------------------------------------------------------- /blast-wrapper/blast_wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """ 4 | Required: BLAST+ installed in $PATH 5 | 6 | Usage: 7 | 8 | ## Simplest: 9 | $ python blast_wrapper.py -q query.faa -df database.faa 10 | or if you already have an established database: 11 | $ python blast_warpper.py -q query.faa -db blast+_database 12 | 13 | ## Moderate: 14 | $ python blast_wrapper.py -b blastn -q query.fna -o output -df database.fna \ 15 | -e 1e-10 -n 5 16 | 17 | ## Control freak: 18 | $ python blast_wrapper.py -b blastn -q query.fna -o output -df database.fna \ 19 | -e 1e-10 -n 5 -ms 3 --no_qseq 20 | 21 | *Any change to output format by -f option may lead to errors when parsing output results. 22 | """ 23 | 24 | import os 25 | import sys 26 | import argparse 27 | from collections import defaultdict 28 | 29 | __author__ = "Heyu Lin" 30 | __contact__ = "heyu.lin(AT)student.unimelb.edu.au" 31 | 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument('-q', '--query', metavar='query_fasta', dest='q', 34 | type=str, required=True) 35 | parser.add_argument('-o', '--output', metavar='output', dest='o', 36 | type=str) 37 | parser.add_argument('-df', '--database_fasta', metavar='database_fasta', 38 | dest='df', type=str, 39 | help='fasta file to be used as database') 40 | parser.add_argument('-db', '--database', metavar='database', 41 | dest='db', type=str, 42 | help='blast database which has already been made') 43 | parser.add_argument('-e', '--evalue', metavar='max_e-value', dest='e', 44 | type=float, default=1e-5, 45 | help='threshod e-value for blast (default=1e-5)') 46 | parser.add_argument('-ms', '--max_target_seqs', metavar='num_sequences', 47 | dest='ms', type=int, default=1, 48 | help='specify the max_number of target seqs for hits per query (default=1)') 49 | parser.add_argument('-n', '--num_threads', metavar='num_cpu', 50 | dest='n', type=int, default=3, 51 | help='specify the number of threads used by blast (default=3)') 52 | parser.add_argument('-b', '--blast_program', metavar='blast+ program', 53 | dest='b', type=str, default='blastp', 54 | help='specify the blast program (default=blastp)') 55 | parser.add_argument('-id', '--identity', metavar='identity_threshold', 56 | dest='idt', type=float, default=0, 57 | help='specify the threshold of identity (default=0)') 58 | parser.add_argument('-qc', '--qcov', metavar='coverage_threshold', 59 | dest='qc', type=float, default=0, 60 | help='specify the threshold of query coverage (default=0)') 61 | parser.add_argument('--no_qseq', metavar='hide qseq column', 62 | dest='nq', nargs="?", const=True, default=False, 63 | help='no query sequences will be showed if this argument is added') 64 | # You're not going to like to change this default output format. 65 | # Any change to this outfmt argument may lead to exceptions for query coverage calculation 66 | parser.add_argument('-f', '--outfmt', metavar='output_format*', 67 | dest='f', type=str, 68 | default='"6 qseqid sseqid pident length mismatch gapopen ' \ 69 | + 'qstart qend sstart send qlen slen evalue bitscore"', 70 | help='outfmt defined by blast+, it is dangerous to change the default value') 71 | args = parser.parse_args() 72 | 73 | 74 | def input_type(b): 75 | ''' 76 | return blast database type (prot or nucl) 77 | ''' 78 | if b == 'blastp' or b == 'blastx': 79 | tp = 'prot' 80 | return tp 81 | elif b == 'blastn' or b == 'tblastn': 82 | tp = 'nucl' 83 | return tp 84 | else: 85 | sys.exit("Error: -b argument should only be 'blastp/blastn/blastx/tblastn'!") 86 | 87 | 88 | def database_exist(db): 89 | prot_databases = db + '.phr' 90 | nucl_databases = db + '.nhr' 91 | if os.path.exists(prot_databases) or os.path.exists(nucl_databases): 92 | return True 93 | 94 | 95 | def run_mkblastdb(fi, fo, tp): 96 | ''' 97 | fi: input fasta file 98 | fo: output database name 99 | tp: prot or nucl 100 | ''' 101 | cmd_para = [ 102 | 'makeblastdb', 103 | '-in', fi, 104 | "-dbtype", tp, 105 | "-parse_seqids", 106 | "-out", fo 107 | ] 108 | cmd = ' '.join(cmd_para) 109 | try: 110 | print("\n", 'Make Blast Database'.center(50, '*')) 111 | print(cmd, "\n") 112 | os.system(cmd) 113 | except Exception as e: 114 | raise e 115 | 116 | 117 | def run_blast(q, o, db, e, f, n, b): 118 | ''' 119 | q: query 120 | o: output 121 | db: database 122 | e: evalue 123 | f: outfmt 124 | n: num_threads 125 | b: blast program 126 | ''' 127 | cmd_para = [ 128 | b, 129 | '-query', q, 130 | '-out', o, 131 | '-db', db, 132 | '-evalue', str(e), 133 | '-outfmt', f, 134 | '-num_threads', str(n) 135 | ] 136 | cmd = ' '.join(cmd_para) 137 | try: 138 | print("\n", 'BLAST Searching'.center(50, '*')) 139 | print(cmd, "\n") 140 | os.system(cmd) 141 | except Exception as e: 142 | raise e 143 | 144 | 145 | def creat_dict(fa): 146 | with open(fa, 'r') as f: 147 | dict = defaultdict(str) 148 | name = '' 149 | for line in f: 150 | if line.startswith('>'): 151 | name = line[1:-1].split()[0] 152 | continue 153 | dict[name] += line.strip() 154 | return dict 155 | 156 | 157 | def blast_Parser(fi, fo, header, idt, qc, ms, *dict): 158 | ''' 159 | fi: blast output (format as defined in this script) 160 | fo: final output 161 | dict: dictionary created from query fasta (used to extract hit sequences) 162 | ''' 163 | seq_dict = {} # initialize a dict to index query sequences 164 | if dict: 165 | seq_dict = dict[0] 166 | 167 | with open(fi) as input, open(fo, 'w') as output: 168 | output.write("\t".join(header) + "\n") 169 | times = 0 # initialize the hit number 170 | quer_last = '' # initialize the hit sequence 171 | for line in input.readlines(): 172 | items = line.strip().split("\t") 173 | quer = items[0] 174 | if quer == quer_last: 175 | times += 1 176 | if times > ms: 177 | continue 178 | else: 179 | quer_last = quer 180 | times = 1 181 | qstart, qend, qlen = map(float, (items[6], items[7], items[10])) 182 | qcov = 100 * (qend - qstart) / qlen 183 | ident = float(items[2]) 184 | if ident < idt or qcov < qc: 185 | continue 186 | items.append(str(round(qcov, 1))) 187 | if seq_dict: 188 | qid = items[0] 189 | items.append(seq_dict[qid]) 190 | output.write("\t".join(items) + "\n") 191 | 192 | 193 | def review_output(file): 194 | with open(file, 'r+') as fi: 195 | if len(fi.readlines()) == 1: 196 | fi.seek(0) 197 | fi.truncate() 198 | 199 | def main(): 200 | tp = input_type(args.b) 201 | 202 | if not args.o: 203 | args.o = os.path.basename(args.q) + '_blast.out' 204 | 205 | # Make blast database 206 | if args.df: 207 | database_file = os.path.join(os.getcwd(), args.df) + '.db' 208 | if not database_exist(database_file): 209 | print("Starting to make blast database...") 210 | run_mkblastdb(args.df, database_file, tp) 211 | args.db = database_file 212 | print('DB: ', args.db) 213 | 214 | # Storing temporary blast result 215 | tempt_output = str(args.o) + '_blast.tmp' 216 | 217 | # => Run blast program 218 | run_blast(args.q, tempt_output, args.db, args.e, args.f, args.n, args.b) 219 | 220 | # Creat dict from query fasta, in order to extract sequencs later 221 | dict = creat_dict(args.q) 222 | 223 | # Parse blast output 224 | header = [ 225 | 'qid', 'sid', 'ident%', 'aln_len', 'miss', 226 | 'gap', 'qstart', 'qend', 'sstart', 'send', 227 | 'qlen', 'slen', 'evalue', 'bitscore', 'qcov%', 'qseq' 228 | ] 229 | # If the --no_qseq option was specified, there would be no qseq column. 230 | if args.nq: 231 | header.remove('qseq') 232 | blast_Parser(tempt_output, args.o, header, args.idt, args.qc, args.ms) 233 | else: 234 | blast_Parser(tempt_output, args.o, header, args.idt, args.qc, args.ms, dict) 235 | # Remove temp file 236 | os.remove(tempt_output) 237 | 238 | # Clear the lonely header line if no hit was found 239 | review_output(args.o) 240 | 241 | print("\n", 'OUTPUT'.center(50, '*')) 242 | print("Output File: {0}".format(args.o)) 243 | 244 | 245 | if __name__ == '__main__': 246 | main() 247 | -------------------------------------------------------------------------------- /blast-wrapper/blastout2fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """ 4 | Used to convert the output from blast_wrapper.py to fasta format 5 | 6 | Usage: 7 | $ python3 blastout2fasta.py blast.out > blast_out.fa 8 | """ 9 | 10 | import sys 11 | import textwrap 12 | 13 | __author__ = "Heyu Lin" 14 | __contact__ = "heyu.lin(AT)student.unimelb.edu.au" 15 | 16 | in_file = sys.argv[1] 17 | with open(in_file, 'r') as fi: 18 | for line in fi.readlines(): 19 | fields = line.strip().split('\t') 20 | if fields[0] != 'qid': 21 | print('>{header}'.format(header=fields[0])) 22 | print(textwrap.fill(fields[15], 80)) 23 | -------------------------------------------------------------------------------- /cdhit-clstr2tbl/README.md: -------------------------------------------------------------------------------- 1 | # CD-HIT clstr2tbl 2 | Given a `clstr` file from `CD-HIT` program, this program will generate a table (tab separated) that contains the header of every sequence in the 1st column and the corresponding representative in the 2nd column. 3 | 4 | The output file is more friendly for further analysis. 5 | 6 | ## Usage 7 | ```bash 8 | $ python3 cdhit_clstr2tbl.py input.clstr > out.tab 9 | ``` 10 | ## Input Sample 11 | 12 | ``` 13 | >Cluster 0 14 | 0 14739aa, >gene1... * 15 | 1 656aa, >gene2... at 99.85% 16 | >Cluster 1 17 | 0 66aa, >gene3... at 100.00% 18 | 1 13708aa, >gene4... * 19 | 2 13708aa, >gene5... at 100.00% 20 | ``` 21 | 22 | Output Sample 23 | 24 | | gene_id | representative | 25 | | ------- | -------------- | 26 | | gene1 | gene1 | 27 | | gene2 | gene4 | 28 | | gene3 | gene4 | 29 | | gene4 | gene4 | 30 | 31 | # Chinese Usage 中文使用说明 32 | 输入一个`CD-HIT`文件产出的`clstr`文件,此脚本可以将其转换为一个tab分隔的表格文件,第一列是每个序列的名称,第二列是每个序列对应的代表序列的名称。 33 | 34 | 经转换过的文件对下游分析更友好。 35 | 36 | ## 使用 37 | ```bash 38 | $ python3 cdhit_clstr2tbl.py input.clstr > out.tab 39 | ``` 40 | ## 输入文件示例 41 | 42 | ``` 43 | >Cluster 0 44 | 0 14739aa, >gene1... * 45 | 1 656aa, >gene2... at 99.85% 46 | >Cluster 1 47 | 0 66aa, >gene3... at 100.00% 48 | 1 13708aa, >gene4... * 49 | 2 13708aa, >gene5... at 100.00% 50 | ``` 51 | 52 | ## 输出文件示例 53 | 54 | | gene_id | representative | 55 | | ------- | -------------- | 56 | | gene1 | gene1 | 57 | | gene2 | gene4 | 58 | | gene3 | gene4 | 59 | | gene4 | gene4 | 60 | -------------------------------------------------------------------------------- /cdhit-clstr2tbl/cdhit_clstr2tbl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """ 4 | Given a clstr file from cd-hit program, this program will generate a table that 5 | contains the header of every sequence in the 1st column and the corresponding 6 | representative in the 2nd column. 7 | The output file is more friendly for further analysis. 8 | 9 | Usage: $ python3 cdhit_clstr2tbl.py input.clstr > out.tab 10 | 11 | 12 | The input .clstr file looks like: 13 | >Cluster 0 14 | 0 14739aa, >gene1... * 15 | 1 656aa, >gene2... at 99.85% 16 | >Cluster 1 17 | 0 13708aa, >gene3... * 18 | >Cluster 2 19 | 0 66aa, >gene4... at 100.00% 20 | 1 13708aa, >gene5... * 21 | 2 13708aa, >gene6... at 100.00% 22 | 23 | 24 | The output table file looks like: 25 | gene_id representative 26 | gene1 gene1 27 | gene2 gene1 28 | gene3 gene3 29 | gene4 gene5 30 | gene5 gene5 31 | gene6 gene5 32 | """ 33 | import re 34 | import sys 35 | 36 | __author__ = "Heyu Lin" 37 | __contact__ = "heyu.lin(AT)student.unimelb.edu.au" 38 | 39 | in_file = sys.argv[1] 40 | 41 | match_header = re.compile(r'>(.*?)\.{3}') 42 | 43 | header_list = [] 44 | repre = '' 45 | 46 | # print the header 47 | print('gene_id' + '\t' + 'representative') 48 | 49 | with open(in_file) as input: 50 | for line in input.readlines(): 51 | if line.startswith('>'): 52 | for name in header_list: 53 | print(name + '\t' + repre) 54 | header_list = [] 55 | else: 56 | if line.strip().endswith('*'): 57 | repre = match_header.findall(line)[0] 58 | header_list.append(repre) 59 | else: 60 | header_list.append(match_header.findall(line)[0]) 61 | 62 | # patch for the last cluster 63 | for name in header_list: 64 | print(name + '\t' + repre) 65 | -------------------------------------------------------------------------------- /cdhit-clstr2tbl/test.clstr: -------------------------------------------------------------------------------- 1 | >Cluster 0 2 | 0 14739aa, >gene1... * 3 | 1 656aa, >gene2... at 99.85% 4 | >Cluster 1 5 | 0 13708aa, >gene3... * 6 | >Cluster 2 7 | 0 66aa, >gene4... at 100.00% 8 | 1 13708aa, >gene5... * 9 | 2 13708aa, >gene6... at 100.00% -------------------------------------------------------------------------------- /cdhit-clstr2tbl/test.clstr.tab: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SilentGene/Bio-py/33f9827114723c9db661c80f7e13564f2375417a/cdhit-clstr2tbl/test.clstr.tab -------------------------------------------------------------------------------- /circular_genomes_from_gfa/README.md: -------------------------------------------------------------------------------- 1 | # Circular genomes from GFA 2 | 3 | This script is used for extracting circular DNA sequences (including genomes, plasmids, viruses, etc) from a GFA file 4 | 5 | ## Usage 6 | 7 | ```bash 8 | $ python circular_genomes_from_gfa.py [output_dir] 9 | ``` 10 | 11 | ## Example 12 | 13 | Using the "assembly_graph.gfa" file generated by flye 14 | 15 | ```bash 16 | # Assembly 17 | $ flye --pacbio-hifi pacbio-css.fq.gz --out-dir flye_out --threads 16 --meta --scaffold 18 | # Get circular DNA 19 | $ cd flye_out 20 | $ python circular_genomes_from_gfa.py assembly_graph.gfa 21 | ``` 22 | 23 | ### Result 24 | 25 | - Output folder: assembly_graph_circular 26 | - assembly_graph_circular_all.fna: A fasta file containing all circular sequences 27 | - assembly_graph_circular_all_info.tsv: A tab-separated file containing information about the circular sequences (ID, length) 28 | - edge_17343.fasta: Each *.fasta file contains an individule circular sequence 29 | - edge_129.fasta 30 | - edge_*.fasta 31 | 32 | ## Chinese Usage 中文使用说明 33 | 34 | 这个脚本用于从 GFA 文件中提取环形的DNA序列(包括基因组、质粒、病毒等) 35 | 36 | ## 使用 37 | 38 | ```bash 39 | $ python circular_genomes_from_gfa.py [output_dir] 40 | ``` 41 | 42 | ## 示例 43 | 44 | 使用 flye 生成的 "assembly_graph.gfa" 文件 45 | 46 | ```bash 47 | # Assembly 48 | $ flye --pacbio-hifi pacbio-css.fq.gz --out-dir flye_out --threads 16 --meta --scaffold 49 | # Get circular DNA 50 | $ cd flye_out 51 | $ python circular_genomes_from_gfa.py assembly_graph.gfa 52 | ``` 53 | 54 | ### 结果 55 | 56 | - 输入文件夹: assembly_graph_circular 57 | - assembly_graph_circular_all.fna: 包含所有环形序列的fasta文件 58 | - assembly_graph_circular_all_info.tsv: 包含环形序列信息的tab分隔文件(ID, 长度) 59 | - edge_17343.fasta: 每个 *.fasta 文件包含一个环形序列 60 | - edge_129.fasta 61 | - edge_*.fasta 62 | -------------------------------------------------------------------------------- /circular_genomes_from_gfa/circular_genomes_from_gfa.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | """ 5 | Extract circular genomes from a GFA file 6 | Usage: python circular_genomes_from_gfa.py [output_dir] 7 | """ 8 | 9 | def usage(): 10 | print("Extract circular genomes from a GFA file") 11 | print("Usage: python circular_genomes_from_gfa.py [output_dir]") 12 | print("\tOptional: output_dir - directory to write output files to (default: _circular)") 13 | sys.exit(1) 14 | 15 | def get_args(): 16 | if len(sys.argv) < 2: 17 | usage() 18 | 19 | input_file = sys.argv[1] 20 | if not input_file.endswith('.gfa'): 21 | print("Error: Input file must be a .gfa file") 22 | usage() 23 | 24 | base = os.path.splitext(input_file)[0] 25 | 26 | output_dir = sys.argv[2] if len(sys.argv) > 2 else f"{base}_circular" 27 | fasta_file = os.path.join(output_dir, f'{base}_circular_all.fna') 28 | tsv_file = os.path.join(output_dir, f'{base}_circular_all_info.tsv') 29 | return input_file, output_dir, fasta_file, tsv_file 30 | 31 | def get_seqs(gfa_file): 32 | segments = {} # id -> sequence 33 | 34 | with open(gfa_file) as f: 35 | for line in f: 36 | if line.startswith('S'): # Segment line 37 | parts = line.strip().split('\t') 38 | seg_id, sequence = parts[1], parts[2] 39 | segments[seg_id] = sequence 40 | return segments 41 | 42 | def find_circular_paths(gfa_file): 43 | circular_paths = set() 44 | with open(gfa_file) as f: 45 | for line in f: 46 | if line.startswith('L'): # Link line 47 | parts = line.strip().split('\t') 48 | from_id, from_orient = parts[1], parts[2] 49 | to_id, to_orient = parts[3], parts[4] 50 | overlap = parts[5] 51 | if from_id == to_id and from_orient == to_orient and overlap == '0M': 52 | circular_paths.add(from_id) 53 | # if no circular paths found, exit with warning 54 | if not circular_paths: 55 | print("Warning: No circular paths found in the GFA file") 56 | sys.exit(1) 57 | return circular_paths 58 | 59 | def write_output(seq_dict, ids, output_dir, output_fasta, output_tsv): 60 | if not os.path.exists(output_dir): 61 | os.makedirs(output_dir) 62 | 63 | count = 1 64 | seq_len = {} 65 | for id in ids: 66 | sequence = seq_dict[id] 67 | seq_len[id] = len(sequence) 68 | # order by length 69 | sorted_ids = sorted(ids, key=lambda x: seq_len[x], reverse=True) 70 | with open(output_fasta, 'w') as ff, open(output_tsv, 'w') as tf: 71 | tf.write('#id\tSeqID\tLength(bp)\n') 72 | for id in sorted_ids: 73 | sequence = seq_dict[id] 74 | ff.write(f'>{id}\n{sequence}\n') 75 | tf.write(f'{count}\t{id}\t{seq_len[id]}\n') 76 | count += 1 77 | # write sequences to individual files 78 | for id in sorted_ids: 79 | with open(os.path.join(output_dir, f'{id}.fasta'), 'w') as f: 80 | f.write(f'>{id}\n{seq_dict[id]}\n') 81 | 82 | 83 | 84 | 85 | def main(): 86 | input_file, output_dir, fasta_file, tsv_file = get_args() 87 | 88 | # Parse GFA file 89 | segments_dict = get_seqs(input_file) 90 | 91 | # Find circular paths 92 | circular_edges = find_circular_paths(input_file) 93 | 94 | # Write output files 95 | write_output(segments_dict, circular_edges, output_dir, fasta_file, tsv_file) 96 | 97 | if __name__ == '__main__': 98 | main() -------------------------------------------------------------------------------- /download_uniprot_proteomes/README.md: -------------------------------------------------------------------------------- 1 | # Uniprot Proteome Downloader 2 | 3 | This script is used for batch retrieval proteomes in faa.gz format according to a list of Proteome identifiers (UPIDs) 4 | 5 | ## Usage: 6 | 7 | ```bash 8 | $ python3 download_uniprot_proteomes_UPID.py input_list.txt output_dir 9 | ``` 10 | 11 | ## Sample input_list.txt 12 | 13 | ``` 14 | UP000000272 15 | UP000000391 16 | UP000000442 17 | ``` 18 | 19 | ## Chinese Usage 中文使用说明 20 | 21 | 该脚本可以通过一个包含Proteome identifiers (UPIDs)列表的文件来批量下载基因组文件。下载格式为faa.gz。 22 | 23 | ## 使用 24 | 25 | ```bash 26 | $ python3 download_uniprot_proteomes_UPID.py input_list.txt output_dir 27 | ``` 28 | 29 | ## 输入列表文件示例 30 | 31 | ``` 32 | UP000000272 33 | UP000000391 34 | UP000000442 35 | ``` 36 | -------------------------------------------------------------------------------- /download_uniprot_proteomes/download_uniprot_proteomes_UPID.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | """ 4 | This script is used for batch retrieval proteomes in faa.gz format according 5 | to a list of Proteome identifiers (UPIDs) 6 | 7 | # Usage: 8 | $ python3 download_uniprot_proteomes_UPID.py input_list.txt output_dir 9 | 10 | input_list.txt sample: 11 | UP000000272 12 | UP000000391 13 | UP000000442 14 | """ 15 | 16 | import sys 17 | import os 18 | import requests 19 | 20 | __author__ = "Heyu Lin" 21 | __contact__ = "heyu.lin@student.unimelb.edu.au" 22 | 23 | list_file = sys.argv[1] 24 | output_dir = sys.argv[2] 25 | 26 | 27 | def request_proteome(upid, output_dir, num): 28 | base_url = 'https://www.uniprot.org/uniprot/?include=false&format=fasta&compress=yes&force=true&query=proteome:' 29 | request_url = base_url + upid 30 | try: 31 | r = requests.get(request_url, allow_redirects=True) 32 | r.raise_for_status() 33 | except requests.exceptions.HTTPError as http_err: 34 | raise SystemExit(f'HTTP error occurred: {http_err}') 35 | except Exception as err: 36 | raise SystemExit(f'Other error occurred: {err}') 37 | else: 38 | print(f'[{num}] {upid} - OK') 39 | 40 | # save the content with name 41 | open(os.path.join(output_dir, upid + '.faa.gz'), 'wb').write(r.content) 42 | 43 | 44 | if __name__ == "__main__": 45 | if not os.path.exists(output_dir): 46 | os.makedirs(output_dir) 47 | 48 | # read input list 49 | with open(list_file) as f: 50 | upids = f.read().splitlines() 51 | print(str(len(upids)) + ' lines have been read. Request started...') 52 | 53 | # retreival 54 | num = 1 55 | for upid in upids: 56 | request_proteome(upid, output_dir, num) 57 | num += 1 58 | -------------------------------------------------------------------------------- /fasta-splitter/fasta_splitter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """ 4 | A Python script for splitting a fasta format file into pieces by 5 | specifying the number of divided files or the number of sequences in divided files 6 | 7 | Inspired by Biopython wiki - https://biopython.org/wiki/Split_large_file 8 | 9 | ==Required: Biopython 10 | 11 | ==Options: 12 | -i, --input: input fasta file 13 | -o, --output: output directory 14 | -partn, --partnumber: number of files will be divided into 15 | -parts, --partseq: number of sequences will be put into every divided file 16 | 17 | ==Examples: 18 | 1. Divide a fasta file into <10> files, storing in 19 | python fasta_splitter.py -i input.fasta -o output_dir -partn 10 20 | 21 | 2. Divide a fasta file into files containing <1000> sequences in 22 | python fasta_splitter.py -i input.fasta -parts 1000 23 | """ 24 | 25 | import sys 26 | import os 27 | from math import ceil 28 | from Bio import SeqIO 29 | import argparse 30 | 31 | # 32 | __author__ = "Heyu Lin" 33 | __contact__ = "heyu.lin(AT)student.unimelb.edu.au" 34 | 35 | 36 | """ 37 | Arguments 38 | """ 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument('-i', '--input', metavar='input_file', dest='i', 41 | type=str, required=True) 42 | parser.add_argument('-o', '--output', metavar='output_dir', dest='o', 43 | type=str, default='.') 44 | 45 | group = parser.add_mutually_exclusive_group(required=True) 46 | group.add_argument('-partn', '--partnumber', metavar='number_of_parts', dest='p', 47 | type=int) 48 | group.add_argument('-parts', '--partseq', metavar='number_of_seqences_in_every_part', dest='s', 49 | type=int) 50 | 51 | args = parser.parse_args() 52 | 53 | def batch_iterator(iterator, batch_size): 54 | """Returns lists of length batch_size. 55 | 56 | This can be used on any iterator, for example to batch up 57 | SeqRecord objects from Bio.SeqIO.parse(...), or to batch 58 | Alignment objects from Bio.AlignIO.parse(...), or simply 59 | lines from a file handle. 60 | 61 | This is a generator function, and it returns lists of the 62 | entries from the supplied iterator. Each list will have 63 | batch_size entries, although the final list may be shorter. 64 | """ 65 | entry = True # Make sure we loop once 66 | while entry: 67 | batch = [] 68 | while len(batch) < batch_size: 69 | try: 70 | entry = next(iterator) 71 | except StopIteration: 72 | entry = None 73 | if entry is None: 74 | # End of file 75 | break 76 | batch.append(entry) 77 | if batch: 78 | yield batch 79 | 80 | 81 | def total_num_calc(fasta): 82 | """ 83 | Calculate total number of the given fasta file 84 | """ 85 | total_num = len([1 for line in open(fasta) if line.startswith(">")]) 86 | return total_num 87 | 88 | 89 | def splitter(input, num, outdir): 90 | """ 91 | split fasta sequences into pieces 92 | """ 93 | fname = os.path.basename(input) 94 | fbname, fename = os.path.splitext(fname) 95 | record_iter = SeqIO.parse(open(input),"fasta") 96 | for i, batch in enumerate(batch_iterator(record_iter, num)): 97 | filename = "{0}.p-{1}{2}".format(fbname, i + 1, fename) 98 | output = os.path.join(outdir, filename) 99 | with open(output, "w") as handle: 100 | count = SeqIO.write(batch, handle, "fasta") 101 | print("Wrote %i records to %s" % (count, output)) 102 | 103 | 104 | def main(): 105 | n_seq = 0 # Number of sequences in every divided files 106 | 107 | if args.p: 108 | total_num = total_num_calc(args.i) 109 | n_seq = ceil(total_num / args.p) 110 | elif args.s: 111 | n_seq = args.s 112 | 113 | splitter(args.i, n_seq, args.o) 114 | 115 | 116 | if __name__ == '__main__': 117 | main() -------------------------------------------------------------------------------- /prodigal-wrapper/prodigal_run.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool 2 | import os 3 | 4 | def prodigal(fasta, basename, outdir): 5 | cmd_para = [ 6 | 'prodigal', '-q', 7 | '-i', fasta, 8 | '-p', 'meta', 9 | '-a', os.path.join(outdir, basename + '.faa'), 10 | '-d', os.path.join(outdir, basename + '.ffn'), 11 | '-o', os.path.join(outdir, basename + '.gbk') 12 | ] 13 | cmd = ' '.join(cmd_para) 14 | try: 15 | print("\n" + 'ORFs prediction'.center(50, '*')) 16 | print(cmd + '\n') 17 | os.system(cmd) 18 | except: 19 | print("\nSomething wrong with prodigal annotation!") 20 | 21 | 22 | -------------------------------------------------------------------------------- /prokka2kegg/README.md: -------------------------------------------------------------------------------- 1 | # Prokka2KEGG 2 | This script is used to assign KO entries (K numbers in KEGG annotation) according to UniProtKB ID in the *.gbk file generated by `Prokka` 3 | 4 | ## Usage 5 | 6 | ### ~~Step 1: Download and initialize the cross-reference database provided by UniProt~~ 7 | 8 | ```bash 9 | $ wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz 10 | $ gzip -dc idmapping.dat.gz | awk '{if($2=="KO") print ​$1,$3}' OFS="\t" | gzip > idmapping_KO.tab.gz 11 | ``` 12 | ~~You could choose to remove 'idmapping.dat.gz' now.~~ 13 | 14 | ### Note 15 | 16 | UniProt has removed K numbers from their cross-reference database since early 2021. Now users have to download the formatted database (formatted in Jan 2019) from this repo ([idmapping_KO.tab.gz](https://github.com/SilentGene/Bio-py/blob/master/prokka2kegg/idmapping_KO.tab.gz)). 17 | 18 | ### Step 2: Retrieve K numbers according to the UniProtKB IDs of proteins 19 | ```bash 20 | $ python3 prokka2kegg.py -i input.gbk -d idmapping_KO.tab.gz -o output.txt 21 | ``` 22 | 23 | *This script will produce a json format database in the same folder of idmapping_KO.tab.gz for reuse, which may speed up the program when running next time.* 24 | 25 | ## Options 26 | 27 | - `-i`: input gbk file generated by Prokka 28 | - `-o`: output file with gene ids and K entries in tab separeted format 29 | - `-d`: formated cross-reference database from the step 1 (or downloaded directly from my repo) 30 | 31 | ## Require 32 | 33 | - Using **Python3** 34 | - Works both on Windows and unix-like systems 35 | - No 3rd party python modules required 36 | 37 | ## Sample Output: 38 | 39 | | | | 40 | | -------- | ------ | 41 | | ORF_0001 | | 42 | | ORF_0002 | K03152 | 43 | | ORF_0003 | | 44 | | ORF_0004 | K16331 | 45 | | ORF_0005 | K01997 | 46 | 47 | ## Tips 48 | 49 | There is another script `prokka2kegg_batch.py` which could helped you handle many gbk files in a batch mode. 50 | 51 | # Chinese Usage 中文使用说明 52 | 53 | 这个脚本可以帮助你利用`Prokka`注释得到的gbk文件进行KEGG注释,得到每个ORF对应的KO号。 54 | 55 | ## 使用 56 | 57 | ### ~~第一步:下载和初始化Uniprot提供的数据库间对应的查询库~~ 58 | ```bash 59 | $ wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz 60 | $ gzip -dc idmapping.dat.gz | awk '{if($2=="KO") print ​$1,$3}' OFS="\t" | gzip > idmapping_KO.tab.gz 61 | ``` 62 | ## 说明 63 | 64 | 因为Uniprot在2021年上半年废弃了其数据库中的KO号信息,现在使用该脚本必须下载本github库中格式化好的数据库(下载于2019年1月)来运行脚本 ([idmapping_KO.tab.gz](https://github.com/SilentGene/Bio-py/blob/master/prokka2kegg/idmapping_KO.tab.gz))。 65 | 66 | ### 第二步: 通过每个ORF的UniProtKB IDs在数据库中查询对应的KO号 67 | 68 | ```bash 69 | $ python3 prokka2kegg.py -i input.gbk -d idmapping_KO.tab.gz -o output.txt 70 | ``` 71 | 72 | *脚本会在idmapping_KO.tab.gz所在的文件夹下产生一个json文件来加快下一次调用数据库查询时的速度。* 73 | 74 | ## 选项 75 | 76 | - `-i`: 输入文件,Prokka注释产生的gbk文件 77 | - `-o`: 输出的带有每个ORF ID和其对应KO号的tab分隔的文本文件 78 | - `-d`: 由第一步产生(或直接从我库中下载)的数据库文件 79 | 80 | ## 要求 81 | 82 | - 使用**Python3** 83 | - 在Windows和类unix系统中均可运行 84 | - 无需第三方python模块 85 | 86 | ## 输出示例: 87 | 88 | | | | 89 | | -------- | ------ | 90 | | ORF_0001 | | 91 | | ORF_0002 | K03152 | 92 | | ORF_0003 | | 93 | | ORF_0004 | K16331 | 94 | | ORF_0005 | K01997 | 95 | 96 | ## 提示 97 | 98 | 另外提供一个脚本`prokka2kegg_batch.py`来同时完成多个gbk文件的转换。 -------------------------------------------------------------------------------- /prokka2kegg/idmapping_KO.tab.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SilentGene/Bio-py/33f9827114723c9db661c80f7e13564f2375417a/prokka2kegg/idmapping_KO.tab.gz -------------------------------------------------------------------------------- /prokka2kegg/prokka2kegg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """ 4 | Description: 5 | KO entries (K numbers in KEGG annotation) assignment 6 | according to UniProtKB ID in `Prokka` *.gbk file 7 | 8 | Usage: 9 | 10 | Step1: Download and initialize the cross-reference database provided by UniProt 11 | $ wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz 12 | $ gzip -dc idmapping.dat.gz | awk '{if($2=="KO") print $1,$3}' OFS="\t" | gzip > idmapping_KO.tab.gz 13 | You could choose to remove 'idmapping.dat.gz' now. 14 | 15 | Step2: Retrieve K numbers according to the UniProtKB IDs of proteins 16 | $ python3 gbk2kegg.py -i input.gbk -d idmapping_KO.tab.gz -o output.txt 17 | 18 | This script will produce a json format database in the same folder of 19 | idmapping_KO.tab.gz for reuse, which may speed up the program when 20 | running next time. 21 | """ 22 | 23 | import os 24 | import re 25 | import gzip 26 | import curses 27 | import argparse 28 | import json 29 | 30 | __author__ = "Heyu Lin" 31 | __contact__ = "heyu.lin(AT)student.unimelb.edu.au" 32 | 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('-i', '--input', metavar='input_gbk', dest='i', 35 | type=str, required=True) 36 | parser.add_argument('-o', '--output', metavar='output', dest='o', 37 | type=str, required=True) 38 | parser.add_argument('-d', '--data', metavar='idmapping.dat.gz', 39 | dest='d', type=str, 40 | help='UniProtKB cross-references database') 41 | args = parser.parse_args() 42 | 43 | 44 | def gbk_parser(gbk): 45 | """ 46 | gbk: gbk genome file generated by Prokka 47 | """ 48 | arr = [] # output array containing locus_tag and UniProtKB 49 | with open(gbk) as input: 50 | cds = 0 51 | locus = 0 52 | pattern_locus = re.compile('"(.*)"') 53 | pattern_uniprotkb = re.compile('UniProtKB:(.*)"') 54 | for line in input.readlines(): 55 | if line.startswith(' ' * 5 + 'CDS'): 56 | cds = 1 # This is a CDS 57 | if line.startswith(' ' * 21 + '/locus_tag=') and cds == 1: 58 | locus_tag = pattern_locus.findall(line)[0] 59 | locus = 1 # locus_tag was read 60 | if line.startswith(' ' * 21 + '/inference="similar to AA sequence:UniProtKB') and locus == 1: 61 | uniprotkb = pattern_uniprotkb.findall(line)[0] 62 | arr.append([locus_tag, uniprotkb]) 63 | cds = 0 64 | locus = 0 65 | if line.startswith(' ' * 21 + '/codon_start') and locus == 1: 66 | arr.append([locus_tag, '']) 67 | cds = 0 68 | locus = 0 69 | return arr 70 | 71 | 72 | def dict_initialize(gzfile): 73 | dict = {} 74 | with gzip.open(gzfile) as fi: 75 | for line in fi.readlines(): 76 | fields = line.decode('utf-8').strip().split('\t') 77 | if fields[0] not in dict: 78 | dict[fields[0]] = [fields[1]] 79 | else: 80 | dict[fields[0]].append(fields[1]) 81 | return dict 82 | 83 | 84 | def dict_load(json_file): 85 | with open(json_file, 'r') as f: 86 | r = json.load(f) 87 | return r 88 | 89 | 90 | def retrieve_KO(arr, dict): 91 | """ 92 | arr = [ 93 | ['AMLFNMKI_00003', ''], 94 | ['AMLFNMKI_00004', 'Q24SP7'] 95 | ] 96 | new_arr = [ 97 | ['AMLFNMKI_00025', 'Q01465', ['K03569']], 98 | ['AMLFNMKI_00026', 'P15639', ['K00602','K00604']] 99 | ['AMLFNMKI_00027', '', ''] 100 | ] 101 | """ 102 | new_arr = [] 103 | id_no_match = [] # record UniProtKB IDs have no corresponding KO numbers 104 | for cds in arr: 105 | if cds[1] == '': 106 | cds.append('') 107 | new_arr.append(cds) 108 | else: 109 | ko = dict.get(cds[1], None) 110 | if ko is None: 111 | id_no_match.append(cds[1]) 112 | cds.append('') 113 | new_arr.append(cds) 114 | else: 115 | cds.append(ko) 116 | new_arr.append(cds) 117 | # print(json.dumps(new_arr)) 118 | """ Report failure in search K numbers according to UniProtKB IDs 119 | print("Warning: The following " + str(len(id_no_match)) 120 | + " UniProtKB IDs don't have corresponding K numbers") 121 | print(' '.join(id_no_match)) 122 | """ 123 | return new_arr 124 | 125 | 126 | def write_json(content, outfile): 127 | with open(outfile, 'w') as fo: 128 | json.dump(content, fo) 129 | 130 | 131 | def output(arr, outfile): 132 | """ 133 | arr = [ 134 | ["AMLFNMKI_00025", Q01465, ["K03569"]], 135 | ["AMLFNMKI_00026", P15639, ["K00602","K00604"]] 136 | ["AMLFNMKI_00027", "", ""] 137 | ] 138 | """ 139 | with open(outfile, 'w') as fo: 140 | for cds in arr: 141 | if cds[2] != "": 142 | for ko in cds[2]: 143 | fo.write(cds[0] + "\t" + ko + "\n") 144 | else: 145 | fo.write(cds[0] + "\n") 146 | 147 | 148 | def main(): 149 | if os.path.exists(args.d + '.json'): 150 | db_dict = dict_load(args.d + '.json') 151 | else: 152 | db_dict = dict_initialize(args.d) 153 | write_json(db_dict, args.d + '.json') 154 | mapping_array = gbk_parser(args.i) 155 | final_arr = retrieve_KO(mapping_array, db_dict) 156 | output(final_arr, args.o) 157 | 158 | 159 | if __name__ == '__main__': 160 | main() 161 | -------------------------------------------------------------------------------- /prokka2kegg/prokka2kegg_batch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """ 4 | Description: 5 | KO entries (K numbers in KEGG annotation) assignment *in batch mode* 6 | according to UniProtKB ID in `Prokka` *.gbk files 7 | 8 | Usage: 9 | 10 | Step1: Download and initialize the cross-reference database provided by UniProt 11 | $ wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz 12 | $ gzip -dc idmapping.dat.gz | awk '{if($2=="KO") print $1,$3}' OFS="\t" | gzip > idmapping_KO.tab.gz 13 | You could choose to remove 'idmapping.dat.gz' now. 14 | 15 | Step2: Retrieve K numbers according to the UniProtKB IDs of proteins 16 | $ python3 gbk2kegg_batch.py -i input_dir -d idmapping_KO.tab.gz -o output_dir 17 | 18 | This script will produce a json format database in the same folder of 19 | idmapping_KO.tab.gz for reuse, which may speed up the program when 20 | running next time. 21 | """ 22 | 23 | import os 24 | import re 25 | import gzip 26 | import curses 27 | import argparse 28 | import json 29 | 30 | __author__ = "Heyu Lin" 31 | __contact__ = "heyu.lin(AT)student.unimelb.edu.au" 32 | 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('-i', '--input', metavar='input_dir', dest='i', 35 | type=str, required=True, 36 | help='specify the directory containing *.gbk files') 37 | parser.add_argument('-o', '--output', metavar='output_dir', dest='o', 38 | type=str, required=True, 39 | help='output tab files will be produced in this directory') 40 | parser.add_argument('-d', '--data', metavar='idmapping_KO.tab.gz', 41 | dest='d', type=str, 42 | help='database generated accroding to "step1" instruction') 43 | args = parser.parse_args() 44 | 45 | 46 | def gbk_parser(gbk): 47 | """ 48 | gbk: gbk genome file generated by Prokka 49 | """ 50 | arr = [] # output array containing locus_tag and UniProtKB 51 | with open(gbk) as input: 52 | cds = 0 53 | locus = 0 54 | pattern_locus = re.compile('"(.*)"') 55 | pattern_uniprotkb = re.compile('UniProtKB:(.*)"') 56 | for line in input.readlines(): 57 | if line.startswith(' ' * 5 + 'CDS'): 58 | cds = 1 # This is a CDS 59 | if line.startswith(' ' * 21 + '/locus_tag=') and cds == 1: 60 | locus_tag = pattern_locus.findall(line)[0] 61 | locus = 1 # locus_tag was read 62 | if line.startswith(' ' * 21 + '/inference="similar to AA sequence:UniProtKB') and locus == 1: 63 | uniprotkb = pattern_uniprotkb.findall(line)[0] 64 | arr.append([locus_tag, uniprotkb]) 65 | cds = 0 66 | locus = 0 67 | if line.startswith(' ' * 21 + '/codon_start') and locus == 1: 68 | arr.append([locus_tag, '']) 69 | cds = 0 70 | locus = 0 71 | return arr 72 | 73 | 74 | def dict_initialize(gzfile): 75 | dict = {} 76 | with gzip.open(gzfile) as fi: 77 | for line in fi.readlines(): 78 | fields = line.decode('utf-8').strip().split('\t') 79 | if fields[0] not in dict: 80 | dict[fields[0]] = [fields[1]] 81 | else: 82 | dict[fields[0]].append(fields[1]) 83 | return dict 84 | 85 | 86 | def dict_load(json_file): 87 | with open(json_file, 'r') as f: 88 | r = json.load(f) 89 | return r 90 | 91 | 92 | def retrieve_KO(arr, dict): 93 | """ 94 | arr = [ 95 | ['AMLFNMKI_00003', ''], 96 | ['AMLFNMKI_00004', 'Q24SP7'] 97 | ] 98 | new_arr = [ 99 | ['AMLFNMKI_00025', 'Q01465', ['K03569']], 100 | ['AMLFNMKI_00026', 'P15639', ['K00602','K00604']] 101 | ['AMLFNMKI_00027', '', ''] 102 | ] 103 | """ 104 | new_arr = [] 105 | id_no_match = [] # record UniProtKB IDs have no corresponding KO numbers 106 | for cds in arr: 107 | if cds[1] == '': 108 | cds.append('') 109 | new_arr.append(cds) 110 | else: 111 | ko = dict.get(cds[1], None) 112 | if ko is None: 113 | id_no_match.append(cds[1]) 114 | cds.append('') 115 | new_arr.append(cds) 116 | else: 117 | cds.append(ko) 118 | new_arr.append(cds) 119 | # print(json.dumps(new_arr)) 120 | """ Report failure in search K numbers according to UniProtKB IDs 121 | print("Warning: The following " + str(len(id_no_match)) 122 | + " UniProtKB IDs don't have corresponding K numbers") 123 | print(' '.join(id_no_match)) 124 | """ 125 | return new_arr 126 | 127 | 128 | def write_json(content, outfile): 129 | with open(outfile, 'w') as fo: 130 | json.dump(content, fo) 131 | 132 | 133 | def output(arr, outfile): 134 | """ 135 | arr = [ 136 | ["AMLFNMKI_00025", Q01465, ["K03569"]], 137 | ["AMLFNMKI_00026", P15639, ["K00602","K00604"]] 138 | ["AMLFNMKI_00027", "", ""] 139 | ] 140 | """ 141 | with open(outfile, 'w') as fo: 142 | for cds in arr: 143 | if cds[2] != "": 144 | for ko in cds[2]: 145 | fo.write(cds[0] + "\t" + ko + "\n") 146 | else: 147 | fo.write(cds[0] + "\n") 148 | 149 | 150 | def get_input_files(dir): 151 | files = [] 152 | for fi in os.listdir(dir): 153 | fi_path = os.path.join(dir, fi) 154 | if os.path.isfile(fi_path) and os.path.splitext(fi)[1] == '.gbk': 155 | files.append(fi) 156 | return files 157 | 158 | 159 | def create_dir(dir): 160 | if not os.path.exists(dir): 161 | os.mkdir(dir) 162 | 163 | 164 | def main(): 165 | create_dir(args.o) 166 | if os.path.exists(args.d + '.json'): 167 | db_dict = dict_load(args.d + '.json') 168 | else: 169 | db_dict = dict_initialize(args.d) 170 | write_json(db_dict, args.d + '.json') 171 | gbks = get_input_files(args.i) 172 | print("{} gbk files have been read.".format(len(gbks))) 173 | for gbk in gbks: 174 | print("parsing {}...".format(gbk)) 175 | in_path = os.path.join(args.i, gbk) 176 | out_path = os.path.join(args.o, gbk) + ".ko.out" 177 | mapping_array = gbk_parser(in_path) 178 | final_arr = retrieve_KO(mapping_array, db_dict) 179 | output(final_arr, out_path) 180 | 181 | 182 | if __name__ == '__main__': 183 | main() 184 | -------------------------------------------------------------------------------- /prokka2kegg/sample.kegg.out.txt: -------------------------------------------------------------------------------- 1 | AHCGDBLN_00001 K05685 2 | AHCGDBLN_00002 K10804 3 | AHCGDBLN_00003 4 | AHCGDBLN_00004 5 | AHCGDBLN_00005 6 | AHCGDBLN_00006 7 | AHCGDBLN_00007 8 | AHCGDBLN_00008 9 | AHCGDBLN_00009 10 | AHCGDBLN_00010 11 | AHCGDBLN_00011 12 | AHCGDBLN_00012 13 | AHCGDBLN_00013 14 | AHCGDBLN_00014 15 | AHCGDBLN_00015 K02410 16 | AHCGDBLN_00016 K02409 17 | AHCGDBLN_00017 K02388 18 | AHCGDBLN_00018 K07714 19 | AHCGDBLN_00019 K07710 20 | AHCGDBLN_00020 21 | AHCGDBLN_00021 22 | AHCGDBLN_00022 23 | AHCGDBLN_00023 K10012 24 | AHCGDBLN_00024 25 | AHCGDBLN_00025 K07264 26 | AHCGDBLN_00026 K07264 27 | AHCGDBLN_00027 K16148 28 | AHCGDBLN_00028 29 | AHCGDBLN_00029 K19569 30 | AHCGDBLN_00030 31 | AHCGDBLN_00031 32 | AHCGDBLN_00032 K15914 33 | AHCGDBLN_00033 K03274 34 | AHCGDBLN_00034 35 | AHCGDBLN_00035 K00754 36 | AHCGDBLN_00036 K02844 37 | AHCGDBLN_00037 K16870 38 | AHCGDBLN_00038 39 | AHCGDBLN_00039 40 | AHCGDBLN_00040 41 | AHCGDBLN_00041 42 | AHCGDBLN_00042 K01482 43 | AHCGDBLN_00043 44 | AHCGDBLN_00044 K03671 45 | AHCGDBLN_00045 46 | AHCGDBLN_00046 47 | AHCGDBLN_00047 K01012 48 | AHCGDBLN_00048 49 | AHCGDBLN_00049 K00800 50 | AHCGDBLN_00050 51 | AHCGDBLN_00051 52 | AHCGDBLN_00052 53 | AHCGDBLN_00053 54 | AHCGDBLN_00054 K00324 55 | AHCGDBLN_00055 K00325 56 | AHCGDBLN_00056 57 | AHCGDBLN_00057 58 | AHCGDBLN_00058 K00325 59 | AHCGDBLN_00059 K20932 60 | AHCGDBLN_00060 K07533 61 | AHCGDBLN_00061 62 | AHCGDBLN_00062 K04066 63 | AHCGDBLN_00063 64 | AHCGDBLN_00064 K03926 65 | AHCGDBLN_00065 K07147 66 | AHCGDBLN_00066 67 | AHCGDBLN_00067 68 | AHCGDBLN_00068 69 | AHCGDBLN_00069 70 | AHCGDBLN_00070 71 | AHCGDBLN_00071 72 | AHCGDBLN_00072 K01666 73 | AHCGDBLN_00073 K01625 74 | AHCGDBLN_00074 75 | AHCGDBLN_00075 K01299 76 | AHCGDBLN_00076 77 | AHCGDBLN_00077 K02203 78 | AHCGDBLN_00078 K03980 79 | AHCGDBLN_00079 80 | AHCGDBLN_00080 81 | AHCGDBLN_00081 82 | AHCGDBLN_00082 83 | AHCGDBLN_00083 84 | AHCGDBLN_00084 K06213 85 | AHCGDBLN_00085 86 | AHCGDBLN_00086 87 | AHCGDBLN_00087 88 | AHCGDBLN_00088 89 | AHCGDBLN_00089 K03685 90 | AHCGDBLN_00090 K09458 91 | AHCGDBLN_00091 K02078 92 | AHCGDBLN_00092 K00059 93 | AHCGDBLN_00093 K00645 94 | AHCGDBLN_00094 K00648 95 | AHCGDBLN_00095 K03621 96 | AHCGDBLN_00096 K02911 97 | AHCGDBLN_00097 K07040 98 | AHCGDBLN_00098 K04763 99 | AHCGDBLN_00099 K03841 100 | AHCGDBLN_00100 K01892 101 | AHCGDBLN_00101 K03711 102 | AHCGDBLN_00102 103 | AHCGDBLN_00103 104 | AHCGDBLN_00104 K01895 105 | AHCGDBLN_00105 106 | AHCGDBLN_00106 107 | AHCGDBLN_00107 108 | AHCGDBLN_00108 K02408 109 | AHCGDBLN_00109 K02387 110 | AHCGDBLN_00110 111 | AHCGDBLN_00111 112 | AHCGDBLN_00112 K01563 113 | AHCGDBLN_00113 K00648 114 | AHCGDBLN_00114 K11085 115 | AHCGDBLN_00115 116 | AHCGDBLN_00116 117 | AHCGDBLN_00117 K01407 118 | AHCGDBLN_00118 119 | AHCGDBLN_00119 K11210 120 | AHCGDBLN_00120 K03637 121 | AHCGDBLN_00121 K03636 122 | AHCGDBLN_00122 K03635 123 | AHCGDBLN_00123 124 | AHCGDBLN_00124 K16898 125 | AHCGDBLN_00125 K00384 126 | AHCGDBLN_00126 127 | AHCGDBLN_00127 K01733 128 | AHCGDBLN_00128 K00003 129 | AHCGDBLN_00129 K01423 130 | AHCGDBLN_00130 K01681 131 | AHCGDBLN_00131 K06889 132 | AHCGDBLN_00132 133 | AHCGDBLN_00133 134 | AHCGDBLN_00134 135 | AHCGDBLN_00136 136 | AHCGDBLN_00137 137 | AHCGDBLN_00138 K09121 138 | AHCGDBLN_00139 139 | AHCGDBLN_00140 140 | AHCGDBLN_00141 141 | AHCGDBLN_00142 K03118 142 | AHCGDBLN_00143 143 | AHCGDBLN_00144 144 | AHCGDBLN_00145 K01808 145 | AHCGDBLN_00146 K00600 146 | AHCGDBLN_00147 K07738 147 | AHCGDBLN_00148 148 | AHCGDBLN_00149 K02492 149 | AHCGDBLN_00150 K01749 150 | AHCGDBLN_00151 K00767 151 | AHCGDBLN_00152 152 | AHCGDBLN_00153 153 | AHCGDBLN_00154 154 | AHCGDBLN_00155 155 | AHCGDBLN_00156 156 | AHCGDBLN_00157 K02406 157 | AHCGDBLN_00158 K07213 158 | AHCGDBLN_00159 159 | AHCGDBLN_00160 K03183 160 | AHCGDBLN_00161 K03152 161 | AHCGDBLN_00162 162 | AHCGDBLN_00163 K06998 163 | AHCGDBLN_00164 K01953 164 | AHCGDBLN_00165 K01662 165 | AHCGDBLN_00166 166 | AHCGDBLN_00167 167 | AHCGDBLN_00168 K13888 168 | AHCGDBLN_00169 K02003 169 | AHCGDBLN_00170 170 | AHCGDBLN_00171 K01665 171 | AHCGDBLN_00172 K00824 172 | AHCGDBLN_00173 173 | AHCGDBLN_00174 174 | AHCGDBLN_00175 175 | AHCGDBLN_00176 176 | AHCGDBLN_00177 K02169 177 | AHCGDBLN_00178 178 | AHCGDBLN_00179 K21377 179 | AHCGDBLN_00180 180 | AHCGDBLN_00181 K03686 181 | AHCGDBLN_00182 182 | AHCGDBLN_00183 183 | AHCGDBLN_00184 K00609 184 | AHCGDBLN_00185 K01465 185 | AHCGDBLN_00186 K01956 186 | AHCGDBLN_00187 187 | AHCGDBLN_00188 K01955 188 | AHCGDBLN_00189 189 | AHCGDBLN_00190 190 | AHCGDBLN_00191 K10773 191 | AHCGDBLN_00192 192 | AHCGDBLN_00193 193 | AHCGDBLN_00194 K00330 194 | AHCGDBLN_00195 K00331 195 | AHCGDBLN_00196 K00332 196 | AHCGDBLN_00197 K00333 197 | AHCGDBLN_00198 K00334 198 | AHCGDBLN_00199 K00335 199 | AHCGDBLN_00200 K05299 200 | AHCGDBLN_00201 K00337 201 | AHCGDBLN_00202 K00338 202 | AHCGDBLN_00203 K00339 203 | AHCGDBLN_00204 K00340 204 | AHCGDBLN_00205 K00341 205 | AHCGDBLN_00206 K00342 206 | AHCGDBLN_00207 K06206 207 | AHCGDBLN_00208 K13307 208 | AHCGDBLN_00209 209 | AHCGDBLN_00210 K00773 210 | AHCGDBLN_00211 K00611 211 | AHCGDBLN_00212 K00821 212 | AHCGDBLN_00213 213 | AHCGDBLN_00214 K03316 214 | AHCGDBLN_00215 215 | AHCGDBLN_00216 K01662 216 | AHCGDBLN_00217 K01693 217 | AHCGDBLN_00218 K03564 218 | AHCGDBLN_00219 K01262 219 | AHCGDBLN_00220 220 | AHCGDBLN_00221 221 | AHCGDBLN_00222 K03702 222 | AHCGDBLN_00223 K03702 223 | AHCGDBLN_00224 K01693 224 | AHCGDBLN_00225 K02501 225 | AHCGDBLN_00226 226 | AHCGDBLN_00227 K05515 227 | AHCGDBLN_00228 228 | AHCGDBLN_00229 K03570 229 | AHCGDBLN_00230 K03569 230 | AHCGDBLN_00231 K03770 231 | AHCGDBLN_00232 K00941 232 | AHCGDBLN_00233 K00820 233 | AHCGDBLN_00234 K04042 234 | AHCGDBLN_00235 K04042 235 | AHCGDBLN_00236 K18707 236 | AHCGDBLN_00237 K03664 237 | AHCGDBLN_00238 238 | AHCGDBLN_00239 239 | AHCGDBLN_00240 240 | AHCGDBLN_00241 241 | AHCGDBLN_00242 242 | AHCGDBLN_00243 243 | AHCGDBLN_00244 K01963 244 | AHCGDBLN_00245 245 | AHCGDBLN_00246 K01613 246 | AHCGDBLN_00247 K00053 247 | AHCGDBLN_00248 K01653 248 | AHCGDBLN_00249 K10773 249 | AHCGDBLN_00250 K00989 250 | AHCGDBLN_00251 251 | AHCGDBLN_00252 K07301 252 | AHCGDBLN_00253 253 | AHCGDBLN_00254 K17828 254 | AHCGDBLN_00255 K01775 255 | AHCGDBLN_00256 256 | AHCGDBLN_00257 K16325 257 | AHCGDBLN_00258 258 | AHCGDBLN_00259 259 | AHCGDBLN_00260 260 | AHCGDBLN_00261 261 | AHCGDBLN_00262 K15352 262 | AHCGDBLN_00263 263 | AHCGDBLN_00264 K05851 264 | AHCGDBLN_00265 K02557 265 | AHCGDBLN_00266 K08218 266 | AHCGDBLN_00267 267 | AHCGDBLN_00268 268 | AHCGDBLN_00269 269 | AHCGDBLN_00270 270 | AHCGDBLN_00271 271 | AHCGDBLN_00272 272 | AHCGDBLN_00273 273 | AHCGDBLN_00274 274 | AHCGDBLN_00275 275 | AHCGDBLN_00276 276 | AHCGDBLN_00277 K03116 277 | AHCGDBLN_00278 278 | AHCGDBLN_00279 K00343 279 | AHCGDBLN_00280 K00342 280 | AHCGDBLN_00281 K00341 281 | AHCGDBLN_00282 282 | AHCGDBLN_00283 K00339 283 | AHCGDBLN_00284 284 | AHCGDBLN_00285 K00337 285 | AHCGDBLN_00286 286 | AHCGDBLN_00287 K00330 287 | AHCGDBLN_00289 288 | AHCGDBLN_00290 K07126 289 | AHCGDBLN_00291 K07126 290 | AHCGDBLN_00292 291 | AHCGDBLN_00293 292 | AHCGDBLN_00294 K07126 293 | AHCGDBLN_00295 294 | AHCGDBLN_00296 295 | AHCGDBLN_00297 296 | AHCGDBLN_00298 K14393 297 | AHCGDBLN_00299 298 | AHCGDBLN_00300 299 | AHCGDBLN_00301 300 | AHCGDBLN_00302 301 | AHCGDBLN_00303 302 | AHCGDBLN_00304 303 | AHCGDBLN_00305 K01953 304 | AHCGDBLN_00306 305 | AHCGDBLN_00307 306 | AHCGDBLN_00308 K19423 307 | AHCGDBLN_00309 K07126 308 | AHCGDBLN_00310 K00449 309 | AHCGDBLN_00311 310 | AHCGDBLN_00312 311 | AHCGDBLN_00313 312 | AHCGDBLN_00314 313 | AHCGDBLN_00315 314 | AHCGDBLN_00316 315 | AHCGDBLN_00317 316 | AHCGDBLN_00318 K16902 317 | AHCGDBLN_00319 318 | AHCGDBLN_00320 319 | AHCGDBLN_00321 K07126 320 | AHCGDBLN_00322 321 | AHCGDBLN_00323 322 | AHCGDBLN_00324 323 | AHCGDBLN_00325 324 | AHCGDBLN_00326 325 | AHCGDBLN_00327 326 | AHCGDBLN_00328 327 | AHCGDBLN_00329 328 | AHCGDBLN_00330 329 | AHCGDBLN_00331 330 | AHCGDBLN_00332 331 | AHCGDBLN_00333 332 | AHCGDBLN_00334 333 | AHCGDBLN_00335 334 | AHCGDBLN_00336 335 | AHCGDBLN_00337 336 | AHCGDBLN_00338 337 | AHCGDBLN_00339 338 | AHCGDBLN_00340 339 | AHCGDBLN_00341 K02835 340 | AHCGDBLN_00342 K02493 341 | AHCGDBLN_00343 K00790 342 | AHCGDBLN_00344 K02843 343 | AHCGDBLN_00345 344 | AHCGDBLN_00346 345 | AHCGDBLN_00347 K18979 346 | AHCGDBLN_00348 K01599 347 | AHCGDBLN_00349 K01772 348 | AHCGDBLN_00350 349 | AHCGDBLN_00351 K03657 350 | AHCGDBLN_00352 K03665 351 | AHCGDBLN_00353 352 | AHCGDBLN_00354 K03216 353 | AHCGDBLN_00355 354 | AHCGDBLN_00356 K21402 355 | AHCGDBLN_00357 K19422 356 | AHCGDBLN_00358 357 | AHCGDBLN_00359 358 | AHCGDBLN_00360 K01709 359 | AHCGDBLN_00361 K00978 360 | AHCGDBLN_00362 K01710 361 | AHCGDBLN_00363 K06148 362 | AHCGDBLN_00364 K02835 363 | AHCGDBLN_00365 364 | AHCGDBLN_00366 K03628 365 | AHCGDBLN_00367 366 | AHCGDBLN_00368 367 | AHCGDBLN_00369 368 | AHCGDBLN_00370 K02879 369 | AHCGDBLN_00371 K03040 370 | AHCGDBLN_00372 K02986 371 | AHCGDBLN_00373 K02948 372 | AHCGDBLN_00374 K02952 373 | AHCGDBLN_00375 K02518 374 | AHCGDBLN_00376 K00939 375 | AHCGDBLN_00377 K03076 376 | AHCGDBLN_00378 K02876 377 | AHCGDBLN_00379 K02988 378 | AHCGDBLN_00380 K02881 379 | AHCGDBLN_00381 380 | AHCGDBLN_00382 K02994 381 | AHCGDBLN_00383 K02931 382 | AHCGDBLN_00384 383 | AHCGDBLN_00385 K02874 384 | AHCGDBLN_00386 K02961 385 | AHCGDBLN_00387 386 | AHCGDBLN_00388 K02878 387 | AHCGDBLN_00389 K02982 388 | AHCGDBLN_00390 K02890 389 | AHCGDBLN_00391 390 | AHCGDBLN_00392 K02886 391 | AHCGDBLN_00393 K02892 392 | AHCGDBLN_00394 393 | AHCGDBLN_00395 K02906 394 | AHCGDBLN_00396 K02946 395 | AHCGDBLN_00397 K02358 396 | AHCGDBLN_00398 397 | AHCGDBLN_00399 K01358 398 | AHCGDBLN_00400 K03544 399 | AHCGDBLN_00401 400 | AHCGDBLN_00402 401 | AHCGDBLN_00403 402 | AHCGDBLN_00404 K06045 403 | AHCGDBLN_00405 K02871 404 | AHCGDBLN_00406 K02996 405 | AHCGDBLN_00407 K00145 406 | AHCGDBLN_00408 K00620 407 | AHCGDBLN_00409 K01462 408 | AHCGDBLN_00410 K00604 409 | AHCGDBLN_00411 K01784 410 | AHCGDBLN_00412 K03271 411 | AHCGDBLN_00413 K03272 412 | AHCGDBLN_00414 K03274 413 | AHCGDBLN_00415 414 | AHCGDBLN_00416 415 | AHCGDBLN_00417 416 | AHCGDBLN_00418 K00928 417 | AHCGDBLN_00419 K01649 418 | AHCGDBLN_00420 K01915 419 | AHCGDBLN_00421 420 | AHCGDBLN_00422 421 | AHCGDBLN_00423 K00748 422 | AHCGDBLN_00424 K09949 423 | AHCGDBLN_00425 K16043 424 | AHCGDBLN_00426 K00677 425 | AHCGDBLN_00427 K02372 426 | AHCGDBLN_00428 427 | AHCGDBLN_00429 K07277 428 | AHCGDBLN_00430 K03696 429 | AHCGDBLN_00431 K00548 430 | AHCGDBLN_00432 431 | AHCGDBLN_00433 K00605 432 | AHCGDBLN_00434 K02437 433 | AHCGDBLN_00435 K00282 434 | AHCGDBLN_00436 435 | AHCGDBLN_00437 436 | AHCGDBLN_00438 437 | AHCGDBLN_00439 438 | AHCGDBLN_00440 439 | AHCGDBLN_00441 440 | AHCGDBLN_00442 441 | AHCGDBLN_00443 442 | AHCGDBLN_00444 443 | AHCGDBLN_00445 444 | AHCGDBLN_00446 K07126 445 | AHCGDBLN_00447 446 | AHCGDBLN_00448 K15352 447 | AHCGDBLN_00449 448 | AHCGDBLN_00450 449 | AHCGDBLN_00451 450 | AHCGDBLN_00452 451 | AHCGDBLN_00453 452 | AHCGDBLN_00454 453 | AHCGDBLN_00455 454 | AHCGDBLN_00456 K01951 455 | AHCGDBLN_00457 K00088 456 | AHCGDBLN_00458 K06920 457 | AHCGDBLN_00459 K03106 458 | AHCGDBLN_00460 K02959 459 | AHCGDBLN_00461 K06960 460 | AHCGDBLN_00462 K02860 461 | AHCGDBLN_00463 K00554 462 | AHCGDBLN_00464 463 | AHCGDBLN_00465 K02884 464 | AHCGDBLN_00466 465 | AHCGDBLN_00467 K21402 466 | AHCGDBLN_00468 K01423 467 | AHCGDBLN_00469 K03470 468 | AHCGDBLN_00470 K07462 469 | AHCGDBLN_00471 K06942 470 | AHCGDBLN_00472 K02575 471 | AHCGDBLN_00473 K01999 472 | AHCGDBLN_00474 K01997 473 | AHCGDBLN_00475 474 | AHCGDBLN_00476 K06861 475 | AHCGDBLN_00477 K01996 476 | AHCGDBLN_00478 477 | AHCGDBLN_00479 478 | AHCGDBLN_00480 K00943 479 | AHCGDBLN_00481 K03186 480 | AHCGDBLN_00482 K04487 481 | AHCGDBLN_00483 K03151 482 | AHCGDBLN_00484 483 | AHCGDBLN_00485 K13940 484 | AHCGDBLN_00486 K10206 485 | AHCGDBLN_00487 486 | AHCGDBLN_00488 487 | AHCGDBLN_00489 K03832 488 | AHCGDBLN_00490 489 | AHCGDBLN_00491 K02015 490 | AHCGDBLN_00492 K02013 491 | AHCGDBLN_00493 K06858 492 | AHCGDBLN_00494 493 | AHCGDBLN_00495 494 | AHCGDBLN_00496 K00974 495 | AHCGDBLN_00497 K04562 496 | AHCGDBLN_00498 497 | AHCGDBLN_00499 498 | AHCGDBLN_00500 499 | AHCGDBLN_00501 500 | AHCGDBLN_00502 K01867 501 | AHCGDBLN_00503 K01870 502 | AHCGDBLN_00504 K01870 503 | AHCGDBLN_00505 K03101 504 | AHCGDBLN_00506 K13292 505 | AHCGDBLN_00507 506 | AHCGDBLN_00508 507 | AHCGDBLN_00509 508 | AHCGDBLN_00510 509 | AHCGDBLN_00511 K03769 510 | AHCGDBLN_00512 511 | AHCGDBLN_00513 512 | AHCGDBLN_00514 513 | AHCGDBLN_00515 514 | AHCGDBLN_00516 K08884 515 | AHCGDBLN_00517 516 | AHCGDBLN_00518 517 | AHCGDBLN_00519 K00058 518 | AHCGDBLN_00520 K04034 519 | AHCGDBLN_00521 520 | AHCGDBLN_00522 521 | AHCGDBLN_00523 522 | AHCGDBLN_00524 K03271 523 | AHCGDBLN_00525 K00616 524 | AHCGDBLN_00526 K03273 525 | AHCGDBLN_00527 K00966 526 | AHCGDBLN_00528 K01710 527 | AHCGDBLN_00529 528 | AHCGDBLN_00530 K07031 529 | AHCGDBLN_00531 K19427 530 | AHCGDBLN_00532 K03639 531 | AHCGDBLN_00533 532 | AHCGDBLN_00534 533 | AHCGDBLN_00535 534 | AHCGDBLN_00536 535 | AHCGDBLN_00537 536 | AHCGDBLN_00538 K03593 537 | AHCGDBLN_00539 538 | AHCGDBLN_00540 539 | AHCGDBLN_00541 K00974 540 | AHCGDBLN_00542 541 | AHCGDBLN_00543 542 | AHCGDBLN_00544 543 | AHCGDBLN_00545 K03797 544 | AHCGDBLN_00546 K03673 545 | AHCGDBLN_00547 546 | AHCGDBLN_00548 K07053 547 | AHCGDBLN_00549 K03551 548 | AHCGDBLN_00550 K03550 549 | AHCGDBLN_00551 K01159 550 | AHCGDBLN_00552 551 | AHCGDBLN_00553 552 | AHCGDBLN_00554 K00278 553 | AHCGDBLN_00555 554 | AHCGDBLN_00556 555 | AHCGDBLN_00557 K03210 556 | AHCGDBLN_00558 K03072 557 | AHCGDBLN_00559 558 | AHCGDBLN_00560 559 | AHCGDBLN_00561 K04567 560 | AHCGDBLN_00562 K18682 561 | AHCGDBLN_00563 K01934 562 | AHCGDBLN_00564 563 | AHCGDBLN_00565 564 | AHCGDBLN_00566 565 | AHCGDBLN_00567 K01890 566 | AHCGDBLN_00568 567 | AHCGDBLN_00569 K07387 568 | AHCGDBLN_00570 569 | AHCGDBLN_00571 570 | AHCGDBLN_00572 571 | AHCGDBLN_00573 K11927 572 | AHCGDBLN_00574 K07461 573 | AHCGDBLN_00575 K09858 574 | AHCGDBLN_00576 K06994 575 | AHCGDBLN_00577 576 | AHCGDBLN_00578 577 | AHCGDBLN_00579 578 | AHCGDBLN_00580 K21271 579 | AHCGDBLN_00581 580 | AHCGDBLN_00582 581 | AHCGDBLN_00583 K11940 582 | AHCGDBLN_00584 583 | AHCGDBLN_00585 584 | AHCGDBLN_00586 K03525 585 | AHCGDBLN_00587 586 | AHCGDBLN_00588 587 | AHCGDBLN_00589 K07126 588 | AHCGDBLN_00590 K00891 589 | AHCGDBLN_00591 590 | AHCGDBLN_00592 K01247 591 | AHCGDBLN_00593 592 | AHCGDBLN_00594 593 | AHCGDBLN_00595 594 | AHCGDBLN_00596 K06219 595 | AHCGDBLN_00597 K02888 596 | AHCGDBLN_00598 K02899 597 | AHCGDBLN_00599 K03979 598 | AHCGDBLN_00600 599 | AHCGDBLN_00601 600 | AHCGDBLN_00602 K00979 601 | AHCGDBLN_00603 K01297 602 | AHCGDBLN_00604 K02500 603 | AHCGDBLN_00605 K01814 604 | AHCGDBLN_00606 K19889 605 | AHCGDBLN_00607 K01790 606 | AHCGDBLN_00608 607 | AHCGDBLN_00609 K00059 608 | AHCGDBLN_00610 609 | AHCGDBLN_00611 610 | AHCGDBLN_00612 611 | AHCGDBLN_00613 612 | AHCGDBLN_00614 613 | AHCGDBLN_00615 614 | AHCGDBLN_00617 615 | AHCGDBLN_00618 K07126 616 | AHCGDBLN_00619 K19789 617 | AHCGDBLN_00620 618 | AHCGDBLN_00621 K03427 619 | AHCGDBLN_00622 620 | AHCGDBLN_00623 621 | AHCGDBLN_00624 K07126 622 | AHCGDBLN_00625 K07126 623 | AHCGDBLN_00626 624 | AHCGDBLN_00627 625 | AHCGDBLN_00628 626 | AHCGDBLN_00629 K03686 627 | AHCGDBLN_00630 628 | AHCGDBLN_00631 629 | AHCGDBLN_00632 630 | AHCGDBLN_00633 631 | AHCGDBLN_00634 K07126 632 | AHCGDBLN_00635 633 | AHCGDBLN_00636 634 | AHCGDBLN_00637 635 | AHCGDBLN_00638 K01872 636 | AHCGDBLN_00639 K06041 637 | AHCGDBLN_00640 K02335 638 | AHCGDBLN_00641 639 | AHCGDBLN_00642 K07305 640 | AHCGDBLN_00643 641 | AHCGDBLN_00644 642 | AHCGDBLN_00645 K15977 643 | AHCGDBLN_00646 644 | AHCGDBLN_00647 645 | AHCGDBLN_00648 646 | AHCGDBLN_00649 647 | AHCGDBLN_00650 K03704 648 | AHCGDBLN_00651 649 | AHCGDBLN_00652 650 | AHCGDBLN_00653 651 | AHCGDBLN_00654 652 | AHCGDBLN_00655 K09771 653 | AHCGDBLN_00656 654 | AHCGDBLN_00657 655 | AHCGDBLN_00658 K15352 656 | AHCGDBLN_00659 657 | AHCGDBLN_00660 658 | AHCGDBLN_00661 659 | AHCGDBLN_00662 660 | AHCGDBLN_00663 K15383 661 | AHCGDBLN_00664 K15352 662 | AHCGDBLN_00665 663 | AHCGDBLN_00666 K07783 664 | AHCGDBLN_00667 K18911 665 | AHCGDBLN_00668 666 | AHCGDBLN_00669 667 | AHCGDBLN_00670 668 | AHCGDBLN_00671 K19428 669 | AHCGDBLN_00672 K18235 670 | AHCGDBLN_00673 671 | AHCGDBLN_00674 K02469 672 | AHCGDBLN_00675 673 | AHCGDBLN_00678 674 | AHCGDBLN_00681 K02343 675 | AHCGDBLN_00682 K09747 676 | AHCGDBLN_00683 K06187 677 | AHCGDBLN_00685 678 | AHCGDBLN_00686 K07714 679 | AHCGDBLN_00687 K10914 680 | AHCGDBLN_00688 K10914 681 | AHCGDBLN_00689 K08303 682 | AHCGDBLN_00690 683 | AHCGDBLN_00691 684 | AHCGDBLN_00692 K15352 685 | AHCGDBLN_00693 686 | AHCGDBLN_00694 K01834 687 | AHCGDBLN_00695 688 | AHCGDBLN_00696 K06442 689 | AHCGDBLN_00697 K03602 690 | AHCGDBLN_00698 K03601 691 | AHCGDBLN_00699 K01491 692 | AHCGDBLN_00700 693 | AHCGDBLN_00701 694 | AHCGDBLN_00702 695 | AHCGDBLN_00703 696 | AHCGDBLN_00705 697 | AHCGDBLN_00706 698 | AHCGDBLN_00707 K03113 699 | AHCGDBLN_00708 700 | AHCGDBLN_00709 701 | AHCGDBLN_00710 K02111 702 | AHCGDBLN_00711 K02115 703 | AHCGDBLN_00712 K02112 704 | AHCGDBLN_00713 K02114 705 | AHCGDBLN_00714 706 | AHCGDBLN_00715 707 | AHCGDBLN_00716 K03701 708 | AHCGDBLN_00717 K03182 709 | AHCGDBLN_00718 K00971 710 | AHCGDBLN_00719 711 | AHCGDBLN_00720 K18889 712 | AHCGDBLN_00721 K06147 713 | AHCGDBLN_00722 K00147 714 | AHCGDBLN_00723 715 | AHCGDBLN_00724 716 | AHCGDBLN_00725 K03168 717 | AHCGDBLN_00726 K04094 718 | AHCGDBLN_00727 K02470 719 | AHCGDBLN_00728 K03593 720 | AHCGDBLN_00729 K04039 721 | AHCGDBLN_00730 722 | AHCGDBLN_00731 723 | AHCGDBLN_00732 K03593 724 | AHCGDBLN_00733 K00058 725 | AHCGDBLN_00734 K07533 726 | AHCGDBLN_00735 727 | AHCGDBLN_00736 728 | AHCGDBLN_00737 729 | AHCGDBLN_00738 730 | AHCGDBLN_00739 731 | AHCGDBLN_00740 732 | AHCGDBLN_00741 733 | AHCGDBLN_00742 734 | AHCGDBLN_00743 735 | AHCGDBLN_00744 736 | AHCGDBLN_00745 737 | AHCGDBLN_00746 K03439 738 | AHCGDBLN_00747 739 | AHCGDBLN_00748 740 | AHCGDBLN_00749 741 | AHCGDBLN_00750 K08884 742 | AHCGDBLN_00751 743 | AHCGDBLN_00752 744 | AHCGDBLN_00753 745 | AHCGDBLN_00754 746 | AHCGDBLN_00755 747 | AHCGDBLN_00756 748 | AHCGDBLN_00757 749 | AHCGDBLN_00758 750 | AHCGDBLN_00759 751 | AHCGDBLN_00760 752 | AHCGDBLN_00761 753 | AHCGDBLN_00762 754 | AHCGDBLN_00763 755 | AHCGDBLN_00764 756 | AHCGDBLN_00765 757 | AHCGDBLN_00767 758 | AHCGDBLN_00768 759 | AHCGDBLN_00769 K06179 760 | AHCGDBLN_00770 K07789 761 | AHCGDBLN_00771 K01993 762 | AHCGDBLN_00772 K06969 763 | AHCGDBLN_00773 K01897 764 | AHCGDBLN_00774 765 | AHCGDBLN_00775 766 | AHCGDBLN_00776 767 | AHCGDBLN_00777 768 | AHCGDBLN_00778 K04562 769 | AHCGDBLN_00779 770 | AHCGDBLN_00780 K02405 771 | AHCGDBLN_00781 K12974 772 | AHCGDBLN_00782 K00997 773 | AHCGDBLN_00783 K07566 774 | AHCGDBLN_00784 K01588 775 | AHCGDBLN_00785 776 | AHCGDBLN_00786 K03742 777 | AHCGDBLN_00787 K19225 778 | AHCGDBLN_00788 779 | AHCGDBLN_00789 780 | AHCGDBLN_00790 K11752 781 | AHCGDBLN_00791 K00793 782 | AHCGDBLN_00792 K14652 783 | AHCGDBLN_00793 K00794 784 | AHCGDBLN_00794 K03625 785 | AHCGDBLN_00795 786 | AHCGDBLN_00796 787 | AHCGDBLN_00797 788 | AHCGDBLN_00798 K04764 789 | AHCGDBLN_00799 K02355 790 | AHCGDBLN_00800 K03977 791 | AHCGDBLN_00801 K01448 792 | AHCGDBLN_00802 793 | AHCGDBLN_00803 K03722 794 | AHCGDBLN_00804 795 | AHCGDBLN_00805 K02042 796 | AHCGDBLN_00806 797 | AHCGDBLN_00807 K01591 798 | AHCGDBLN_00808 799 | AHCGDBLN_00809 800 | AHCGDBLN_00810 K00537 801 | AHCGDBLN_00811 K03529 802 | AHCGDBLN_00812 803 | AHCGDBLN_00813 K00568 804 | AHCGDBLN_00814 805 | AHCGDBLN_00815 K10012 806 | AHCGDBLN_00816 K12902 807 | AHCGDBLN_00817 808 | AHCGDBLN_00818 809 | AHCGDBLN_00819 K01710 810 | AHCGDBLN_00820 811 | AHCGDBLN_00821 K10816 812 | AHCGDBLN_00822 813 | AHCGDBLN_00823 K01737 814 | AHCGDBLN_00824 K11745 815 | AHCGDBLN_00825 816 | AHCGDBLN_00826 817 | AHCGDBLN_00827 818 | AHCGDBLN_00828 819 | AHCGDBLN_00829 820 | AHCGDBLN_00830 821 | AHCGDBLN_00831 K01243 822 | AHCGDBLN_00832 K03215 823 | AHCGDBLN_00833 K01893 824 | AHCGDBLN_00834 825 | AHCGDBLN_00835 826 | AHCGDBLN_00836 K09131 827 | AHCGDBLN_00837 K08963 828 | AHCGDBLN_00838 K00966 829 | AHCGDBLN_00839 830 | AHCGDBLN_00840 K07533 831 | AHCGDBLN_00841 832 | AHCGDBLN_00842 833 | AHCGDBLN_00843 834 | AHCGDBLN_00844 K06287 835 | AHCGDBLN_00845 836 | AHCGDBLN_00846 837 | AHCGDBLN_00847 838 | AHCGDBLN_00848 839 | AHCGDBLN_00849 K18430 840 | AHCGDBLN_00850 841 | AHCGDBLN_00851 842 | AHCGDBLN_00852 843 | AHCGDBLN_00853 K03453 844 | AHCGDBLN_00854 K16868 845 | AHCGDBLN_00855 846 | AHCGDBLN_00856 847 | AHCGDBLN_00857 K00428 848 | AHCGDBLN_00858 849 | AHCGDBLN_00859 850 | AHCGDBLN_00860 851 | AHCGDBLN_00861 852 | AHCGDBLN_00862 K02600 853 | AHCGDBLN_00863 854 | AHCGDBLN_00864 855 | AHCGDBLN_00865 K02834 856 | AHCGDBLN_00866 K03177 857 | AHCGDBLN_00867 858 | AHCGDBLN_00868 K00962 859 | AHCGDBLN_00869 K03787 860 | AHCGDBLN_00870 K03386 861 | AHCGDBLN_00871 K06167 862 | AHCGDBLN_00872 863 | AHCGDBLN_00873 K01935 864 | AHCGDBLN_00874 K07658 865 | AHCGDBLN_00875 K07636 866 | AHCGDBLN_00876 867 | AHCGDBLN_00877 868 | AHCGDBLN_00878 869 | AHCGDBLN_00879 870 | AHCGDBLN_00880 K00655 871 | AHCGDBLN_00881 872 | AHCGDBLN_00882 K03564 873 | AHCGDBLN_00883 K07390 874 | AHCGDBLN_00884 875 | AHCGDBLN_00885 876 | AHCGDBLN_00886 877 | AHCGDBLN_00887 K01657 878 | AHCGDBLN_00888 K01664 879 | AHCGDBLN_00889 K00766 880 | AHCGDBLN_00890 K01609 881 | AHCGDBLN_00891 K01817 882 | AHCGDBLN_00892 K01696 883 | AHCGDBLN_00893 K01695 884 | AHCGDBLN_00894 885 | AHCGDBLN_00895 886 | AHCGDBLN_00896 K08884 887 | AHCGDBLN_00897 888 | AHCGDBLN_00898 K12339 889 | AHCGDBLN_00899 K21029 890 | AHCGDBLN_00900 K03636 891 | AHCGDBLN_00901 K01733 892 | AHCGDBLN_00902 K01738 893 | AHCGDBLN_00903 K02453 894 | AHCGDBLN_00904 895 | AHCGDBLN_00905 896 | AHCGDBLN_00906 K01698 897 | AHCGDBLN_00907 898 | AHCGDBLN_00908 K21464 899 | AHCGDBLN_00909 900 | AHCGDBLN_00910 K01810 901 | AHCGDBLN_00911 K00616 902 | AHCGDBLN_00912 K01940 903 | AHCGDBLN_00913 K01999 904 | AHCGDBLN_00914 K03116 905 | AHCGDBLN_00915 906 | AHCGDBLN_00916 907 | AHCGDBLN_00917 908 | AHCGDBLN_00918 K05366 909 | AHCGDBLN_00919 910 | AHCGDBLN_00920 K01652 911 | AHCGDBLN_00921 912 | AHCGDBLN_00922 913 | AHCGDBLN_00923 914 | AHCGDBLN_00924 915 | AHCGDBLN_00925 K04034 916 | AHCGDBLN_00926 917 | AHCGDBLN_00927 K07126 918 | AHCGDBLN_00928 919 | AHCGDBLN_00929 K12902 920 | AHCGDBLN_00930 921 | AHCGDBLN_00931 K13309 922 | AHCGDBLN_00932 K12902 923 | AHCGDBLN_00933 K05527 924 | AHCGDBLN_00934 925 | AHCGDBLN_00935 926 | AHCGDBLN_00936 927 | AHCGDBLN_00937 928 | AHCGDBLN_00938 K04043 929 | AHCGDBLN_00939 K03687 930 | AHCGDBLN_00940 K03686 931 | AHCGDBLN_00941 932 | AHCGDBLN_00942 933 | AHCGDBLN_00943 K00721 934 | AHCGDBLN_00944 K00266 935 | AHCGDBLN_00945 K00284 936 | AHCGDBLN_00946 937 | AHCGDBLN_00947 K00602 938 | AHCGDBLN_00948 K04034 939 | AHCGDBLN_00949 940 | AHCGDBLN_00950 K22320 941 | AHCGDBLN_00951 K10780 942 | AHCGDBLN_00952 943 | AHCGDBLN_00953 944 | AHCGDBLN_00954 945 | AHCGDBLN_00955 946 | AHCGDBLN_00956 947 | AHCGDBLN_00957 948 | AHCGDBLN_00958 K01129 949 | AHCGDBLN_00959 950 | AHCGDBLN_00960 K06077 951 | AHCGDBLN_00961 952 | AHCGDBLN_00962 953 | AHCGDBLN_00963 K01520 954 | AHCGDBLN_00964 955 | AHCGDBLN_00965 956 | AHCGDBLN_00966 957 | AHCGDBLN_00967 958 | AHCGDBLN_00968 959 | AHCGDBLN_00969 K01990 960 | AHCGDBLN_00970 K00010 961 | AHCGDBLN_00971 K02406 962 | AHCGDBLN_00972 963 | AHCGDBLN_00973 K00796 964 | AHCGDBLN_00974 K03474 965 | AHCGDBLN_00975 966 | AHCGDBLN_00976 967 | AHCGDBLN_00977 K01803 968 | AHCGDBLN_00978 K03075 969 | AHCGDBLN_00980 970 | AHCGDBLN_00981 971 | AHCGDBLN_00982 K00382 972 | AHCGDBLN_00983 973 | AHCGDBLN_00984 K02635 974 | AHCGDBLN_00985 K03798 975 | AHCGDBLN_00986 K04075 976 | AHCGDBLN_00987 K07533 977 | AHCGDBLN_00988 K03530 978 | AHCGDBLN_00989 K07568 979 | AHCGDBLN_00990 K07568 980 | AHCGDBLN_00991 981 | AHCGDBLN_00992 K00012 982 | AHCGDBLN_00993 983 | AHCGDBLN_00994 K09022 984 | AHCGDBLN_00995 K03584 985 | AHCGDBLN_00996 K00567 986 | AHCGDBLN_00997 987 | AHCGDBLN_00998 988 | AHCGDBLN_00999 989 | AHCGDBLN_01000 K04773 990 | AHCGDBLN_01001 991 | AHCGDBLN_01002 K04034 992 | AHCGDBLN_01003 993 | AHCGDBLN_01004 994 | AHCGDBLN_01005 995 | AHCGDBLN_01006 K00754 996 | AHCGDBLN_01007 K01057 997 | AHCGDBLN_01008 998 | AHCGDBLN_01009 K17947 999 | AHCGDBLN_01010 K02988 1000 | AHCGDBLN_01011 K02876 1001 | AHCGDBLN_01012 K03076 1002 | AHCGDBLN_01013 1003 | AHCGDBLN_01014 K02518 1004 | AHCGDBLN_01015 1005 | AHCGDBLN_01016 K00790 1006 | AHCGDBLN_01017 K02493 1007 | AHCGDBLN_01018 K02835 1008 | AHCGDBLN_01019 1009 | AHCGDBLN_01020 K03628 1010 | AHCGDBLN_01021 1011 | AHCGDBLN_01022 K02879 1012 | AHCGDBLN_01023 K03040 1013 | AHCGDBLN_01024 K02986 1014 | AHCGDBLN_01025 1015 | AHCGDBLN_01026 1016 | AHCGDBLN_01027 1017 | AHCGDBLN_01028 1018 | AHCGDBLN_01029 1019 | AHCGDBLN_01030 1020 | AHCGDBLN_01031 K06871 1021 | AHCGDBLN_01032 1022 | AHCGDBLN_01033 1023 | AHCGDBLN_01034 1024 | AHCGDBLN_01035 1025 | AHCGDBLN_01036 K04771 1026 | AHCGDBLN_01037 1027 | AHCGDBLN_01038 1028 | AHCGDBLN_01039 1029 | AHCGDBLN_01040 K02236 1030 | AHCGDBLN_01041 K06147 1031 | AHCGDBLN_01042 1032 | AHCGDBLN_01043 1033 | AHCGDBLN_01044 1034 | AHCGDBLN_01045 1035 | AHCGDBLN_01046 1036 | AHCGDBLN_01047 K21464 1037 | AHCGDBLN_01048 1038 | AHCGDBLN_01049 1039 | AHCGDBLN_01051 1040 | AHCGDBLN_01052 1041 | AHCGDBLN_01053 K01872 1042 | AHCGDBLN_01054 K06041 1043 | AHCGDBLN_01055 K00012 1044 | AHCGDBLN_01056 1045 | AHCGDBLN_01057 K07568 1046 | AHCGDBLN_01058 K03530 1047 | AHCGDBLN_01059 K05589 1048 | AHCGDBLN_01060 1049 | AHCGDBLN_01061 1050 | AHCGDBLN_01062 1051 | AHCGDBLN_01063 1052 | AHCGDBLN_01064 1053 | AHCGDBLN_01065 1054 | AHCGDBLN_01066 1055 | AHCGDBLN_01067 1056 | AHCGDBLN_01068 1057 | AHCGDBLN_01069 1058 | AHCGDBLN_01070 1059 | AHCGDBLN_01071 1060 | AHCGDBLN_01072 1061 | AHCGDBLN_01073 1062 | AHCGDBLN_01074 1063 | AHCGDBLN_01075 1064 | AHCGDBLN_01076 1065 | AHCGDBLN_01077 K01939 1066 | AHCGDBLN_01078 1067 | AHCGDBLN_01079 1068 | AHCGDBLN_01080 1069 | AHCGDBLN_01081 1070 | AHCGDBLN_01082 1071 | AHCGDBLN_01083 K06889 1072 | AHCGDBLN_01084 1073 | AHCGDBLN_01085 1074 | AHCGDBLN_01086 1075 | AHCGDBLN_01087 1076 | AHCGDBLN_01088 1077 | AHCGDBLN_01089 1078 | AHCGDBLN_01090 1079 | AHCGDBLN_01091 1080 | AHCGDBLN_01092 1081 | AHCGDBLN_01093 1082 | AHCGDBLN_01094 1083 | AHCGDBLN_01095 1084 | AHCGDBLN_01096 1085 | AHCGDBLN_01097 1086 | AHCGDBLN_01098 K01150 1087 | AHCGDBLN_01099 1088 | AHCGDBLN_01100 1089 | AHCGDBLN_01101 K07400 1090 | AHCGDBLN_01102 1091 | AHCGDBLN_01103 K21140 1092 | AHCGDBLN_01104 1093 | AHCGDBLN_01105 1094 | AHCGDBLN_01106 K03569 1095 | AHCGDBLN_01107 1096 | AHCGDBLN_01108 1097 | AHCGDBLN_01111 K09936 1098 | AHCGDBLN_01112 K03217 1099 | AHCGDBLN_01113 K08998 1100 | AHCGDBLN_01114 K03536 1101 | AHCGDBLN_01115 1102 | AHCGDBLN_01116 1103 | AHCGDBLN_01117 1104 | AHCGDBLN_01118 1105 | AHCGDBLN_01119 K03100 1106 | AHCGDBLN_01120 K03596 1107 | AHCGDBLN_01121 1108 | AHCGDBLN_01122 1109 | AHCGDBLN_01123 1110 | AHCGDBLN_01124 K15521 1111 | AHCGDBLN_01125 1112 | AHCGDBLN_01126 1113 | AHCGDBLN_01127 1114 | AHCGDBLN_01128 1115 | AHCGDBLN_01129 1116 | AHCGDBLN_01130 K11936 1117 | AHCGDBLN_01131 1118 | AHCGDBLN_01132 K07126 1119 | AHCGDBLN_01133 1120 | AHCGDBLN_01134 1121 | AHCGDBLN_01135 1122 | AHCGDBLN_01136 1123 | AHCGDBLN_01137 1124 | AHCGDBLN_01138 1125 | AHCGDBLN_01139 1126 | AHCGDBLN_01140 1127 | AHCGDBLN_01141 1128 | AHCGDBLN_01142 K21464 1129 | AHCGDBLN_01143 1130 | AHCGDBLN_01144 K00931 1131 | AHCGDBLN_01145 K00147 1132 | AHCGDBLN_01146 K00969 1133 | AHCGDBLN_01147 K09710 1134 | AHCGDBLN_01148 K03797 1135 | AHCGDBLN_01149 K08311 1136 | AHCGDBLN_01150 1137 | AHCGDBLN_01151 1138 | AHCGDBLN_01152 1139 | AHCGDBLN_01153 1140 | AHCGDBLN_01154 K00010 1141 | AHCGDBLN_01155 K13019 1142 | AHCGDBLN_01156 1143 | AHCGDBLN_01157 K20573 1144 | AHCGDBLN_01158 1145 | AHCGDBLN_01159 1146 | AHCGDBLN_01160 1147 | AHCGDBLN_01161 1148 | AHCGDBLN_01162 1149 | AHCGDBLN_01163 1150 | AHCGDBLN_01164 1151 | AHCGDBLN_01165 1152 | AHCGDBLN_01166 K07533 1153 | AHCGDBLN_01167 K00058 1154 | AHCGDBLN_01168 K00831 1155 | AHCGDBLN_01169 K04771 1156 | AHCGDBLN_01170 1157 | AHCGDBLN_01171 1158 | AHCGDBLN_01172 1159 | AHCGDBLN_01173 1160 | AHCGDBLN_01174 1161 | AHCGDBLN_01175 1162 | AHCGDBLN_01176 1163 | AHCGDBLN_01177 1164 | AHCGDBLN_01178 1165 | AHCGDBLN_01179 1166 | AHCGDBLN_01180 1167 | AHCGDBLN_01181 1168 | AHCGDBLN_01182 1169 | AHCGDBLN_01183 1170 | AHCGDBLN_01184 K01520 1171 | AHCGDBLN_01185 1172 | AHCGDBLN_01186 1173 | AHCGDBLN_01187 1174 | AHCGDBLN_01188 1175 | AHCGDBLN_01189 1176 | AHCGDBLN_01190 1177 | AHCGDBLN_01191 1178 | AHCGDBLN_01192 1179 | AHCGDBLN_01193 K12420 1180 | AHCGDBLN_01194 1181 | AHCGDBLN_01195 K01790 1182 | AHCGDBLN_01196 K19889 1183 | AHCGDBLN_01197 K01814 1184 | AHCGDBLN_01198 1185 | AHCGDBLN_01199 K21131 1186 | AHCGDBLN_01200 K15669 1187 | AHCGDBLN_01201 K12454 1188 | AHCGDBLN_01202 1189 | AHCGDBLN_01203 1190 | AHCGDBLN_01204 K02469 1191 | AHCGDBLN_01205 K02470 1192 | AHCGDBLN_01206 K00052 1193 | AHCGDBLN_01207 K00133 1194 | AHCGDBLN_01208 1195 | AHCGDBLN_01209 1196 | AHCGDBLN_01210 1197 | AHCGDBLN_01211 1198 | AHCGDBLN_01212 1199 | AHCGDBLN_01213 1200 | AHCGDBLN_01214 1201 | AHCGDBLN_01215 1202 | AHCGDBLN_01216 K06190 1203 | AHCGDBLN_01217 1204 | AHCGDBLN_01218 1205 | AHCGDBLN_01219 K00382 1206 | AHCGDBLN_01220 1207 | AHCGDBLN_01221 K04773 1208 | AHCGDBLN_01222 1209 | AHCGDBLN_01223 1210 | AHCGDBLN_01224 1211 | AHCGDBLN_01225 1212 | AHCGDBLN_01226 K00567 1213 | AHCGDBLN_01227 1214 | AHCGDBLN_01228 1215 | AHCGDBLN_01229 1216 | AHCGDBLN_01230 K07126 1217 | AHCGDBLN_01231 1218 | AHCGDBLN_01232 1219 | AHCGDBLN_01233 1220 | AHCGDBLN_01234 K02453 1221 | AHCGDBLN_01235 1222 | AHCGDBLN_01237 1223 | AHCGDBLN_01238 1224 | AHCGDBLN_01239 1225 | AHCGDBLN_01240 1226 | AHCGDBLN_01241 K03526 1227 | AHCGDBLN_01242 K03545 1228 | AHCGDBLN_01243 K22360 1229 | AHCGDBLN_01244 1230 | AHCGDBLN_01245 K00789 1231 | AHCGDBLN_01246 K08483 1232 | AHCGDBLN_01247 1233 | AHCGDBLN_01248 K03979 1234 | AHCGDBLN_01249 K02899 1235 | AHCGDBLN_01250 K02888 1236 | AHCGDBLN_01251 K06219 1237 | AHCGDBLN_01252 1238 | AHCGDBLN_01253 1239 | AHCGDBLN_01254 1240 | AHCGDBLN_01255 K01247 1241 | AHCGDBLN_01256 1242 | AHCGDBLN_01257 K03559 1243 | AHCGDBLN_01258 1244 | AHCGDBLN_01259 1245 | AHCGDBLN_01260 1246 | AHCGDBLN_01261 1247 | AHCGDBLN_01262 K02013 1248 | AHCGDBLN_01263 K22305 1249 | AHCGDBLN_01264 1250 | AHCGDBLN_01265 1251 | AHCGDBLN_01266 1252 | AHCGDBLN_01267 1253 | AHCGDBLN_01268 1254 | AHCGDBLN_01269 1255 | AHCGDBLN_01270 1256 | AHCGDBLN_01271 1257 | AHCGDBLN_01272 1258 | AHCGDBLN_01273 1259 | AHCGDBLN_01274 1260 | AHCGDBLN_01275 K13894 1261 | AHCGDBLN_01276 K13895 1262 | AHCGDBLN_01277 K00616 1263 | AHCGDBLN_01278 K01940 1264 | AHCGDBLN_01279 1265 | AHCGDBLN_01280 1266 | AHCGDBLN_01281 1267 | AHCGDBLN_01282 K12902 1268 | AHCGDBLN_01283 K02503 1269 | AHCGDBLN_01284 1270 | AHCGDBLN_01285 K01057 1271 | AHCGDBLN_01286 1272 | AHCGDBLN_01287 K10823 1273 | AHCGDBLN_01288 1274 | AHCGDBLN_01289 K03644 1275 | AHCGDBLN_01290 1276 | AHCGDBLN_01291 1277 | AHCGDBLN_01292 K01937 1278 | AHCGDBLN_01293 1279 | AHCGDBLN_01294 K00059 1280 | AHCGDBLN_01295 1281 | AHCGDBLN_01296 K07806 1282 | AHCGDBLN_01297 1283 | AHCGDBLN_01298 1284 | AHCGDBLN_01299 1285 | AHCGDBLN_01300 1286 | AHCGDBLN_01301 1287 | AHCGDBLN_01302 1288 | AHCGDBLN_01304 K02401 1289 | AHCGDBLN_01305 K22509 1290 | AHCGDBLN_01306 1291 | AHCGDBLN_01307 K02419 1292 | AHCGDBLN_01308 1293 | AHCGDBLN_01309 K02417 1294 | AHCGDBLN_01310 1295 | AHCGDBLN_01311 1296 | AHCGDBLN_01312 1297 | AHCGDBLN_01313 1298 | AHCGDBLN_01314 1299 | AHCGDBLN_01315 1300 | AHCGDBLN_01316 K00721 1301 | AHCGDBLN_01317 K06173 1302 | AHCGDBLN_01318 K03657 1303 | AHCGDBLN_01319 1304 | AHCGDBLN_01320 1305 | AHCGDBLN_01321 K00573 1306 | AHCGDBLN_01322 1307 | AHCGDBLN_01323 K01126 1308 | AHCGDBLN_01325 1309 | AHCGDBLN_01326 K07708 1310 | AHCGDBLN_01327 1311 | AHCGDBLN_01328 K19699 1312 | AHCGDBLN_01329 1313 | AHCGDBLN_01330 1314 | AHCGDBLN_01331 1315 | AHCGDBLN_01332 K11065 1316 | AHCGDBLN_01333 1317 | AHCGDBLN_01334 1318 | AHCGDBLN_01335 1319 | AHCGDBLN_01336 1320 | AHCGDBLN_01337 K04772 1321 | AHCGDBLN_01338 1322 | AHCGDBLN_01339 1323 | AHCGDBLN_01340 1324 | AHCGDBLN_01341 1325 | AHCGDBLN_01342 K00772 1326 | AHCGDBLN_01343 K00772 1327 | AHCGDBLN_01344 K01845 1328 | AHCGDBLN_01345 K07281 1329 | AHCGDBLN_01345 K07291 1330 | AHCGDBLN_01346 K02041 1331 | AHCGDBLN_01347 K02044 1332 | AHCGDBLN_01348 K01129 1333 | AHCGDBLN_01349 1334 | AHCGDBLN_01350 1335 | AHCGDBLN_01351 1336 | AHCGDBLN_01352 1337 | AHCGDBLN_01353 1338 | AHCGDBLN_01354 1339 | AHCGDBLN_01355 1340 | AHCGDBLN_01356 1341 | AHCGDBLN_01357 1342 | AHCGDBLN_01358 1343 | AHCGDBLN_01359 1344 | AHCGDBLN_01360 K00343 1345 | AHCGDBLN_01361 1346 | AHCGDBLN_01362 K03430 1347 | AHCGDBLN_01363 K11936 1348 | AHCGDBLN_01364 1349 | AHCGDBLN_01365 1350 | AHCGDBLN_01366 K01990 1351 | AHCGDBLN_01367 K00010 1352 | AHCGDBLN_01368 1353 | AHCGDBLN_01369 1354 | AHCGDBLN_01370 1355 | AHCGDBLN_01371 1356 | AHCGDBLN_01372 1357 | AHCGDBLN_01373 1358 | AHCGDBLN_01374 1359 | AHCGDBLN_01375 1360 | AHCGDBLN_01376 1361 | AHCGDBLN_01377 1362 | AHCGDBLN_01378 1363 | AHCGDBLN_01379 1364 | AHCGDBLN_01380 1365 | AHCGDBLN_01381 1366 | AHCGDBLN_01382 1367 | AHCGDBLN_01383 K01885 1368 | AHCGDBLN_01384 K03775 1369 | AHCGDBLN_01385 1370 | AHCGDBLN_01386 1371 | AHCGDBLN_01387 1372 | AHCGDBLN_01388 1373 | AHCGDBLN_01389 1374 | AHCGDBLN_01390 1375 | AHCGDBLN_01391 1376 | AHCGDBLN_01392 K03553 1377 | AHCGDBLN_01393 1378 | AHCGDBLN_01394 K12573 1379 | AHCGDBLN_01395 1380 | AHCGDBLN_01396 K02435 1381 | AHCGDBLN_01397 K02433 1382 | AHCGDBLN_01398 K02401 1383 | AHCGDBLN_01399 1384 | AHCGDBLN_01400 1385 | AHCGDBLN_01401 1386 | AHCGDBLN_01402 1387 | AHCGDBLN_01403 1388 | AHCGDBLN_01404 1389 | AHCGDBLN_01405 1390 | AHCGDBLN_01406 1391 | AHCGDBLN_01407 1392 | AHCGDBLN_01408 1393 | AHCGDBLN_01409 1394 | AHCGDBLN_01410 1395 | AHCGDBLN_01411 1396 | AHCGDBLN_01412 1397 | AHCGDBLN_01413 1398 | AHCGDBLN_01414 1399 | AHCGDBLN_01415 K19889 1400 | AHCGDBLN_01416 1401 | AHCGDBLN_01417 K03190 1402 | AHCGDBLN_01418 1403 | AHCGDBLN_01419 1404 | AHCGDBLN_01420 1405 | AHCGDBLN_01421 K01627 1406 | AHCGDBLN_01422 K09767 1407 | AHCGDBLN_01423 1408 | AHCGDBLN_01424 1409 | AHCGDBLN_01425 1410 | AHCGDBLN_01426 1411 | AHCGDBLN_01427 1412 | AHCGDBLN_01428 1413 | AHCGDBLN_01429 K01714 1414 | AHCGDBLN_01430 K00215 1415 | AHCGDBLN_01431 1416 | AHCGDBLN_01432 1417 | AHCGDBLN_01433 1418 | AHCGDBLN_01434 1419 | AHCGDBLN_01436 K15343 1420 | AHCGDBLN_01437 K03567 1421 | AHCGDBLN_01438 K00382 1422 | AHCGDBLN_01439 1423 | AHCGDBLN_01440 1424 | AHCGDBLN_01441 K02406 1425 | AHCGDBLN_01442 K13668 1426 | AHCGDBLN_01443 K00712 1427 | AHCGDBLN_01444 1428 | AHCGDBLN_01445 1429 | AHCGDBLN_01446 K18429 1430 | AHCGDBLN_01447 K00966 1431 | AHCGDBLN_01448 1432 | AHCGDBLN_01449 1433 | AHCGDBLN_01450 1434 | AHCGDBLN_01451 1435 | AHCGDBLN_01452 K08306 1436 | AHCGDBLN_01453 1437 | AHCGDBLN_01454 K03281 1438 | AHCGDBLN_01455 K15256 1439 | AHCGDBLN_01456 1440 | AHCGDBLN_01457 1441 | AHCGDBLN_01458 K12944 1442 | AHCGDBLN_01459 1443 | AHCGDBLN_01460 1444 | AHCGDBLN_01461 1445 | AHCGDBLN_01462 K12713 1446 | AHCGDBLN_01463 K03671 1447 | AHCGDBLN_01464 1448 | AHCGDBLN_01465 1449 | AHCGDBLN_01466 K00605 1450 | AHCGDBLN_01467 K07277 1451 | AHCGDBLN_01468 1452 | AHCGDBLN_01469 1453 | AHCGDBLN_01470 1454 | AHCGDBLN_01471 1455 | AHCGDBLN_01472 1456 | AHCGDBLN_01473 1457 | AHCGDBLN_01474 1458 | AHCGDBLN_01475 K07806 1459 | AHCGDBLN_01476 1460 | AHCGDBLN_01477 1461 | AHCGDBLN_01478 1462 | AHCGDBLN_01479 1463 | AHCGDBLN_01480 1464 | AHCGDBLN_01481 1465 | AHCGDBLN_01482 1466 | AHCGDBLN_01483 1467 | AHCGDBLN_01484 1468 | AHCGDBLN_01485 K00853 1469 | AHCGDBLN_01486 1470 | AHCGDBLN_01487 1471 | AHCGDBLN_01488 1472 | AHCGDBLN_01489 1473 | AHCGDBLN_01490 1474 | AHCGDBLN_01491 1475 | AHCGDBLN_01492 1476 | AHCGDBLN_01493 1477 | AHCGDBLN_01494 1478 | AHCGDBLN_01495 K08591 1479 | AHCGDBLN_01498 K00167 1480 | AHCGDBLN_01499 1481 | AHCGDBLN_01500 1482 | AHCGDBLN_01501 1483 | AHCGDBLN_01502 K03734 1484 | AHCGDBLN_01503 K17686 1485 | AHCGDBLN_01504 K02109 1486 | AHCGDBLN_01505 1487 | AHCGDBLN_01506 1488 | AHCGDBLN_01507 1489 | AHCGDBLN_01508 1490 | AHCGDBLN_01509 1491 | AHCGDBLN_01510 1492 | AHCGDBLN_01511 1493 | AHCGDBLN_01512 1494 | AHCGDBLN_01513 1495 | AHCGDBLN_01514 1496 | AHCGDBLN_01515 K01886 1497 | AHCGDBLN_01516 1498 | AHCGDBLN_01517 1499 | AHCGDBLN_01518 1500 | AHCGDBLN_01519 1501 | AHCGDBLN_01520 1502 | AHCGDBLN_01521 1503 | AHCGDBLN_01522 1504 | AHCGDBLN_01523 K03924 1505 | AHCGDBLN_01524 1506 | AHCGDBLN_01525 1507 | AHCGDBLN_01526 1508 | AHCGDBLN_01527 K03650 1509 | AHCGDBLN_01528 1510 | AHCGDBLN_01529 1511 | AHCGDBLN_01530 1512 | AHCGDBLN_01531 1513 | AHCGDBLN_01532 1514 | AHCGDBLN_01533 1515 | AHCGDBLN_01534 K02500 1516 | AHCGDBLN_01535 1517 | AHCGDBLN_01536 1518 | AHCGDBLN_01537 1519 | AHCGDBLN_01538 1520 | AHCGDBLN_01539 1521 | AHCGDBLN_01540 1522 | AHCGDBLN_01541 1523 | AHCGDBLN_01542 1524 | AHCGDBLN_01543 K04771 1525 | AHCGDBLN_01544 1526 | AHCGDBLN_01545 1527 | AHCGDBLN_01546 1528 | AHCGDBLN_01547 1529 | AHCGDBLN_01548 K03687 1530 | AHCGDBLN_01549 K02355 1531 | AHCGDBLN_01550 K01881 1532 | AHCGDBLN_01551 K09748 1533 | AHCGDBLN_01552 1534 | AHCGDBLN_01553 K08884 1535 | AHCGDBLN_01554 K01710 1536 | AHCGDBLN_01555 1537 | AHCGDBLN_01556 K06898 1538 | AHCGDBLN_01557 K22320 1539 | AHCGDBLN_01558 1540 | AHCGDBLN_01559 1541 | AHCGDBLN_01560 1542 | AHCGDBLN_01561 1543 | AHCGDBLN_01562 1544 | AHCGDBLN_01563 1545 | AHCGDBLN_01564 K01783 1546 | AHCGDBLN_01565 1547 | AHCGDBLN_01566 1548 | AHCGDBLN_01567 K03667 1549 | AHCGDBLN_01568 K00930 1550 | AHCGDBLN_01569 1551 | AHCGDBLN_01570 1552 | AHCGDBLN_01571 1553 | AHCGDBLN_01572 K20534 1554 | AHCGDBLN_01573 1555 | AHCGDBLN_01574 1556 | AHCGDBLN_01575 1557 | AHCGDBLN_01576 1558 | AHCGDBLN_01577 K04034 1559 | AHCGDBLN_01578 K02902 1560 | AHCGDBLN_01579 1561 | AHCGDBLN_01580 1562 | AHCGDBLN_01581 1563 | AHCGDBLN_01582 K05807 1564 | AHCGDBLN_01583 1565 | AHCGDBLN_01584 K01778 1566 | AHCGDBLN_01585 K03310 1567 | AHCGDBLN_01586 K03110 1568 | AHCGDBLN_01587 K00615 1569 | AHCGDBLN_01589 1570 | AHCGDBLN_01590 1571 | AHCGDBLN_01591 K06194 1572 | AHCGDBLN_01592 1573 | AHCGDBLN_01593 1574 | AHCGDBLN_01594 1575 | AHCGDBLN_01595 1576 | AHCGDBLN_01596 1577 | AHCGDBLN_01597 1578 | AHCGDBLN_01598 K05589 1579 | AHCGDBLN_01599 K02416 1580 | AHCGDBLN_01600 1581 | AHCGDBLN_01601 1582 | AHCGDBLN_01602 1583 | AHCGDBLN_01603 1584 | AHCGDBLN_01604 K01709 1585 | AHCGDBLN_01605 1586 | AHCGDBLN_01606 1587 | AHCGDBLN_01607 1588 | AHCGDBLN_01608 1589 | AHCGDBLN_01609 1590 | AHCGDBLN_01610 1591 | AHCGDBLN_01611 K07533 1592 | AHCGDBLN_01612 1593 | AHCGDBLN_01613 K13888 1594 | AHCGDBLN_01614 K02404 1595 | AHCGDBLN_01615 1596 | AHCGDBLN_01616 K05299 1597 | AHCGDBLN_01617 1598 | AHCGDBLN_01618 1599 | AHCGDBLN_01619 1600 | AHCGDBLN_01620 1601 | AHCGDBLN_01621 1602 | AHCGDBLN_01622 1603 | AHCGDBLN_01623 1604 | AHCGDBLN_01624 1605 | AHCGDBLN_01625 1606 | AHCGDBLN_01626 1607 | AHCGDBLN_01627 1608 | AHCGDBLN_01628 1609 | AHCGDBLN_01629 K00979 1610 | AHCGDBLN_01630 1611 | AHCGDBLN_01631 1612 | AHCGDBLN_01632 K00101 1613 | AHCGDBLN_01633 K02400 1614 | AHCGDBLN_01634 K18430 1615 | AHCGDBLN_01635 1616 | AHCGDBLN_01636 1617 | AHCGDBLN_01637 1618 | AHCGDBLN_01638 1619 | AHCGDBLN_01639 K02314 1620 | AHCGDBLN_01640 1621 | AHCGDBLN_01642 1622 | AHCGDBLN_01643 1623 | AHCGDBLN_01644 1624 | AHCGDBLN_01645 1625 | AHCGDBLN_01646 1626 | AHCGDBLN_01647 1627 | AHCGDBLN_01648 K02598 1628 | AHCGDBLN_01649 1629 | AHCGDBLN_01650 1630 | AHCGDBLN_01651 K06890 1631 | AHCGDBLN_01652 1632 | AHCGDBLN_01653 1633 | AHCGDBLN_01654 K02954 1634 | AHCGDBLN_01655 1635 | AHCGDBLN_01656 1636 | AHCGDBLN_01657 1637 | AHCGDBLN_01658 1638 | AHCGDBLN_01659 1639 | AHCGDBLN_01660 1640 | AHCGDBLN_01661 1641 | AHCGDBLN_01663 1642 | AHCGDBLN_01664 1643 | AHCGDBLN_01665 K13038 1644 | AHCGDBLN_01666 1645 | AHCGDBLN_01667 1646 | AHCGDBLN_01668 K07668 1647 | AHCGDBLN_01669 1648 | AHCGDBLN_01670 1649 | AHCGDBLN_01671 1650 | AHCGDBLN_01672 1651 | AHCGDBLN_01673 1652 | AHCGDBLN_01674 1653 | -------------------------------------------------------------------------------- /remove_duplicate_seqs/README.md: -------------------------------------------------------------------------------- 1 | # Remove duplicate sequences 2 | Remove duplicate sequences from one or several multifasta files. 3 | According to the **id** in the header or the **sequence** itself. 4 | 5 | ## Require 6 | - `Biopython` module required 7 | - Using **Python3** 8 | - Works both on Windows and Unix-like systems 9 | 10 | ## Usage 11 | 12 | Filter according to the sequence id: 13 | 14 | ```bash 15 | $python3 remove_duplicate_seqs.py --id input.fa [input2.fa ...] > output.fa 16 | ``` 17 | or filter according to the sequence itself: 18 | ```bash 19 | $python3 remove_duplicate_seqs.py --seq input.fa [input2.fa ...] > output.fa 20 | ``` 21 | 22 | ## Note 23 | - `--id` or `--seq` are necessary and should be right put following the name of the script 24 | - The result will be sent to *stdout* by default, so please use `>` if you want to redirect the output. 25 | 26 | # Chinese Usage 中文使用说明 27 | 28 | 本脚本能够在一个或多个fasta格式的文本文件中清除重复的序列 29 | 可以根据序列的id或者根据序列本身来去除这种冗余 30 | 31 | ## 要求 32 | 33 | - 使用**Python3** 34 | - 需要调用`Biopython` 35 | - 在Windows和类Unix系统中均可运行 36 | 37 | ## 使用 38 | 39 | 通过序列的id号来过滤: 40 | ```bash 41 | $python3 remove_duplicate_seqs.py --id input.fa [input2.fa ...] > output.fa 42 | ``` 43 | 44 | 通过序列本身来过滤: 45 | ```bash 46 | $python3 remove_duplicate_seqs.py --seq input.fa [input2.fa ...] > output.fa 47 | ``` 48 | 49 | ## 注意 50 | 51 | - 参数`--id` 或者 `--seq`必须指定其一,并且它只能被置于第一个参数的位置(脚本名之后) 52 | - 结果默认打印输出到`stdout`,请使用`>`来重定向结果。 -------------------------------------------------------------------------------- /remove_duplicate_seqs/remove_duplicate_seqs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """ 4 | Description: 5 | Remove duplicate sequences from one or several multifasta files. 6 | According to the id in the header or the sequence itself. 7 | 8 | Usage: 9 | Filter according to the sequence id: 10 | $python3 remove_duplicate_seqs.py --id input.fa [input2.fa ...] > output.fa 11 | 12 | or filter according to the sequence itself: 13 | $python3 remove_duplicate_seqs.py --seq input.fa [input2.fa ...] > output.fa 14 | """ 15 | import sys 16 | import textwrap 17 | from Bio import SeqIO 18 | 19 | __author__ = "Heyu Lin" 20 | __contact__ = "heyu.lin@student.unimelb.edu.au" 21 | 22 | 23 | def arg_parser(arr): 24 | if arr[1] != '--id' and arr[1] != '--seq': 25 | raise Exception('Please indicate the filter method by --id or --seq') 26 | if not arr[2]: 27 | raise Exception('Please indicate the input fasta file(s)') 28 | ref = arr[1] 29 | inputs = arr[2:] 30 | return ref, inputs 31 | 32 | 33 | def seqs_parser(filter, files): 34 | rec_dic = {} 35 | for fasfile in files: 36 | for seq_record in SeqIO.parse(fasfile, "fasta"): 37 | if filter == '--id': 38 | rec_dic[str(seq_record.description)] = str(seq_record.seq) 39 | elif filter == '--seq': 40 | rec_dic[str(seq_record.seq)] = str(seq_record.description) 41 | return rec_dic 42 | 43 | 44 | def main(): 45 | ref, inputs = arg_parser(sys.argv) 46 | rec_dic = seqs_parser(ref, inputs) 47 | if ref == '--id': 48 | for key, value in rec_dic.items(): 49 | print('>' + key) 50 | print(textwrap.fill(value)) 51 | elif ref == '--seq': 52 | for key, value in rec_dic.items(): 53 | print('>' + value) 54 | print(textwrap.fill(key)) 55 | 56 | 57 | if __name__ == '__main__': 58 | main() 59 | --------------------------------------------------------------------------------