├── .gitignore ├── Dockerfile ├── LICENSE.md ├── MANIFEST.in ├── README.md ├── __init__.py ├── blobtools ├── data └── __init__.py ├── example ├── .DS_Store ├── assembly.fna ├── assembly.list.txt ├── blast.out ├── blobDB.json ├── blobDB.table.txt ├── blobplot.png ├── catcolour.txt ├── colours.txt ├── diamond.out ├── mapping_1.bam ├── mapping_1.sorted.bam ├── mapping_1.sorted.bam.bai ├── mapping_2.bam ├── mapping_2.sorted.bam ├── mapping_2.sorted.bam.bai └── refcov.txt ├── lib ├── BtCore.py ├── BtIO.py ├── BtLog.py ├── BtPlot.py ├── BtTax.py ├── __init__.py ├── bamfilter.py ├── blobplot.py ├── covplot.py ├── create.py ├── interface.py ├── map2cov.py ├── nodesdb.py ├── seqfilter.py ├── taxify.py └── view.py ├── requirements.txt ├── setup.cfg ├── setup.py └── test └── meta.json /.gitignore: -------------------------------------------------------------------------------- 1 | !*.md 2 | !*.py 3 | !*install 4 | !lib/* 5 | !data/ 6 | !example/* 7 | example/*.stats.txt 8 | example/a* 9 | example/test* 10 | .DS_Store 11 | !setup* 12 | !requirements.txt 13 | !blobtools 14 | !MANIFEST.in 15 | data/n* 16 | samtools/ 17 | *.pyc 18 | *.gz 19 | *.fq 20 | *.png 21 | !blobplot.png 22 | *.sam 23 | 24 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3 2 | MAINTAINER Nick Waters 3 | RUN conda install -c anaconda matplotlib docopt tqdm wget pyyaml git 4 | RUN conda install -c bioconda pysam --update-deps 5 | RUN git clone https://github.com/DRL/blobtools.git 6 | WORKDIR blobtools 7 | 8 | RUN ./blobtools -h 9 | # RUN ./blobtools create -i example/assembly.fna -b example/mapping_1.bam -t example/blast.out -o example/test 10 | RUN wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz -P data/ 11 | RUN tar zxf data/taxdump.tar.gz -C data/ nodes.dmp names.dmp 12 | RUN ./blobtools nodesdb --nodes data/nodes.dmp --names data/names.dmp 13 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | {one line to give the program's name and a brief idea of what it does.} 635 | Copyright (C) {year} {name of author} 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | {project} Copyright (C) {year} {fullname} 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include requirements.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | BlobTools v1.1 2 | =============================== 3 | A modular command-line solution for visualisation, quality control and taxonomic partitioning of genome datasets 4 | 5 | - Discussions, questions and answers: [BlobTools GoogleGroup](https://groups.google.com/forum/#!forum/blobtools) 6 | - Issues, bug reports and feature requests: [GitHub issues](https://github.com/DRL/blobtools/issues) 7 | - Documentation: [blobtools.readme.io](https://blobtools.readme.io) 8 | - Citation: [Laetsch DR and Blaxter ML, 2017](https://f1000research.com/articles/6-1287/v1) 9 | 10 | ![](https://github.com/DRL/blobtools/blob/master/example/blobplot.png) 11 | 12 | Obtaining BlobTools 13 | ------------ 14 | - **Option A**: Download latest [release](https://github.com/DRL/blobtools/releases/latest) 15 | - **Option B**: Clone repository 16 | ``` 17 | git clone https://github.com/DRL/blobtools.git 18 | ``` 19 | 20 | Enter directory 21 | ------------ 22 | ``` 23 | cd blobtools 24 | ``` 25 | 26 | Install dependencies 27 | ------------ 28 | - Create [Conda](https://conda.io/en/latest/miniconda.html) environment and install dependencies 29 | 30 | ``` 31 | conda create -n blobtools 32 | conda activate blobtools 33 | conda install -c anaconda -c bioconda matplotlib docopt tqdm wget pyyaml git pysam 34 | ``` 35 | 36 | Download NCBI taxdump and create nodesdb 37 | ------------ 38 | ``` 39 | wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz -P data/ 40 | tar zxf data/taxdump.tar.gz -C data/ nodes.dmp names.dmp 41 | ./blobtools nodesdb --nodes data/nodes.dmp --names data/names.dmp 42 | ``` 43 | 44 | Create blobplot 45 | ------------ 46 | ``` 47 | ./blobtools create -i example/assembly.fna -b example/mapping_1.sorted.bam -t example/blast.out -o example/test && \ 48 | ./blobtools view -i example/test.blobDB.json && \ 49 | ./blobtools plot -i example/test.blobDB.json 50 | ``` 51 | Usage 52 | ----- 53 | ``` 54 | ./blobtools --help 55 | ``` 56 | 57 | Docker 58 | ------ 59 | 60 | A docker container can be build using the following command: 61 | ``` 62 | docker build -t drl/blobtools . 63 | ``` 64 | This docker image can be run with sample data as follows: 65 | ``` 66 | docker run -v $PWD/example:/example/ -t drl/blobtools ./blobtools create -i /example/assembly.fna -b /example/mapping_1.sorted.bam -t /example/blast.out -o /example/test 67 | ``` 68 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DRL/blobtools/9426ca4a6dd0e0a2a67841d92b60b101fc52b921/__init__.py -------------------------------------------------------------------------------- /blobtools: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from lib.interface import main 5 | 6 | if __name__ == '__main__': 7 | main() -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DRL/blobtools/9426ca4a6dd0e0a2a67841d92b60b101fc52b921/data/__init__.py -------------------------------------------------------------------------------- /example/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DRL/blobtools/9426ca4a6dd0e0a2a67841d92b60b101fc52b921/example/.DS_Store -------------------------------------------------------------------------------- /example/assembly.fna: -------------------------------------------------------------------------------- 1 | >contig_1 2 | TTTTTTATTTTTTGTTTAAAAGACGCATTTAATTGTCTTAGTTTTTTCTTAACGTTTCGC 3 | ATTCCCAATCAATTGTATCCAGAATTTAAATGTGCTTTTTCACACGGCGATTTTATGGCA 4 | CATAAATTAAGGGAATCAAAACTCATAATCATCGATGAAGTCAGCATGCTTCATAAAAAT 5 | GTAATCTTTGCTATCGACTCAACTTTAAGGTTTTTTTTATTTAATTTTTTTAATTAAAAA 6 | CTTTTAACTATAAAACATAATCAAAAATTTTTTTATATCTTATACACATATTATTATATT 7 | CATCAATTTACATTCTAAAATTAGATAAATACTATTTAAATTTTAAAATAAAATTTTTAA 8 | AAATAAAATTAAAAATTTTAGTAGCTTTAAATTTTTTTATTTTAGATACTTATATTCTGC 9 | AAGCGACATTAAACGAAAAATCCCATTTGCAAGAAAAACAATTATATTTGGAGGAGATTG 10 | GAAACAGCTGCTCCCTGTTATTCCAGGAAGCAATATTAATCAACAAGCACTTGCAAGCTT 11 | CAAATTTACTAAGTTCTTCAACCTCTGTCAACCAATAACATTGCAAAAAAACATGAAAGT 12 | TAATAAAGATGACGGCGATTTTGCACTGTTTCTTATATTAATTGGAAATGGATTAATTGG 13 | AACAGAGGATAAAAATGGTGTCTTTGAACTTCCAACTCAAATGATAGCAAATAAGCTTAA 14 | AGAATTAGTAGATTTTTGCTTCGACACCGATTGCTT 15 | >contig_2 16 | TAGCAGAAAGAAAAATAAAAAATTTTAATGAAGATCAAAAAAACGCATTTGATAAAGTAA 17 | TTTATATTTAATTAAAAAAAAAATATATTAAAATAATTTAGATTATGGAATCAGTAAGAA 18 | GTAAAGAAATTGACCATAATGGCAAACTATTTATGTTAACAGGAGATGGAGGCACAGGTA 19 | AATTTCATATATTTAATTAATATTTTTAAATTAATTTTTAGGAAAATCCTATTTATATAA 20 | TGGATTGATAGCTCAGTGCAAAGCTCATCAATTTGAAGTTATTCCATGCGCATCAACTGG 21 | AATAGCTTCAATATTATTATTGTGCAATTTTGGTACAGCACACAGAACGTTTCGCATTCC 22 | CAATCAATTGTATCCAGAATTTAAATGTGCTTTTTCACACGGCGATTTTATGGCACATAA 23 | ATTAAGGGAATCAAAACTTATAATCATCGATGAAGTCAGCATGCTTCATAAAAATGTAAT 24 | CTTTGCTATCGACTCAACTTTAAGGTTTTTTTTATTTAATTTTTTTAATTAAAAACTTTC 25 | AACTATAAAACATAATCAAAAATTTTTTTATATCTTATACACATGTTCATTAATTTACAT 26 | TCTAAAATTAGATAAATACTATTTAAATTTTAAAATAAAATTTTTAAAAATAAAATTAAA 27 | AATTTTAGTAGCTTTAAATTTTTTTATTTTAGATACTTATATTCTGCAAGCGACATTAAA 28 | CGAAAAATCCCATTTGCAGGAAAAACAATTATATTTGGAGGAGATTGGAAACAGCTGCTC 29 | CCTGTTGTTCCAGGAAGCAATATTAATCAACAAGCACTTGCAAGCTTCAAATTTACTAAG 30 | TTCTTCAACCTCTGTCAACCAATAACATTGCAAAAAAACATGAGAGTTAATAAAGATGAA 31 | GGCGTTTTTGCAATGTTTCTTAAATTGAATGGAAATGGATTAATTGGAACAGAGGATAAA 32 | AATGGTGTCTTTGAAATACCACCTAAAATGATAGCTAAAAATCTTATAGAATTGGTAGAT 33 | TTTTGCTTTGACACCGATTGTTTACGCAATCATACCGTAT 34 | >contig_3 35 | AATCATTGATGAAGTCAGCATGCTTTATAAAAATGTAATCTTTGCTATCGACTCAACTTT 36 | AAGGTTTTTTTATTTAATTTTTTTAATTAAAAACTTTCAACTATAAAACATAATCAAAAA 37 | TTTTTTTATATCTTATACACATGTTCATTAATTTACATCCTAAAATTATATAAATACTAT 38 | TTAAATTTTAAAATAAAATTTTTAAAAATAAAATTTTTAAAAATAAAATTAAAAATTTTA 39 | GTAGCTTTAAATTTTTTTATTTTAGATACTTATATTTTGCAAGCGACATAAAACGAAAAA 40 | TCCCGTTTGCAAGAAAAATAATTATATTTGGAGGAGATTGGAAACAGCTGCTTACTGTTG 41 | TTCCAGGATTAATCAACAAGCACTTGCAAGCTTCAAATTTACTAAGTTCTTCAATCTCTG 42 | TCAACCAATAACATTGCAAACAAACATAAGAGTTAATAAAGATGAAGGCGATTTTGCAAT 43 | GTTTTTTAAATCGATTGAAAATGGATTAATTGGAACAGAGGATAAAAATGGTGTCTTTGA 44 | AATACTATTTCAAATGATAAGAAAAAATCTTACAGAATTAGTAGATTTTTGCTTTGACAC 45 | CG 46 | >contig_4 47 | GATTTTGCAATGTTTCTTAAATCGATTGGAAATGGATTAATTGGAACAGAGGATAAAAAT 48 | GGTGTCTTTGAAATACCACCTAAAATGATAGCAAAAAATCTTACAGAATTGGTAGATTTT 49 | TGCTTTGACACCGATTGTTTACGCAATCCTATCGTAAACACTGAAGCGATAAGTCAAAAT 50 | GCAATTTTATGCCCAACAAATGACAGCGTCAACATAATAAATGATGCAATAAATAAAAAA 51 | ATACCAAGTGAAGAGCATTTAAAAACATCAATTGATAAAGTCATCAACAGAGCCGCAATA 52 | GATGATTTAAGCGTGCACGTAGCCGACTTCTGCATGGAAAACATCCATCGGCAAACGCCA 53 | AGTGGTTTCCCACCACATATTCTGAAGATTAAATTGGGAACAATTGTAATGCTCATTAAA 54 | AACATAGACATAAAAAATGGCTTATGCAATGGCACTCGACTACAAATTGTCGACATTAAA 55 | ACAGATTTAATTAACTGTGCCATTTTAACGGGCAAAGGAAAAAAAGACACAAAAGATACA 56 | ACTATTTGGCTGCCCAGAATAAAATTCCAATATGGAAATGAATCTACACAACTGGGAATT 57 | CAATGGGAACGACTACAATTTCCCATTAGAGTAGCCTTCGCAATGACTATTAACAAGAGT 58 | CAAGGGCAAACATTAAACAGAGTTGGCCTATATTTATTAAATAAAGATGTATTTAGCCAT 59 | GGCCATTTATATACTGCTTTTTCAAGAGTTAAAAGTTCAGAATCAATAAAAATCCTCTGC 60 | AATATCCATTATAAAAATAACTCTGTTAGAAACATTATTTTTGACAAAATTTTGGATAAA 61 | AATGAATTAGGCAACAACGATATGAAAAAATTTTTGGTGGAGTATGCAAAAATTAATATC 62 | AAAGTTGCAGGTAAATAACTTTAAATTAATTTTTAAAAACAAATTATTAAT 63 | >contig_5 64 | GATTTTGCAATGTTTCTTAAATTAATTGGAAATGGATTAATTGGAACAGAGGATAAAAAT 65 | GGTGTCTTTGAAATACCAACTCAAATGATAGCAAAAAATCTTACAGAATTAGTAGATTTT 66 | TGCTTTGACACCGATTGTTTACGCAATCCTATCGTAAACACTGAAGCGATAAGTCAAAAT 67 | GCAATTTTATGCCTAACAAATGACAGCGTCAACATAATAAATGATGCAATAAATAAAAAA 68 | ATACCAAGTGAAGAGCATTTAAAAACATCAATTGATAAAGTCATCAACAAAGCCGCAATA 69 | GATGATTTAAGCGTGCACGTAGCCGACTTTTGCATGAAAAACATCCATCGGCAAACGCCA 70 | AGTGGTTTCCCACCACATATTCTGAAGATTAAATTGGGAACAATAGTAATGCTCATTAAA 71 | AACATAGACATAAAAAATGGCTTATGCAATGGCACTCGACTACAAATTGTCGACATTAAA 72 | ACAGATTTAATTAACTGTGCTATTTTAACGAGCAAAGGAAAAAAAGACACAAAAGATACA 73 | ACTATTTGGCTGCCCAGAATAAAATTCCAATATGGAAATGAATCTACACAACTAGGAATT 74 | CAATGGGAACGACT 75 | >contig_6 76 | TTTTTTTATTAATTTATTATTAAATTACATATTTTTATATTTTAGAACCTACATATCAAA 77 | TCTTTACCAATTTTAAAACGACAAATTTTTATTCTTCGTATTTTTGGTCTTTATTCTGAT 78 | GAAACTTCAATAATAGTGCATGCTGAATCTGTTAGTACTGTTCAAAATGTTATTCAAACA 79 | GTAATTTTTGTTTGTAGTTAAAATAATTTAAAAAAA 80 | >contig_7 81 | TTTTTTATATAATATTTTAAATTTTTTTAGCATAATAATTAACATTTTTTAAGTAGTTTT 82 | TTTTAATCATTATTTTTAAAATATATCTATTTTTTAAGGTCCACTTGCGCGCAGTGGACA 83 | TAGAATATTTGCAACTGATAATTATATTTATTTAATTGGAGGATATAATCCCGATTCTTC 84 | TTCTAAAACACTTATTGATATTTGGCGTTTTAATCTTGCAACGGAAAGATGGCAAGAAGG 85 | TTTAATGGAAAATGGAAAAATGCCAACATGTTTAGCGTCTTTTTCATGTCTGTAATTTTT 86 | TTATAAATTAAATTTTTTAAAATTATTGTAGTGTCTCAAACTAAATTAAAAAATGAAACA 87 | TATATTTTTGGTGGAACAGGTTTTCCATTTGGTGTTAAAGCATCAAATGATATCTATAAA 88 | TTGACGGTTGGACCAAGTGGTTCAGTTAATATTGAGCCTCAAATTATTGAATCACTTTTT 89 | GATCATTATCCTCCTAAAATTTATGGTCATGCTATGACATATGTTAAACGTTTTAATCCA 90 | ATTACAAATTCTGAGGAAGTAATTTAAATTATTTTGTTAATTTTTAATAATTTAAATAAA 91 | AATTAAGGAAATTATTTATTTGGTTGGTGGTACAACAGGTCATACATATAATATGAATGT 92 | TTGGAAATTACAAAAAAAGAAAGAAACAAAAAACGTTTGGGACTGCTTTTTGTTAACAGT 93 | AATAATTTTTTTTTTAAATAAAAAAATAATTTTTTAGGACAGAATTTTTGAATCTGAATT 94 | AGGTCGTTATCGTTTAGAAATAATTGTACATAAAAATTATATAATTACATTTGGTGGTGG 95 | ATCACCAATTTTTTGTGCTGAATTTGATAACGTTTAATAACTATTTTTTATGTTTTAAAT 96 | TTTATTTTTTTAGAGCGCAGGAAAAAGTTCTGTTATTGAAGGAATTGTTGGAAGAGACTT 97 | TTTGCCTCGTGGAGTAGGAATTGTTACAAGGCGACCACTTTTACTCCATTTAATTTATGT 98 | ACCATTAGACAGTCAACTGAGAAAAGAAACTTCAAGTTATTTTAAATAAATTTTTTAATA 99 | TTTAGTTTATTGTTTTTAGTCAGTGTAGATTCGGATTGGGCTGTTTTTGAGCATAAACCT 100 | AATCAAATTTTTTCCGATTTTGAAAAAGTACGTCAGGAAGTTGAAGATGAAACATATCGT 101 | GTCACTGGCTCAAATAAAGGAATTTCTACAATTCCAATCAATTTAAAAATTTATTCACAT 102 | AAAGTTGTAAATTTATCACTTATTGATTTGCCTGGTATAACTAAAGTTCCAGTGGGCGAT 103 | CAGCCTCCTGATATTGAAGTATAATGTTAAATTTTTCTTGAATCATAGAATTATTTTAGG 104 | TGCAAATTAGAGAAATGATTTGCAACTATATATTAAATCCAAATTCGTTAATTCTTGCTG 105 | TGACTCCCGCTAATCAGGATTTTGCAACATCTGAACCTCTTAAATTAGCTAAAGAGTTTG 106 | ATCATGAGGGTAATTATAATTTTTTATTTATTAAATTTTATATTTGCAAGTTTTTTAAAA 107 | TTAAAAATAATATTAAATGATATAAAATAATGTATAACTTAAAATTTTTTTTTAAGGTAA 108 | TAGAACTTTGGCAGTTTTAACTAAACTTGATTTGATGGATCATGGCACAGATGCTATGGA 109 | TGTTCTTACCGGTAAAATTGTGCCTGTTAAATTGGGTATTATTGGTGTTGTTAATCGTTC 110 | TCAGGCTGACATTAAAAGTGAAAAATCAATTGAAGATTGTTTAAAAGATGAAATGAAATT 111 | TTTACACAAAAAATATCCAACGCTTGCATCAACAAACGGAGTTTATTATTTATCAAAAAC 112 | TTTAAATCGGGTATTTTTAATTAAATAAAAATTATTTTTAATTTGAAATAGTATAATAAA 113 | TTTTTTAGCTATTAATGCATCATATTTGCTCATGCTTGCCACAATTAAAAATTCGTATTA 114 | GCACAATGATTCAACAAAGTCAAGCATTGCTTTATTCATATGGTGAAAATGTAGTCGATA 115 | AAAATCGCACACTTCTTCATATTATAACTAATTTTGCAAACGCTTACACATCTACAATTG 116 | AGGGCACTTCAAAAAATATTGACACAACAGAAATGTATACATTTTAGTTAATTAAATATT 117 | TTTTTTAATAATTAATTATTAATTAATTACTTTTTTAAATTTTAGTTGTGGTGGAGCTCG 118 | TATTTGCTATATTTTTTATGAAACACTCAGGAATGCATTAGAAAAAGTTAATCCAATGGA 119 | AAATTTAACAAAATGGGAAATTTTAACTGCAATTAGAAATGCAACAGTAAATTTTAGTTA 120 | GAAAAAAAAAAATTATAAAAAATTTTTTAAGGGTCCTAAAACTGCTATTTTTATACCGGA 121 | GGTTAGTTTTGAACTTTTAGTTAAACGCCAAATCCGTCGTTTAGAAGAACCTAGCCTTAA 122 | ATGCGTTGAGTTAGTATATGAAGAACTGTTACGAATTGTTCAAAATTGTGGTTATGAAAT 123 | ACAAGTAAACACATTTAAAAATAATAAAATATAATTTTAAAAATAAATTTAGCAAGAAAT 124 | GCAACGTTTTCCAAAATTGTATGATCGAGTTAGTGAAGTTGTAACTGGTGTTCTTGTTAA 125 | TAGATTAGCACCTACAAAAGAATTTGTTTCTAATTTAATAGCAATTCAACTTGCTTACAT 126 | TAATACTAGACATCCAGAATTTAATGAAGTCAATATTAGTATGTTTAAAGAGTCAACTTT 127 | AGGCCCGATAGTAAATTCGGAGGTTTTTAATAATTTTCAATTATTTTATATTTATAAAAA 128 | AAATAAAATTTTTAAATCAAATAACGATTTATGATTTAAAAAAAATTTTTAAATATTTAA 129 | AATTTAATTTAGAGTAATAAAACAAAATTTTTATTTAAATATATAATAATAACTTACTCG 130 | TATAGTCAATGGTGTTAGCGTTGTTGCAAGAATAGTAGGAGTTTCCAAATTTAATTGTTT 131 | TGTAAATGGTGAAAAGTTAACTAATTTATCGTTTATTATTTGAGATAAAATTTTATTGTC 132 | AAAATTATTTACATCAGATGTTGTAGCTGATGAAACGCCAAATGCACGATCATTATCAAA 133 | TTCAAATTTTTCATTATTCTTTAAATTTGGTAATATTGTTGGTTGTGAATTAAGAAATGT 134 | GGACTGTGAATGTTTTTTTACAAACGGCTGCTCATTTGCGGAAAGCAAAGAAGAATGTTT 135 | TTGTAAAACAGATGACTTTTGTGTATTTGATGAATGAATTAATGGAACAAGGGACATAAC 136 | TGGTACAGGAATTTGTTCTGTTGGCACAGATGAATGAACTCCTAATAGTTAAAATAATTT 137 | AAAAAAATTTTTATCAAAATTTCTAACAAAAAAAAATTTACAAAAAATATAAAAAAAAAA 138 | TTTATTTTTAATAAATAAATGAAATACCTTTACATGTATTTGTAATATTATTATATTCAA 139 | AACAAAGACATAAATATGCACCAGGTGTATTAACACAATCTGGCTGTTCATTTGGACAGT 140 | TAATTTCTTTGCTGGAACATTCATCAATATCTTAAATTTAAATAATTATAATATTAAAAT 141 | ATGCAACTTATTAATAAAAAGCTATCACATAGTTTTTATTTTAAAAAATTTTTATTTTAA 142 | AATTAATTTTTGATTAAATAATATAAACCTTGATCACATATAGGACTTCCTTCCCAACCT 143 | GGATTACATTTTCCGTTAGGACATTCGCCAGATTCTGAGTTACAAGCGCTTCCATCAGCA 144 | CAATGACAAAAGCGCGTACAATCTAAGTACCATTGATTTGGCGCACAAGAATCAACGCAT 145 | TCTAAGCCTTGAAAACCGGCACCGCACAAATAAATTTTTTCTTCATCAATTCGAAATAGC 146 | CACTCACCTGGAATGCCAATATTACTTTTTTCAATCAATTGTGTAGAATGTATTGTGCCA 147 | GAACCAGGTAATGAAAAATGTACTGGCATTACAGCCATTTCTTCATCGGCAATGCCATTA 148 | AATCCAGCCTATTTTTTAAGTTTATTAATAAAAAAATTAA 149 | >contig_8 150 | TTAAAATTTTTTTTTGTTTATTTTTTATGTTAAGCAATTCTGGGTTTTTTAACTAATAAA 151 | ACTTATTTATTATTTTGTCTACCTATTCCTATTAATTACAACGCTATTCGATACATAAAG 152 | TTAGTTTATTTTTTATTTAAACTGTAATTTAAAAATTTTATTTTTAGAAAACTTGTTGTC 153 | AATAGGAGTGTTATGAAAAGAGTTTTATGCGTAGCCGAGAAAAATGATGCGGCAAAAAAT 154 | ATTGCTTCAATTTTATCTAAGAATCGAATGATTCGACGTGAAGGCCGTAGTCGTTTTAAT 155 | AAGTTATATTGTTTTGAAACTGATAATTGCCTCGGTTTTAAAGCGCAAATTGTTTTTACT 156 | AGCGTGAGTGGTCATTTGTTACAATTAGAATTTAGTACGGAGTATCGCGAGTGGGATAAA 157 | TTTACAACTGAAGCACTATTTTCGGCTCAAGTTTTTAGGAATGTGCCTTCAAATATGAAA 158 | GAAATTGAAATGACTTTAAAGTATTTTTTAGTTTTTAGCACACTTTAAAATTTTTTCAAG 159 | AGAACAAAGTCGTCTAGCTCAAATACTTATTATATGGACTGATTGTGATCGAGAAGGCGA 160 | AAATATTGGTTCAGAAATAGAATTTGTTTGTCGAACACAGAATTCAAATCTTGATGTTTA 161 | TAGAGCTAAGTTTTCAGAAATTACACCTCGTTCAATATTTGCAGCAATGCTAAATTTAAC 162 | ACGGCTAGACGAACGTATCATTAATGCAGTTGATTGCCGAACTGAATTGGATTTAAGAAT 163 | AGGTTTTAAATTTTATATTATTTTTTTAAACTTTAAATAATTTAGGCGCAGCCTTTACAC 164 | GTCTACAAACTTTACATTTAAAATCAAAATTTTCAACTTTGATTCCTGAGAAAGTTATAA 165 | GCTATGGCAGTTGTCAATTTCCAACGTTAGGCTTTATTGTTGAGCGCTATAAATCAATTA 166 | AAGAATTTATTGTAGAAGATTTTTGGAAATTAGTTGGCAAAGATCAAAATATAGAGTTTT 167 | TATGGGAGCGTCACAAAGTTTTTGATGAGCAAGTAGCAAAGGTATTTTTTTAATAAATAT 168 | TTTATCATTATTAATAGAATTTTAAGGCATTTTTGGAGCTTTGTACTGAAGTTCCTGGCA 169 | CAGCAGTTGTGCAAATGGTTGATAAACATCCTAAAACTAAGTACAGACCTATCGCTTTGG 170 | ATACTATTGAATTAGAAAAATTGGGCATTCGTAAATTACGCATGACAGCGAAGCGAGTAA 171 | TGGCAGCTGCTGAAAGATTATACAGCAATGGTTTTATTTCTTATCCAAGGTTTTAAATTA 172 | AAATTAAAATTTTTTTTTAAATTTTTTTATTTAGAACAGAAACAAATAAATATCCAAAAA 173 | ATTTAAATTTATCAGAGCTTGTAGAAATACAGCGTTCTCATGCTCAATGGGGTGATTTTG 174 | CTACAGAAGTGATTAGTAATGATGGTCCGAATCCACGTAATGGTAGCAAATCTGATGAAG 175 | CACATCCACCTATTCATCCGTTAAAAATTGCTACTAAGTTTATTAATAAATTTTTTAAAT 176 | TTTTTAATTTTAATTTTATCTTAGAGATATTATTTCATCTCCAGACGAATGGTCAGTTTA 177 | TGAATTGGTTGTTAGACACTTTTTGGCATGCGTTTCTTTTGACGCAAAAGGTCAAGAGAC 178 | AAAAGTTAAAGTAATATTAAATAAAAAAAATCTTTATTTTTTTTTAATTTTAAAATTTAG 179 | ATAGAAATAGGTGGTGAAATATTTAGCACTACTGGTCTTATTATTCAAGATCTTGGTTAT 180 | ATGCGTGTTTATCCTTATGATAAATGGTCGAATAAAAATTTACCAAATTATAATCTTCAT 181 | CAAAAACTTCCTAATTTTGTTGTTACAATAGATGCAGGAAAAACAACTCCACCATTACTT 182 | CTTAATGAAGCGGATTTAATTGCTTTAATGGATAAATATGGTATTGGAACAGACGCTACG 183 | CATGCAGAGCATATTGAAAAAATAAAACAGCGTTGTTATGTAGCATTAAATAATGAAAAT 184 | AGATTTATACCTAGCTTTTTGGGTTTGGCACTTGTGGATGCTTATAATAAAATTGGTTAT 185 | GAAATGAGTAAACCAAATTTAAGATCAAATCTTGAATCACAACTTGTTGATGTTTGTAAC 186 | GGTAACCTTATTTGTTTTTTTTATTTATAATAAAATACAAAATTTTAGGTACAAAAACAA 187 | AAGATTTTGTACTAGAAGATCAACTTGGTAAATATAAACGTATTTTTCAAAAAACTGAAG 188 | ATAAAATTTCTACGTTTTCTGATGTTTTTAATCAATATATGAATTCTAATAATAATAATA 189 | ATAATA 190 | >contig_9 191 | TTTTTTTTATTATTACTTAAAACACAGCAATTAATTATCAATTTATTTAAAAAAAATTGT 192 | TAGAATTATTTAAATTATATTGATAAATTTTTAAGGACAAAGCTAGATTGCATTCAAGTA 193 | GTCGCTCCGATTTAGCCGATGAAGTGACAAAAGAAAGTATGGCTTATTTAAATCGGCCAT 194 | TATCAAAAATGGATATTTTTTATACCGGTTCAATTAGTTCATTAGCTTTAAAAGATAAAA 195 | TATCTAATACTTCAAAAAAACAGGAGACAACACCAAGAATTAAAAATGGAAATTTAGCTA 196 | ATCCTTTGTTTGAAACAACTAGCAAATCAGCGCTTTATTTAAGTACTGCCGGATTACCTA 197 | ACTTGAACAATGAATACGAATCATCATCAAAATGGACACAAAACATTGCAGCAGTAATTT 198 | TTTAAAATTTAATAATAATATTATAATTCTTTTTTAATTTTTTTAGTCATTACGTTCTTT 199 | ACTCGATGTTTCATTGTTAAAAAGCCCCAGTTATATGGTTTTAGCACTTAGCGGTTTTTT 200 | AACACTTTCCTGTTTTTTTGTGCCATTTATGTAAAATTTTTATTTAACGTAAAAAAATAA 201 | ATTAATAAACATTTTTTCAAGGTTTATTGGAACTTTAGCAAAACAAAATGGAATTGATGA 202 | ATCACTTTCTAAATATTTAGTTGTAATTCTTGGACTTGTAAATTTAGCTGGACGTATTAT 203 | CTGCGGGTAAATTTTCAACTTTGTTAAATATCAAAAAATTTTTTATTTAGTTTAATTTCT 204 | GATCATCCGATGGTTGATCCATTGGTTGTATCAAATATTGCAGTTATTTTTGGTGGACTA 205 | GCTACAGTATTAATACCACTAGGTACAGAGTTTTGGATGTTTGTTTTGTATTGTGTACCA 206 | TTTGCATTGGGTGTTGCGTGTTTTGCTGCACTTCGTTCTATAATTTGTGTTGAGCTTTTA 207 | GGAATAGAAAAATTAACGAACGCATATGGAATGTTAATGCTTTTTATGGGTATCGCAGCA 208 | TTAATTGGTCCACCATTTGCAGGTGCATTTAATAATTTTTTTTTATTGATTTAATTATAA 209 | ATTTAAGCATTGTTGAAAAATTTAACAAACAGTTTTAATATGTCATTTCATGTGATGGGT 210 | GGATTAATGATGCTTAGTGGCGTTATAAGTTTACCACTACGCGCTATTAGTGCCTATGAA 211 | ATTAGAAAAAATAGTAAGAGTGATAAAAACGATTGTGTCTCTGTTCTTGAACTTGAACCA 212 | TTAAAATTGGCTTTATAATTTTTATTTAATATTTTTTATAGAACATTAATTTTATAATAA 213 | AAAATAATTTTTTAATTTTATATACATTTAATTTTTGGTAAAAGTTAACACTTGATTAAG 214 | TTAAAATTTATTTAAAATAATAAAAAATGTATAAAACAATAAGTGAATTTAATAAAATAA 215 | TTTAATTTTGGTATTTAAAGATTTAATAAATTGTAATTTGCTTTTTTGTCTTTTGATAAT 216 | AATGTATCATTTTTATTATCTTTAGAATTTCTAAAAACTTAATATTTTTAATGTTAAAGT 217 | GCTTAAAAAGTAAATTTTTATTTTATTTTGTTTATATTT 218 | >contig_10 219 | TTTTCTCATTTTTTTTTTCTTAATGGAAAAATATCAAGACAACCAGGTTAATGTGAGTGC 220 | TTTAGAAGAACCCAAAAAAAAAGAGCGACCGGCTGGTTCATCCATGGCGGCCGATTTAAA 221 | AGAGTTAAAAGAGATGATGAGTAGCATGTTATTTGAAATTAAAAGTAATTCAGAGAGATT 222 | AAGTAAATTGGAATCCTTTTCAGATTGTAAAGAGATACATGAAGATGAGAGAACCAAATT 223 | TAATAATGAGAGTATAGTAGTTAATGAGAGTAATGAATATAGTAAAAATAACGAGATAGC 224 | TATCAACCAAATTTTAATATTTCAATTAACTGCGAGCTTTGCAAATATAAGGGGTACCGA 225 | GAGTTTTAAGACTTTAGAAATCTATTTTCGTAAATTTGAGATAAGCGCGTATGGGTTAAC 226 | AGAACCTGAAAAAGTAAGATTTATTGTAAGTAAGCTTGAAGATCGAGCGTTTGCGTCATA 227 | CGAAAATTTAAGTTTAGTTGAAAGAATGTCGTACGAAAGTGTTAAAAATTGTGTGTTAAA 228 | TAATTCAAAAGAAATTTCCTCGCGCGTTTTAAACCAACAAAAATTATTTTCGGGAGTTAA 229 | AAAATTTAATAATGAAACCTTATTAGAATTTGGGAATAGGGTACTTAAGGTCACTAGAGC 230 | GAGTATGTTACCCACTACTGCGAATGACGTTATAGAAGATTTAGCCATTACCCAGTTTTT 231 | ACAACAAATTGATAATCCCATTATACGTAACACCCTAATTTTGCGGCGCGAAAATTGTTC 232 | TTTTAAGCAGTTACTTCAGGATGCAGTTTCACTTCATGACCACAATAAATTTTTTAAACA 233 | AAATGATAATAATAGTAAGTTCGATAGCGGGAGAAAATTTACAACTCCTCGACCATCGCT 234 | CACGTTTGTGTCTAATGCGCACCCTCCTAAGTCAAAATCGTTGCCTTGTATTTTTTGTAA 235 | TAATATGCATGCGTCTAAAAAGTGTACAAGTTATAGTAATCTTAGTTCGCGAGTAAGTAG 236 | ACTTAAAGTTCTTGGTCGTTGTACTAAATGTTGTAAGATTGGTCATGTTACTAACTCTTG 237 | TTATGCTAAGTTAAGTTGTTCTAATTGTTCTTCAAACGGACACCATCCGTTTCTTTGTTT 238 | TTCATCAAGTGACTCTAATAGTAATACGTTATTAATTAATGACAATGATATCGTTAGTTC 239 | GGATTCCAAAAATTTGTTTAAGGTTCAAGCAGGATCAACTCTTTCACAAGTCGTGAAGGA 240 | TCCTGAGACCCCAGTTTTATTAAAATGTGTTCAGTGCGTTGTCTCTAATCCAAATTTTCC 241 | TAGTTGTACATCAAACGCCTTAGTATTATTAGACGACGGCAGTACAACTTCTTACATTTC 242 | TTCGTCTTTGTCTAACAAACTTAAGTTATCACCTGTTAGTTCAGATTTATTAAAATTTAG 243 | TGTTTTTAATGAAACGCTTGTTAAAGAAGTCCCTACAGACTTAGTCTCATTTAATTTTGA 244 | AGTTAAAAATGGCAAATCATTTAAAGTAAATGCTCATACTATATCTCATATAGCTAAAGC 245 | TATTCCTCATAGTATTCTACCAAGTAAGGACTTTTCGTCTAATTTAAATATTGTGTCATA 246 | TAAGTTTGGTTCACCCGATATTCTTATCGGCAGTGATTTTTATTATGATCTTAATATAAA 247 | ACCAATTAAAACTCTATCTTCTGGGTTTACCCTATTGGAATCTTCTTTAGGAAATATTTT 248 | AGCAGGCAAGGGTCAAGCTAAAGCTTTAAATTCTAATGCTTATTATCATACTAATTTAAG 249 | TGTCGCTTTTAGTAAATCAGCAAATAGTACGTCATCGATTAACGATCTAAATGATCAAGT 250 | TTCTAATTATTTTTCCTTAGAAAGTTTAGGTATTACTGATACAGTCGAGGAAAGTTTTTG 251 | TCTGGATAAATTTAAAGACTCAATTCGTTATAATGGTGAGAGATATGAAGTCACATTACC 252 | GTGGAAAAATTTCCCACCAGATCTCGATTCTAATTTAGGTTTAAGTATAGGTAGACTGCG 253 | TTCTACAATAAAACTTTTAAGGTCTAAACCAGATCTATTAGCTCAATATAATGATATTAT 254 | TGTTAATCAGTTAAGTAACGGTGTTATCGAAAAGGTCGATAAAAGTTTAAAATATTCGCC 255 | ATCTCACTATATCCCTCATCAACCTGTTATACGTGAGGACAAAAATAAAGTTAGAATAGT 256 | ATACGATGCCTCGGCTAAATCAGCTAAATGTTGTTATTCGTTAAATGAATGTTTGTATTC 257 | TGGACCATTGCTTTTACAAAACCTAAGCGGAATATTGCTTAGATTTCGATTATACCCAGT 258 | CGTTGTGTTATCTGATCTCGAAAAAGCCTTTTTACAAGTCGCACTTATTGAAAGAGATCG 259 | TGAATTTACTAGATTTCTATGGTTAAAAAATCCTTTTGTAGAGGAATGGAGCGAAAATAA 260 | TTTAGAAATATATCGTTTTTGTCGAGTTGCCTTCGGGTTGACGTGCTCACCCTTTTTGTT 261 | AGCTTTCACTATAATTTCCCATTTTAAAAATTCAGCACTTTCCTTTTCAGAAGAAATAAT 262 | AAATAACTTGTATGTTGATAACATCTTAATTAATGCGTTGGATGAAAAAACAGCTGCCTA 263 | TAAAAGTTTGACGTTAAAATCTGAGTTTCGTAAAATAGGTATGAATTTACGTGAGTTTAT 264 | CTCTAACTGTCCAAATGCGTTAAATGAAGTAAGTGATTGTGATAAACTTCATAATAATCT 265 | AAATAAGGTTTATGGACTATTATGGAATAGTCAAGAGGACAATATAAGGTTTGTTATCGA 266 | AAATCCAGCAAGTAATCAAGTTGTTTCTAAACAGTTTATCTTAAGTTACATAGCATCTGT 267 | TTTTGATCCTATGGGTATTCTAGTACCTGCGTTGTTACCTTTTAAATTATTTTTCCAAAA 268 | ATTGTGGAATTATAAACTTTCATGGTCAGAAAATATAAATAGTGATCTGTTATCTGAATG 269 | GTTAAAACTTGTTAAGTCAAATAAGTTTCCAATCTCTATAGTTATTCCTCGTAGAAGTAG 270 | TAATTTTGTTTCGTCTAATTGTAATCATGAGATACATGCGTTTTGTGATGCTTCAGGATT 271 | TGCCTTTTCTTGTTGTGTTTTTTTAAAAACAACATATAATAACAAGACCTCAGAATGTCA 272 | TATTATCTTTTCTAAATCAAAAGTTTATCCAAAAAAGCTTAAGGATTCTTTAGTAATTCA 273 | TCGTGCGGAATTACTTGGTTTATTAATAGCTGTTAGAGCTTTAAATTTTTGTTATTCTCA 274 | ACTCTTAAGTGACCCAATTTTAAAAAATACTTTAAGTAAAATAAAAACAATTTGGACTGA 275 | TTCCACAACTGTCCTCCACTGGCTTCGTAGTACCTCTAAACAACCTACTTTCATTGAAAA 276 | CAGATTAAAAGAAATTTCTTCTGTAGATAATCTTACTTGTCGTTATGTTAGCACTAGCGA 277 | AAACCCTGCTGACATTGCAACCAGAGGGTGTACATTTACCGAAATACAAGAAGATAAATT 278 | ATGGTGGTCTGGACCAAAATGGTTATCTAGTAACAACTATCCTAAATTTGATAATATTCC 279 | TATCTACGATTCTAATATCTCTAACCCCAGATCTACTATTATTGAAAACACTTTTATTGT 280 | TTCTGCTTCAAATTCATCTGTTATCGATGTTAATAGATTTTCTTCTTGGTTAAAAATAGT 281 | ACGTGTTCTTGGATATGTTTATCGTTTTCTGCGTAAAATATGTAAATCGAGTCTTCCTAA 282 | AATGTCTTTTAAGGATCTTTTGTTTTCATCTAAGTCATACTTATCTGTTCAAGAATTAAA 283 | ATTTTCTCAGCTTAAATTGTTTATGTTAACTCAAAAGGATTGCCTTCCTCAAAAGGATGA 284 | AATTGATAGCTTAAATCTTTTTCTAAGTGAGGGTATTTATCGATCTCAAGGGCGTATTAA 285 | TTATAGTGCATGTTCATACGAAAGTAAGCACCCTATTTGGTTATCGTGTAAAAGTAAATT 286 | TACACATTTATTTATTTTTTATGTTCATCGATTAATGTTACATCCTGGCACAATTTCATT 287 | GTTAAATCATCTCAGAACTGTCTGTTGGATCTCTCAAGGAAGGAGAACTGTAAGTAAGGT 288 | TATTTATCGTTATTGTTTTCCGTGCAGAAAGTTGTCTTGTAAACCCTATTCACGACCTAC 289 | ACCCCCACAATTACCTGAAGAAAGAGTTACAATAACCCCAGTTTTTCACAATACAGGGCT 290 | TGATTATTTTGGGCCTATTAATGTAAAAGAAAATGTAAAAGTGTGGTGTTGTCTCTTTAC 291 | ATGTTTATCAGTTAGAGCAATACATTTAGAATTAGCCGAGACTTTATCGGCAGAGTCTTT 292 | TATAGAGGCGTTTCGACGTTTTGTCGCGAGGCGCGGAAGGCCATTATCAGTGATAAGTGA 293 | CAATGGAAAGAATTTTATTTTAGCTAAAAAAGTTCTCGACCCCACTGTTCAAAAAAATCC 294 | ATTACATTCCTCGTCTTATAAAGAGTTTCTCTCCAAAAACGGAGTTAAGTGGTCATTTAT 295 | AACGGAAAGAGCGCCGTGGAAAGGCGGCTTCTACGAACGATTAATAGGTATAGTAAAAAA 296 | TCATATACGGCGAGTTGTCGGTAACGCTTATTTAAGCCTATCTAAACTTAATACTATTTT 297 | AAGTGAAATTGAATTCATAGTTAATTGCCGACCTTTAACATTTGTGTCTGATCAACCGGA 298 | AAATATTCACGTCATTAGACCTATAGATTTCTTGTCACCCAATGTTGACACTCAACTACT 299 | AACACCAGTCACTATTTTAAACAATACCTTAAGCTCTAAATCTAACAAAGATACTCTAGT 300 | CTTAAATTGGCAAGCATGTCAAGAAAGATTAAATAATTTTTGGAAAAAATGGTCAGTGGA 301 | CTACTTATTGTCATTGAGAGAAAGAAAAAATAAATTATTAAAAAGTAAAAGTGAAAAAGT 302 | GCCAAAAAATGGTGAAGTTGTTTTGGTATACGACGAAAATGTACCAAAGGGTCAATGGAA 303 | ATTGGCCGTAATTTACGATAATGAAAATTTAGGTTCACATACACGATCAGTAAAAATAAA 304 | ATTTGCGTCTGGTCACGTATCACGCCGAGCCGTCGATCATCTTTACCCATTAGAAACAAC 305 | AGGAGAATAAGTTTATTAAGTTTTTGTTCAATTACCCTCGGGTAATATTTTAATAATTTC 306 | TATATAATCGAATTTAGATCTTGATCTAGTCAGTCTTAACCAGCATTATCAGCGAAATAC 307 | GAGTGAAATTCATTTATATTATTATTTAATTTTAATTTTTTTGTTAATAAGTTAAATTTG 308 | TTATTATTTTAAATTATGTTATTATCTTAGTTACCAACAGCAATAATTTAAGAGTTATCA 309 | CAAATTCCTTTTATAATCTCATTTAAATTATTTAACTTTAATTATGTGGTTTATACTTCT 310 | CTAATTCTTAAAACTCCCATTACAAGCTCAGACGTAGCGAGCAGTATTGTTGGACGCCTT 311 | TTAGCCATCTGCTAGCTTTGCACGTTTTCTGGGCGACGGGAGTGTCGCGAACACTTCGCG 312 | AATTTAAATAAAATAAATCTAATAATTAAAATTAATTATAATTTTAAATAAATATATTTC 313 | AAATTTAAAAATGATGCTCATAAAATATTGCCAATTTTAAGTTTTAAATTCAAATATTTT 314 | TTTTGTTTTTTATTATTATTGTTGTTACCGTTTGGTTGAGTTTTGTTTCTGGACTCATAT 315 | TTTTTTGTTTTAAATATAATTTTCTTTGATTAAAATATAATTGTTATAATTATATTGTTT 316 | TTTTTGTTTACTTTAAATTTAGTTATTACTTATATAATAGCTGAATTATTAGTTCGTTTT 317 | TAATTAAAAATAAAAGTGTTAGTTAATTATTTTTTAGTTAAAACATTTAAATTTACAAGT 318 | TGTAAAAGCGCAACGTTTAAGTGTCGCTTTTAATACAGCGAGTGGTTAATACCCCTGGGT 319 | AGAGGCCAAAAGTTAATCCTTTTTTTTAGAAAAAATTGCTGCTACAGTACTTTTGAAACA 320 | ATCGAAATTTTAGTGATATTATAGAAGATTTGAATCATTAAGAACTCATTTTGATTGGCG 321 | AATAAATTTTTATAGAAGTTGTGATTTCATTCTCCAGTAAGCGTAAGTATTTCGCTAAAG 322 | TTTTTTTTCTTATATTTAAGATTTAAGTAAGCTATTACTTGTTTAGAAATTTATTTTAAT 323 | TTTAGTATTAAATTTATTTTCTCATTTTTTTTT 324 | -------------------------------------------------------------------------------- /example/assembly.list.txt: -------------------------------------------------------------------------------- 1 | contig_1 2 | contig_2 3 | -------------------------------------------------------------------------------- /example/blast.out: -------------------------------------------------------------------------------- 1 | contig_1 979556 200 2 | contig_2 979556 500 3 | contig_2 979556 1000 4 | contig_2 979556 500 5 | contig_2 979556 300 6 | contig_3 979556 10000 7 | contig_4 979556 1000 8 | contig_5 6252 2000 9 | contig_6 232323 2000 10 | contig_6 6252 2000 11 | contig_6 979556 2000 12 | contig_6 232323 2000 13 | contig_7 6252 2000 14 | contig_8 6252 2000 15 | contig_8 979556 2000 16 | contig_9 6252 200 17 | -------------------------------------------------------------------------------- /example/blobDB.json: -------------------------------------------------------------------------------- 1 | {"tax_collision_random": false, "min_score": 0.0, "nodesDB_f": "/Users/dom/git/blobtools/data/nodesDB.txt", "title": "blobDB.json", "lineages": {"6252": {"superkingdom": "Eukaryota", "family": "Ascarididae", "order": "Ascaridida", "phylum": "Nematoda", "genus": "Ascaris", "species": "Ascaris lumbricoides"}, "232323": {"superkingdom": "Eukaryota", "family": "Hypsibiidae", "order": "Parachela", "phylum": "Tardigrada", "genus": "Hypsibius", "species": "Hypsibius dujardini"}, "979556": {"superkingdom": "Bacteria", "family": "Microbacteriaceae", "order": "Micrococcales", "phylum": "Actinobacteria", "genus": "Microbacterium", "species": "Microbacterium testaceum"}}, "taxrules": ["bestsum"], "hitLibs": {"tax0": {"fmt": "tax", "name": "tax0", "f": "/Users/dom/git/blobtools/example/blast.out"}}, "length": 18477, "version": "blobtools v1.0", "n_count": 0, "order_of_blobs": ["contig_1", "contig_2", "contig_3", "contig_4", "contig_5", "contig_6", "contig_7", "contig_8", "contig_9", "contig_10"], "seqs": 10, "min_diff": 0.0, "dict_of_blobs": {"contig_10": {"hits": {}, "name": "contig_10", "taxonomy": {"bestsum": {"superkingdom": {"score": 0.0, "tax": "no-hit", "c_index": 0}, "family": {"score": 0.0, "tax": "no-hit", "c_index": 0}, "order": {"score": 0.0, "tax": "no-hit", "c_index": 0}, "phylum": {"score": 0.0, "tax": "no-hit", "c_index": 0}, "genus": {"score": 0.0, "tax": "no-hit", "c_index": 0}, "species": {"score": 0.0, "tax": "no-hit", "c_index": 0}}}, "agct_count": 6273, "length": 6273, "gc": 0.3067, "n_count": 0, "covs": {"cov0": 310.634}, "read_cov": {"cov0": 8741}}, "contig_9": {"hits": {"tax0": [{"score": 200.0, "name": "contig_9", "taxId": "6252"}]}, "name": "contig_9", "taxonomy": {"bestsum": {"superkingdom": {"score": 200.0, "tax": "Eukaryota", "c_index": 0}, "family": {"score": 200.0, "tax": "Ascarididae", "c_index": 0}, "order": {"score": 200.0, "tax": "Ascaridida", "c_index": 0}, "phylum": {"score": 200.0, "tax": "Nematoda", "c_index": 0}, "genus": {"score": 200.0, "tax": "Ascaris", "c_index": 0}, "species": {"score": 200.0, "tax": "Ascaris lumbricoides", "c_index": 0}}}, "agct_count": 1599, "length": 1599, "gc": 0.2439, "n_count": 0, "covs": {"cov0": 74.757}, "read_cov": {"cov0": 554}}, "contig_8": {"hits": {"tax0": [{"score": 2000.0, "name": "contig_8", "taxId": "6252"}, {"score": 2000.0, "name": "contig_8", "taxId": "979556"}]}, "name": "contig_8", "taxonomy": {"bestsum": {"superkingdom": {"score": 2000.0, "tax": "unresolved", "c_index": 1}, "family": {"score": 2000.0, "tax": "unresolved", "c_index": 1}, "order": {"score": 2000.0, "tax": "unresolved", "c_index": 1}, "phylum": {"score": 2000.0, "tax": "unresolved", "c_index": 1}, "genus": {"score": 2000.0, "tax": "unresolved", "c_index": 1}, "species": {"score": 2000.0, "tax": "unresolved", "c_index": 1}}}, "agct_count": 2346, "length": 2346, "gc": 0.2801, "n_count": 0, "covs": {"cov0": 91.742}, "read_cov": {"cov0": 1008}}, "contig_1": {"hits": {"tax0": [{"score": 200.0, "name": "contig_1", "taxId": "979556"}]}, "name": "contig_1", "taxonomy": {"bestsum": {"superkingdom": {"score": 200.0, "tax": "Bacteria", "c_index": 0}, "family": {"score": 200.0, "tax": "Microbacteriaceae", "c_index": 0}, "order": {"score": 200.0, "tax": "Micrococcales", "c_index": 0}, "phylum": {"score": 200.0, "tax": "Actinobacteria", "c_index": 0}, "genus": {"score": 200.0, "tax": "Microbacterium", "c_index": 0}, "species": {"score": 200.0, "tax": "Microbacterium testaceum", "c_index": 0}}}, "agct_count": 756, "length": 756, "gc": 0.2606, "n_count": 0, "covs": {"cov0": 90.406}, "read_cov": {"cov0": 369}}, "contig_3": {"hits": {"tax0": [{"score": 10000.0, "name": "contig_3", "taxId": "979556"}]}, "name": "contig_3", "taxonomy": {"bestsum": {"superkingdom": {"score": 10000.0, "tax": "Bacteria", "c_index": 0}, "family": {"score": 10000.0, "tax": "Microbacteriaceae", "c_index": 0}, "order": {"score": 10000.0, "tax": "Micrococcales", "c_index": 0}, "phylum": {"score": 10000.0, "tax": "Actinobacteria", "c_index": 0}, "genus": {"score": 10000.0, "tax": "Microbacterium", "c_index": 0}, "species": {"score": 10000.0, "tax": "Microbacterium testaceum", "c_index": 0}}}, "agct_count": 602, "length": 602, "gc": 0.2342, "n_count": 0, "covs": {"cov0": 43.761}, "read_cov": {"cov0": 188}}, "contig_2": {"hits": {"tax0": [{"score": 500.0, "name": "contig_2", "taxId": "979556"}, {"score": 1000.0, "name": "contig_2", "taxId": "979556"}, {"score": 500.0, "name": "contig_2", "taxId": "979556"}, {"score": 300.0, "name": "contig_2", "taxId": "979556"}]}, "name": "contig_2", "taxonomy": {"bestsum": {"superkingdom": {"score": 2300.0, "tax": "Bacteria", "c_index": 0}, "family": {"score": 2300.0, "tax": "Microbacteriaceae", "c_index": 0}, "order": {"score": 2300.0, "tax": "Micrococcales", "c_index": 0}, "phylum": {"score": 2300.0, "tax": "Actinobacteria", "c_index": 0}, "genus": {"score": 2300.0, "tax": "Microbacterium", "c_index": 0}, "species": {"score": 2300.0, "tax": "Microbacterium testaceum", "c_index": 0}}}, "agct_count": 1060, "length": 1060, "gc": 0.2623, "n_count": 0, "covs": {"cov0": 168.409}, "read_cov": {"cov0": 844}}, "contig_5": {"hits": {"tax0": [{"score": 2000.0, "name": "contig_5", "taxId": "6252"}]}, "name": "contig_5", "taxonomy": {"bestsum": {"superkingdom": {"score": 2000.0, "tax": "Eukaryota", "c_index": 0}, "family": {"score": 2000.0, "tax": "Ascarididae", "c_index": 0}, "order": {"score": 2000.0, "tax": "Ascaridida", "c_index": 0}, "phylum": {"score": 2000.0, "tax": "Nematoda", "c_index": 0}, "genus": {"score": 2000.0, "tax": "Ascaris", "c_index": 0}, "species": {"score": 2000.0, "tax": "Ascaris lumbricoides", "c_index": 0}}}, "agct_count": 614, "length": 614, "gc": 0.329, "n_count": 0, "covs": {"cov0": 163.557}, "read_cov": {"cov0": 456}}, "contig_4": {"hits": {"tax0": [{"score": 1000.0, "name": "contig_4", "taxId": "979556"}]}, "name": "contig_4", "taxonomy": {"bestsum": {"superkingdom": {"score": 1000.0, "tax": "Bacteria", "c_index": 0}, "family": {"score": 1000.0, "tax": "Microbacteriaceae", "c_index": 0}, "order": {"score": 1000.0, "tax": "Micrococcales", "c_index": 0}, "phylum": {"score": 1000.0, "tax": "Actinobacteria", "c_index": 0}, "genus": {"score": 1000.0, "tax": "Microbacterium", "c_index": 0}, "species": {"score": 1000.0, "tax": "Microbacterium testaceum", "c_index": 0}}}, "agct_count": 951, "length": 951, "gc": 0.3155, "n_count": 0, "covs": {"cov0": 456.313}, "read_cov": {"cov0": 2096}}, "contig_7": {"hits": {"tax0": [{"score": 2000.0, "name": "contig_7", "taxId": "6252"}]}, "name": "contig_7", "taxonomy": {"bestsum": {"superkingdom": {"score": 2000.0, "tax": "Eukaryota", "c_index": 0}, "family": {"score": 2000.0, "tax": "Ascarididae", "c_index": 0}, "order": {"score": 2000.0, "tax": "Ascaridida", "c_index": 0}, "phylum": {"score": 2000.0, "tax": "Nematoda", "c_index": 0}, "genus": {"score": 2000.0, "tax": "Ascaris", "c_index": 0}, "species": {"score": 2000.0, "tax": "Ascaris lumbricoides", "c_index": 0}}}, "agct_count": 4060, "length": 4060, "gc": 0.2584, "n_count": 0, "covs": {"cov0": 52.312}, "read_cov": {"cov0": 1005}}, "contig_6": {"hits": {"tax0": [{"score": 2000.0, "name": "contig_6", "taxId": "232323"}, {"score": 2000.0, "name": "contig_6", "taxId": "6252"}, {"score": 2000.0, "name": "contig_6", "taxId": "979556"}, {"score": 2000.0, "name": "contig_6", "taxId": "232323"}]}, "name": "contig_6", "taxonomy": {"bestsum": {"superkingdom": {"score": 6000.0, "tax": "Eukaryota", "c_index": 1}, "family": {"score": 4000.0, "tax": "Hypsibiidae", "c_index": 2}, "order": {"score": 4000.0, "tax": "Parachela", "c_index": 2}, "phylum": {"score": 4000.0, "tax": "Tardigrada", "c_index": 2}, "genus": {"score": 4000.0, "tax": "Hypsibius", "c_index": 2}, "species": {"score": 4000.0, "tax": "Hypsibius dujardini", "c_index": 2}}}, "agct_count": 216, "length": 216, "gc": 0.1944, "n_count": 0, "covs": {"cov0": 25.88}, "read_cov": {"cov0": 52}}}, "assembly_f": "/Users/dom/git/blobtools/example/assembly.fna", "covLibs": {"cov0": {"reads_unmapped": 0, "mean_cov": 147.7771, "cov_sum": 1477.771, "name": "cov0", "f": "/Users/dom/git/blobtools/example/mapping_1.bam.cov", "fmt": "cov", "reads_total": 15313, "reads_mapped": 15313}}} -------------------------------------------------------------------------------- /example/blobDB.table.txt: -------------------------------------------------------------------------------- 1 | ## blobtools v1.0 2 | ## assembly : /Users/dom/git/blobtools/example/assembly.fna 3 | ## coverage : cov0 - /Users/dom/git/blobtools/example/mapping_1.bam.cov 4 | ## taxonomy : tax0 - /Users/dom/git/blobtools/example/blast.out 5 | ## nodesDB : /Users/dom/git/blobtools/data/nodesDB.txt 6 | ## taxrule : bestsum 7 | ## min_score : 0.0 8 | ## min_diff : 0.0 9 | ## tax_collision_random : False 10 | ## 11 | # name length GC N cov0 phylum.t.6 phylum.s.7 phylum.c.8 12 | contig_1 756 0.2606 0 90.406 Actinobacteria 200.0 0 13 | contig_2 1060 0.2623 0 168.409 Actinobacteria 2300.0 0 14 | contig_3 602 0.2342 0 43.761 Actinobacteria 10000.0 0 15 | contig_4 951 0.3155 0 456.313 Actinobacteria 1000.0 0 16 | contig_5 614 0.329 0 163.557 Nematoda 2000.0 0 17 | contig_6 216 0.1944 0 25.88 Tardigrada 4000.0 2 18 | contig_7 4060 0.2584 0 52.312 Nematoda 2000.0 0 19 | contig_8 2346 0.2801 0 91.742 unresolved 2000.0 1 20 | contig_9 1599 0.2439 0 74.757 Nematoda 200.0 0 21 | contig_10 6273 0.3067 0 310.634 no-hit 0.0 0 -------------------------------------------------------------------------------- /example/blobplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DRL/blobtools/9426ca4a6dd0e0a2a67841d92b60b101fc52b921/example/blobplot.png -------------------------------------------------------------------------------- /example/catcolour.txt: -------------------------------------------------------------------------------- 1 | contig_1,A 2 | contig_2,A 3 | contig_3,A 4 | contig_4,B 5 | contig_5,B 6 | contig_6,B 7 | contig_7,B 8 | contig_8,B 9 | contig_9,C 10 | contig_10,C 11 | -------------------------------------------------------------------------------- /example/colours.txt: -------------------------------------------------------------------------------- 1 | Nematoda,#48a365 2 | Tardigrada,#48a365 3 | Actinobacteria,#926eb3 4 | other,#ffffff 5 | -------------------------------------------------------------------------------- /example/diamond.out: -------------------------------------------------------------------------------- 1 | contig_1 232323 200 2 | contig_2 232323 500 3 | contig_2 232323 1000 4 | contig_2 232323 500 5 | contig_2 979556 300 6 | contig_3 979556 10000 7 | contig_4 979556 1000 8 | contig_5 6252 1000 9 | contig_6 232323 1000 10 | contig_6 6252 1000 11 | contig_6 979556 1000 12 | contig_6 232323 1000 13 | contig_7 6252 1000 14 | contig_9 6252 100 15 | -------------------------------------------------------------------------------- /example/mapping_1.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DRL/blobtools/9426ca4a6dd0e0a2a67841d92b60b101fc52b921/example/mapping_1.bam -------------------------------------------------------------------------------- /example/mapping_1.sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DRL/blobtools/9426ca4a6dd0e0a2a67841d92b60b101fc52b921/example/mapping_1.sorted.bam -------------------------------------------------------------------------------- /example/mapping_1.sorted.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DRL/blobtools/9426ca4a6dd0e0a2a67841d92b60b101fc52b921/example/mapping_1.sorted.bam.bai -------------------------------------------------------------------------------- /example/mapping_2.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DRL/blobtools/9426ca4a6dd0e0a2a67841d92b60b101fc52b921/example/mapping_2.bam -------------------------------------------------------------------------------- /example/mapping_2.sorted.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DRL/blobtools/9426ca4a6dd0e0a2a67841d92b60b101fc52b921/example/mapping_2.sorted.bam -------------------------------------------------------------------------------- /example/mapping_2.sorted.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DRL/blobtools/9426ca4a6dd0e0a2a67841d92b60b101fc52b921/example/mapping_2.sorted.bam.bai -------------------------------------------------------------------------------- /example/refcov.txt: -------------------------------------------------------------------------------- 1 | cov0,15313,15300 2 | bam0,15313,15300 3 | cov1,37278,15300 4 | bam1,37278,15300 5 | covsum,52591,30600 -------------------------------------------------------------------------------- /lib/BtIO.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | File : BtIO.py 6 | Author : Dominik R. Laetsch, dominik.laetsch at gmail dot com 7 | """ 8 | 9 | from __future__ import division 10 | import re 11 | import subprocess 12 | import os 13 | import pysam 14 | from collections import defaultdict 15 | from os.path import basename, isfile, splitext, join, isdir 16 | import shutil 17 | import lib.BtLog as BtLog 18 | import sys 19 | from tqdm import tqdm 20 | import yaml 21 | # CONSTs 22 | COMPLEMENT = {'A':'T','C':'G','G':'C','T':'A','N':'N'} 23 | 24 | def create_dir(directory="", overwrite=True): 25 | if directory: 26 | if not isdir(directory): 27 | os.makedirs(directory) 28 | else: 29 | if overwrite: 30 | shutil.rmtree(directory) #removes all the subdirectories! 31 | os.makedirs(directory) 32 | return directory 33 | else: 34 | return None 35 | 36 | def parseList(infile): 37 | if not isfile(infile): 38 | BtLog.error('0', infile) 39 | with open(infile) as fh: 40 | items = [] 41 | for l in fh: 42 | items.append(l.rstrip("\n")) 43 | return items 44 | 45 | def parseReferenceCov(infile): 46 | refcov_dict = {} 47 | if infile: 48 | if not isfile(infile): 49 | BtLog.error('0', infile) 50 | with open(infile) as fh: 51 | for l in fh: 52 | try: 53 | cov_lib, reads_total_ref, reads_mapped_ref = l.split(",") 54 | refcov_dict[cov_lib] = {'reads_total' : int(reads_total_ref), 55 | 'reads_mapped' : int(reads_mapped_ref)} 56 | except: 57 | BtLog.error('21', infile) 58 | return refcov_dict 59 | 60 | def parseCmdlist(temp): 61 | _list = [] 62 | if temp: 63 | if "," in temp: 64 | _list = temp.split(",") 65 | else: 66 | _list.append(temp) 67 | return _list 68 | 69 | def parseCmdLabels(labels): 70 | label_d = {} 71 | name, groups = '', '' 72 | if labels: 73 | try: 74 | for label in labels: 75 | name, groups = str(label).split("=") 76 | if "," in groups: 77 | for group in groups.split(","): 78 | label_d[group] = name 79 | else: 80 | label_d[groups] = name 81 | except: 82 | BtLog.error('17', labels) 83 | return label_d 84 | 85 | def parseCatColour(infile): 86 | catcolour_dict = {} 87 | if infile: 88 | if not isfile(infile): 89 | BtLog.error('0', infile) 90 | with open(infile) as fh: 91 | for l in fh: 92 | try: 93 | seq_name, category = l.rstrip("\n").split(",") 94 | catcolour_dict[seq_name] = category 95 | except: 96 | BtLog.error('23', infile) 97 | return catcolour_dict 98 | 99 | def parseDict(infile, key, value): 100 | items = {} 101 | if infile: 102 | if not isfile(infile): 103 | BtLog.error('0', infile) 104 | with open(infile) as fh: 105 | items = {} 106 | k_idx = int(key) 107 | v_idx = int(value) 108 | for l in fh: 109 | temp = l.rstrip("\n").split() 110 | items[temp[k_idx]] = temp[v_idx] 111 | return items 112 | 113 | def parseColours(infile): 114 | items = {} 115 | if infile: 116 | if not isfile(infile): 117 | BtLog.error('0', infile) 118 | with open(infile) as fh: 119 | for l in fh: 120 | temp = l.rstrip("\n").split(",") 121 | items[temp[0]] = temp[1] 122 | return items 123 | 124 | def parseSet(infile): 125 | if not isfile(infile): 126 | BtLog.error('0', infile) 127 | with open(infile) as fh: 128 | items = set() 129 | for l in fh: 130 | items.add(l.rstrip("\n").lstrip(">")) 131 | return items 132 | 133 | def parseFastaNameOrder(infile): 134 | fasta_order = [] 135 | for name, seq in readFasta(infile): 136 | fasta_order.append(name) 137 | return fasta_order 138 | 139 | def readFasta(infile): 140 | if not isfile(infile): 141 | BtLog.error('0', infile) 142 | with open(infile) as fh: 143 | header, seqs = '', [] 144 | for l in fh: 145 | if l[0] == '>': 146 | if header: 147 | yield header, ''.join(seqs).upper() 148 | header, seqs = l[1:-1].split()[0], [] # Header is split at first whitespace 149 | else: 150 | seqs.append(l[:-1]) 151 | yield header, ''.join(seqs).upper() 152 | 153 | def runCmd(**kwargs): 154 | command = kwargs['command'] 155 | cmd = command.split() # sanitation 156 | p = subprocess.Popen(cmd, 157 | stdout=subprocess.PIPE, 158 | stderr=subprocess.STDOUT, 159 | universal_newlines=True, 160 | bufsize=-1) # buffersize of system 161 | wait = kwargs.get('wait', False) 162 | if wait : 163 | p.wait() 164 | if p.returncode == 0: 165 | pass 166 | else: 167 | return iter(p.stdout.readline, b'') 168 | 169 | def which(program): 170 | def is_exe(fpath): 171 | return os.path.isfile(fpath) and os.access(fpath, os.X_OK) 172 | fpath, fname = os.path.split(program) 173 | if fpath: 174 | if is_exe(program): 175 | return program 176 | else: 177 | for path in os.environ["PATH"].split(os.pathsep): 178 | path = path.strip('"') 179 | exe_file = os.path.join(path, program) 180 | if is_exe(exe_file): 181 | return exe_file 182 | return None 183 | 184 | def checkAlnIndex(aln): 185 | try: 186 | index_flag = aln.check_index() 187 | except ValueError: 188 | index_flag = False 189 | return index_flag 190 | 191 | def getAlnHeaderIntersection(aln, headers): 192 | aln_set = set(aln.references) 193 | headers_set = set(headers) 194 | headers_aln_intersection = headers_set.intersection(aln_set) 195 | return (len(headers_set), len(aln_set), len(headers_aln_intersection)) 196 | 197 | def estimate_read_lengths(aln, set_of_blobs): 198 | _read_lengths = [] 199 | while len(_read_lengths) < 10000: 200 | for header in set_of_blobs: 201 | for read in aln.fetch(header): 202 | _read_lengths.append(read.query_length) 203 | return round(sum(_read_lengths)/len(_read_lengths), 4) 204 | 205 | def checkBam(aln, set_of_blobs): 206 | if not checkAlnIndex(aln): 207 | print("[X] Please (sort and) index your BAM file") 208 | sys.exit() 209 | len_headers, len_aln, len_intersection = getAlnHeaderIntersection(aln, set_of_blobs) 210 | if len_intersection == 0: 211 | print("[X] Headers in FASTA and BAM don't seem to match") 212 | sys.exit() 213 | print("[+] -> %.2f (%s/%s) of sequences have reads aligned to them." % ((len_intersection / len_headers) * 100, len_intersection, len_headers)) 214 | reads_total = aln.mapped + aln.unmapped 215 | print("[+] -> %.2f (%s/%s) of reads are mapped." % ((aln.mapped / reads_total) * 100, aln.mapped, reads_total)) 216 | return reads_total, aln.mapped 217 | 218 | def parseBam(infile, set_of_blobs, estimate_cov): 219 | # no_base_cov_flag [deprecated] 220 | reads_total, reads_mapped = 0, 0 221 | with pysam.AlignmentFile(infile) as aln: 222 | reads_total, reads_mapped = checkBam(aln, set_of_blobs) 223 | if estimate_cov: 224 | base_cov_dict, read_cov_dict = estimate_coverage(aln, set_of_blobs) 225 | else: 226 | base_cov_dict, read_cov_dict = calculate_coverage(aln, reads_mapped, set_of_blobs) 227 | return base_cov_dict, reads_total, reads_mapped, read_cov_dict 228 | 229 | def estimate_coverage(aln, set_of_blobs): 230 | base_cov_dict = {blob : 0.0 for blob in set_of_blobs} 231 | read_cov_dict = {blob : 0 for blob in set_of_blobs} 232 | est_read_length = estimate_read_lengths(aln, set_of_blobs) 233 | with tqdm(total=len(set_of_blobs), desc="[%] ", ncols=200, unit_scale=True) as pbar: 234 | for header in set_of_blobs: 235 | read_count = aln.count(header, read_callback=check_mapped_read) 236 | base_cov_dict[header] = read_count * est_read_length 237 | read_cov_dict[header] += read_count 238 | pbar.update() 239 | return base_cov_dict, read_cov_dict 240 | 241 | def check_mapped_read(read): 242 | if read.is_unmapped or read.is_secondary or read.is_supplementary: 243 | return False 244 | return True 245 | 246 | def calculate_coverage(aln, reads_mapped, set_of_blobs): 247 | _base_cov_dict = {blob : [] for blob in set_of_blobs} 248 | read_cov_dict = {blob : 0 for blob in set_of_blobs} 249 | allowed_operations = set([0, 7, 8]) 250 | with tqdm(total=reads_mapped, desc="[%] ", ncols=200, unit_scale=True) as pbar: 251 | for read in aln.fetch(until_eof=True): 252 | if not check_mapped_read(read): 253 | continue 254 | for operation, length in read.cigartuples: 255 | if operation in allowed_operations: 256 | _base_cov_dict[read.reference_name].append(length) 257 | read_cov_dict[read.reference_name] += 1 258 | pbar.update() 259 | base_cov_dict = {ref_name: sum(_base_cov) for ref_name, _base_cov in _base_cov_dict.items()} 260 | return base_cov_dict, read_cov_dict 261 | 262 | def write_read_pair_seqs(pair_count_by_type, seqs_by_type, out_fs_by_type): 263 | for pair_type, pair_count in pair_count_by_type.items(): 264 | print(BtLog.status_d['23'] % (pair_type, pair_count)) 265 | if pair_count: 266 | out_fs = out_fs_by_type[pair_type] 267 | if len(set(out_fs)) == 1: 268 | out_f = out_fs[0] 269 | with open(out_f, 'w') as out_fh: 270 | print(BtLog.status_d['24'] % out_f) 271 | #out_fh.write("\n".join(seqs_by_type[pair_type]) + "\n") 272 | out_fh.write("\n".join([pair for pair in seqs_by_type[pair_type]]) + "\n") 273 | else: 274 | out_f = out_fs[0] 275 | with open(out_f, 'w') as out_fh: 276 | print(BtLog.status_d['24'] % out_f) 277 | out_fh.write("\n".join([pair for pair in seqs_by_type[pair_type][0::2]]) + "\n") 278 | out_f = out_fs[1] 279 | with open(out_f, 'w') as out_fh: 280 | print(BtLog.status_d['24'] % out_f) 281 | out_fh.write("\n".join([pair for pair in seqs_by_type[pair_type][1::2]]) + "\n") 282 | 283 | 284 | def get_read_pair_fasta(read, read_format): 285 | name = read.query_name 286 | seq = read.get_forward_sequence() 287 | if read_format == "fq": 288 | qual = '' 289 | if not read.is_reverse: 290 | qual = read.qual 291 | else: 292 | qual = read.qual[::-1] 293 | return "@{name}\n{seq}\n+\n{qual}".format(name=name, seq=seq, qual=qual) 294 | else: 295 | return ">{name}\n{seq}".format(name=name, seq=seq) 296 | 297 | 298 | def init_read_pairs(outfile, include_unmapped, noninterleaved, include, exclude, read_format): 299 | read_pair_types = [] 300 | if include or exclude: 301 | read_pair_types = ['InUn', 'InIn', 'ExIn'] # strings have to be sorted alphabetically ('ExIn', not 'InEx') 302 | else: 303 | read_pair_types = ['InUn', 'InIn'] # strings have to be sorted alphabetically 304 | if include_unmapped: 305 | read_pair_types.append('UnUn') 306 | pair_count_by_type = {read_pair_type : 0 for read_pair_type in read_pair_types} 307 | # initialise read_pair tuples 308 | # read_pair_seqs = {read_pair_type : tuple() for read_pair_type in read_pair_types} 309 | read_pair_seqs = {read_pair_type : [] for read_pair_type in read_pair_types} 310 | # initialise read_pair files 311 | read_pair_out_fs = defaultdict(lambda: []) 312 | if noninterleaved: 313 | for read_pair_type in read_pair_types: 314 | read_pair_out_fs[read_pair_type].append(getOutFile(outfile, None, read_pair_type + ".1." + read_format)) 315 | read_pair_out_fs[read_pair_type].append(getOutFile(outfile, None, read_pair_type + ".2." + read_format)) 316 | else: 317 | for read_pair_type in read_pair_types: 318 | read_pair_out_fs[read_pair_type].append(getOutFile(outfile, None, read_pair_type + "." + read_format)) 319 | return pair_count_by_type, read_pair_seqs, read_pair_out_fs 320 | 321 | def print_bam(read_pair_out_fs, read_pair_type, read1, read2): 322 | with open(read_pair_out_fs[read_pair_type] + ".txt", 'a') as fh: 323 | fh.write("\t".join(read1) + "\n") 324 | fh.write("\t".join(read2) + "\n") 325 | 326 | def read_pair_generator(aln, region_string=None): 327 | """ 328 | Generate read pairs in a BAM file or within a region string. 329 | Reads are added to read_dict until a pair is found. 330 | """ 331 | read_dict = defaultdict(lambda: [None, None]) 332 | for read in aln.fetch(until_eof=True): 333 | if read.is_secondary or read.is_supplementary: 334 | continue 335 | qname = read.query_name 336 | if qname not in read_dict: 337 | if read.is_read1: 338 | read_dict[qname][0] = read 339 | else: 340 | read_dict[qname][1] = read 341 | else: 342 | if read.is_read1: 343 | yield read, read_dict[qname][1] 344 | else: 345 | yield read_dict[qname][0], read 346 | del read_dict[qname] 347 | 348 | def parseBamForFilter(infile, include_unmapped, noninterleaved, outfile, include, exclude, read_format): 349 | ''' 350 | parse BAM to extract readpairs 351 | ''' 352 | 353 | pair_count_by_type, seqs_by_type, out_fs_by_type = init_read_pairs(outfile, include_unmapped, noninterleaved, include, exclude, read_format) 354 | if include: 355 | sequence_to_type_dict = defaultdict(lambda: 'Ex') 356 | for incl in include: 357 | sequence_to_type_dict[incl] = 'In' 358 | sequence_to_type_dict[None] = 'Un' 359 | elif exclude: 360 | sequence_to_type_dict = defaultdict(lambda: 'In') 361 | for excl in exclude: 362 | sequence_to_type_dict[excl] = 'Ex' 363 | sequence_to_type_dict[None] = 'Un' 364 | else: 365 | sequence_to_type_dict = defaultdict(lambda: 'In') 366 | sequence_to_type_dict[None] = 'Un' 367 | 368 | seen_reads = 0 369 | print(BtLog.status_d['26'] % infile) 370 | with pysam.AlignmentFile(infile) as aln: 371 | with tqdm(total=(aln.mapped + aln.unmapped) / 2, desc="[%] ", ncols=200, unit_scale=True) as pbar: 372 | for read1, read2 in read_pair_generator(aln): 373 | seen_reads += 2 374 | read_pair_type = "".join(sorted([sequence_to_type_dict[read1.reference_name], sequence_to_type_dict[read2.reference_name]])) 375 | 376 | if read_pair_type in seqs_by_type: 377 | seqs_by_type[read_pair_type].append(get_read_pair_fasta(read1, read_format)) 378 | seqs_by_type[read_pair_type].append(get_read_pair_fasta(read2, read_format)) 379 | pair_count_by_type[read_pair_type] += 1 380 | pbar.update() 381 | write_read_pair_seqs(pair_count_by_type, seqs_by_type, out_fs_by_type) 382 | # info log 383 | info_string = [] 384 | info_string.append(('Total pairs', "{:,}".format(int(seen_reads / 2)), '{0:.1%}'.format(1.00))) 385 | for read_pair_type, count in pair_count_by_type.items(): 386 | info_string.append((read_pair_type + ' pairs', "{:,}".format(count), '{0:.1%}'.format(count / int(seen_reads / 2)))) 387 | info_out_f = getOutFile(outfile, None, "info.txt") 388 | with open(info_out_f, 'w') as info_fh: 389 | print(BtLog.status_d['24'] % info_out_f) 390 | info_fh.write(get_table(info_string)) 391 | return 1 392 | 393 | def get_table(table): 394 | col_width = [max(len(x) for x in col) for col in zip(*table)] 395 | table_string = [] 396 | for line in table: 397 | table_string.append('| %s | %s | %s |' % (line[0].rjust(col_width[0]), line[1].rjust(col_width[1]), line[2].rjust(col_width[2]))) 398 | return "\n".join(table_string) + "\n" 399 | 400 | 401 | def parseCovFromHeader(fasta_type, header): 402 | ''' 403 | Returns the coverage from the header of a FASTA 404 | sequence depending on the assembly type 405 | ''' 406 | ASSEMBLY_TYPES = [None, 'spades', 'velvet', 'platanus'] 407 | if not fasta_type in ASSEMBLY_TYPES: 408 | BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:])) 409 | if fasta_type == 'spades': 410 | spades_match_re = re.compile(r"_cov_(\d+\.*\d*)") 411 | #cov = re.findall(r"_cov_(\d+\.*\d*)", header) 412 | return float(spades_match_re.findall(header)[0]) 413 | elif fasta_type == 'velvet': 414 | return float(header.split("_")[-1]) 415 | #elif fasta_type == 'abyss' or fasta_type == 'soap': 416 | # temp = header.split(" ") 417 | # return float(temp[2]/(temp[1]+1-75)) 418 | elif fasta_type == 'platanus': 419 | temp = header.rstrip("\n").split("_") 420 | if len(temp) >= 3: 421 | return float(temp[2].replace("cov", "")) # scaffold/scaffoldBubble/contig 422 | else: 423 | return float(temp[1].replace("cov", "")) # gapClosed 424 | else: 425 | pass 426 | 427 | def parseCov(infile, set_of_blobs): 428 | if not isfile(infile): 429 | BtLog.error('0', infile) 430 | base_cov_dict = {} 431 | 432 | cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)") 433 | reads_total = 0 434 | reads_mapped = 0 435 | reads_unmapped = 0 436 | read_cov_dict = {} 437 | 438 | with tqdm(total=len(set_of_blobs), desc="[%] ", ncols=200, unit_scale=True) as pbar: 439 | with open(infile) as fh: 440 | for line in fh: 441 | if line.startswith('#'): 442 | if line.startswith("## Total Reads"): 443 | reads_total = int(line.split(" = ")[1]) 444 | elif line.startswith("## Mapped Reads"): 445 | reads_mapped = int(line.split(" = ")[1]) 446 | elif line.startswith("## Unmapped Reads"): 447 | reads_unmapped = int(line.split(" = ")[1]) 448 | else: 449 | pass 450 | else: 451 | match = cov_line_re.search(line) 452 | if match: 453 | 454 | name, read_cov, base_cov = match.group(1), int(match.group(2)), float(match.group(3)) 455 | if name not in set_of_blobs: 456 | print(BtLog.warn_d['2'] % (name)) 457 | else: 458 | read_cov_dict[name] = read_cov 459 | base_cov_dict[name] = base_cov 460 | pbar.update() 461 | 462 | #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs)) 463 | return base_cov_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict 464 | 465 | def checkCas(infile): 466 | print(BtLog.status_d['12']) 467 | if not isfile(infile): 468 | BtLog.error('0', infile) 469 | if not (which('clc_mapping_info')): 470 | BtLog.error('20') 471 | seqs_total_re = re.compile(r"\s+Contigs\s+(\d+)") 472 | reads_total_re = re.compile(r"\s+Reads\s+(\d+)") 473 | reads_mapping_re = re.compile(r"\s+Mapped reads\s+(\d+)\s+(\d+.\d+)\s+\%") 474 | seqs_total, reads_total, reads_mapped = 0, 0, 0 475 | output = '' 476 | command = "clc_mapping_info -s " + infile 477 | for line in runCmd(command=command): 478 | output += line 479 | seqs_total = int(seqs_total_re.search(output).group(1)) 480 | reads_mapped = int(reads_mapping_re.search(output).group(1)) 481 | reads_total = int(reads_total_re.search(output).group(1)) 482 | print(BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))) 483 | return seqs_total, reads_total, reads_mapped 484 | 485 | def parseCas(infile, order_of_blobs): 486 | if not isfile(infile): 487 | BtLog.error('0', infile) 488 | seqs_total, reads_total, reads_mapped = checkCas(infile) 489 | progress_unit = int(len(order_of_blobs)/100) 490 | cas_line_re = re.compile(r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})") 491 | command = "clc_mapping_info -n " + infile 492 | cov_dict = {} 493 | read_cov_dict = {} 494 | seqs_parsed = 0 495 | if (runCmd(command=command)): 496 | for line in runCmd(command=command): 497 | cas_line_match = cas_line_re.search(line) 498 | if cas_line_match: 499 | idx = int(cas_line_match.group(1)) - 1 # -1 because index of contig list starts with zero 500 | try: 501 | name = order_of_blobs[idx] 502 | reads = int(cas_line_match.group(3)) 503 | cov = float(cas_line_match.group(6)) 504 | cov_dict[name] = cov 505 | read_cov_dict[name] = reads 506 | seqs_parsed += 1 507 | except: 508 | pass 509 | BtLog.progress(seqs_parsed, progress_unit, seqs_total) 510 | return cov_dict, reads_total, reads_mapped, read_cov_dict 511 | 512 | def readTax(infile, set_of_blobs): 513 | ''' 514 | If more fields need to be parsed: 515 | - add as key-value pairs to hitDict 516 | ''' 517 | if not isfile(infile): 518 | BtLog.error('0', infile) 519 | #hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)") # TEST TEST , if not split it afterwards 520 | with open(infile) as fh: 521 | for line in fh: 522 | #match = hit_line_re.search(line) 523 | #if match: 524 | col = line.split() 525 | try: 526 | hitDict = { 527 | 'name' : col[0], 528 | 'taxId' : col[1], # string because if int, conversion is a nightmare ... 529 | 'score' : float(col[2]) 530 | } 531 | except ValueError: 532 | BtLog.error('46', infile, col[2]) 533 | if hitDict['name'] not in set_of_blobs: 534 | #print(BtLog.warn_d['13'] % (hitDict['name'], infile)) 535 | BtLog.error('19', hitDict['name'], infile) 536 | yield hitDict 537 | #hitDict = { 538 | # 'name' : match.group(1), 539 | # 'taxId' : match.group(2), # string because if int, conversion is a nightmare ... 540 | # 'score' : float(match.group(3)) 541 | # } 542 | #if hitDict['name'] not in set_of_blobs: 543 | # print(BtLog.warn_d['13'] % (hitDict['name'], infile)) 544 | # #BtLog.error('19', hitDict['name'], infile) 545 | #if hitDict['taxId'] == 'N/A': 546 | # BtLog.error('22', infile) 547 | #yield hitDict 548 | 549 | def getOutFile(base_file, prefix, suffix): 550 | EXTENSIONS = ['.fasta', '.fa', '.fna', '.txt', '.cov', '.out', '.json'] 551 | out_f, extension = splitext(basename(base_file)) 552 | if not extension in EXTENSIONS: 553 | out_f = '%s%s' % (out_f, extension) 554 | if (prefix): 555 | if prefix.endswith("/"): 556 | out_f = "%s" % (join(prefix, out_f)) 557 | else: 558 | out_f = "%s.%s" % (prefix, out_f) 559 | if (suffix): 560 | out_f = "%s.%s" % (out_f, suffix) 561 | return out_f 562 | 563 | def parseNodesDB(**kwargs): 564 | ''' 565 | Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that 566 | gets JSON'ed into blobtools/data/nodes_db.json if this file 567 | does not exist. Nodes_db.json is used if neither "--names" and "--nodes" 568 | nor "--db" is specified. If all three are specified and "--db" does not 569 | exist, then write 'nodes_db' to file specified by "--db". If all three 570 | are specified and "--db" exists, error out. 571 | ''' 572 | nodesDB = {} 573 | names_f = kwargs['names'] 574 | nodes_f = kwargs['nodes'] 575 | nodesDB_f = kwargs['nodesDB'] 576 | nodesDB_default = kwargs['nodesDBdefault'] 577 | 578 | if (nodes_f and names_f): 579 | if not isfile(names_f): 580 | BtLog.error('0', names_f) 581 | if not isfile(nodes_f): 582 | BtLog.error('0', nodes_f) 583 | if (nodesDB_f): 584 | if isfile(nodesDB_f): 585 | BtLog.error('47', nodesDB_f) 586 | BtLog.status_d['27'] % (nodesDB_f, nodes_f, names_f) 587 | else: 588 | print(BtLog.status_d['3'] % (nodes_f, names_f)) 589 | try: 590 | nodesDB = readNamesNodes(names_f, nodes_f) 591 | except: 592 | BtLog.error('3', nodes_f, names_f) 593 | elif (nodesDB_f): 594 | if not isfile(nodesDB_f): 595 | BtLog.error('0', nodesDB_f) 596 | print(BtLog.status_d['4'] % (nodesDB_f)) 597 | try: 598 | nodesDB = readNodesDB(nodesDB_f) 599 | except: 600 | BtLog.error('27', nodesDB_f) 601 | elif (nodesDB_default): 602 | if not isfile(nodesDB_default): 603 | BtLog.error('28') 604 | print(BtLog.status_d['4'] % (nodesDB_default)) 605 | try: 606 | nodesDB = readNodesDB(nodesDB_default) 607 | except: 608 | BtLog.error('27', nodesDB_default) 609 | 610 | # Write nodesDB if names, nodes, nodesDB all given and nodesDB does not 611 | # exist. Otherwise, write to nodesDB_default if it does not exist, unless 612 | # nodesDB given, then do nothing with nodesDB_default. 613 | if (nodes_f and names_f and nodesDB_f): 614 | print(BtLog.status_d['28'] % nodesDB_f) 615 | writeNodesDB(nodesDB, nodesDB_f) 616 | elif (not nodesDB_f and not isfile(nodesDB_default)): 617 | nodesDB_f = nodesDB_default 618 | print(BtLog.status_d['5'] % nodesDB_f) 619 | writeNodesDB(nodesDB, nodesDB_f) 620 | 621 | return nodesDB, nodesDB_f 622 | 623 | def readNamesNodes(names_f, nodes_f): 624 | nodesDB = {} 625 | nodes_count = 0 626 | with open(nodes_f) as fh: 627 | for line in fh: 628 | nodes_col = line.split("\t") 629 | node = {} 630 | node_id = nodes_col[0] 631 | node['parent'] = nodes_col[2] 632 | node['rank'] = nodes_col[4] 633 | nodesDB[node_id] = node 634 | nodes_count += 1 635 | with open(names_f) as fh: 636 | for line in fh: 637 | names_col = line.split("\t") 638 | if names_col[6] == "scientific name": 639 | nodesDB[names_col[0]]['name'] = names_col[2] 640 | nodesDB['nodes_count'] = nodes_count 641 | return nodesDB 642 | 643 | def readNodesDB(nodesDB_f): 644 | nodesDB = {} 645 | with open(nodesDB_f) as fh: 646 | nodes_count = int(fh.readline().lstrip("# nodes_count = ").rstrip("\n")) 647 | with tqdm(total=nodes_count, desc="[%] ", ncols=200, unit_scale=True) as pbar: 648 | for line in fh: 649 | if line.startswith("#"): 650 | pass 651 | else: 652 | node, rank, name, parent = line.rstrip("\n").split("\t") 653 | nodesDB[node] = {'rank' : rank, 'name' : name, 'parent' : parent} 654 | pbar.update() 655 | nodesDB['nodes_count'] = nodes_count 656 | return nodesDB 657 | 658 | def writeNodesDB(nodesDB, nodesDB_f): 659 | nodes_count = nodesDB['nodes_count'] 660 | with open(nodesDB_f, 'w') as fh: 661 | fh.write("# nodes_count = %s\n" % nodes_count) 662 | with tqdm(total=nodes_count, desc="[%] ", ncols=200, unit_scale=True) as pbar: 663 | for node in nodesDB: 664 | if not node == "nodes_count": 665 | fh.write("%s\t%s\t%s\t%s\n" % (node, nodesDB[node]['rank'], nodesDB[node]['name'], nodesDB[node]['parent'])) 666 | pbar.update() 667 | 668 | def byteify(input): 669 | ''' 670 | http://stackoverflow.com/a/13105359 671 | ''' 672 | if isinstance(input, dict): 673 | return {byteify(key):byteify(value) for key, value in input.items} 674 | elif isinstance(input, list): 675 | return [byteify(element) for element in input] 676 | #elif isinstance(input, unicode): 677 | # return input.encode('utf-8') 678 | else: 679 | return input 680 | 681 | 682 | 683 | def writeJsonGzip(obj, outfile): 684 | import json 685 | import gzip 686 | with gzip.open(outfile, 'wb') as fh: 687 | json.dump(obj, fh) 688 | 689 | def writeJson(obj, outfile, indent=0, separators=(',', ': ')): 690 | import json 691 | with open(outfile, 'w') as fh: 692 | #if (indent): 693 | # json.dump(obj, fh, indent=indent, separators=separators) 694 | #else: 695 | # json.dump(obj, fh) 696 | json.dump(obj, fh) 697 | #json.dump(obj, fh, indent=4, separators=(',', ': ')) # 698 | 699 | def parseJsonGzip(infile): 700 | import json 701 | import gzip 702 | with gzip.open(infile, 'rb') as fh: 703 | #obj = json.loads(fh.read().decode("ascii")) 704 | obj = json.loads(fh.read()) 705 | #return byteify(obj) 706 | return obj 707 | 708 | def parseJson(infile): 709 | '''http://artem.krylysov.com/blog/2015/09/29/benchmark-python-json-libraries/''' 710 | if not isfile(infile): 711 | BtLog.error('0', infile) 712 | import time 713 | start = time.time() 714 | json_parser = '' 715 | with open(infile, 'r') as fh: 716 | print(BtLog.status_d['15']) 717 | json_string = fh.read() 718 | try: 719 | import ujson as json # fastest 720 | json_parser = 'ujson' 721 | print(BtLog.status_d['16'] % json_parser) 722 | except ImportError: 723 | try: 724 | import simplejson as json # fast 725 | json_parser = 'simplejson' 726 | except ImportError: 727 | import json # default 728 | json_parser = 'json' 729 | print(BtLog.status_d['17'] % json_parser) 730 | try: 731 | #obj = json.loads(json_string.decode("ascii")) 732 | obj = json.loads(json_string) 733 | except ValueError: 734 | BtLog.error('37', infile, "BlobDB") 735 | #data = byteify(obj) 736 | data = obj 737 | print(BtLog.status_d['20'] % (time.time() - start)) 738 | return data 739 | 740 | def readYaml(infile): 741 | if not isfile(infile): 742 | BtLog.error('0', infile) 743 | with open(infile) as fh: 744 | str = "".join(fh.readlines()) 745 | try: 746 | data = yaml.safeload(str) 747 | except yaml.YAMLError: 748 | BtLog.error('37', infile, "yaml") 749 | return data 750 | 751 | if __name__ == "__main__": 752 | pass 753 | -------------------------------------------------------------------------------- /lib/BtLog.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | File : BtLog.py 6 | Author : Dominik R. Laetsch, dominik.laetsch at gmail dot com 7 | """ 8 | 9 | from __future__ import division 10 | import sys 11 | 12 | def error(message, *argv): 13 | if argv is None: 14 | sys.exit(error_d[message]) 15 | else: 16 | sys.exit(error_d[message] % (argv)) 17 | #exit(1) # change to exit with the actual ERROR number (different than 0) 18 | 19 | error_d = { 20 | '0': '[ERROR:0]\t: File %s does not exist.', 21 | '1': '[ERROR:1]\t: Please provide coverage information.', 22 | '2': '[ERROR:2]\t: Assembly type is not valid (%s).', 23 | '3': '[ERROR:3]\t: names.dmp/nodes.dmp ("--names", "--nodes") could not be read. %s, %s', 24 | '4': '[ERROR:4]\t: BlobDB.parseFasta() - no sequences found. Check FASTA file.', 25 | '5': '[ERROR:5]\t: Sequence header %s is not unique.', 26 | '6': '[ERROR:6]\t: BlobDB.readBam() - sequence header %s in %s was not in FASTA.', 27 | '7': '[ERROR:7]\t: Please add "samtools" to you PATH variable.', 28 | '8': '[ERROR:8]\t: Unsupported taxrule "%s".', 29 | '9': '[ERROR:9]\t: Unsupported taxonomic rank "%s".', 30 | '10': '[ERROR:10]\t: Unsupported output format "%s".', 31 | '11': '[ERROR:11]\t: Taxrule "%s" was not computed for this BlobDb. Available taxrule(s) : %s.', 32 | '12': '[ERROR:12]\t: Please provide an output file.', 33 | '13': '[ERROR:13]\t: %s does not appear to be a comma-separated list or a file.', 34 | '14': '[ERROR:14]\t: Unsupported sort order for plotting : %s. Must be either "span" or "count".', 35 | '15': '[ERROR:15]\t: Unsupported histogram type for plotting : %s. Must be either "span" or "count".', 36 | '16': '[ERROR:16]\t: Group "%s" was specified in multiple clusters.', 37 | '17': '[ERROR:17]\t: Label could not be parsed from "%s".', 38 | '18': '[ERROR:18]\t: Please provide a tax file in BLAST format.', 39 | '19': '[ERROR:19]\t: Sequence %s in file %s is not part of the assembly.', 40 | '20': '[ERROR:20]\t: Please add "clc_mapping_info" to your PATH variable.', 41 | '21': '[ERROR:21]\t: Refcov file %s does not seem to have the right format.', 42 | '23': '[ERROR:23]\t: Catcolour file %s does not seem to have the right format.', 43 | '24': '[ERROR:24]\t: Catcolour file incompatible with c-index colouring.', 44 | '25': '[ERROR:25]\t: COV file %s does not seem to have the right format.', 45 | '26': '[ERROR:26]\t: TaxID must be integer.', 46 | '27': '[ERROR:27]\t: nodesDB ("--db") %s could not be read.', 47 | '28': '[ERROR:28]\t: Please specify "--names" and "--nodes", or "--db"', 48 | '29': '[ERROR:29]\t: No mapping reads found in %s', 49 | '30': '[ERROR:30]\t: The module docopt is not installed. Please install it to run blobtools\n\tpip install docopt', 50 | '31': '[ERROR:31]\t: Please specify a read mapping file (BAM/SAM/CAS)', 51 | '32': '[ERROR:32]\t: Choose either --cumulative or --multiplot', 52 | '33': '[ERROR:33] : CovLib(s) not found. The available covlibs are: \n%s', 53 | '34': '[ERROR:34] : Invalid plot type : %s', 54 | '35': '[ERROR:35] : Directory %s could not be created', 55 | '36': '[ERROR:36] : View %s could not be created', 56 | '37': '[ERROR:37] : %s does not seem to be a valid %s file', 57 | '38': '[ERROR:38] : %s is not an integer', 58 | '39': '[ERROR:39] : Please specify a taxid file (mapping subjects to taxids)', 59 | '40': '[ERROR:40] : CovLib \'%s\' not specified in refcov file', 60 | '41': '[ERROR:41] : Please specify either a mapping file or a taxID.', 61 | '42': '[ERROR:42] : SubjectID %s not found in mapping file %s.', 62 | '43': '[ERROR:43] : %s could not be found.', 63 | '44': '[ERROR:44] : Please specify integers for --map_col_sseqid and --map_col_taxid.', 64 | '45': '[ERROR:45] : Both --min_score and --min_diff must be numbers.', 65 | '46': '[ERROR:46] : Score in %s must be a float, not \'%s\'.', 66 | '47': '[ERROR:47] : Cannot create new "--db" file from "--names", "--nodes", "--db" file exists. %s' 67 | } 68 | 69 | warn_d = { 70 | '0': '[-] No tax files specified.', 71 | '1': '[-] %s not in colour file %s ...', 72 | '2': '[-] %s is not part of the assembly', 73 | '3': '\n[-] Based on samtools flagstat: expected %s reads, %s reads were parsed', 74 | '4': '[-] No coverage data found in %s', 75 | '5': '[-] Hit for sequence %s in tax file %s has multiple taxIds, only first one is used.', 76 | '6': '[-] Sum of coverage in cov lib %s is 0.0. Please ignore this warning if "--no_base_cov" was specified.', 77 | '7': '[-] No taxonomy information found.', 78 | '8': '[-] Duplicated sequences found :\n\t\t\t%s', 79 | '9': '[-] Taxrule "%s" was not computed for this BlobDb. Available taxrule(s) : %s. Will proceed without taxonomic annotation ...', 80 | '10': '[-] Line %s: sequence "%s" already has TaxID "%s". Skipped. (use --force to overwrite)', 81 | '11': '\n[-] The BAM file appears to be truncated.', 82 | '12': '[-] sseqid %s not found in ID-to-taxID mapping file %s.', 83 | '13': '[-] Sequence %s in file %s is not part of the assembly.' 84 | } 85 | status_d = { 86 | '0': '[+] Nothing to be done. %s', 87 | '1': '[+] Parsing %s - %s', 88 | '2': '[+] Done', 89 | '3': '[+] Creating nodesDB from %s and %s', 90 | '4': '[+] names.dmp/nodes.dmp not specified. Retrieving nodesDB from %s', 91 | '5': '[+] Store nodesDB in default location %s', 92 | '6': '[+] Computing taxonomy using taxrule(s) %s', 93 | '7': '[+] Generating BlobDB and writing to file %s', 94 | '8': '[+] Plotting %s', 95 | '9': '[+] Reading BlobDB %s', 96 | '10': '[+] \tChecking with \'samtools flagstat\'', 97 | '11': '[+] \tMapping reads = %s, total reads = %s (mapping rate = %s)', 98 | '12': '[+] \tChecking with \'clc_mapping_info\'', 99 | '13': '[+] \tWriting %s', 100 | '14': '[+] Preparing view(s) ...', 101 | '15': '[+] \tLoading BlobDB into memory ...', 102 | '16': '[+] \tDeserialising BlobDB (using \'%s\' module) (this may take a while) ...', 103 | '17': '[+] \tDeserialising BlobDB (using \'%s\' module) (this may take a while, consider installing the \'ujson\' module) ...', 104 | '18': '[+] Extracting data for plots ...', 105 | '19': '[+] Writing output ...', 106 | '20': '[+] \tFinished in %ss', 107 | '22': '[+] Filtering %s ...', 108 | '23': '[+] Filtered %s (pairs=%s) ...', 109 | '24': '[+] Writing %s', 110 | '25': '[+] Gzip\'ing %s', 111 | '26': '[+] Reading %s', 112 | '27': '[+] Creating nodesDB %s from %s and %s', 113 | '28': '[+] Store nodesDB in %s', 114 | } 115 | 116 | info_d = { 117 | '0': '[I]\t%s : sequences = %s, span = %s MB, N50 = %s nt' 118 | } 119 | 120 | if __name__ == "__main__": 121 | pass 122 | -------------------------------------------------------------------------------- /lib/BtPlot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | File : BtPlot.py 6 | Author : Dominik R. Laetsch, dominik.laetsch at gmail dot com 7 | """ 8 | 9 | from numpy import array, arange, logspace, mean, std 10 | import math 11 | import lib.BtLog as BtLog 12 | import lib.BtTax as BtTax 13 | import matplotlib as mat 14 | from matplotlib import cm 15 | from matplotlib.ticker import NullFormatter, MultipleLocator, AutoMinorLocator 16 | from matplotlib.lines import Line2D 17 | from matplotlib.colors import rgb2hex 18 | mat.use('agg') 19 | import matplotlib.pyplot as plt 20 | from operator import itemgetter 21 | #from itertools import izip 22 | 23 | mat.rcParams.update({'font.size': 36}) 24 | mat.rcParams['xtick.major.pad'] = '8' 25 | mat.rcParams['ytick.major.pad'] = '8' 26 | mat.rcParams['lines.antialiased'] = True 27 | 28 | LEGEND_FONTSIZE = 20 29 | COLOURMAP = "tab10" # "Set1", "Paired", "Set2", "Spectral" 30 | #GREY, BGGREY, WHITE, DGREY = '#d3d3d3', '#F0F0F5', '#ffffff', '#4d4d4d' 31 | GREY, BGGREY, BGCOLOUR, WHITE, DGREY = '#d3d3d3', '#F0F0F5', '#F8F8F8', '#ffffff', '#4d4d4d' 32 | nullfmt = NullFormatter() 33 | 34 | def n50(list_of_lengths): 35 | total_span = 0 36 | sorted_list_of_lengths=sorted(list_of_lengths, reverse=True) 37 | for contig_length in sorted_list_of_lengths: 38 | total_span += contig_length 39 | teoN50 = total_span/2.0 40 | running_sum = 0 41 | N50 = 0 42 | for contig_length in sorted_list_of_lengths: 43 | running_sum += contig_length 44 | if teoN50 <= running_sum: 45 | N50 = contig_length 46 | break 47 | return N50 48 | 49 | def getSortedGroups(data_dict, sort_order, sort_first=()): 50 | """ Returns list of sorted groups based on span or count. """ 51 | sorted_groups = [] 52 | visible_by_group = {} 53 | if sort_order == 'span': 54 | visible_by_group = {group: _dict['span_visible'] for group, _dict in data_dict.items()} 55 | #sorted_groups = sorted(data_dict, key = lambda x : data_dict[x]['span_visible'] if data_dict[x]['span_visible'] > 0 else 0, reverse=True) 56 | #sorted_groups = sorted(data_dict, key = lambda x : max(data_dict[x]['span_visible'], 0), reverse=True) 57 | elif sort_order == 'count': 58 | visible_by_group = {group: _dict['count_visible'] for group, _dict in data_dict.items()} 59 | #sorted_groups = sorted(data_dict, key = lambda x : data_dict[x]['count_visible'] if data_dict[x]['count_visible'] > 0 else 0, reverse=True) 60 | #sorted_groups = sorted(data_dict, key = lambda x : max(data_dict[x]['count_visible'], 0), reverse=True) 61 | else: 62 | pass 63 | for group, visible in sorted(visible_by_group.items(), key=itemgetter(1), reverse=True): 64 | if visible > 0: 65 | sorted_groups.append(group) 66 | 67 | # Now shuffle the stuff in sort_first to the front 68 | for sf in reversed(sort_first): 69 | try: 70 | sorted_groups.remove(sf) 71 | sorted_groups.insert(0, sf) 72 | except ValueError: 73 | #It wasn't in the list then. No probs. 74 | pass 75 | return sorted_groups 76 | 77 | def generateColourDict(colour_groups, groups): 78 | cmap = [rgb2hex(rgb) for rgb in cm.get_cmap(name=COLOURMAP).colors] 79 | # remove green 80 | del cmap[2] 81 | # remove brown 82 | del cmap[4] 83 | colour_d = {} 84 | idx_delay = 0 85 | for idx, group in enumerate(groups): 86 | if group in colour_groups: 87 | #print(group,) 88 | if group == 'no-hit' or group == 'None': 89 | colour_d[group] = GREY 90 | #print("GREY") 91 | idx_delay -= 1 92 | else: 93 | colour_d[group] = cmap[idx+idx_delay] 94 | #print(colour_d[group], idx+idx_delay) 95 | return colour_d 96 | 97 | def set_canvas(): 98 | left, width = 0.1, 0.60 99 | bottom, height = 0.1, 0.60 100 | bottom_h = left_h = left+width+0.02 101 | rect_scatter = [left, bottom, width, height] 102 | rect_histx = [left, bottom_h, width, 0.2] 103 | rect_histy = [left_h, bottom, 0.2, height] 104 | rect_legend = [left_h, bottom_h, 0.2, 0.2] 105 | return rect_scatter, rect_histx, rect_histy, rect_legend 106 | 107 | def set_format_scatterplot(axScatter, **kwargs): 108 | min_x, max_x = None, None 109 | min_y, max_y = None, None 110 | if kwargs['plot'] == 'blobplot': 111 | min_x, max_x = 0, 1 112 | major_xticks = MultipleLocator(0.2) 113 | minor_xticks = AutoMinorLocator(20) 114 | min_y, max_y = kwargs['min_cov']*0.1, kwargs['max_cov']+100 115 | axScatter.set_yscale('log') 116 | axScatter.set_xscale('linear') 117 | axScatter.xaxis.set_major_locator(major_xticks) 118 | axScatter.xaxis.set_minor_locator(minor_xticks) 119 | elif kwargs['plot'] == 'covplot': 120 | min_x, max_x = kwargs['min_cov']*0.1, kwargs['max_cov']+100 121 | min_y, max_y = kwargs['min_cov']*0.1, kwargs['max_cov']+100 122 | axScatter.set_yscale('log') 123 | axScatter.set_xscale('log') 124 | else: 125 | BtLog.error('34' % kwargs['plot']) 126 | axScatter.set_xlim( (min_x, max_x) ) 127 | axScatter.set_ylim( (min_y, max_y) ) # This sets the max-Coverage so that all libraries + sum are at the same scale 128 | axScatter.grid(True, which="major", lw=2., color=BGGREY, linestyle='-') 129 | axScatter.set_axisbelow(True) 130 | axScatter.xaxis.labelpad = 20 131 | axScatter.yaxis.labelpad = 20 132 | axScatter.yaxis.get_major_ticks()[0].label1.set_visible(False) 133 | axScatter.tick_params(axis='both', which='both', direction='out') 134 | return axScatter 135 | 136 | def set_format_hist_x(axHistx, axScatter): 137 | axHistx.set_xlim(axScatter.get_xlim()) 138 | axHistx.set_xscale(axScatter.get_xscale()) 139 | axHistx.grid(True, which="major", lw=2., color=BGGREY, linestyle='-') 140 | axHistx.xaxis.set_major_locator(axScatter.xaxis.get_major_locator()) # no labels since redundant 141 | axHistx.xaxis.set_minor_locator(axScatter.xaxis.get_minor_locator()) 142 | axHistx.xaxis.set_major_formatter(nullfmt) # no labels since redundant 143 | axHistx.set_axisbelow(True) 144 | axHistx.yaxis.labelpad = 20 145 | axHistx.tick_params(axis='both', which='both', direction='out') 146 | return axHistx 147 | 148 | def set_format_hist_y(axHisty, axScatter): 149 | axHisty.set_ylim(axScatter.get_ylim()) 150 | axHisty.set_yscale(axScatter.get_yscale()) 151 | axHisty.grid(True, which="major", lw=2., color=BGGREY, linestyle='-') 152 | axHisty.yaxis.set_major_formatter(nullfmt) # no labels since redundant 153 | axHisty.set_axisbelow(True) 154 | axHisty.xaxis.labelpad = 20 155 | axHisty.tick_params(axis='both', which='both', direction='out') 156 | return axHisty 157 | 158 | def get_ref_label(max_length, max_marker_size, fraction): 159 | length = int(math.ceil(fraction * max_length / 100.0)) * 100 160 | string = "%snt" % "{:,}".format(length) 161 | markersize = length/max_length * max_marker_size 162 | return length, string, markersize 163 | 164 | def plot_ref_legend(axScatter, max_length, max_marker_size, ignore_contig_length): 165 | if not (ignore_contig_length): 166 | ref1_length, ref1_string, ref1_markersize = get_ref_label(max_length, max_marker_size, 0.05) 167 | ref2_length, ref2_string, ref2_markersize = get_ref_label(max_length, max_marker_size, 0.1) 168 | ref3_length, ref3_string, ref3_markersize = get_ref_label(max_length, max_marker_size, 0.25) 169 | # markersize in scatter is in "points^2", markersize in Line2D is in "points" ... that's why we need math.sqrt() 170 | ref_1 = (Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=math.sqrt(ref1_markersize), markeredgecolor=WHITE, markerfacecolor=GREY)) 171 | ref_2 = (Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=math.sqrt(ref2_markersize), markeredgecolor=WHITE, markerfacecolor=GREY)) 172 | ref_3 = (Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=math.sqrt(ref3_markersize), markeredgecolor=WHITE, markerfacecolor=GREY)) 173 | axScatter.legend([ref_1,ref_2,ref_3], [ref1_string, ref2_string, ref3_string], numpoints=1, ncol = 3, loc = 8, fontsize=LEGEND_FONTSIZE, borderpad=1.2, labelspacing=1.8, handlelength=1, handletextpad=1) 174 | 175 | def plot_legend(fig, axLegend, out_f, legend_flag, format, cumulative_flag): 176 | if (legend_flag): 177 | extent = axLegend.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) 178 | legend_out_f = '%s.%s.%s' % (out_f, "legend", format) 179 | print(BtLog.status_d['8'] % legend_out_f) 180 | fig.savefig('%s' % legend_out_f, bbox_inches=extent, format=format) 181 | fig.delaxes(axLegend) 182 | return fig 183 | 184 | def check_input(args): 185 | rank = args['--rank'] 186 | c_index = args['--cindex'] 187 | multiplot = args['--multiplot'] 188 | sort_order = args['--sort'] 189 | sort_first = args['--sort_first'] 190 | taxrule = args['--taxrule'] 191 | hist_type = args['--hist'] 192 | catcolour_f = args['--catcolour'] 193 | cumulative_flag = args['--cumulative'] 194 | 195 | #Convert sort_first to a list 196 | if sort_first: 197 | args['--sort_first'] = sort_first.split(',') 198 | else: 199 | args['--sort_first'] = () 200 | 201 | if 'blobplot' in args or 'covplot' in args: 202 | # Are ranks sane ? 203 | if rank not in BtTax.RANKS: 204 | BtLog.error('9', rank) 205 | # is taxrule provided? 206 | if taxrule not in BtTax.TAXRULES: 207 | BtLog.error('8', taxrule) 208 | # Are sort_order and hist_type sane? 209 | if not sort_order in ['span', 'count']: 210 | BtLog.error('14', sort_order) 211 | if not hist_type in ['span', 'count']: 212 | BtLog.error('15', hist_type) 213 | if (catcolour_f) and (c_index): 214 | BtLog.error('24') 215 | if (cumulative_flag) and (multiplot): 216 | BtLog.error('32') 217 | return args 218 | 219 | class PlotObj(): 220 | def __init__(self, data_dict, cov_lib_dict, cov_lib_selection, plot_type, sort_first=()): 221 | self.labels = {'all'} 222 | self.plot = plot_type # type of plot 223 | self.group_labels = {} 224 | self.cov_lib_dict = cov_lib_dict 225 | self.cov_libs = self.subselect_cov_libs(cov_lib_dict, cov_lib_selection) 226 | self.cov_libs_total_reads_dict = self.get_cov_libs_total_reads_dict(cov_lib_dict) 227 | self.cov_libs_mapped_reads_dict = self.get_cov_libs_mapped_reads_dict(cov_lib_dict) 228 | self.data_dict = data_dict 229 | self.stats = {} 230 | self.exclude_groups = [] 231 | self.version = None 232 | self.colours = {} 233 | self.group_order = [] 234 | self.plot_order = [] 235 | self.sort_first = sort_first 236 | self.min_cov = 0.1 237 | self.max_cov = 0.0 238 | self.out_f = '' 239 | self.no_title = '' 240 | self.max_group_plot = 0 241 | self.format = '' 242 | self.legend_flag = '' 243 | self.cumulative_flag = '' 244 | self.dpi = 200 245 | self.scatter_size = (35, 35) 246 | self.readcov_size = (30, 10) 247 | self.cov_y_dict = {} 248 | self.xlabel = None 249 | self.ylabel = None 250 | 251 | self.refcov_dict = {} 252 | 253 | def subselect_cov_libs(self, cov_lib_dict, cov_lib_selection): 254 | selected_cov_libs = [] 255 | cov_lib_selection_error = 0 256 | if (cov_lib_selection): 257 | if cov_lib_selection == 'covsum': 258 | selected_cov_libs.append('covsum') 259 | elif "," in cov_lib_selection: 260 | selected_cov_libs = cov_lib_selection.split(",") 261 | if not set(selected_cov_libs).issubset(set(cov_lib_dict.keys())): 262 | cov_lib_selection_error = 1 263 | else: 264 | selected_cov_libs.append(cov_lib_selection) 265 | if not cov_lib_selection in cov_lib_dict: 266 | cov_lib_selection_error = 1 267 | else: 268 | selected_cov_libs = cov_lib_dict.keys() 269 | if cov_lib_selection_error: 270 | covlib_string = [] 271 | for covlib in cov_lib_dict: 272 | cov_lib_f = cov_lib_dict[covlib]['f'] 273 | if not cov_lib_f: 274 | cov_lib_f = "sum of coverages from all covlibs" 275 | covlib_string.append("\t\t%s : %s" % (covlib, cov_lib_f)) 276 | BtLog.error('33', "\n".join(covlib_string)) 277 | return selected_cov_libs 278 | 279 | def get_cov_libs_total_reads_dict(self, cov_lib_dict): 280 | return { x : cov_lib_dict[x]['reads_total'] for x in self.cov_libs} 281 | 282 | def get_cov_libs_mapped_reads_dict(self, cov_lib_dict): 283 | return { x : cov_lib_dict[x]['reads_mapped'] for x in self.cov_libs} 284 | 285 | def get_stats_for_group(self, group): 286 | stats = { 'name' : group, 287 | 'count_total' : "{:,}".format(self.stats[group]['count']), 288 | 'count_visible' : "{:,}".format(self.stats[group]['count_visible']), 289 | 'count_visible_perc' : '{0:.1%}'.format(self.stats[group]['count_visible']/self.stats[group]['count']) if self.stats[group]['count'] > 0 else '{0:.1%}'.format(0.0), 290 | 'span_visible' : "{:,}".format(self.stats[group]['span_visible']), 291 | 'span_total' : "{:,}".format(self.stats[group]['span']), 292 | 'span_visible_perc' : '{0:.1%}'.format(self.stats[group]['span_visible']/self.stats[group]['span']) if self.stats[group]['span'] > 0 else '{0:.1%}'.format(0.0), 293 | 'colour' : str(self.colours[group] if group in self.colours else None), 294 | 'n50' : "{:,}".format(self.stats[group]['n50']), 295 | 'gc_mean' : "{0:.2}".format(self.stats[group]['gc_mean']), 296 | 'gc_std' : "{0:.2}".format(self.stats[group]['gc_std']), 297 | 'cov_mean' : {cov_lib : "{0:0.1f}".format(cov_mean) for cov_lib, cov_mean in self.stats[group]['cov_mean'].items()}, 298 | 'cov_std' : {cov_lib : "{0:0.1f}".format(cov_std) for cov_lib, cov_std in self.stats[group]['cov_std'].items()}, 299 | 'reads_mapped' : {cov_lib : "{:,}".format(reads_mapped) for cov_lib, reads_mapped in self.stats[group]['reads_mapped'].items()}, 300 | 'reads_mapped_perc' : {cov_lib : '{0:.1%}'.format(reads_mapped_perc) for cov_lib, reads_mapped_perc in self.stats[group]['reads_mapped_perc'].items()} 301 | } 302 | return stats 303 | 304 | def write_stats(self, out_f): 305 | stats = [] 306 | stats.append(self.get_stats_for_group('all')) 307 | for group in self.plot_order: # group/label/other that has been plotted 308 | stats.append(self.get_stats_for_group(group)) 309 | if not group in self.group_labels: # it is either a label or "other" 310 | label = group 311 | for g, labels in self.group_labels.items(): 312 | if label in labels: 313 | stats.append(self.get_stats_for_group(g)) 314 | output = [] 315 | output.append('## %s' % self.version) 316 | for cov_lib, cov_lib_dict in self.cov_lib_dict.items(): 317 | if cov_lib in self.cov_libs: 318 | output.append("## %s=%s" % (cov_lib, cov_lib_dict['f'])) 319 | fields = ['name', 'colour', 'count_visible', 'count_visible_perc', 'span_visible','span_visible_perc', 'n50', 'gc_mean', 'gc_std'] 320 | header = [field for field in fields] 321 | for cov_lib in sorted(self.cov_libs): 322 | header.append('%s_mean' % cov_lib) 323 | header.append('%s_std' % cov_lib) 324 | header.append('%s_read_map' % cov_lib) 325 | header.append('%s_read_map_p' % cov_lib) 326 | output.append('# %s' % "\t".join(header)) 327 | for stat in stats: 328 | line = [] 329 | for field in fields: 330 | line.append(stat[field]) 331 | for cov_lib in sorted(self.cov_libs): 332 | line.append(stat['cov_mean'][cov_lib]) 333 | line.append(stat['cov_std'][cov_lib]) 334 | line.append(stat['reads_mapped'][cov_lib]) 335 | line.append(stat['reads_mapped_perc'][cov_lib]) 336 | output.append("%s" % "\t".join(line)) 337 | out_f = "%s.stats.txt" % out_f 338 | with open(out_f, 'w') as fh: 339 | print(BtLog.status_d['24'] % ("%s" % out_f)) 340 | fh.write("\n".join(output)) 341 | 342 | def compute_stats(self): 343 | stats = {} 344 | for label in self.labels: 345 | stats[label] = { 346 | 'name' : [], 347 | 'gc' : [], 348 | 'length': [], 349 | 'covs' : {cov_lib : [] for cov_lib in self.cov_libs}, 350 | 'cov_mean' : {cov_lib : 0.0 for cov_lib in self.cov_libs}, 351 | 'cov_std' : {cov_lib : 0.0 for cov_lib in self.cov_libs}, 352 | 'reads_mapped' : {cov_lib : 0 for cov_lib in self.cov_libs}, 353 | 'reads_mapped_perc' : {cov_lib: 0.0 for cov_lib in self.cov_libs}, 354 | 'n50' : 0, 355 | 'gc_mean' : 0.0, 356 | 'gc_std' : 0.0, 357 | 'groups' : set(), 358 | 'count' : 0, 359 | 'span' : 0, 360 | 'count_visible' : 0, 361 | 'span_visible' : 0, 362 | 'count_hidden' : 0, 363 | 'span_hidden' : 0 364 | } 365 | 366 | for group, labels in self.group_labels.items(): 367 | for label in labels: 368 | stats[label]['name'] = stats[label]['name'] + self.data_dict[group]['name'] 369 | stats[label]['groups'].add(group) 370 | stats[label]['gc'] = stats[label]['gc'] + self.data_dict[group]['gc'] 371 | stats[label]['length'] = stats[label]['length'] + self.data_dict[group]['length'] 372 | stats[label]['count'] += self.data_dict[group]['count'] 373 | stats[label]['span'] += self.data_dict[group]['span'] 374 | stats[label]['count_visible'] += self.data_dict[group]['count_visible'] 375 | stats[label]['count_hidden'] += self.data_dict[group]['count_hidden'] 376 | stats[label]['span_visible'] += self.data_dict[group]['span_visible'] 377 | stats[label]['span_hidden'] += self.data_dict[group]['span_hidden'] 378 | for cov_lib in self.cov_libs: 379 | stats[label]['covs'][cov_lib] = stats[label]['covs'][cov_lib] + self.data_dict[group]['covs'][cov_lib] 380 | stats[label]['reads_mapped'][cov_lib] += self.data_dict[group]['reads_mapped'][cov_lib] 381 | for label in stats: 382 | stats[label]['gc_mean'] = mean(array(stats[label]['gc'])) if stats[label]['count_visible'] > 0.0 else 0.0 383 | stats[label]['gc_std'] = std(array(stats[label]['gc'])) if stats[label]['count_visible'] > 0.0 else 0.0 384 | stats[label]['n50'] = n50(stats[label]['length']) if stats[label]['count_visible'] > 0.0 else 0.0 385 | for cov_lib in self.cov_libs: 386 | stats[label]['cov_mean'][cov_lib] = mean(array(stats[label]['covs'][cov_lib])) if stats[label]['count_visible'] > 0.0 else 0.0 387 | stats[label]['cov_std'][cov_lib] = std(array(stats[label]['covs'][cov_lib])) if stats[label]['count_visible'] > 0.0 else 0.0 388 | if self.cov_libs_total_reads_dict[cov_lib]: 389 | stats[label]['reads_mapped_perc'][cov_lib] = stats[label]['reads_mapped'][cov_lib]/self.cov_libs_total_reads_dict[cov_lib] 390 | self.stats = stats 391 | 392 | def relabel_and_colour(self, colour_dict, user_labels): 393 | #print(user_labels) 394 | groups = self.group_order[0:self.max_group_plot] 395 | if (colour_dict): 396 | groups_not_in_colour_dict = set(groups) - set(colour_dict.keys()) 397 | for _group in groups_not_in_colour_dict: 398 | colour_dict[_group] = WHITE 399 | else: 400 | #print(groups) 401 | colour_groups = list(set([_group if not (_group in user_labels) else user_labels[_group] for _group in groups])) 402 | #print(colour_groups) 403 | colour_dict = generateColourDict(colour_groups, groups) 404 | #print(colour_dict) 405 | for idx, group in enumerate(self.group_order): 406 | if group in self.exclude_groups: 407 | pass 408 | elif group in user_labels: 409 | #print(group, "in user_labels") 410 | label = user_labels[group] 411 | #print(label) 412 | self.group_labels[group].add(label) 413 | #print(self.group_labels[group]) 414 | self.group_labels[group].add(group) 415 | #print(self.group_labels[group]) 416 | self.colours[label] = colour_dict[user_labels[group]] 417 | if label not in self.plot_order: 418 | self.plot_order.append(label) 419 | elif group in colour_dict: 420 | self.group_labels[group].add(group) 421 | self.colours[group] = colour_dict[group] 422 | self.plot_order.append(group) 423 | elif idx > self.max_group_plot: 424 | self.group_labels[group].add('other') 425 | self.group_labels[group].add(group) 426 | self.colours['other'] = WHITE 427 | self.labels.add('other') 428 | else: 429 | self.group_labels[group].add('other') 430 | self.group_labels[group].add(group) 431 | self.colours['other'] = WHITE 432 | self.labels.add('other') 433 | self.group_labels[group].add('all') 434 | if 'other' in self.labels: 435 | if 'other' in self.sort_first: 436 | #Slightly tricky. We need to insert 'other' after the location of 437 | #the last thing before 'other' in self.sort_first that appears in 438 | #self.plot_order, or else put it at the start. 439 | ipos = 0 440 | for l in reversed(self.sort_first[:self.sort_first.index('other')]): 441 | try: 442 | ipos = self.plot_order.index(l) + 1 443 | break 444 | except ValueError: 445 | #keep looking 446 | pass 447 | self.plot_order.insert(ipos, 'other') 448 | else: 449 | # 'other' gets plotted last by default 450 | self.plot_order.append('other') 451 | 452 | 453 | def setupPlot(self, plot): 454 | if plot == 'blobplot' or plot == 'covplot': 455 | rect_scatter, rect_histx, rect_histy, rect_legend = set_canvas() 456 | # Setting up plots and axes 457 | fig = plt.figure(1, figsize=self.scatter_size, dpi=self.dpi) 458 | try: 459 | axScatter = plt.axes(rect_scatter, facecolor=BGCOLOUR) 460 | except AttributeError: 461 | axScatter = plt.axes(rect_scatter, axisbg=BGCOLOUR) 462 | axScatter = set_format_scatterplot(axScatter, min_cov=self.min_cov, max_cov=self.max_cov, plot=plot) 463 | axScatter.set_xlabel(self.xlabel) 464 | axScatter.set_ylabel(self.ylabel) 465 | try: 466 | axHistx = plt.axes(rect_histx, facecolor=BGCOLOUR) 467 | axHisty = plt.axes(rect_histy, facecolor=BGCOLOUR) 468 | except AttributeError: 469 | axHistx = plt.axes(rect_histx, axisbg=BGCOLOUR) 470 | axHisty = plt.axes(rect_histy, axisbg=BGCOLOUR) 471 | axHistx = set_format_hist_x(axHistx, axScatter) 472 | axHisty = set_format_hist_y(axHisty, axScatter) 473 | if self.hist_type == "span": 474 | axHistx.set_ylabel("Span (kb)") 475 | axHisty.set_xlabel("Span (kb)", rotation='horizontal') 476 | else: 477 | axHistx.set_ylabel("Count") 478 | axHisty.set_xlabel("Count", rotation='horizontal') 479 | for xtick in axHisty.get_xticklabels(): # rotate text for ticks in cov histogram 480 | xtick.set_rotation(270) 481 | try: 482 | axLegend = plt.axes(rect_legend, facecolor=WHITE) 483 | except AttributeError: 484 | axLegend = plt.axes(rect_legend, axisbg=WHITE) 485 | axLegend.xaxis.set_major_locator(plt.NullLocator()) 486 | axLegend.xaxis.set_major_formatter(nullfmt) 487 | axLegend.yaxis.set_major_locator(plt.NullLocator()) 488 | axLegend.yaxis.set_major_formatter(nullfmt) 489 | top_bins, right_bins = None, None 490 | if plot == 'blobplot': 491 | top_bins = arange(0, 1, 0.01) 492 | if self.min_cov >= 1: 493 | right_bins = logspace(0, (int(math.log(self.max_cov)) + 1), 200, base=10.0) 494 | elif self.min_cov >= 0.1: 495 | right_bins = logspace(-1, (int(math.log(self.max_cov)) + 1), 200, base=10.0) 496 | else: 497 | right_bins = logspace(-2, (int(math.log(self.max_cov)) + 1), 200, base=10.0) 498 | elif plot == 'covplot': 499 | if self.min_cov >= 1: 500 | top_bins = logspace(0, (int(math.log(self.max_cov)) + 1), 200, base=10.0) 501 | right_bins = logspace(0, (int(math.log(self.max_cov)) + 1), 200, base=10.0) 502 | elif self.min_cov >= 0.1: 503 | top_bins = logspace(-1, (int(math.log(self.max_cov)) + 1), 200, base=10.0) 504 | right_bins = logspace(-1, (int(math.log(self.max_cov)) + 1), 200, base=10.0) 505 | else: 506 | top_bins = logspace(-2, (int(math.log(self.max_cov)) + 1), 200, base=10.0) 507 | right_bins = logspace(-2, (int(math.log(self.max_cov)) + 1), 200, base=10.0) 508 | else: 509 | pass 510 | return fig, axScatter, axHistx, axHisty, axLegend, top_bins, right_bins 511 | elif plot == 'readcov': 512 | main_columns = 2 513 | if (self.refcov_dict): 514 | main_columns += 2 515 | group_columns = len(self.plot_order) 516 | fig = plt.figure(1, figsize=self.readcov_size, dpi=self.dpi) 517 | gs = mat.gridspec.GridSpec(1, 2, width_ratios=[main_columns, group_columns]) 518 | 519 | ax_main = plt.subplot(gs[0]) 520 | try: 521 | ax_main.set_facecolor(WHITE) 522 | except AttributeError: 523 | ax_main.set_color(WHITE) 524 | ax_main.set_ylim(0, 1.1) 525 | ax_main.set_yticklabels(['{:.0f}%'.format(x*100) for x in ax_main.get_yticks()]) 526 | ax_main.grid(True, axis='y', which="major", lw=2., color=BGGREY, linestyle='--') 527 | 528 | ax_group = plt.subplot(gs[1]) 529 | try: 530 | ax_group.set_facecolor(WHITE) 531 | except AttributeError: 532 | ax_group.set_color(WHITE) 533 | ax_group.set_ylim(0, 1.1) 534 | ax_group.set_yticklabels(['{:.0f}%'.format(x*100) for x in ax_group.get_yticks()]) 535 | ax_group.grid(True, axis='y', which="major", lw=2., color=BGGREY, linestyle='--') 536 | 537 | x_pos_main = arange(main_columns) 538 | x_pos_group = arange(len(self.plot_order)) 539 | return fig, ax_main, ax_group, x_pos_main, x_pos_group 540 | else: 541 | return None 542 | 543 | def plotBar(self, cov_lib, out_f): 544 | fig, ax_main, ax_group, x_pos_main, x_pos_group = self.setupPlot('readcov') 545 | ax_main_data = {'labels' : [], 'values' : [], 'colours' : [] } 546 | ax_group_data = {'labels' : [], 'values' : [], 'colours' : [] } 547 | reads_total = self.cov_libs_total_reads_dict[cov_lib] 548 | reads_mapped = self.stats['all']['reads_mapped'][cov_lib] 549 | reads_unmapped = reads_total - self.stats['all']['reads_mapped'][cov_lib] 550 | ax_main_data['labels'].append('Unmapped (assembly)') 551 | ax_main_data['values'].append(reads_unmapped/reads_total) 552 | ax_main_data['colours'].append(DGREY) 553 | ax_main_data['labels'].append('Mapped (assembly)') 554 | ax_main_data['values'].append(reads_mapped/reads_total) 555 | ax_main_data['colours'].append(DGREY) 556 | if (self.refcov_dict): 557 | if cov_lib in self.refcov_dict: 558 | reads_total_ref = self.refcov_dict[cov_lib]['reads_total'] 559 | reads_mapped_ref = self.refcov_dict[cov_lib]['reads_mapped'] 560 | reads_unmapped_ref = reads_total_ref - reads_mapped_ref 561 | ax_main_data['labels'].append('Unmapped (ref)') 562 | ax_main_data['values'].append(reads_unmapped_ref/reads_total_ref) 563 | ax_main_data['colours'].append(DGREY) 564 | ax_main_data['labels'].append('Mapped (ref)') 565 | ax_main_data['values'].append(reads_mapped_ref/reads_total_ref) 566 | ax_main_data['colours'].append(DGREY) 567 | else: 568 | BtLog.error('40', cov_lib) 569 | 570 | # mapped plotted groups 571 | for group in self.plot_order: 572 | ax_group_data['labels'].append(group) 573 | ax_group_data['values'].append(self.stats[group]['reads_mapped_perc'][cov_lib]) 574 | ax_group_data['colours'].append(self.colours[group]) 575 | rect_group = ax_group.bar(x_pos_group, ax_group_data['values'], width = 0.5, tick_label=ax_group_data['labels'], align='center', color = ax_group_data['colours']) 576 | for rect_g in rect_group: 577 | height_g = float(rect_g.get_height()) 578 | ax_group.text(rect_g.get_x() + rect_g.get_width()/2., 0.005 + height_g, '{:.2f}%'.format(height_g*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE) 579 | rect_main = ax_main.bar(x_pos_main, ax_main_data['values'], width = 0.5, tick_label=ax_main_data['labels'], align='center', color = ax_main_data['colours']) 580 | for rect_m in rect_main: 581 | height_m = float(rect_m.get_height()) 582 | ax_main.text(rect_m.get_x() + rect_m.get_width()/2., 0.005 + height_m, '{:.2f}%'.format(height_m*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE) 583 | 584 | ax_main.set_xticklabels(ax_main_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE) 585 | ax_group.set_xticklabels(ax_group_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE) 586 | #figsuptitle = fig.suptitle(out_f, verticalalignment='top') 587 | out_f = "%s.read_cov.%s" % (out_f, cov_lib) 588 | print(BtLog.status_d['8'] % "%s.%s" % (out_f, self.format)) 589 | fig.tight_layout() 590 | #fig.savefig("%s.%s" % (out_f, self.format), format=self.format, bbox_extra_artists=(figsuptitle,)) 591 | fig.savefig("%s.%s" % (out_f, self.format), format=self.format) 592 | plt.close(fig) 593 | 594 | def plotScatter(self, cov_lib, info_flag, out_f): 595 | 596 | fig, axScatter, axHistx, axHisty, axLegend, top_bins, right_bins = self.setupPlot(self.plot) 597 | # empty handles for big legend 598 | legend_handles = [] 599 | legend_labels = [] 600 | # marker size scaled by biggest blob (size in points^2) 601 | max_length = max(array(self.stats['all']['length'])) # length of biggest blob 602 | max_marker_size = 12500 # marker size for biggest blob, i.e. area of 12500^2 pixel 603 | for idx, group in enumerate(self.plot_order): 604 | idx += 1 605 | lw, alpha = 0.5, 0.8 606 | if group == 'no-hit': 607 | alpha = 0.5 608 | group_length_array = array(self.stats[group]['length']) 609 | if len(group_length_array) > 0 and group not in self.exclude_groups: 610 | colour = self.colours[group] 611 | group_x_array = '' 612 | group_y_array = '' 613 | if self.plot == 'blobplot': 614 | group_x_array = array(self.stats[group]['gc']) 615 | group_y_array = array(self.stats[group]['covs'][cov_lib]) 616 | elif self.plot == 'covplot': 617 | group_x_array = array(self.stats[group]['covs'][cov_lib]) 618 | group_y_array = array([self.cov_y_dict.get(name, 0.02) for name in self.stats[group]['name']]) 619 | else: 620 | BtLog.error('34', self.plot) 621 | marker_size_array = [] 622 | if (self.ignore_contig_length): # no scaling 623 | if group == "no-hit": 624 | s = 20 625 | else: 626 | s = 100 627 | marker_size_array = [s for length in group_length_array] 628 | else: # scaling by max_length 629 | marker_size_array = [(length/max_length)*max_marker_size for length in group_length_array] 630 | # generate label for legend 631 | group_span_in_mb = round(self.stats[group]['span_visible']/1000000, 2) 632 | group_number_of_seqs = self.stats[group]['count_visible'] 633 | group_n50 = self.stats[group]['n50'] 634 | fmt_seqs = "{:,}".format(group_number_of_seqs) 635 | fmt_span = "{:,}".format(group_span_in_mb) 636 | fmt_n50 = "{:,}".format(group_n50) 637 | label = "%s (%s;%sMB;%snt)" % (group, fmt_seqs, fmt_span, fmt_n50) 638 | if (info_flag): 639 | print(BtLog.info_d['0'] % (group, fmt_seqs, fmt_span, fmt_n50)) 640 | if group == "other": 641 | legend_handles.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markeredgecolor=DGREY, markerfacecolor=colour)) 642 | else: 643 | legend_handles.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markeredgecolor=WHITE, markerfacecolor=colour)) 644 | legend_labels.append(label) 645 | 646 | weights_array = None 647 | if self.hist_type == "span": 648 | weights_array = group_length_array/1000 649 | 650 | axHistx.hist(group_x_array, weights=weights_array, color = colour, bins = top_bins, histtype='step', lw = 3) 651 | axHisty.hist(group_y_array, weights=weights_array, color = colour, bins = right_bins, histtype='step', orientation='horizontal', lw = 3) 652 | if group == 'other': 653 | axScatter.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=DGREY, label=label) 654 | else: 655 | axScatter.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=WHITE, label=label) 656 | axLegend.axis('off') 657 | if (self.multiplot): 658 | fig_m, axScatter_m, axHistx_m, axHisty_m, axLegend_m, top_bins, right_bins = self.setupPlot(self.plot) 659 | legend_handles_m = [] 660 | legend_labels_m = [] 661 | legend_handles_m.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markeredgecolor=WHITE, markerfacecolor=colour)) 662 | legend_labels_m.append(label) 663 | axHistx_m.hist(group_x_array, weights=weights_array, color = colour, bins = top_bins, histtype='step', lw = 3) 664 | axHisty_m.hist(group_y_array, weights=weights_array, color = colour, bins = right_bins, histtype='step', orientation='horizontal', lw = 3) 665 | if group == 'other': 666 | axScatter_m.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=DGREY, label=label) 667 | else: 668 | axScatter_m.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=WHITE, label=label) 669 | axLegend_m.axis('off') 670 | axLegend_m.legend(legend_handles_m, legend_labels_m, loc=6, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True) 671 | plot_ref_legend(axScatter_m, max_length, max_marker_size, self.ignore_contig_length) 672 | m_out_f = "%s.%s.%s.%s" % (out_f, cov_lib, idx, group.replace("/", "_").replace(" ", "_")) 673 | fig_m = plot_legend(fig_m, axLegend_m, m_out_f, self.legend_flag, self.format, self.cumulative_flag) 674 | print(BtLog.status_d['8'] % "%s.%s" % (m_out_f, self.format)) 675 | fig_m.savefig("%s.%s" % (m_out_f, self.format), format=self.format) 676 | plt.close(fig_m) 677 | elif (self.cumulative_flag): 678 | axLegend.legend(legend_handles, legend_labels, loc=6, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True) 679 | plot_ref_legend(axScatter, max_length, max_marker_size, self.ignore_contig_length) 680 | m_out_f = "%s.%s.%s.%s" % (out_f, cov_lib, idx, group.replace("/", "_").replace(" ", "_")) 681 | fig.add_axes(axLegend) 682 | fig = plot_legend(fig, axLegend, m_out_f, self.legend_flag, self.format, self.cumulative_flag) 683 | if not (self.no_title): 684 | fig.suptitle(out_f, fontsize=35, verticalalignment='top') 685 | print(BtLog.status_d['8'] % "%s.%s" % (m_out_f, self.format)) 686 | fig.savefig("%s.%s" % (m_out_f, self.format), format=self.format) 687 | else: 688 | pass 689 | plot_ref_legend(axScatter, max_length, max_marker_size, self.ignore_contig_length) 690 | axLegend.legend(legend_handles, legend_labels, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True, loc=6 ) 691 | out_f = "%s.%s" % (out_f, cov_lib) 692 | fig.add_axes(axLegend) 693 | fig = plot_legend(fig, axLegend, out_f, self.legend_flag, self.format, self.cumulative_flag) 694 | if not (self.no_title): 695 | fig.suptitle(out_f, fontsize=35, verticalalignment='top') 696 | print(BtLog.status_d['8'] % "%s.%s" % (out_f, self.format)) 697 | fig.savefig("%s.%s" % (out_f, self.format), format=self.format) 698 | plt.close(fig) 699 | 700 | if __name__ == "__main__": 701 | pass 702 | -------------------------------------------------------------------------------- /lib/BtTax.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | File : BtTax.py 6 | Author : Dominik R. Laetsch, dominik.laetsch at gmail dot com 7 | """ 8 | 9 | RANKS = ['species', 'genus', 'family', 'order', 'phylum', 'superkingdom'] 10 | TAXRULES = ['bestsum', 'bestsumorder'] # this should be re-named colour rules at one point 11 | 12 | def noHit(): 13 | return {rank : {'tax' : 'no-hit', 'score' : 0.0, 'c_index' : 0} for rank in RANKS} 14 | 15 | def getTreeList(taxIds, nodesDB): 16 | known_tree_lists = {} 17 | for taxId in taxIds: 18 | if not taxId in known_tree_lists: 19 | tree_list = [] 20 | nextTaxId = [taxId] 21 | while nextTaxId: 22 | thisTaxId = nextTaxId.pop(0) 23 | if (not thisTaxId == '1') and (thisTaxId in nodesDB): 24 | parent = nodesDB[thisTaxId]['parent'] 25 | nextTaxId.append(parent) 26 | tree_list.append(thisTaxId) 27 | else: 28 | tree_list.append('1') 29 | known_tree_lists[taxId] = tree_list 30 | return known_tree_lists 31 | 32 | def getLineages(tree_lists, nodesDB): 33 | lineage = {} 34 | for tree_list_id, tree_list in tree_lists.items(): 35 | lineage[tree_list_id] = {rank : 'undef' for rank in RANKS} 36 | for taxId in tree_list: 37 | node = nodesDB[taxId] 38 | if node['rank'] in RANKS: 39 | lineage[tree_list_id][node['rank']] = node['name'] 40 | # traverse ranks again so that undef is "higher_def_rank" + "-" + undef 41 | def_rank = '' 42 | for rank in reversed(list(RANKS)): 43 | if not lineage[tree_list_id][rank] == 'undef': 44 | def_rank = lineage[tree_list_id][rank] 45 | else: 46 | if (def_rank): 47 | lineage[tree_list_id][rank] = def_rank + "-" + lineage[tree_list_id][rank] 48 | return lineage 49 | 50 | def taxRuleBestSum(taxDict, taxonomy, min_bitscore, min_bitscore_diff, tax_collision_random): 51 | tempTax = { rank : {} for rank in RANKS } 52 | for lib in sorted(taxDict): 53 | for rank in RANKS: 54 | for tax, score in sorted(taxDict[lib][rank].items()): 55 | tempTax[rank][tax] = tempTax[rank].get(tax, 0.0) + score 56 | for rank in tempTax: 57 | for tax, score in sorted(tempTax[rank].items(), key=lambda x: x[1], reverse=True): 58 | if taxonomy[rank]['tax'] == 'no-hit': 59 | taxonomy[rank]['score'] = score 60 | if score >= min_bitscore: 61 | taxonomy[rank]['tax'] = tax 62 | #taxonomy_assigned_in_hit_lib = lib 63 | else: 64 | if score == taxonomy[rank]['score']: # equal score in subsequent hit 65 | if not tax_collision_random: 66 | taxonomy[rank]['tax'] = 'unresolved' 67 | elif (taxonomy[rank]['score'] - score) <= min_bitscore_diff: 68 | taxonomy[rank]['tax'] = 'unresolved' 69 | else: 70 | pass 71 | if not taxonomy[rank]['tax'] == tax: 72 | taxonomy[rank]['c_index'] += 1 73 | return taxonomy 74 | 75 | def taxRuleBestSumOrder(taxDict, taxonomy, min_bitscore, min_bitscore_diff, tax_collision_random): 76 | for rank in RANKS: 77 | taxonomy_assigned_in_hit_lib = '' 78 | for lib in sorted(taxDict): 79 | for tax, score in sorted(taxDict[lib][rank].items(), key=lambda x: x[1], reverse=True): 80 | if not taxonomy_assigned_in_hit_lib: # has not been taxonomically annotated yet 81 | if taxonomy[rank]['tax'] == 'no-hit': 82 | taxonomy[rank]['score'] = score 83 | if score >= min_bitscore: 84 | taxonomy[rank]['tax'] = tax 85 | taxonomy_assigned_in_hit_lib = lib 86 | elif taxonomy_assigned_in_hit_lib == lib: 87 | if score == taxonomy[rank]['score']: # equal score in subsequent hit 88 | if not tax_collision_random: 89 | taxonomy[rank]['tax'] = 'unresolved' 90 | elif (taxonomy[rank]['score'] - score) <= min_bitscore_diff: 91 | taxonomy[rank]['tax'] = 'unresolved' 92 | else: 93 | pass 94 | if not taxonomy[rank]['tax'] == tax: 95 | taxonomy[rank]['c_index'] += 1 96 | else: 97 | pass 98 | return taxonomy 99 | 100 | def taxRule(taxrule, hits, lineages, min_score, min_bitscore_diff, tax_collision_random): 101 | taxonomy = {rank: {'tax': 'no-hit', 'score': 0.0, 'c_index': 0 } for rank in RANKS } 102 | taxDict = getTaxDict(hits, lineages) # here libs are separated 103 | if taxrule == 'bestsum': 104 | taxonomy = taxRuleBestSum(taxDict, taxonomy, min_score, min_bitscore_diff, tax_collision_random) 105 | elif taxrule == 'bestsumorder': 106 | taxonomy = taxRuleBestSumOrder(taxDict, taxonomy, min_score, min_bitscore_diff, tax_collision_random) 107 | else: 108 | pass 109 | return taxonomy 110 | 111 | def getTaxDict(hits, lineages): 112 | taxDict = {} 113 | for lib, hits in hits.items(): 114 | taxDict[lib] = {} 115 | for hit in hits: 116 | taxId = hit['taxId'] 117 | score = hit['score'] 118 | for rank in RANKS: 119 | name = lineages[taxId][rank] 120 | if not rank in taxDict[lib]: 121 | taxDict[lib][rank] = {name : 0.0} 122 | taxDict[lib][rank][name] = taxDict[lib][rank].get(name, 0.0) + score 123 | return taxDict 124 | 125 | if __name__ == "__main__": 126 | pass 127 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DRL/blobtools/9426ca4a6dd0e0a2a67841d92b60b101fc52b921/lib/__init__.py -------------------------------------------------------------------------------- /lib/bamfilter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """usage: blobtools bamfilter -b FILE [-i FILE] [-e FILE] [-U] [-n] [-o PREFIX] [-f FORMAT] 5 | [-h|--help] 6 | 7 | Options: 8 | -h --help show this 9 | -b, --bam FILE BAM file (sorted by name) 10 | -i, --include FILE List of contigs whose reads are included 11 | - writes FASTAs of pairs where at least 12 | one read maps sequences in list 13 | (InUn.fq, InIn.fq, ExIn.fq) 14 | -e, --exclude FILE List of contigs whose reads are excluded (outputs reads that do not map to sequences in list) 15 | - writes FASTAs of pairs where at least 16 | one read does not maps to sequences in list 17 | (InUn.fq, InIn.fq, ExIn.fq) 18 | -U, --exclude_unmapped Exclude pairs where both reads are unmapped 19 | -n, --noninterleaved Use if fw and rev reads should be in separate files 20 | -f, --read_format FORMAT FASTQ = fq, FASTA = fa [default: fa] 21 | -o, --out PREFIX Output prefix 22 | """ 23 | 24 | from __future__ import division 25 | from docopt import docopt 26 | 27 | import sys 28 | 29 | import lib.BtLog as BtLog 30 | import lib.BtIO as BtIO 31 | 32 | def main(): 33 | args = docopt(__doc__) 34 | #print(args) 35 | bam_f = args['--bam'] 36 | include_f = args['--include'] 37 | exclude_f = args['--exclude'] 38 | out_prefix = args['--out'] 39 | read_format = args['--read_format'] 40 | if not read_format in set(['fq', 'fa']): 41 | sys.exit("[X] Read format must be fq or fa!") 42 | noninterleaved = args['--noninterleaved'] 43 | include_unmapped = True 44 | if args['--exclude_unmapped']: 45 | include_unmapped = False 46 | out_f = BtIO.getOutFile(bam_f, out_prefix, None) 47 | if include_f and exclude_f: 48 | print(BtLog.error('43')) 49 | elif include_f: 50 | sequence_list = BtIO.parseList(include_f) 51 | BtIO.parseBamForFilter(bam_f, include_unmapped, noninterleaved, out_f, sequence_list, None, read_format) 52 | elif exclude_f: 53 | sequence_list = BtIO.parseList(exclude_f) 54 | BtIO.parseBamForFilter(bam_f, include_unmapped, noninterleaved, out_f, None, sequence_list, read_format) 55 | else: 56 | BtIO.parseBamForFilter(bam_f, include_unmapped, noninterleaved, out_f, None, None, read_format) 57 | 58 | if __name__ == '__main__': 59 | main() 60 | -------------------------------------------------------------------------------- /lib/blobplot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """usage: blobtools plot -i 5 | [-p INT] [-l INT] [--cindex] [-n] [-s] 6 | [-r RANK] [-x TAXRULE] [--label GROUPS...] 7 | [--lib COVLIB] [-o PREFIX] [-m] 8 | [--sort ORDER] [--sort_first LABELS] [--hist HIST] [--notitle] [--filelabel] 9 | [--colours FILE] [--exclude FILE] 10 | [--refcov FILE] [--catcolour FILE] 11 | [--format FORMAT] [--noblobs] [--noreads] [--legend] 12 | [--cumulative] [--multiplot] 13 | [-h|--help] 14 | 15 | Options: 16 | -h --help show this 17 | -i, --infile BLOBDB BlobDB file (created with "blobtools create") 18 | --lib COVLIB Plot only certain covlib(s). Separated by "," 19 | --notitle Do not add filename as title to plot 20 | --filelabel Label axis based on filenames 21 | -p, --plotgroups INT Number of (taxonomic) groups to plot, remaining 22 | groups are placed in 'other' [default: 8] 23 | -l, --length INT Minimum sequence length considered for plotting [default: 100] 24 | --cindex Colour blobs by 'c index' [default: False] 25 | -n, --nohit Hide sequences without taxonomic annotation [default: False] 26 | -s, --noscale Do not scale sequences by length [default: False] 27 | --legend Plot legend of blobplot in separate figure 28 | -m, --multiplot Multi-plot. Print blobplot for each (taxonomic) group separately 29 | --cumulative Print plot after addition of each (taxonomic) group 30 | --sort Sort order for plotting [default: span] 31 | span : plot with decreasing span 32 | count : plot with decreasing count 33 | --sort_first Labels that should always be plotted first, regardless of sort order 34 | ("no-hit,other,undef" is often a useful setting) 35 | --hist Data for histograms [default: span] 36 | span : span-weighted histograms 37 | count : count histograms 38 | -r, --rank Taxonomic rank used for colouring of blobs [default: phylum] 39 | (Supported: species, genus, family, order, 40 | phylum, superkingdom) 41 | -x, --taxrule Taxrule which has been used for computing taxonomy 42 | (Supported: bestsum, bestsumorder) [default: bestsum] 43 | --format FORMAT Figure format for plot (png, pdf, eps, jpeg, 44 | ps, svg, svgz, tiff) [default: png] 45 | --noblobs Omit blobplot [default: False] 46 | --noreads Omit plot of reads mapping [default: False] 47 | 48 | -o, --out PREFIX Output prefix 49 | 50 | --label GROUPS... Relabel (taxonomic) groups, can be used several times. 51 | e.g. "A=Actinobacteria,Proteobacteria" 52 | --colours COLOURFILE File containing colours for (taxonomic) groups. This allows having more than 9 colours. 53 | --exclude GROUPS Exclude these (taxonomic) groups (also works for 'other') 54 | e.g. "Actinobacteria,Proteobacteria,other" 55 | --refcov File containing number of "total" and "mapped" reads 56 | per coverage file. (e.g.: bam0,900,100). If provided, info 57 | will be used in read coverage plot(s). 58 | --catcolour Colour plot based on categories from FILE 59 | (format : "seq\tcategory"). 60 | 61 | """ 62 | from docopt import docopt 63 | 64 | from os.path import basename 65 | import sys 66 | import lib.BtLog as BtLog 67 | import lib.BtIO as BtIO 68 | import lib.BtCore as BtCore 69 | import lib.BtPlot as BtPlot 70 | import lib.interface as interface 71 | 72 | def main(): 73 | args = docopt(__doc__) 74 | args = BtPlot.check_input(args) 75 | 76 | blobdb_f = args['--infile'] 77 | rank = args['--rank'] 78 | min_length = int(args['--length']) 79 | max_group_plot = int(args['--plotgroups']) 80 | colour_f = args['--colours'] 81 | if max_group_plot > 8 and not colour_f: 82 | sys.exit("[X] '--plotgroups' must be less than 9 for using automatic colour assignation.") 83 | hide_nohits = args['--nohit'] 84 | taxrule = args['--taxrule'] 85 | c_index = args['--cindex'] 86 | exclude_groups = args['--exclude'] 87 | labels = args['--label'] 88 | colour_f = args['--colours'] 89 | refcov_f = args['--refcov'] 90 | catcolour_f = args['--catcolour'] 91 | 92 | multiplot = args['--multiplot'] 93 | out_prefix = args['--out'] 94 | sort_order = args['--sort'] 95 | sort_first = args['--sort_first'] 96 | hist_type = args['--hist'] 97 | no_title = args['--notitle'] 98 | ignore_contig_length = args['--noscale'] 99 | format_plot = args['--format'] 100 | no_plot_blobs = args['--noblobs'] 101 | no_plot_reads = args['--noreads'] 102 | legend_flag = args['--legend'] 103 | cumulative_flag = args['--cumulative'] 104 | cov_lib_selection = args['--lib'] 105 | 106 | filelabel = args['--filelabel'] 107 | 108 | exclude_groups = BtIO.parseCmdlist(exclude_groups) 109 | refcov_dict = BtIO.parseReferenceCov(refcov_f) 110 | user_labels = BtIO.parseCmdLabels(labels) 111 | catcolour_dict = BtIO.parseCatColour(catcolour_f) 112 | colour_dict = BtIO.parseColours(colour_f) 113 | 114 | # Load BlobDb 115 | print(BtLog.status_d['9'] % blobdb_f) 116 | blobDb = BtCore.BlobDb('blobplot') 117 | blobDb.version = interface.__version__ 118 | blobDb.load(blobdb_f) 119 | 120 | # Generate plot data 121 | print(BtLog.status_d['18']) 122 | data_dict, min_cov, max_cov, cov_lib_dict = blobDb.getPlotData(rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict) 123 | plotObj = BtPlot.PlotObj(data_dict, cov_lib_dict, cov_lib_selection, 'blobplot', sort_first) 124 | plotObj.exclude_groups = exclude_groups 125 | plotObj.version = blobDb.version 126 | plotObj.format = format_plot 127 | plotObj.max_cov = max_cov 128 | plotObj.min_cov = min_cov 129 | plotObj.no_title = no_title 130 | plotObj.multiplot = multiplot 131 | plotObj.hist_type = hist_type 132 | plotObj.ignore_contig_length = ignore_contig_length 133 | plotObj.max_group_plot = max_group_plot 134 | plotObj.legend_flag = legend_flag 135 | plotObj.cumulative_flag = cumulative_flag 136 | # order by which to plot (should know about user label) 137 | plotObj.group_order = BtPlot.getSortedGroups(data_dict, sort_order, sort_first) 138 | # labels for each level of stats 139 | plotObj.labels.update(plotObj.group_order) 140 | # plotObj.group_labels is dict that contains labels for each group : all/other/user_label 141 | if (user_labels): 142 | for group, label in user_labels.items(): 143 | plotObj.labels.add(label) 144 | plotObj.group_labels = {group : set() for group in plotObj.group_order} 145 | plotObj.relabel_and_colour(colour_dict, user_labels) 146 | plotObj.compute_stats() 147 | plotObj.refcov_dict = refcov_dict 148 | # Plotting 149 | info_flag = 1 150 | out_f = '' 151 | for cov_lib in plotObj.cov_libs: 152 | plotObj.ylabel = "Coverage" 153 | plotObj.xlabel = "GC proportion" 154 | if (filelabel): 155 | plotObj.ylabel = basename(cov_lib_dict[cov_lib]['f']) 156 | out_f = "%s.%s.%s.p%s.%s.%s" % (blobDb.title, taxrule, rank, max_group_plot, hist_type, min_length) 157 | if catcolour_dict: 158 | out_f = "%s.%s" % (out_f, "catcolour") 159 | if ignore_contig_length: 160 | out_f = "%s.%s" % (out_f, "noscale") 161 | if c_index: 162 | out_f = "%s.%s" % (out_f, "c_index") 163 | if exclude_groups: 164 | out_f = "%s.%s" % (out_f, "exclude_" + "_".join(exclude_groups)) 165 | if labels: 166 | out_f = "%s.%s" % (out_f, "userlabel_" + "_".join(set([name for name in user_labels.values()]))) 167 | out_f = "%s.%s" % (out_f, "blobplot") 168 | if (plotObj.cumulative_flag): 169 | out_f = "%s.%s" % (out_f, "cumulative") 170 | if (plotObj.multiplot): 171 | out_f = "%s.%s" % (out_f, "multiplot") 172 | out_f = BtIO.getOutFile(out_f, out_prefix, None) 173 | if not (no_plot_blobs): 174 | plotObj.plotScatter(cov_lib, info_flag, out_f) 175 | info_flag = 0 176 | if not (no_plot_reads) and (plotObj.cov_libs_total_reads_dict[cov_lib]): 177 | # prevent plotting if --noreads or total_reads == 0 178 | plotObj.plotBar(cov_lib, out_f) 179 | plotObj.write_stats(out_f) 180 | 181 | if __name__ == '__main__': 182 | main() 183 | -------------------------------------------------------------------------------- /lib/covplot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """usage: blobtools covplot -i BLOBDB -c COV [--max FLOAT] 5 | [--xlabel XLABEL] [--ylabel YLABEL] 6 | [--lib COVLIB] [-o PREFIX] [-m] 7 | [-p INT] [-l INT] [--cindex] [-n] [-s] 8 | [-r RANK] [-x TAXRULE] [--label GROUPS...] 9 | [--sort ORDER] [--sort_first LABELS] 10 | [--hist HIST] [--notitle] 11 | [--colours FILE] [--exclude FILE] 12 | [--refcov FILE] [--catcolour FILE] 13 | [--format FORMAT] [--noblobs] [--noreads] [--legend] 14 | [--cumulative] 15 | [-h|--help] 16 | 17 | Options: 18 | -h --help show this 19 | -i, --infile BLOBDB BlobDB file 20 | -c, --cov COV COV file to be used in y-axis 21 | 22 | --xlabel XLABEL Label for x-axis 23 | --ylabel YLABEL Label for y-axis 24 | --max FLOAT Maximum values for x/y-axis [default: 1e10] 25 | 26 | --lib COVLIB Plot only certain covlib(s). Separated by "," 27 | --notitle Do not add filename as title to plot 28 | -p, --plotgroups INT Number of (taxonomic) groups to plot, remaining 29 | groups are placed in 'other' [default: 7] 30 | -l, --length INT Minimum sequence length considered for plotting [default: 100] 31 | --cindex Colour blobs by 'c index' [default: False] 32 | -n, --nohit Hide sequences without taxonomic annotation [default: False] 33 | -s, --noscale Do not scale sequences by length [default: False] 34 | --legend Plot legend of blobplot in separate figure 35 | -m, --multiplot Multi-plot. Print blobplot for each (taxonomic) group separately 36 | --cumulative Print plot after addition of each (taxonomic) group 37 | --sort Sort order for plotting [default: span] 38 | span : plot with decreasing span 39 | count : plot with decreasing count 40 | --sort_first Labels that should always be plotted first, regardless of sort order 41 | ("no-hit,other,undef" is often a useful setting) 42 | --hist Data for histograms [default: span] 43 | span : span-weighted histograms 44 | count : count histograms 45 | -r, --rank Taxonomic rank used for colouring of blobs [default: phylum] 46 | (Supported: species, genus, family, order, 47 | phylum, superkingdom) 48 | -x, --taxrule Taxrule which has been used for computing taxonomy 49 | (Supported: bestsum, bestsumorder) [default: bestsum] 50 | --format FORMAT Figure format for plot (png, pdf, eps, jpeg, 51 | ps, svg, svgz, tiff) [default: png] 52 | --noblobs Omit blobplot [default: False] 53 | --noreads Omit plot of reads mapping [default: False] 54 | -o, --out PREFIX Output prefix 55 | --label GROUPS... Relabel (taxonomic) groups, can be used several times. 56 | e.g. "A=Actinobacteria,Proteobacteria" 57 | --colours COLOURFILE File containing colours for (taxonomic) groups 58 | --exclude GROUPS Exclude these (taxonomic) groups (also works for 'other') 59 | e.g. "Actinobacteria,Proteobacteria,other" 60 | --refcov File containing number of "total" and "mapped" reads 61 | per coverage file. (e.g.: bam0,900,100). If provided, info 62 | will be used in read coverage plot(s). 63 | --catcolour Colour plot based on categories from FILE 64 | (format : "seq,category"). 65 | """ 66 | 67 | from __future__ import division 68 | from docopt import docopt 69 | 70 | from os.path import basename 71 | 72 | import lib.interface as interface 73 | import lib.BtLog as BtLog 74 | import lib.BtIO as BtIO 75 | import lib.BtCore as Bt 76 | import lib.BtPlot as BtPlot 77 | 78 | def main(): 79 | args = docopt(__doc__) 80 | args = BtPlot.check_input(args) 81 | blobdb_f = args['--infile'] 82 | cov_f = args['--cov'] 83 | rank = args['--rank'] 84 | min_length = int(args['--length']) 85 | max_group_plot = int(args['--plotgroups']) 86 | hide_nohits = args['--nohit'] 87 | taxrule = args['--taxrule'] 88 | c_index = args['--cindex'] 89 | exclude_groups = args['--exclude'] 90 | labels = args['--label'] 91 | colour_f = args['--colours'] 92 | refcov_f = args['--refcov'] 93 | catcolour_f = args['--catcolour'] 94 | sort_order = args['--sort'] 95 | sort_first = args['--sort_first'] 96 | multiplot = args['--multiplot'] 97 | out_prefix = args['--out'] 98 | sort_order = args['--sort'] 99 | hist_type = args['--hist'] 100 | no_title = args['--notitle'] 101 | ignore_contig_length = args['--noscale'] 102 | format_plot = args['--format'] 103 | no_plot_blobs = args['--noblobs'] 104 | #no_plot_reads = args['--noreads'] 105 | legend_flag = args['--legend'] 106 | cumulative_flag = args['--cumulative'] 107 | cov_lib_selection = args['--lib'] 108 | 109 | xlabel = args['--xlabel'] 110 | ylabel = args['--ylabel'] 111 | axis_max = float(args['--max']) 112 | 113 | exclude_groups = BtIO.parseCmdlist(exclude_groups) 114 | refcov_dict = BtIO.parseReferenceCov(refcov_f) 115 | user_labels = BtIO.parseCmdLabels(labels) 116 | catcolour_dict = BtIO.parseCatColour(catcolour_f) 117 | colour_dict = BtIO.parseColours(colour_f) 118 | 119 | # Load BlobDb 120 | print(BtLog.status_d['9'] % blobdb_f) 121 | blobDb = Bt.BlobDb('blobplot') 122 | blobDb.version = interface.__version__ 123 | blobDb.load(blobdb_f) 124 | 125 | # Generate plot data 126 | print(BtLog.status_d['1'] % ('cov_y_axis', cov_f)) 127 | cov_y_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict = BtIO.parseCov(cov_f, set(blobDb.dict_of_blobs)) 128 | print(BtLog.status_d['18']) 129 | data_dict, min_cov, max_cov, cov_lib_dict = blobDb.getPlotData(rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict) 130 | plotObj = BtPlot.PlotObj(data_dict, cov_lib_dict, cov_lib_selection, 'covplot', sort_first) 131 | # set lowest coverage to 0.01 132 | for contig in cov_y_dict: 133 | if cov_y_dict[contig] < 0.1: 134 | cov_y_dict[contig] = 0.1 135 | plotObj.cov_y_dict = cov_y_dict 136 | plotObj.exclude_groups = exclude_groups 137 | plotObj.version = blobDb.version 138 | plotObj.format = format_plot 139 | plotObj.max_cov = axis_max 140 | plotObj.no_title = no_title 141 | plotObj.multiplot = multiplot 142 | plotObj.hist_type = hist_type 143 | plotObj.ignore_contig_length = ignore_contig_length 144 | plotObj.max_group_plot = max_group_plot 145 | plotObj.legend_flag = legend_flag 146 | plotObj.cumulative_flag = cumulative_flag 147 | # order by which to plot (should know about user label) 148 | plotObj.group_order = BtPlot.getSortedGroups(data_dict, sort_order) 149 | # labels for each level of stats 150 | plotObj.labels.update(plotObj.group_order) 151 | # plotObj.group_labels is dict that contains labels for each group : all/other/user_label 152 | if (user_labels): 153 | for group, label in user_labels.items(): 154 | plotObj.labels.add(label) 155 | plotObj.group_labels = {group : set() for group in plotObj.group_order} 156 | plotObj.relabel_and_colour(colour_dict, user_labels) 157 | plotObj.compute_stats() 158 | plotObj.refcov_dict = refcov_dict 159 | # Plotting 160 | info_flag = 1 161 | 162 | out_f = '' 163 | for cov_lib in plotObj.cov_libs: 164 | plotObj.xlabel = basename(cov_lib_dict[cov_lib]['f']) 165 | plotObj.ylabel = cov_f 166 | if (ylabel): 167 | plotObj.ylabel = ylabel 168 | if (xlabel): 169 | plotObj.xlabel = xlabel 170 | out_f = "%s.%s.%s.p%s.%s.%s" % (blobDb.title, taxrule, rank, max_group_plot, hist_type, min_length) 171 | if catcolour_dict: 172 | out_f = "%s.%s" % (out_f, "catcolour") 173 | if ignore_contig_length: 174 | out_f = "%s.%s" % (out_f, "noscale") 175 | if c_index: 176 | out_f = "%s.%s" % (out_f, "c_index") 177 | if exclude_groups: 178 | out_f = "%s.%s" % (out_f, "exclude_" + "_".join(exclude_groups)) 179 | if labels: 180 | out_f = "%s.%s" % (out_f, "userlabel_" + "_".join(set([name for name in user_labels.values()]))) 181 | out_f = "%s.%s" % (out_f, "covplot") 182 | if (plotObj.cumulative_flag): 183 | out_f = "%s.%s" % (out_f, "cumulative") 184 | if (plotObj.multiplot): 185 | out_f = "%s.%s" % (out_f, "multiplot") 186 | out_f = BtIO.getOutFile(out_f, out_prefix, None) 187 | if not (no_plot_blobs): 188 | plotObj.plotScatter(cov_lib, info_flag, out_f) 189 | info_flag = 0 190 | plotObj.write_stats(out_f) 191 | 192 | if __name__ == '__main__': 193 | main() 194 | -------------------------------------------------------------------------------- /lib/create.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """usage: blobtools create -i FASTA [-y FASTATYPE] [-o PREFIX] [--title TITLE] 5 | [-b BAM...] [-C] [-a CAS...] [-c COV...] 6 | [--nodes ] [--names ] [--db ] 7 | [-t HITS...] [-x TAXRULE...] [-m FLOAT] [-d FLOAT] [--tax_collision_random] 8 | [-h|--help] 9 | 10 | Options: 11 | -h --help show this 12 | -i, --infile FASTA FASTA file of assembly. Headers are split at whitespaces. 13 | -y, --type FASTATYPE Assembly program used to create FASTA. If specified, 14 | coverage will be parsed from FASTA header. 15 | (Parsing supported for 'spades', 'velvet', 'platanus') 16 | -t, --hitsfile HITS... Hits file in format (qseqid\\ttaxid\\tbitscore) 17 | (e.g. BLAST output "--outfmt '6 qseqid staxids bitscore'") 18 | Can be specified multiple times 19 | -x, --taxrule ... Taxrule determines how taxonomy of blobs 20 | is computed (by default both are calculated) 21 | "bestsum" : sum bitscore across all 22 | hits for each taxonomic rank 23 | "bestsumorder" : sum bitscore across all 24 | hits for each taxonomic rank. 25 | - If first file supplies hits, bestsum is calculated. 26 | - If no hit is found, the next file is used. 27 | -m, --min_score Minimal score necessary to be considered for taxonomy calculaton, otherwise set to 'no-hit' 28 | [default: 0.0] 29 | -d, --min_diff Minimal score difference between highest scoring 30 | taxonomies (otherwise "unresolved") [default: 0.0] 31 | --tax_collision_random Random allocation of taxonomy if highest scoring 32 | taxonomies have equal scores (otherwise "unresolved") [default: False] 33 | --nodes NCBI nodes.dmp file. Not required if '--db' 34 | --names NCBI names.dmp file. Not required if '--db' 35 | --db NodesDB file (default: $BLOBTOOLS/data/nodesDB.txt). If --nodes, --names and --db 36 | are all given and NODESDB does not exist, create it from NODES and NAMES. 37 | -b, --bam ... BAM file(s), can be specified multiple times 38 | -a, --cas ... CAS file(s) (requires clc_mapping_info in $PATH), can be specified multiple times 39 | -c, --cov ... COV file(s), can be specified multiple times 40 | -C, --calculate_cov Legacy coverage when getting coverage from BAM (does not apply to COV parsing). 41 | New default is to estimate coverages which is faster, 42 | -o, --out BlobDB output prefix 43 | --title TITLE Title of BlobDB [default: output prefix) 44 | """ 45 | 46 | from __future__ import division 47 | from docopt import docopt 48 | 49 | from os.path import join, dirname, abspath 50 | 51 | import lib.BtCore as BtCore 52 | import lib.BtLog as BtLog 53 | import lib.BtIO as BtIO 54 | import lib.interface as interface 55 | 56 | def main(): 57 | 58 | #main_dir = dirname(__file__) 59 | args = docopt(__doc__) 60 | fasta_f = args['--infile'] 61 | fasta_type = args['--type'] 62 | bam_fs = args['--bam'] 63 | cov_fs = args['--cov'] 64 | cas_fs = args['--cas'] 65 | hit_fs = args['--hitsfile'] 66 | prefix = args['--out'] 67 | nodesDB_f = args['--db'] 68 | names_f = args['--names'] 69 | estimate_cov_flag = True if not args['--calculate_cov'] else False 70 | nodes_f = args['--nodes'] 71 | taxrules = args['--taxrule'] 72 | try: 73 | min_bitscore_diff = float(args['--min_diff']) 74 | min_score = float(args['--min_score']) 75 | except ValueError(): 76 | BtLog.error('45') 77 | tax_collision_random = args['--tax_collision_random'] 78 | title = args['--title'] 79 | 80 | # outfile 81 | out_f = BtIO.getOutFile("blobDB", prefix, "json") 82 | if not (title): 83 | title = out_f 84 | 85 | # coverage 86 | if not (fasta_type) and not bam_fs and not cov_fs and not cas_fs: 87 | BtLog.error('1') 88 | cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \ 89 | [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \ 90 | [BtCore.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)] 91 | 92 | # taxonomy 93 | hit_libs = [BtCore.HitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs)] 94 | 95 | # Create BlobDB object 96 | blobDb = BtCore.BlobDb(title) 97 | blobDb.version = interface.__version__ 98 | # Parse FASTA 99 | blobDb.parseFasta(fasta_f, fasta_type) 100 | 101 | # Parse nodesDB OR names.dmp, nodes.dmp 102 | nodesDB_default = join(dirname(abspath(__file__)), "../data/nodesDB.txt") 103 | nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f, names=names_f, nodesDB=nodesDB_f, nodesDBdefault=nodesDB_default) 104 | blobDb.nodesDB_f = nodesDB_f 105 | 106 | # Parse similarity hits 107 | if (hit_libs): 108 | blobDb.parseHits(hit_libs) 109 | if not taxrules: 110 | if len(hit_libs) > 1: 111 | taxrules = ['bestsum', 'bestsumorder'] 112 | else: 113 | taxrules = ['bestsum'] 114 | blobDb.computeTaxonomy(taxrules, nodesDB, min_score, min_bitscore_diff, tax_collision_random) 115 | else: 116 | print(BtLog.warn_d['0']) 117 | 118 | # Parse coverage 119 | blobDb.parseCoverage(covLibObjs=cov_libs, estimate_cov=estimate_cov_flag, prefix=prefix) 120 | 121 | # Generating BlobDB and writing to file 122 | print(BtLog.status_d['7'] % out_f) 123 | BtIO.writeJson(blobDb.dump(), out_f) 124 | 125 | if __name__ == '__main__': 126 | main() 127 | -------------------------------------------------------------------------------- /lib/interface.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | blobtools 6 | 7 | Usage: 8 | ./blobtools [] [...] [-h] [-v] 9 | 10 | Modules: 11 | create create a BlobDB 12 | view generate tabular view, CONCOCT input or COV files from BlobDB 13 | plot generate a BlobPlot from a BlobDB 14 | covplot generate a CovPlot from a BlobDB and a COV file 15 | 16 | map2cov generate a COV file from BAM file 17 | taxify generate a BlobTools compatible HITS file (TSV) 18 | bamfilter subset paired-end reads from a BAM file 19 | seqfilter subset sequences in FASTA file based sequence IDs in list 20 | nodesdb create nodesdb based on NCBI Taxdump's names.dmp and nodes.dmp 21 | 22 | Options 23 | -h, --help show this 24 | -v, --version show version number 25 | 26 | See 'blobtools --help' for more information on a specific command. 27 | 28 | Further documentation is available at https://blobtools.readme.io/ 29 | 30 | Examples: 31 | 32 | # 1. Create a BlobDB 33 | ./blobtools create -i example/assembly.fna -b example/mapping_1.bam -t example/blast.out -o example/test 34 | 35 | # 2. Generate a tabular view 36 | ./blobtools view -i example/test.blobDB.json 37 | 38 | # 3. Generate a blobplot 39 | ./blobtools plot -i example/test.blobDB.json 40 | 41 | """ 42 | 43 | import sys 44 | from docopt import docopt, DocoptExit 45 | from timeit import default_timer as timer 46 | 47 | __version__ = '1.1.1' 48 | 49 | def main(): 50 | try: 51 | start_time = timer() 52 | try: 53 | args = docopt(__doc__, version=__version__, options_first=True) 54 | except DocoptExit: 55 | print(__doc__) 56 | else: 57 | if args['']: 58 | if args[''] == 'create': 59 | import lib.create as create 60 | create.main() 61 | elif args[''] == 'view': 62 | import lib.view as view 63 | view.main() 64 | elif args[''] == 'plot': 65 | import lib.blobplot as plot 66 | plot.main() 67 | elif args[''] == 'map2cov': 68 | import lib.map2cov as map2cov 69 | map2cov.main() 70 | elif args[''] == 'seqfilter': 71 | import lib.seqfilter as seqfilter 72 | seqfilter.main() 73 | elif args[''] == 'covplot': 74 | import lib.covplot as covplot 75 | covplot.main() 76 | elif args[''] == 'taxify': 77 | import lib.taxify as taxify 78 | taxify.main() 79 | elif args[''] == 'bamfilter': 80 | import lib.bamfilter as bamfilter 81 | bamfilter.main() 82 | elif args[''] == 'nodesdb': 83 | import lib.nodesdb as nodesdb 84 | nodesdb.main() 85 | else: 86 | sys.exit("%r is not a blobtools module. See 'blobtools -h'." % args['']) 87 | else: 88 | 89 | print(__doc__) 90 | except KeyboardInterrupt: 91 | sys.stderr.write("\n[X] Interrupted by user after %i seconds!\n" % (timer() - start_time)) 92 | sys.exit(-1) 93 | 94 | -------------------------------------------------------------------------------- /lib/map2cov.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """usage: blobtools map2cov -i FASTA [-b BAM...] [-a CAS...] 5 | [-o PREFIX] [-c] 6 | [-h|--help] 7 | 8 | Options: 9 | -h --help show this 10 | -i, --infile FASTA FASTA file of assembly. Headers are split at whitespaces. 11 | -b, --bam ... BAM file (requires pysam) 12 | -a, --cas ... CAS file (requires clc_mapping_info in $PATH) 13 | -o, --output Output prefix 14 | -c, --calculate_cov Legacy coverage, slower. New default is to estimate coverages 15 | based on read lengths of first 10K reads. 16 | """ 17 | 18 | from __future__ import division 19 | from docopt import docopt 20 | 21 | import lib.BtLog as BtLog 22 | import lib.BtCore as BtCore 23 | import lib.interface as interface 24 | 25 | def main(): 26 | args = docopt(__doc__) 27 | fasta_f = args['--infile'] 28 | bam_fs = args['--bam'] 29 | cas_fs = args['--cas'] 30 | prefix = args['--output'] 31 | estimate_cov_flag = True if not args['--calculate_cov'] else False 32 | 33 | # Make covLibs 34 | cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \ 35 | [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] 36 | if not (cov_libs): 37 | BtLog.error('31') 38 | blobDb = BtCore.BlobDb('cov') 39 | blobDb.version = interface.__version__ 40 | blobDb.parseFasta(fasta_f, None) 41 | blobDb.parseCoverage(covLibObjs=cov_libs, estimate_cov=estimate_cov_flag, prefix=prefix) 42 | 43 | if __name__ == '__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /lib/nodesdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """usage: blobtools nodesdb --nodes --names 5 | [-h|--help] 6 | 7 | Options: 8 | -h --help show this 9 | --nodes NCBI nodes.dmp file. 10 | --names NCBI names.dmp file. 11 | """ 12 | 13 | from __future__ import division 14 | from docopt import docopt 15 | 16 | from os.path import join, dirname, abspath 17 | 18 | import lib.BtIO as BtIO 19 | 20 | def main(): 21 | args = docopt(__doc__) 22 | names_f = args['--names'] 23 | nodes_f = args['--nodes'] 24 | 25 | # Parse names.dmp, nodes.dmp 26 | nodesDB_default = join(dirname(abspath(__file__)), "../data/nodesDB.txt") 27 | nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f, names=names_f, nodesDB=None, nodesDBdefault=nodesDB_default) 28 | 29 | if __name__ == '__main__': 30 | main() 31 | -------------------------------------------------------------------------------- /lib/seqfilter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """usage: blobtools seqfilter -i FASTA -l LIST [-o PREFIX] [-v] 5 | [-h|--help] 6 | 7 | Options: 8 | -h --help show this 9 | 10 | -i, --infile FASTA file of sequences (Headers are split at whitespaces) 11 | -l, --list TXT file containing headers of sequences to keep 12 | -o, --out Output prefix 13 | -v, --invert Invert filtering (Sequences w/ headers NOT in list) 14 | """ 15 | 16 | from __future__ import division 17 | from docopt import docopt 18 | from tqdm import tqdm 19 | 20 | import lib.BtLog as BtLog 21 | import lib.BtIO as BtIO 22 | 23 | def main(): 24 | args = docopt(__doc__) 25 | fasta_f = args['--infile'] 26 | list_f = args['--list'] 27 | invert = args['--invert'] 28 | prefix = args['--out'] 29 | 30 | output = [] 31 | out_f = BtIO.getOutFile(fasta_f, prefix, "filtered.fna") 32 | 33 | print(BtLog.status_d['1'] % ("list", list_f)) 34 | items = BtIO.parseSet(list_f) 35 | items_count = len(items) 36 | print(BtLog.status_d['22'] % fasta_f) 37 | items_parsed = [] 38 | 39 | with tqdm(total=items_count, desc="[%] ", ncols=200, unit_scale=True) as pbar: 40 | for header, sequence in BtIO.readFasta(fasta_f): 41 | if header in items: 42 | if not (invert): 43 | items_parsed.append(header) 44 | output.append(">%s\n%s\n" % (header, sequence)) 45 | else: 46 | if (invert): 47 | items_parsed.append(header) 48 | output.append(">%s\n%s\n" % (header, sequence)) 49 | pbar.update() 50 | 51 | items_parsed_count = len(items_parsed) 52 | 53 | items_parsed_count_unique = len(set(items_parsed)) 54 | if not items_parsed_count == items_parsed_count_unique: 55 | print(BtLog.warn_d['8'] % "\n\t\t\t".join(list(set([x for x in items_parsed if items_parsed.count(x) > 1])))) 56 | 57 | with open(out_f, "w") as fh: 58 | print(BtLog.status_d['24'] % out_f) 59 | fh.write("".join(output)) 60 | 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /lib/taxify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """usage: blobtools taxify -f FILE [-a INT] [-b INT] [-c INT] 5 | [-m FILE] [-s INT] [-t INT] 6 | [-i FILE] [-x INT] [-v FLOAT] 7 | [-o PREFIX] [-h|--help] 8 | 9 | Options: 10 | -h --help show this 11 | 12 | Options for similarity search input 13 | -f, --hit_file BLAST/Diamond similarity search result (TSV format). 14 | Defaults assume "-outfmt '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore'" 15 | -a, --hit_column_qseqid Zero-based column of qseqid in similarity search result [default: 0] 16 | Change if different format than (-outfmt '6') 17 | -b, --hit_column_sseqid Zero-based column of sseqid in similarity search result [default: 1] 18 | Change if different format than (-outfmt '6') 19 | -c, --hit_column_score Zero-based column of (bit)score in similarity search result [default: 11] 20 | Change if different format than (-outfmt '6') 21 | Options for TaxID mapping file 22 | -m, --taxid_mapping_file TaxID mapping file (contains seqid and taxid) 23 | -s, --map_col_sseqid Zero-based column of sseqid in TaxID mapping file (it will search for sseqid in this column) 24 | -t, --map_col_taxid Zero-based Column of taxid in TaxID mapping file (it will extract for taxid from this column) 25 | 26 | Options for custom input 27 | -i, --custom File containing list of sequence IDs 28 | -x, --custom_taxid TaxID to assign to all sequence IDs in list 29 | -v, --custom_score Score to assign to all sequence IDs in list 30 | 31 | General 32 | -o, --out Output prefix 33 | """ 34 | 35 | from __future__ import division 36 | from docopt import docopt 37 | from collections import defaultdict 38 | 39 | import lib.BtLog as BtLog 40 | import lib.BtIO as BtIO 41 | 42 | def main(): 43 | args = docopt(__doc__) 44 | out_f, hit_f, map_f, taxid_d = None, None, None, {} 45 | hit_f = args['--hit_file'] 46 | hit_col_qseqid = args['--hit_column_qseqid'] 47 | hit_col_sseqid = args['--hit_column_sseqid'] 48 | hit_col_score = args['--hit_column_score'] 49 | map_f = args['--taxid_mapping_file'] 50 | map_col_sseqid = args['--map_col_sseqid'] 51 | map_col_taxid = args['--map_col_taxid'] 52 | #custom_f = args['--custom'] 53 | custom_taxid = args['--custom_taxid'] 54 | #custom_score = args['--custom_score'] 55 | prefix = args['--out'] 56 | 57 | try: 58 | hit_col_qseqid = int(hit_col_qseqid) 59 | hit_col_sseqid = int(hit_col_sseqid) 60 | hit_col_score = int(hit_col_score) 61 | except ValueError: 62 | BtLog.error('41' % ("--hit_column_qseqid, --hit_column_sseqid and --hit_column_score")) 63 | 64 | if custom_taxid: 65 | try: 66 | custom_taxid = int(custom_taxid) 67 | except TypeError: 68 | BtLog.error('26') 69 | out_f = BtIO.getOutFile(hit_f, prefix, "taxID_%s.out" % custom_taxid) 70 | taxid_d = defaultdict(lambda: custom_taxid) 71 | elif map_f: 72 | if map_col_sseqid and map_col_taxid: 73 | try: 74 | map_col_sseqid = int(map_col_sseqid) 75 | map_col_taxid = int(map_col_taxid) 76 | except ValueError: 77 | BtLog.error('44') 78 | print(BtLog.status_d['1'] % ("Mapping file", map_f)) 79 | taxid_d = BtIO.parseDict(map_f, map_col_sseqid, map_col_taxid) 80 | out_f = BtIO.getOutFile(hit_f, prefix, "taxified.out") 81 | else: 82 | BtLog.error('44') 83 | else: 84 | BtLog.error('41') 85 | 86 | output = [] 87 | print(BtLog.status_d['1'] % ("similarity search result", hit_f)) 88 | with open(hit_f) as fh: 89 | for idx, line in enumerate(fh): 90 | col = line.rstrip("\n").split() 91 | qseqid = col[hit_col_qseqid] 92 | sseqid = col[hit_col_sseqid] 93 | score = col[hit_col_score] 94 | tax_id = None 95 | if custom_taxid: 96 | tax_id = taxid_d[sseqid] 97 | else: 98 | if sseqid not in taxid_d: 99 | BtLog.warn_d['12'] % (sseqid, map_f) 100 | tax_id = taxid_d.get(sseqid, "N/A") 101 | output.append("%s\t%s\t%s\t%s" % (qseqid, tax_id, score, sseqid)) 102 | if output: 103 | with open(out_f, "w") as fh: 104 | print(BtLog.status_d['24'] % out_f) 105 | fh.write("\n".join(output) + "\n") 106 | 107 | if __name__ == '__main__': 108 | main() 109 | -------------------------------------------------------------------------------- /lib/view.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | """usage: blobtools view -i [-x ] [--rank ...] [--hits] 5 | [--list ] [--out ] [--notable] 6 | [--concoct] [--cov] [--experimental ] 7 | [--h|--help] 8 | 9 | Options: 10 | --h --help show this 11 | -i, --input BlobDB file (created with "blobtools create") 12 | -o, --out Output prefix 13 | -l, --list List of sequence names (file). 14 | -x, --taxrule Taxrule used for computing taxonomy 15 | (supported: "bestsum", "bestsumorder") 16 | [default: bestsum] 17 | -r, --rank ... Taxonomic rank(s) at which output will be written. 18 | (supported: 'species', 'genus', 'family', 'order', 19 | 'phylum', 'superkingdom', 'all') [default: phylum] 20 | -b, --hits Displays taxonomic hits from tax files 21 | that contributed to the taxonomy. 22 | --concoct Generate concoct files [default: False] 23 | --cov Generate cov files [default: False] 24 | --experimental Experimental output [default: False] 25 | -n, --notable Do not generate table view [default: False] 26 | """ 27 | 28 | from docopt import docopt 29 | from os.path import isfile 30 | 31 | import lib.BtCore as BtCore 32 | import lib.BtLog as BtLog 33 | import lib.BtIO as BtIO 34 | import lib.interface as interface 35 | TAXRULES = ['bestsum', 'bestsumorder'] 36 | RANKS = ['species', 'genus', 'family', 'order', 'phylum', 'superkingdom', 'all'] 37 | 38 | def main(): 39 | #print(data_dir) 40 | args = docopt(__doc__) 41 | blobdb_f = args['--input'] 42 | prefix = args['--out'] 43 | ranks = args['--rank'] 44 | taxrule = args['--taxrule'] 45 | hits_flag = args['--hits'] 46 | seq_list_f = args['--list'] 47 | concoct = args['--concoct'] 48 | cov = args['--cov'] 49 | notable = args['--notable'] 50 | experimental = args['--experimental'] 51 | # Does blobdb_f exist ? 52 | if not isfile(blobdb_f): 53 | BtLog.error('0', blobdb_f) 54 | 55 | out_f = BtIO.getOutFile(blobdb_f, prefix, None) 56 | 57 | # Are ranks sane ? 58 | if 'all' in ranks: 59 | temp_ranks = RANKS[0:-1] 60 | ranks = temp_ranks[::-1] 61 | else: 62 | for rank in ranks: 63 | if rank not in RANKS: 64 | BtLog.error('9', rank) 65 | 66 | # Does seq_list file exist? 67 | seqs = [] 68 | if (seq_list_f): 69 | if isfile(seq_list_f): 70 | seqs = BtIO.parseList(seq_list_f) 71 | else: 72 | BtLog.error('0', seq_list_f) 73 | 74 | # Load BlobDb 75 | blobDb = BtCore.BlobDb('new') 76 | print(BtLog.status_d['9'] % (blobdb_f)) 77 | blobDb.load(blobdb_f) 78 | blobDb.version = interface.__version__ 79 | 80 | # Is taxrule sane and was it computed? 81 | if (blobDb.hitLibs) and taxrule not in blobDb.taxrules: 82 | BtLog.error('11', taxrule, blobDb.taxrules) 83 | 84 | # view(s) 85 | viewObjs = [] 86 | print(BtLog.status_d['14']) 87 | if not (notable): 88 | tableView = None 89 | if len(blobDb.hitLibs) > 1: 90 | tableView = BtCore.ViewObj(name="table", out_f=out_f, suffix="%s.table.txt" % (taxrule), body=[]) 91 | else: 92 | tableView = BtCore.ViewObj(name="table", out_f=out_f, suffix="table.txt", body=[]) 93 | viewObjs.append(tableView) 94 | if not experimental == 'False': 95 | meta = {} 96 | if isfile(experimental): 97 | meta = BtIO.readYaml(experimental) 98 | experimentalView = BtCore.ExperimentalViewObj(name = "experimental", view_dir=out_f, blobDb=blobDb, meta=meta) 99 | viewObjs.append(experimentalView) 100 | if (concoct): 101 | concoctTaxView = None 102 | concoctCovView = None 103 | if len(blobDb.hitLibs) > 1: 104 | concoctTaxView = BtCore.ViewObj(name="concoct_tax", out_f=out_f, suffix="%s.concoct_taxonomy_info.csv" % (taxrule), body=dict()) 105 | concoctCovView = BtCore.ViewObj(name="concoct_cov", out_f=out_f, suffix="%s.concoct_coverage_info.tsv" % (taxrule), body=[]) 106 | else: 107 | concoctTaxView = BtCore.ViewObj(name="concoct_tax", out_f=out_f, suffix="concoct_taxonomy_info.csv", body=dict()) 108 | concoctCovView = BtCore.ViewObj(name="concoct_cov", out_f=out_f, suffix="concoct_coverage_info.tsv", body=[]) 109 | viewObjs.append(concoctTaxView) 110 | viewObjs.append(concoctCovView) 111 | if (cov): 112 | for cov_lib_name, covLibDict in blobDb.covLibs.items(): 113 | out_f = BtIO.getOutFile(covLibDict['f'], prefix, None) 114 | covView = BtCore.ViewObj(name="covlib", out_f=out_f, suffix="cov", body=[]) 115 | blobDb.view(viewObjs=[covView], ranks=None, taxrule=None, hits_flag=None, seqs=None, cov_libs=[cov_lib_name], progressbar=True) 116 | if (viewObjs): 117 | #for viewObj in viewObjs: 118 | # print(viewObj.name) 119 | blobDb.view(viewObjs=viewObjs, ranks=ranks, taxrule=taxrule, hits_flag=hits_flag, seqs=seqs, cov_libs=[], progressbar=True) 120 | print(BtLog.status_d['19']) 121 | 122 | if __name__ == '__main__': 123 | main() 124 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | docopt 2 | matplotlib 3 | tqdm 4 | pysam 5 | pyyaml>=4.2b1 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | 4 | [metadata] 5 | description-file=README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pip 2 | from setuptools import setup, find_packages 3 | 4 | __version__ = '1.1' 5 | 6 | # Get the long description from the README file 7 | with open('README.md', 'r') as readme: 8 | long_description = readme.read() 9 | 10 | # get the dependencies and installs 11 | with open('requirements.txt', 'r') as requirements: 12 | reqs = requirements.read().splitlines() 13 | 14 | setup( 15 | name='blobtools', 16 | version=__version__, 17 | description='A modular command-line solution for visualisation, quality control and taxonomic partitioning of genome datasets', 18 | long_description=long_description, 19 | url='https://github.com/DRL/blobtools', 20 | download_url='https://github.com/DRL/blobtools/tarball/' + __version__, 21 | license='GnuGPL3', 22 | classifiers=[ 23 | 'Development Status :: 4 - Beta', 24 | 'Operating System :: POSIX', 25 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 26 | 'Topic :: Scientific/Engineering :: Visualization', 27 | 'Programming Language :: Python :: 3', 28 | ], 29 | keywords='Bioinformatics visualisation genome assembly QC', 30 | packages=find_packages(exclude=['docs', 'tests*']), 31 | include_package_data=True, 32 | author='Dominik R Laetsch', 33 | entry_points={ 34 | 'console_scripts': [ 35 | "blobtools=lib.interface:main", 36 | ], 37 | }, 38 | author_email='dominik.laetsch@gmail.com' 39 | ) 40 | -------------------------------------------------------------------------------- /test/meta.json: -------------------------------------------------------------------------------- 1 | {"id": "test", "name": "test", "records": 10, "record_type": "contigs", "fields": [{"id": "length", "name": "Length", "type": "variable", "datatype": "integer", "range": [216, 6273], "scale": "scaleLog", "preload": true}, {"id": "gc", "name": "GC", "type": "variable", "datatype": "float", "range": --------------------------------------------------------------------------------