├── setup.cfg ├── tests └── fastqs │ └── .desc ├── .gitmodules ├── PATENTS ├── .gitignore ├── download_example_reads.sh ├── setup.py ├── LICENSE ├── LICENSE.txt ├── README.md └── stringMLST.py /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /tests/fastqs/.desc: -------------------------------------------------------------------------------- 1 | location of downloaded test fastq files 2 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "datasets"] 2 | path = datasets 3 | url = https://github.com/jordanlab/stringMLST_datasets 4 | -------------------------------------------------------------------------------- /PATENTS: -------------------------------------------------------------------------------- 1 | Patent Rights Grant 2 | 3 | Some portions of the allele selection algorithm in stringMLST are patent 4 | pending. 5 | 6 | The Jordan Lab and Applied Bioinformatics Laboratory / IHRC Inc. (ABiL-IHRC) 7 | and its affiliates promise not to assert any stringMLST related patents against 8 | you for using or modifying stringMLST for non-commericial, academic use 9 | consistent with the terms stated in the License. 10 | 11 | No rights other than those explicitly stated in the License or this Promise are 12 | granted or waived. 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Folder view configuration files 2 | .DS_Store 3 | Desktop.ini 4 | 5 | # Thumbnail cache files 6 | ._* 7 | Thumbs.db 8 | 9 | # Files that might appear on external disks 10 | .Spotlight-V100 11 | .Trashes 12 | 13 | # Compiled Python files 14 | *.pyc 15 | 16 | # Compiled C++ files 17 | *.out 18 | # Byte-compiled / optimized / DLL files 19 | __pycache__/ 20 | *.py[cod] 21 | *$py.class 22 | 23 | # C extensions 24 | *.so 25 | 26 | # Distribution / packaging 27 | .Python 28 | env/ 29 | build/ 30 | develop-eggs/ 31 | dist/ 32 | downloads/ 33 | eggs/ 34 | .eggs/ 35 | lib/ 36 | lib64/ 37 | parts/ 38 | sdist/ 39 | var/ 40 | wheels/ 41 | *.egg-info/ 42 | .installed.cfg 43 | *.egg 44 | 45 | # PyInstaller 46 | # Usually these files are written by a python script from a template 47 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 48 | *.manifest 49 | *.spec 50 | 51 | -------------------------------------------------------------------------------- /download_example_reads.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | testdir='tests/fastqs' 4 | 5 | echo -e "Downloading fastq files from EBI..." 6 | 7 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR026/ERR026529/ERR026529_1.fastq.gz -P $testdir 8 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR026/ERR026529/ERR026529_2.fastq.gz -P $testdir 9 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR027/ERR027250/ERR027250_1.fastq.gz -P $testdir 10 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR027/ERR027250/ERR027250_2.fastq.gz -P $testdir 11 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036104/ERR036104_1.fastq.gz -P $testdir 12 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036104/ERR036104_2.fastq.gz -P $testdir 13 | 14 | echo -e "Done downloading.\n\n" 15 | 16 | for file in `ls $testdir/*` ; do 17 | echo -e "Attempting to unzip $file..." 18 | gunzip $file; 19 | echo -e "Done unzipping $file.\n" 20 | done 21 | 22 | echo -e "Done downloading and extrating test read files." 23 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | try: 3 | import os 4 | from setuptools import setup, find_packages 5 | except ImportError: 6 | from distutils.core import setup 7 | from os import path 8 | here = path.abspath(path.dirname(__file__)) 9 | def readme(file): 10 | with open(path.join(here, 'README.md')) as fh: 11 | long_description_text = fh.read() 12 | return(long_description_text) 13 | 14 | setup( 15 | name = 'stringMLST', 16 | scripts = ['stringMLST.py'], 17 | version = "0.6.1", 18 | description = 'Fast k-mer based tool for alignment and assembly-free multi locus sequence typing (MLST) directly from genome sequencing reads.', 19 | long_description=readme('README.md'), 20 | long_description_content_type="text/markdown", 21 | author = 'Jordan Lab', 22 | author_email = 'pypi@atc.io', 23 | url = 'https://github.com/jordanlab/stringMLST', 24 | keywords = ['MLST', 'kmer', "NGS", "stringMSLT"], 25 | classifiers = [ 26 | 'Programming Language :: Python :: 2.7', 27 | 'Programming Language :: Python :: 3.5', 28 | ], 29 | install_requires=['lxml','pyfaidx'], 30 | ) 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License 2 | 3 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. 4 | 5 | Section 1 – Definitions. 6 | 7 | Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image. 8 | Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License. 9 | BY-NC-SA Compatible License means a license listed at creativecommons.org/compatiblelicenses, approved by Creative Commons as essentially the equivalent of this Public License. 10 | Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights. 11 | Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements. 12 | Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material. 13 | License Elements means the license attributes listed in the name of a Creative Commons Public License. The License Elements of this Public License are Attribution, NonCommercial, and ShareAlike. 14 | Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License. 15 | Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license. 16 | Licensor means the individual(s) or entity(ies) granting rights under this Public License. 17 | NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange. 18 | Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them. 19 | Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world. 20 | You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning. 21 | Section 2 – Scope. 22 | 23 | License grant. 24 | Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to: 25 | reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and 26 | produce, reproduce, and Share Adapted Material for NonCommercial purposes only. 27 | Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions. 28 | Term. The term of this Public License is specified in Section 6(a). 29 | Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material. 30 | Downstream recipients. 31 | Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License. 32 | Additional offer from the Licensor – Adapted Material. Every recipient of Adapted Material from You automatically receives an offer from the Licensor to exercise the Licensed Rights in the Adapted Material under the conditions of the Adapter’s License You apply. 33 | No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material. 34 | No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i). 35 | Other rights. 36 | 37 | Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise. 38 | Patent and trademark rights are not licensed under this Public License. 39 | To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes. 40 | Section 3 – License Conditions. 41 | 42 | Your exercise of the Licensed Rights is expressly made subject to the following conditions. 43 | 44 | Attribution. 45 | 46 | If You Share the Licensed Material (including in modified form), You must: 47 | 48 | retain the following if it is supplied by the Licensor with the Licensed Material: 49 | identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated); 50 | a copyright notice; 51 | a notice that refers to this Public License; 52 | a notice that refers to the disclaimer of warranties; 53 | a URI or hyperlink to the Licensed Material to the extent reasonably practicable; 54 | indicate if You modified the Licensed Material and retain an indication of any previous modifications; and 55 | indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License. 56 | You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information. 57 | If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable. 58 | ShareAlike. 59 | In addition to the conditions in Section 3(a), if You Share Adapted Material You produce, the following conditions also apply. 60 | 61 | The Adapter’s License You apply must be a Creative Commons license with the same License Elements, this version or later, or a BY-NC-SA Compatible License. 62 | You must include the text of, or the URI or hyperlink to, the Adapter's License You apply. You may satisfy this condition in any reasonable manner based on the medium, means, and context in which You Share Adapted Material. 63 | You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, Adapted Material that restrict exercise of the rights granted under the Adapter's License You apply. 64 | Section 4 – Sui Generis Database Rights. 65 | 66 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material: 67 | 68 | for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only; 69 | if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material, including for purposes of Section 3(b); and 70 | You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database. 71 | For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights. 72 | Section 5 – Disclaimer of Warranties and Limitation of Liability. 73 | 74 | Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You. 75 | To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You. 76 | The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability. 77 | Section 6 – Term and Termination. 78 | 79 | This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically. 80 | Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates: 81 | 82 | automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or 83 | upon express reinstatement by the Licensor. 84 | For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License. 85 | For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License. 86 | Sections 1, 5, 6, 7, and 8 survive termination of this Public License. 87 | Section 7 – Other Terms and Conditions. 88 | 89 | The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed. 90 | Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License. 91 | Section 8 – Interpretation. 92 | 93 | For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License. 94 | To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions. 95 | No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor. 96 | Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority. 97 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License 2 | 3 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. 4 | 5 | Section 1 – Definitions. 6 | 7 | Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image. 8 | Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License. 9 | BY-NC-SA Compatible License means a license listed at creativecommons.org/compatiblelicenses, approved by Creative Commons as essentially the equivalent of this Public License. 10 | Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights. 11 | Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements. 12 | Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material. 13 | License Elements means the license attributes listed in the name of a Creative Commons Public License. The License Elements of this Public License are Attribution, NonCommercial, and ShareAlike. 14 | Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License. 15 | Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license. 16 | Licensor means the individual(s) or entity(ies) granting rights under this Public License. 17 | NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange. 18 | Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them. 19 | Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world. 20 | You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning. 21 | Section 2 – Scope. 22 | 23 | License grant. 24 | Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to: 25 | reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and 26 | produce, reproduce, and Share Adapted Material for NonCommercial purposes only. 27 | Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions. 28 | Term. The term of this Public License is specified in Section 6(a). 29 | Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material. 30 | Downstream recipients. 31 | Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License. 32 | Additional offer from the Licensor – Adapted Material. Every recipient of Adapted Material from You automatically receives an offer from the Licensor to exercise the Licensed Rights in the Adapted Material under the conditions of the Adapter’s License You apply. 33 | No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material. 34 | No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i). 35 | Other rights. 36 | 37 | Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise. 38 | Patent and trademark rights are not licensed under this Public License. 39 | To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes. 40 | Section 3 – License Conditions. 41 | 42 | Your exercise of the Licensed Rights is expressly made subject to the following conditions. 43 | 44 | Attribution. 45 | 46 | If You Share the Licensed Material (including in modified form), You must: 47 | 48 | retain the following if it is supplied by the Licensor with the Licensed Material: 49 | identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated); 50 | a copyright notice; 51 | a notice that refers to this Public License; 52 | a notice that refers to the disclaimer of warranties; 53 | a URI or hyperlink to the Licensed Material to the extent reasonably practicable; 54 | indicate if You modified the Licensed Material and retain an indication of any previous modifications; and 55 | indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License. 56 | You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information. 57 | If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable. 58 | ShareAlike. 59 | In addition to the conditions in Section 3(a), if You Share Adapted Material You produce, the following conditions also apply. 60 | 61 | The Adapter’s License You apply must be a Creative Commons license with the same License Elements, this version or later, or a BY-NC-SA Compatible License. 62 | You must include the text of, or the URI or hyperlink to, the Adapter's License You apply. You may satisfy this condition in any reasonable manner based on the medium, means, and context in which You Share Adapted Material. 63 | You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, Adapted Material that restrict exercise of the rights granted under the Adapter's License You apply. 64 | Section 4 – Sui Generis Database Rights. 65 | 66 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material: 67 | 68 | for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only; 69 | if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material, including for purposes of Section 3(b); and 70 | You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database. 71 | For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights. 72 | Section 5 – Disclaimer of Warranties and Limitation of Liability. 73 | 74 | Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You. 75 | To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You. 76 | The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability. 77 | Section 6 – Term and Termination. 78 | 79 | This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically. 80 | Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates: 81 | 82 | automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or 83 | upon express reinstatement by the Licensor. 84 | For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License. 85 | For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License. 86 | Sections 1, 5, 6, 7, and 8 survive termination of this Public License. 87 | Section 7 – Other Terms and Conditions. 88 | 89 | The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed. 90 | Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License. 91 | Section 8 – Interpretation. 92 | 93 | For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License. 94 | To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions. 95 | No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor. 96 | Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority. 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # stringMLST 2 | 3 | Fast k-mer based tool for multi locus sequence typing (MLST) 4 | stringMLST is a tool for detecting the MLST of an isolate directly from the genome sequencing reads. stringMLST predicts the ST of an isolate in a completely assembly and alignment free manner. The tool is designed in a light-weight, platform-independent fashion with minimum dependencies. 5 | 6 | Some portions of the allele selection algorithm in stringMLST are patent 7 | pending. Please refer to the PATENTS file for additional inforamation 8 | regarding licencing and use. 9 | 10 | 11 | Reference 12 | *http://jordan.biology.gatech.edu/page/software/stringmlst/* 13 | 14 | Abstract 15 | *http://bioinformatics.oxfordjournals.org/content/early/2016/09/06/bioinformatics.btw586.short?rss=1* 16 | 17 | Application Note 18 | *http://bioinformatics.oxfordjournals.org/content/early/2016/09/06/bioinformatics.btw586.full.pdf+html* 19 | 20 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat-square)](http://bioconda.github.io/recipes/stringmlst/README.html) [![PyPI version](https://badge.fury.io/py/stringMLST.svg)](https://badge.fury.io/py/stringMLST) ![downloads](https://img.shields.io/conda/dn/bioconda/stringmlst.svg?style=flat) [![container ready](https://quay.io/repository/biocontainers/stringmlst/status)](https://quay.io/repository/biocontainers/stringmlst) 21 | 22 | 23 | 24 | **stringMLST is a *tool* not a *database*, always use the most up-to-date database files as possible.** To facilitate 25 | keeping your databases updated, stringMLST can download and build databases from pubMLST using the most recent allele 26 | and profile definitions. Please see the "Included databases and automated retrieval of databases from pubMLST" section 27 | below for instructions. *The databases bundled here are for convenience only, do not rely on them being up-to-date*. 28 | 29 | stringMLST is licensed and distributed under [CC Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0) 30 | and is free for academic users and requires permission before any commercial use for any version of this code/algorithm. 31 | If you are a commercial user, please contact king.jordan@biology.gatech.edu for permissions 32 | 33 | ## Recommended installation method 34 | 35 | ``` 36 | pip install stringMLST 37 | 38 | ``` 39 | 40 | #### Installation via git (Not recommended for most users) 41 | 42 | ``` 43 | git clone https://github.com/jordanlab/stringMLST 44 | # Optional, download prebuilt databases 45 | # We don't recommend this method, instead build the databases locally 46 | cd stringMLST 47 | git submodule init 48 | git submodule update 49 | ``` 50 | 51 | ## Quickstart guide 52 | 53 | ```bash 54 | pip install stringMLST 55 | mkdir -p stringMLST_analysis; cd stringMLST_analysis 56 | stringMLST.py --getMLST -P neisseria/nmb --species neisseria 57 | # Download all available databases with: 58 | # stringMLST.py --getMLST -P mlst_dbs --species all 59 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR026/ERR026529/ERR026529_1.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR026/ERR026529/ERR026529_2.fastq.gz 60 | stringMLST.py --predict -P neisseria/nmb -1 ERR026529_1.fastq.gz -2 ERR026529_2.fastq.gz 61 | Sample abcZ adk aroE fumC gdh pdhC pgm ST 62 | ERR026529 231 180 306 612 269 277 260 10174 63 | 64 | ``` 65 | 66 | ## Python dependencies and external programs 67 | 68 | stringMLST does not require any python dependencies for basic usage (Building databases and predicting STs). 69 | 70 | For advanced used (genome coverage), stringMLST depends on the `pyfaidx` python module and `bamtools`, `bwa`, and `samtools`. 71 | See the coverage section for more information 72 | 73 | stringMLST has been tested with: 74 | ``` 75 | pyfaidx: 0.4.8.1 76 | samtools: 1.3 (Using htslib 1.3.1) [Requires the 1.x branch of samtools] 77 | bedtools: v2.24.0 78 | bwa: 0.7.13-r1126 79 | ``` 80 | 81 | ### To install the dependencies 82 | 83 | ```bash 84 | # pyfaidx 85 | pip install --user pyfaidx 86 | # samtools 87 | wget https://github.com/samtools/samtools/releases/download/1.3.1/samtools-1.3.1.tar.bz2 -o samtools-1.3.1.tar.bz2 88 | tar xf samtools-1.3.1.tar.bz2 89 | cd samtools-1.3.1.tar 90 | make 91 | make prefix=$HOME install 92 | # bedtools 93 | wget https://github.com/arq5x/bedtools2/releases/download/v2.25.0/bedtools-2.25.0.tar.gz 94 | tar -zxvf bedtools-2.25.0.tar.gz 95 | cd bedtools2; make 96 | cp ./bin/* ~/bin 97 | # bwa 98 | git clone https://github.com/lh3/bwa.git 99 | cd bwa; make 100 | cp bwa ~/bin/bwa 101 | export PATH=$PATH:$HOME/bin 102 | ``` 103 | 104 | 105 | ## Usage for Example Read Files (Neisseria meningitidis) 106 | 107 | * Download stringMLST.py, example read files (ERR026529, ERR027250, ERR036104) and the dataset for Neisseria meningitidis (Neisseria_spp.zip). 108 | ### Build database: 109 | 110 | ``` 111 | # Add dir to path 112 | export PATH=$PATH:$PWD 113 | # Will connect to EBI's SRA servers 114 | download_example_reads.sh 115 | ```` 116 | 117 | * Extract the MLST loci dataset. 118 | 119 | ``` 120 | unzip datasets/Neisseria_spp.zip -d datasets 121 | ``` 122 | 123 | * Create or use a config file specifying the location of all the locus and profile files. 124 | Example config file (Neisseria_spp/config.txt): 125 | 126 | ``` 127 | [loci] 128 | abcZ datasets/Neisseria_spp/abcZ.fa 129 | adk datasets/Neisseria_spp/adk.fa 130 | aroE datasets/Neisseria_spp/aroE.fa 131 | fumC datasets/Neisseria_spp/fumC.fa 132 | gdh datasets/Neisseria_spp/gdh.fa 133 | pdhC datasets/Neisseria_spp/pdhC.fa 134 | pgm datasets/Neisseria_spp/pgm.fa 135 | [profile] 136 | profile datasets/Neisseria_spp/neisseria.txt 137 | ``` 138 | 139 | * Run stringMLST.py --buildDB to create DB. Choose a k value and prefix (optional). 140 | 141 | ``` 142 | stringMLST.py --buildDB -c databases/Neisseria_spp/config.txt -k 35 -P NM 143 | ``` 144 | 145 | ### Predict: 146 | 147 | #### Single sample : 148 | ``` 149 | stringMLST.py --predict -1 tests/fastqs/ERR026529_1.fastq -2 tests/fastqs/ERR026529_2.fastq -k 35 -P NM 150 | ``` 151 | #### Batch mode (all the samples together): 152 | ``` 153 | stringMLST.py --predict -d ./tests/fastqs/ -k 35 -P NM 154 | ``` 155 | #### List mode: 156 | Create a list file (list_paired.txt) as : 157 | ``` 158 | tests/fastqs/ERR026529_1.fastq tests/fastqs/ERR026529_2.fastq 159 | tests/fastqs/ERR027250_1.fastq tests/fastqs/ERR027250_2.fastq 160 | tests/fastqs/ERR036104_1.fastq tests/fastqs/ERR036104_2.fastq 161 | ``` 162 | Run the tool as: 163 | ``` 164 | stringMLST.py --predict -l list_paired.txt -k 35 -P NM 165 | ``` 166 | #### Working with gziped files 167 | ``` 168 | stringMLST.py --predict -1 tests/fastqs/ERR026529_1.fq.gz -2 tests/fastqs/ERR026529_2.fq.gz -p -P NM -k 35 -o ST_NM.txt 169 | ``` 170 | ## Usage Documentation 171 | 172 | stringMLST's workflow is divided into two routines: 173 | * Database building and 174 | * ST discovery 175 | 176 | *Database building:* Builds the stringMLST database which is used for assigning STs to input sample files. This step is required once for each organism. Please note that stringMLST is capable of working on a custom user defined typing scheme but its efficiency has not been tested on other typing scheme. 177 | 178 | *ST discovery:* This routine takes the database created in the last step and predicts the ST of the input sample(s). Please note that the database building is required prior to this routine. stringMLST is capable of processing single-end and paired-end files. It can run in three modes: 179 | * Single sample mode - for running stringMLST on a single sample 180 | * Batch mode - for running stringMLST on all the FASTQ files present in a directory 181 | * List mode - for running stringMLST on all the FASTQ files provided in a list file 182 | 183 | 184 | ``` 185 | Readme for stringMLST 186 | ============================================================================================= 187 | Usage 188 | ./stringMLST.py 189 | [--buildDB] 190 | [--predict] 191 | [-1 filename_fastq1][--fastq1 filename_fastq1] 192 | [-2 filename_fastq2][--fastq2 filename_fastq2] 193 | [-d directory][--dir directory][--directory directory] 194 | [-l list_file][--list list_file] 195 | [-p][--paired] 196 | [-s][--single] 197 | [-c][--config] 198 | [-P][--prefix] 199 | [-z][--fuzzy] 200 | [-a] 201 | [-C][--coverage] 202 | [-k] 203 | [-o output_filename][--output output_filename] 204 | [-x][--overwrite] 205 | [-t] 206 | [-r] 207 | [-v] 208 | [-h][--help] 209 | ============================================================================================== 210 | 211 | There are two steps to predicting ST using stringMLST. 212 | 1. Create DB : stringMLST.py --buildDB 213 | 2. Predict : stringMLST --predict 214 | 215 | 1. stringMLST.py --buildDB 216 | 217 | Synopsis: 218 | stringMLST.py --buildDB -c -k -P 219 | config file : is a tab delimited file which has the information for typing scheme ie loci, its multifasta file and profile definition file. 220 | Format : 221 | [loci] 222 | locus1 locusFile1 223 | locus2 locusFile2 224 | [profile] 225 | profile profileFile 226 | kmer length : is the kmer length for the db. Note, while processing this should be smaller than the read length. 227 | We suggest kmer lengths of 35, 66 depending on the read length. 228 | DB prefix(optional) : holds the information for DB files to be created and their location. This module creates 3 files with this prefix. 229 | You can use a folder structure with prefix to store your db at particular location. 230 | 231 | Required arguments 232 | --buildDB 233 | Identifier for build db module 234 | -c,--config = 235 | Config file in the format described above. 236 | All the files follow the structure followed by pubmlst. Refer extended document for details. 237 | 238 | Optional arguments 239 | -k = 240 | Kmer size for which the db has to be formed(Default k = 35). Note the tool works best with kmer length in between 35 and 66 241 | for read lengths of 55 to 150 bp. Kmer size can be increased accordingly. It is advised to keep lower kmer sizes 242 | if the quality of reads is not very good. 243 | -P,--prefix = 244 | Prefix for db and log files to be created(Default = kmer). Also you can specify folder where you want the dbb to be created. 245 | -a 246 | File location to write build log 247 | -h,--help 248 | Prints the help manual for this application 249 | 250 | -------------------------------------------------------------------------------------------- 251 | 252 | 2. stringMLST.py --predict 253 | 254 | stringMLST --predict : can run in three modes 255 | 1) single sample (default mode) 256 | 2) batch mode : run stringMLST for all the samples in a folder (for a particular specie) 257 | 3) list mode : run stringMLST on samples specified in a file 258 | stringMLST can process both single and paired end files. By default program expects paired end files. 259 | 260 | Synopsis 261 | stringMLST.py --predict -1 -2 -d -l -p -s -P -k -o -x 262 | 263 | Required arguments 264 | --predict 265 | Identifier for predict miodule 266 | 267 | Optional arguments 268 | -1,--fastq1 = 269 | Path to first fastq file for paired end sample and path to the fastq file for single end file. 270 | Should have extension fastq or fq. 271 | -2,--fastq2 = 272 | Path to second fastq file for paired end sample. 273 | Should have extension fastq or fq. 274 | -d,--dir,--directory = 275 | BATCH MODE : Location of all the samples for batch mode. 276 | -C,--coverage 277 | Calculate seqence coverage for each allele. Turns on read generation (-r) and turns off fuzzy (-z 1) 278 | Requires bwa, bamtools and samtools be in your path 279 | -k = 280 | Kmer length for which the db was created(Default k = 35). Could be verified by looking at the name of the db file. 281 | Could be used if the reads are of very bad quality or have a lot of N's. 282 | -l,--list = 283 | LIST MODE : Location of list file and flag for list mode. 284 | list file should have full file paths for all the samples/files. 285 | Each sample takes one line. For paired end samples the 2 files should be tab separated on single line. 286 | -o,--output = 287 | Prints the output to a file instead of stdio. 288 | -p,--paired 289 | Flag for specifying paired end files. Default option so would work the same if you do not specify for all modes. 290 | For batch mode the paired end samples should be differentiated by 1/2.fastq or 1/2.fq 291 | -P,--prefix = 292 | Prefix using which the db was created(Defaults = kmer). The location of the db could also be provided. 293 | -r 294 | A seperate reads file is created which has all the reads covering all the locus. 295 | -s,--single 296 | Flag for specifying single end files. 297 | -t 298 | Time for each analysis will also be reported. 299 | -v 300 | Prints the version of the software. 301 | -x,--overwrite 302 | By default stringMLST appends the results to the output_filename if same name is used. 303 | This argument overwrites the previously specified output file. 304 | -z,--fuzzy = 305 | Threshold for reporting a fuzzy match (Default=300). For higher coverage reads this threshold should be set higher to avoid 306 | indicating fuzzy match when exact match was more likely. For lower coverage reads, threshold of <100 is recommended 307 | -h,--help 308 | Prints the help manual for this application 309 | 310 | -------------------------------------------------------------------------------------------- 311 | 312 | 3. stringMLST.py --getMLST 313 | 314 | Synopsis: 315 | stringMLST.py --getMLST --species= [-k kmer length] [-P DB prefix] 316 | 317 | Required arguments 318 | --getMLST 319 | Identifier for getMLST module 320 | --species= 321 | Species name from the pubMLST schemes (use "--species show" to get list of available schemes) 322 | "all" will download and build all 323 | 324 | Optional arguments 325 | -k = 326 | Kmer size for which the db has to be formed(Default k = 35). Note the tool works best with kmer length in between 35 and 66 327 | for read lengths of 55 to 150 bp. Kmer size can be increased accordingly. It is advised to keep lower kmer sizes 328 | if the quality of reads is not very good. 329 | -P,--prefix = 330 | Prefix for db and log files to be created(Default = kmer). Also you can specify folder where you want the db to be created. 331 | We recommend that prefix and config point to the same folder for cleanliness but this is not required 332 | --schemes 333 | Display the list of available schemes 334 | -h,--help 335 | Prints the help manual for this application 336 | 337 | ``` 338 | 339 | 340 | **stringMLST expects paired end reads to be in [Illumina naming convention](http://support.illumina.com/help/SequencingAnalysisWorkflow/Content/Vault/Informatics/Sequencing_Analysis/CASAVA/swSEQ_mCA_FASTQFiles.htm), minimally ending with _1.fq and _2.fq to delineate read1 and read2:** 341 | 342 | *Periods (.) are disallowed delimiters except for file extensions* 343 | 344 | ``` 345 | Illumina FASTQ files use the following naming scheme: 346 | 347 | __L_R_.fastq.gz 348 | 349 | For example, the following is a valid FASTQ file name: 350 | 351 | NA10831_ATCACG_L002_R1_001.fastq.gz 352 | ``` 353 | 354 | ## Running stringMLST 355 | 356 | #### Included databases and automated retrieval of databases from pubMLST 357 | 358 | stringMLST includes all the pubMLST databases as of **February 15, 2017**, built with the default kmer (*35*). They can be found in the `datasets/` folder. 359 | Simply unzip the databases you need and begin using stringMSLT as described below. 360 | 361 | All the databases from pubMLST can be downloaded and prepared with your kmer choice 362 | 363 | *Getting all pubMLST schemes* 364 | ``` 365 | stringMLST.py --getMLST -P datasets/ --species all 366 | ``` 367 | 368 | 369 | Individual databases from pubMLST can also be downloaded as needed, using the scheme identifiers 370 | 371 | *Downloading a scheme* 372 | ``` 373 | # List available schemes 374 | stringMLST.py --getMLST --schemes 375 | 376 | # Download the Neisseria spp. scheme 377 | 378 | stringMLST.py --getMLST -P datasets/nmb --species Neisseria 379 | 380 | ``` 381 | 382 | 383 | 384 | #### Database Preparation 385 | In order to create the database, files can be downloaded from the database page. 386 | 387 | If the organism of interest is not present in the provided link, the required files can be downloaded from PubMLST as follows: 388 | * On your browser, navigate to http://pubmlst.org/ 389 | * Navigate to "Download MLST definitions" link or go to http://pubmlst.org/data/ 390 | * Scroll to the species of interest. For each species, user may find the file for typing definitions and multi-FASTA files for each locus. Download these files. 391 | 392 | E.g.: 393 | 394 | Species of interest: Neisseria spp. 395 | Corresponding definition file: http://pubmlst.org/data/profiles/neisseria.txt 396 | Corresponding multi fasta locus files: 397 | http://pubmlst.org/data/alleles/neisseria/abcZ.tfa 398 | http://pubmlst.org/data/alleles/neisseria/adk.tfa 399 | http://pubmlst.org/data/alleles/neisseria/aroE.tfa 400 | http://pubmlst.org/data/alleles/neisseria/fumC.tfa 401 | http://pubmlst.org/data/alleles/neisseria/gdh.tfa 402 | http://pubmlst.org/data/alleles/neisseria/pdhC.tfa 403 | http://pubmlst.org/data/alleles/neisseria/pgm.tfa 404 | 405 | Download these files at a desired location. 406 | 407 | 408 | Custom user files can also be used for building database. The database building routine requires the profile definition file and allele sequence file. The profile definition file is a tab separated file that contains the ST and the allele profile corresponding to the ST. An example of the profile definition file is shown below: 409 | ``` 410 | ST abcZ adk aroE fumC gdh pdhC pgm clonal_complex 411 | 1 1 3 1 1 1 1 3 ST-1 complex/subgroup I/II 412 | 2 1 3 4 7 1 1 3 ST-1 complex/subgroup I/II 413 | 3 1 3 1 1 1 23 13 ST-1 complex/subgroup I/II 414 | 4 1 3 3 1 4 2 3 ST-4 complex/subgroup IV 415 | ``` 416 | The allele sequence file is a standard multi-FASTA with the description being the loci name with the allele number. An example abcZ allele sequence is shown below: 417 | ``` 418 | >abcZ_1 419 | TTTGATACTGTTGCCGA... 420 | >abcZ_2 421 | TTTGATACCGTTGCCGA... 422 | >abcZ_3 423 | TTTGATACCGTTGCGAA... 424 | >abcZ_4 425 | TTTGATACCGTTGCCAA... 426 | ``` 427 | 428 | These files can be obtained from PubMLST/BIGSdb or can be create by the user themselves. 429 | 430 | In either case, an accompanying configuration file is also required to describe the profile definition and allele sequence files. An example configuration file is shown below: 431 | ``` 432 | [loci] 433 | abcZ /data/home/stringMLST/pubmlst/Neisseria_sp/abcZ.fa 434 | adk /data/home/stringMLST/pubmlst/Neisseria_sp/adk.fa 435 | aroE /data/home/stringMLST/pubmlst/Neisseria_sp/aroE.fa 436 | fumC /data/home/stringMLST/pubmlst/Neisseria_sp/fumC.fa 437 | gdh /data/home/stringMLST/pubmlst/Neisseria_sp/gdh.fa 438 | pdhC /data/home/stringMLST/pubmlst/Neisseria_sp/pdhC.fa 439 | pgm /data/home/stringMLST/pubmlst/Neisseria_sp/pgm.fa 440 | 441 | [profile] 442 | profile /data/home/stringMLST/pubmlst/Neisseria_sp/neisseria.txt 443 | ``` 444 | 445 | This file is pre-packed on stringMLSTs website and can easily be created by the user for custom database. 446 | 447 | #### Database Building 448 | The next step is for database building is running the buildDB module to create the database files. buildDB module requires the user to specify the config file. The default k-mer size is 35 but can be changed using the -k option. Specifying the prefix for the created database files is optional but is recommended. 449 | 450 | The choice of k-mer depends on the size of the sequencing read. In general, the value of k can never be greater than the read length. The application has been tested on a number of read lengths ranging from 55 to 150 bps using k-mer sizes of 21 to 66. In our testing, the k-mer size does not affect the accuracy of the read length. A smaller k-mer size will increase the runtime and a larger k-mer size will increase the file size. The user should ideally pick a k-mer with a length around half of the average read length. For lower quality data, it also advised to choose smaller k-mer values to reduce false hits. 451 | ``` 452 | stringMLST.py --buildDB --config -k -P 453 | ``` 454 | Example: 455 | ``` 456 | stringMLST.py --buildDB --config config.txt -k 35 -P NM 457 | ``` 458 | This command will produce 3 database files and a log file. The log file is used for debugging purposes in the event an error is encountered. The 3 database files created are: 459 | * _.txt : The main database file for the application. This is a tab delimited file describing k-mer to locus relationship. 460 | * _weight.txt : Contains the weight factors for alleles which differ in lengths by more than 5%. Will be empty otherwise. 461 | * _profile.txt : Profile definition file used for finding the ST from the predicted allelic profile. 462 | 463 | For the example above, the following files will be created: 464 | NM_35.txt, NM_weight.txt and NM_profile.txt 465 | 466 | Please note that in the prediction routine the database is identified with the prefix. 467 | 468 | ST discovery routine 469 | As discussed earlier, StringMLST has 3 running modes 470 | * Single sample mode - for running stringMLST on a single sample 471 | * Batch mode - for running stringMLST on all the FASTQ files present in a directory 472 | * List mode - for running stringMLST on all the FASTQ files provided in a list file 473 | 474 | #### Single sample mode: 475 | This is the default mode for stringMLST and takes in one sample at a time. The sample can be single-end or paired-end. The sample has to be in FASTQ format. In order to run, the user should know the prefix of the database created and the k-mer size. 476 | 477 | By default, the tool expects paired-end samples. 478 | ``` 479 | stringMLST.py --predict -1 -2 -p --prefix -k -o 480 | ``` 481 | *For single-end samples:* 482 | ``` 483 | stringMLST.py --predict -1 -s --prefix -k -o 484 | ``` 485 | #### Batch Mode: 486 | This mode can be used for processing multiple files with one command. All the samples will be queried against the same database. Also all samples should be in the same directory. All the samples will be treated either as single-end or paired-end. The paired-end samples should be differentiated with the character _1 and _2 at the end (E.g.: sampleX_1.fastq and sampleX_2.fastq). 487 | 488 | *Paired-end samples:* 489 | ``` 490 | stringMLST.py --predict -d -p --prefix -k -o 491 | ``` 492 | 493 | *Single-end samples:* 494 | ``` 495 | stringMLST.py --predict -d -s --prefix -k -o 496 | ``` 497 | #### List Mode: 498 | This mode could be used if user has samples at different locations or if the paired-end samples are not stored in traditional way. All the samples will be queried against the same database. All the samples will be treated either as single-end or paired-end. This mode requires the user to provide a list file which has the list of all samples along with the location. Each line in the list file represents a new sample. 499 | A sample list file for single-end sample looks like the following. 500 | ``` 501 | 502 | 503 | 504 | . 505 | . 506 | 507 | ``` 508 | A sample list file for paired-end sample looks like the following. 509 | 510 | ``` 511 | 512 | 513 | 514 | . 515 | . 516 | 517 | ``` 518 | 519 | Once the user has the list file, he can directly use the tool. 520 | 521 | *Paired-end samples:* 522 | ``` 523 | stringMLST.py --predict -l -p --prefix -k -o 524 | ``` 525 | *Single-end samples:* 526 | ``` 527 | stringMLST.py --predict -l -s --prefix -k -o 528 | ``` 529 | 530 | #### Gene coverage and match confidence 531 | 532 | stringMLST provides two, complimentary methods for determining confidence in an inferred ST. There's the `-C|--coverage` flag and `-z|--fuzzy` threshold option. 533 | 534 | stringMLST determines an allele based on its kmer support; the more kmers seen for allele 1, the more likely that allele 1 is the allele present in the genome. Unlike SRST2 and other mapping/BLAST based tools, stringMLST always infers an ST, using the maximimally supported allele (allele with most kmer hits). The difference between the maximum support (the reported allele) and the second support (next closest allele) can be informative for low coverage reads. The `-z|--fuzzy` threshold (Default = 300), assigns significance to the difference between supports. Much like SRST2 and Torsten Seemann's popular [pubMLST script](https://github.com/tseemann/mlst), stringMLST reports potentially new or closely supported alleles in allele* syntax. For high coverage reads, we suggest a fuzzy threshold >500. For low coverage reads, a fuzzy threshold of <50. 535 | 536 | Coverage mode requires `bedtools`, `bwa`, and `samtools` in your PATH and an additional python module, `pyfaidx` (See the dependencies section for installion information). Coverage mode by default disables display of fuzzy alleles in favor of sequence coverage information made by mapping potential reads to the putative allele sequence. In our testing, coverage mode slightly increases prediction time (<1 sec increase per sample). 537 | 538 | **Please note:** stringMLST *always* infers the ST from the reads, fuzzy matches and/or <100% coverage do not necessarily mean a new allele has been found. 539 | 540 | *Getting gene coverage from reads* 541 | ``` 542 | stringMLST.py --predict -1 -2 -p --prefix -k -r -o - -c -C 543 | ``` 544 | *Changing the fuzziness of the search for low coverage reads* 545 | ``` 546 | stringMLST.py --predict -1 -2 -p --prefix -k -r -o - -f 50 547 | ``` 548 | 549 | #### Other Examples : 550 | 551 | *Reporting time along with the output.* 552 | ``` 553 | stringMLST.py --predict -1 -2 -p --prefix -k -t -o 554 | ``` 555 | *Getting reads file relevant to typing scheme.* 556 | ``` 557 | stringMLST.py --predict -1 -2 -p --prefix -k -r -o 558 | ``` 559 | -------------------------------------------------------------------------------- /stringMLST.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import getopt 3 | import sys 4 | import logging 5 | import os 6 | import time 7 | import ast 8 | import gzip 9 | import re 10 | import tempfile 11 | import shutil 12 | import xml.etree.ElementTree as ET 13 | try: 14 | from urllib.request import urlopen, urlretrieve 15 | except ImportError: 16 | from urllib import urlopen, urlretrieve 17 | import argparse 18 | version = """ stringMLST v0.6.3 (updated : September 02, 2020) """ 19 | """ 20 | 21 | stringMLST free for academic users and requires permission before any commercial 22 | use for any version of this code/algorithm. If you are a commercial user, please 23 | contact king.jordan@biology.gatech.edu for permissions 24 | 25 | LICENSE TERMS FOR stringMLST 26 | Adopted from: https://creativecommons.org/licenses/by-nc-sa/4.0/ 27 | 28 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public 29 | License 30 | 31 | By exercising the Licensed Rights (defined below), You accept and agree to be 32 | bound by the terms and conditions of this Creative Commons Attribution- 33 | NonCommercial-ShareAlike 4.0 International Public License ("Public License"). To 34 | the extent this Public License may be interpreted as a contract, You are granted 35 | the Licensed Rights in consideration of Your acceptance of these terms and 36 | conditions, and the Licensor grants You such rights in consideration of benefits 37 | the Licensor receives from making the Licensed Material available under these 38 | terms and conditions. 39 | 40 | Section 1 - Definitions. 41 | 42 | Adapted Material means material subject to Copyright and Similar Rights that is 43 | derived from or based upon the Licensed Material and in which the Licensed 44 | Material is translated, altered, arranged, transformed, or otherwise modified in 45 | a manner requiring permission under the Copyright and Similar Rights held by the 46 | Licensor. For purposes of this Public License, where the Licensed Material is a 47 | musical work, performance, or sound recording, Adapted Material is always 48 | produced where the Licensed Material is synched in timed relation with a moving 49 | image. Adapter's License means the license You apply to Your Copyright and 50 | Similar Rights in Your contributions to Adapted Material in accordance with the 51 | terms and conditions of this Public License. BY-NC-SA Compatible License means a 52 | license listed at creativecommons.org/compatiblelicenses, approved by Creative 53 | Commons as essentially the equivalent of this Public License. Copyright and 54 | Similar Rights means copyright and/or similar rights closely related to 55 | copyright including, without limitation, performance, broadcast, sound 56 | recording, and Sui Generis Database Rights, without regard to how the rights are 57 | labeled or categorized. For purposes of this Public License, the rights 58 | specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights. Effective 59 | Technological Measures means those measures that, in the absence of proper 60 | authority, may not be circumvented under laws fulfilling obligations under 61 | Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or 62 | similar international agreements. Exceptions and Limitations means fair use, 63 | fair dealing, and/or any other exception or limitation to Copyright and Similar 64 | Rights that applies to Your use of the Licensed Material. License Elements means 65 | the license attributes listed in the name of a Creative Commons Public License. 66 | The License Elements of this Public License are Attribution, NonCommercial, and 67 | ShareAlike. Licensed Material means the artistic or literary work, database, or 68 | other material to which the Licensor applied this Public License. Licensed 69 | Rights means the rights granted to You subject to the terms and conditions of 70 | this Public License, which are limited to all Copyright and Similar Rights that 71 | apply to Your use of the Licensed Material and that the Licensor has authority 72 | to license. Licensor means the individual(s) or entity(ies) granting rights 73 | under this Public License. NonCommercial means not primarily intended for or 74 | directed towards commercial advantage or monetary compensation. For purposes of 75 | this Public License, the exchange of the Licensed Material for other material 76 | subject to Copyright and Similar Rights by digital file-sharing or similar means 77 | is NonCommercial provided there is no payment of monetary compensation in 78 | connection with the exchange. Share means to provide material to the public by 79 | any means or process that requires permission under the Licensed Rights, such as 80 | reproduction, public display, public performance, distribution, dissemination, 81 | communication, or importation, and to make material available to the public 82 | including in ways that members of the public may access the material from a 83 | place and at a time individually chosen by them. Sui Generis Database Rights 84 | means rights other than copyright resulting from Directive 96/9/EC of the 85 | European Parliament and of the Council of 11 March 1996 on the legal protection 86 | of databases, as amended and/or succeeded, as well as other essentially 87 | equivalent rights anywhere in the world. You means the individual or entity 88 | exercising the Licensed Rights under this Public License. Your has a 89 | corresponding meaning. Section 2 - Scope. 90 | 91 | License grant. Subject to the terms and conditions of this Public License, the 92 | Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non- 93 | exclusive, irrevocable license to exercise the Licensed Rights in the Licensed 94 | Material to: reproduce and Share the Licensed Material, in whole or in part, for 95 | NonCommercial purposes only; and produce, reproduce, and Share Adapted Material 96 | for NonCommercial purposes only. Exceptions and Limitations. For the avoidance 97 | of doubt, where Exceptions and Limitations apply to Your use, this Public 98 | License does not apply, and You do not need to comply with its terms and 99 | conditions. Term. The term of this Public License is specified in Section 6(a). 100 | Media and formats; technical modifications allowed. The Licensor authorizes You 101 | to exercise the Licensed Rights in all media and formats whether now known or 102 | hereafter created, and to make technical modifications necessary to do so. The 103 | Licensor waives and/or agrees not to assert any right or authority to forbid You 104 | from making technical modifications necessary to exercise the Licensed Rights, 105 | including technical modifications necessary to circumvent Effective 106 | Technological Measures. For purposes of this Public License, simply making 107 | modifications authorized by this Section 2(a)(4) never produces Adapted 108 | Material. Downstream recipients. Offer from the Licensor - Licensed Material. 109 | Every recipient of the Licensed Material automatically receives an offer from 110 | the Licensor to exercise the Licensed Rights under the terms and conditions of 111 | this Public License. Additional offer from the Licensor - Adapted Material. 112 | Every recipient of Adapted Material from You automatically receives an offer 113 | from the Licensor to exercise the Licensed Rights in the Adapted Material under 114 | the conditions of the Adapter's License You apply. No downstream restrictions. 115 | You may not offer or impose any additional or different terms or conditions on, 116 | or apply any Effective Technological Measures to, the Licensed Material if doing 117 | so restricts exercise of the Licensed Rights by any recipient of the Licensed 118 | Material. No endorsement. Nothing in this Public License constitutes or may be 119 | construed as permission to assert or imply that You are, or that Your use of the 120 | Licensed Material is, connected with, or sponsored, endorsed, or granted 121 | official status by, the Licensor or others designated to receive attribution as 122 | provided in Section 3(a)(1)(A)(i). Other rights. 123 | 124 | Moral rights, such as the right of integrity, are not licensed under this Public 125 | License, nor are publicity, privacy, and/or other similar personality rights; 126 | however, to the extent possible, the Licensor waives and/or agrees not to assert 127 | any such rights held by the Licensor to the limited extent necessary to allow 128 | You to exercise the Licensed Rights, but not otherwise. Patent and trademark 129 | rights are not licensed under this Public License. To the extent possible, the 130 | Licensor waives any right to collect royalties from You for the exercise of the 131 | Licensed Rights, whether directly or through a collecting society under any 132 | voluntary or waivable statutory or compulsory licensing scheme. In all other 133 | cases the Licensor expressly reserves any right to collect such royalties, 134 | including when the Licensed Material is used other than for NonCommercial 135 | purposes. Section 3 - License Conditions. 136 | 137 | Your exercise of the Licensed Rights is expressly made subject to the following 138 | conditions. 139 | 140 | Attribution. 141 | 142 | If You Share the Licensed Material (including in modified form), You must: 143 | 144 | retain the following if it is supplied by the Licensor with the Licensed 145 | Material: identification of the creator(s) of the Licensed Material and any 146 | others designated to receive attribution, in any reasonable manner requested by 147 | the Licensor (including by pseudonym if designated); a copyright notice; a 148 | notice that refers to this Public License; a notice that refers to the 149 | disclaimer of warranties; a URI or hyperlink to the Licensed Material to the 150 | extent reasonably practicable; indicate if You modified the Licensed Material 151 | and retain an indication of any previous modifications; and indicate the 152 | Licensed Material is licensed under this Public License, and include the text 153 | of, or the URI or hyperlink to, this Public License. You may satisfy the 154 | conditions in Section 3(a)(1) in any reasonable manner based on the medium, 155 | means, and context in which You Share the Licensed Material. For example, it may 156 | be reasonable to satisfy the conditions by providing a URI or hyperlink to a 157 | resource that includes the required information. If requested by the Licensor, 158 | You must remove any of the information required by Section 3(a)(1)(A) to the 159 | extent reasonably practicable. ShareAlike. In addition to the conditions in 160 | Section 3(a), if You Share Adapted Material You produce, the following 161 | conditions also apply. 162 | 163 | The Adapter's License You apply must be a Creative Commons license with the same 164 | License Elements, this version or later, or a BY-NC-SA Compatible License. You 165 | must include the text of, or the URI or hyperlink to, the Adapter's License You 166 | apply. You may satisfy this condition in any reasonable manner based on the 167 | medium, means, and context in which You Share Adapted Material. You may not 168 | offer or impose any additional or different terms or conditions on, or apply any 169 | Effective Technological Measures to, Adapted Material that restrict exercise of 170 | the rights granted under the Adapter's License You apply. Section 4 - Sui 171 | Generis Database Rights. 172 | 173 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your 174 | use of the Licensed Material: 175 | 176 | for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, 177 | reuse, reproduce, and Share all or a substantial portion of the contents of the 178 | database for NonCommercial purposes only; if You include all or a substantial 179 | portion of the database contents in a database in which You have Sui Generis 180 | Database Rights, then the database in which You have Sui Generis Database Rights 181 | (but not its individual contents) is Adapted Material, including for purposes of 182 | Section 3(b); and You must comply with the conditions in Section 3(a) if You 183 | Share all or a substantial portion of the contents of the database. For the 184 | avoidance of doubt, this Section 4 supplements and does not replace Your 185 | obligations under this Public License where the Licensed Rights include other 186 | Copyright and Similar Rights. Section 5 - Disclaimer of Warranties and 187 | Limitation of Liability. 188 | 189 | Unless otherwise separately undertaken by the Licensor, to the extent possible, 190 | the Licensor offers the Licensed Material as-is and as-available, and makes no 191 | representations or warranties of any kind concerning the Licensed Material, 192 | whether express, implied, statutory, or other. This includes, without 193 | limitation, warranties of title, merchantability, fitness for a particular 194 | purpose, non-infringement, absence of latent or other defects, accuracy, or the 195 | presence or absence of errors, whether or not known or discoverable. Where 196 | disclaimers of warranties are not allowed in full or in part, this disclaimer 197 | may not apply to You. To the extent possible, in no event will the Licensor be 198 | liable to You on any legal theory (including, without limitation, negligence) or 199 | otherwise for any direct, special, indirect, incidental, consequential, 200 | punitive, exemplary, or other losses, costs, expenses, or damages arising out of 201 | this Public License or use of the Licensed Material, even if the Licensor has 202 | been advised of the possibility of such losses, costs, expenses, or damages. 203 | Where a limitation of liability is not allowed in full or in part, this 204 | limitation may not apply to You. The disclaimer of warranties and limitation of 205 | liability provided above shall be interpreted in a manner that, to the extent 206 | possible, most closely approximates an absolute disclaimer and waiver of all 207 | liability. Section 6 - Term and Termination. 208 | 209 | This Public License applies for the term of the Copyright and Similar Rights 210 | licensed here. However, if You fail to comply with this Public License, then 211 | Your rights under this Public License terminate automatically. Where Your right 212 | to use the Licensed Material has terminated under Section 6(a), it reinstates: 213 | 214 | automatically as of the date the violation is cured, provided it is cured within 215 | 30 days of Your discovery of the violation; or upon express reinstatement by the 216 | Licensor. For the avoidance of doubt, this Section 6(b) does not affect any 217 | right the Licensor may have to seek remedies for Your violations of this Public 218 | License. For the avoidance of doubt, the Licensor may also offer the Licensed 219 | Material under separate terms or conditions or stop distributing the Licensed 220 | Material at any time; however, doing so will not terminate this Public License. 221 | Sections 1, 5, 6, 7, and 8 survive termination of this Public License. Section 7 222 | - Other Terms and Conditions. 223 | 224 | The Licensor shall not be bound by any additional or different terms or 225 | conditions communicated by You unless expressly agreed. Any arrangements, 226 | understandings, or agreements regarding the Licensed Material not stated herein 227 | are separate from and independent of the terms and conditions of this Public 228 | License. Section 8 - Interpretation. 229 | 230 | For the avoidance of doubt, this Public License does not, and shall not be 231 | interpreted to, reduce, limit, restrict, or impose conditions on any use of the 232 | Licensed Material that could lawfully be made without permission under this 233 | Public License. To the extent possible, if any provision of this Public License 234 | is deemed unenforceable, it shall be automatically reformed to the minimum 235 | extent necessary to make it enforceable. If the provision cannot be reformed, it 236 | shall be severed from this Public License without affecting the enforceability 237 | of the remaining terms and conditions. No term or condition of this Public 238 | License will be waived and no failure to comply consented to unless expressly 239 | agreed to by the Licensor. Nothing in this Public License constitutes or may be 240 | interpreted as a limitation upon, or waiver of, any privileges and immunities 241 | that apply to the Licensor or You, including from the legal processes of any 242 | jurisdiction or authority. 243 | 244 | 245 | 246 | The program has 3 basic modes : 247 | mainTool: for single sample (both single and paired end) 248 | batchTool: for multiple samples stored at a common location (both single and paired end samples) 249 | listTool: for multiple samples with location information stored in a list (both single and paired end samples) 250 | predict part starts here 251 | """ 252 | ############################################################# 253 | # Function : get_links 254 | # Input : speciesName and schemes dict 255 | # Output : Dict containing links to alleles and profile 256 | # Description: Gets the URLs from pubMLST for the required 257 | # files (alleles, profile) 258 | ############################################################# 259 | def get_links(xmlData, savePath, speciesName): 260 | lociList = {} 261 | profileURL = None 262 | for species in xmlData: 263 | if re.search(re.escape(speciesName), species.text, re.IGNORECASE, ): 264 | for mlst in species: 265 | for database in mlst: 266 | for child in database: 267 | if child.tag == "profiles": 268 | profileURL = child[0].text 269 | if child.tag == "loci": 270 | for locus in child: 271 | lociList[locus.text.rstrip()] = locus[0].text 272 | if profileURL is None: 273 | profileError = "Parsing failed: could not find profiles file" 274 | print(profileError) 275 | print("This usually means the provided species, '{}', does not exist on PubMLST".format(speciesName)) 276 | print("Use `{} --getMLST --species list` to list available species".format(sys.argv[0])) 277 | print("Or visit PubMLST for more information:\nhttps://pubmlst.org/data/") 278 | logging.debug(profileError) 279 | sys.exit(1) 280 | elif lociList == {}: 281 | lociError = "Parsing failed: could not find allele sequences" 282 | logging.debug(lociError) 283 | print(lociError) 284 | sys.exit(1) 285 | else: 286 | return profileURL, lociList 287 | ############################################################# 288 | # Function : get_files 289 | # Input : URLs from get_links 290 | # Output : Downloads files and builds database 291 | ############################################################# 292 | def get_files(filePrefix, loci, profileURL, speciesName): 293 | with open(config, "w") as configFile: 294 | configFile.write("[loci]\n") 295 | for file in loci: 296 | localFile = filePrefix + "_" + file + ".tfa" 297 | try: 298 | localFile, headers = urlretrieve(loci[file], localFile) 299 | except: 300 | print('\033[91m' + "There was an error downloading " + file + '\033[0m') 301 | pass 302 | configFile.write(file + "\t" + filePrefix + "_" + file + ".tfa\n") 303 | localFile = filePrefix + "_profile.txt" 304 | localFile, headers = urlretrieve(profileURL, localFile) 305 | configFile.write("[profile]\n") 306 | configFile.write("profile\t" + filePrefix + "_profile.txt\n") 307 | configFile.close() 308 | try: 309 | makeCustomDB(config, k, filePrefix) 310 | except: 311 | print('\033[91m' + "Failed to create database " + speciesName + '\033[0m') 312 | pass 313 | else: 314 | print("\t" + '\033[92m' + "Database ready for " + speciesName + '\033[0m') 315 | print("\t" + filePrefix) 316 | ############################################################ 317 | # Function : batchTool 318 | # Input : Directory name, paired or single, k value 319 | # Output : STs and allelic profiles for each FASTQ file 320 | # Description: Processes all FASTQ files present in the input 321 | # directory 322 | ############################################################# 323 | def batchTool(fdir, paired, k): 324 | fileList = [] 325 | if not dir.endswith('/'): 326 | fdir += '/' 327 | for inputFile in os.listdir(fdir): 328 | if paired is True: 329 | if inputFile.endswith('1.fastq') or inputFile.endswith('1.fq') or inputFile.endswith('1.fq.gz') or inputFile.endswith('1.fastq.gz'): 330 | fastq1 = fdir+inputFile 331 | fastq2 = fdir+inputFile.replace('1.', '2.') 332 | fileList.append((fastq1, fastq2)) 333 | else: 334 | if inputFile.endswith('.fastq') or inputFile.endswith('.fq') or inputFile.endswith('.fq.gz') or inputFile.endswith('.fastq.gz'): 335 | fastq1 = fdir + inputFile 336 | fileList.append(fastq1) 337 | results = multiSampleTool(fileList, paired, k) 338 | return results 339 | ############################################################# 340 | # Function : listTool 341 | # Input : List file, paired or single, k value 342 | # Output : STs and allelic profiles for each FASTQ file 343 | # Description: Processes all FASTQ files present in the input 344 | # list file 345 | ############################################################# 346 | def listTool(fList, paired, k): 347 | fileList = [] 348 | listf = open(fList, 'r') 349 | samples = listf.readlines() 350 | for sample in samples: 351 | if paired is True: 352 | s = sample.strip().split() 353 | fastq1 = s[0] 354 | try: 355 | fastq2 = s[1] 356 | except IndexError: 357 | print("Error: Paired end files should be whitespace/tab seperated") 358 | exit(0) 359 | fileList.append((fastq1, fastq2)) 360 | else: 361 | fastq1 = sample.rstrip() 362 | fileList.append(fastq1) 363 | results = multiSampleTool(fileList, paired, k) 364 | return results 365 | ############################################################# 366 | # Function : multiSampleTool 367 | # Input : List of files to process, paired or single, k value 368 | # Output : STs and allelic profiles for each FASTQ file 369 | # Description: Processes all FASTQ files present in the input list 370 | ############################################################# 371 | def multiSampleTool(fileList, paired, k): 372 | results = {} 373 | for sample in fileList: 374 | if paired is True: 375 | fastq1 = sample[0] 376 | fastq2 = sample[1] 377 | else: 378 | fastq1 = sample 379 | fastq2 = None 380 | results = singleSampleTool(fastq1, fastq2, paired, k, results) 381 | return results 382 | ############################################################# 383 | # Function : singleSampleTool 384 | # Input : fastq file 1 and 2, paired or single, k value, output dictionary 385 | # Output : STs and allelic profiles for each FASTQ file 386 | # Description: Processes both FASTQ files passed to the function 387 | ############################################################# 388 | def singleSampleTool(fastq1, fastq2, paired, k, results): 389 | if paired is True: 390 | fileName = fastq1.split('/')[-1].split('.')[0][:-1] 391 | else: 392 | fileName = fastq1.split('/')[-1].split('.')[0] 393 | if reads is True: 394 | readFileName = fileName + '_reads.fq' 395 | global readFile 396 | readFile = open(readFileName, 'w+') 397 | if paired is True: 398 | msg = "singleSampleTool : " + fastq1 + ' and ' + fastq2 399 | else: 400 | msg = "singleSampleTool : " + fastq1 401 | logging.debug(msg) 402 | global alleleCount 403 | alleleCount = {} 404 | t1 = time.time() 405 | if paired is True: 406 | logging.debug("singleSampleTool : paired True") 407 | logging.debug("singleSampleTool : fastq1 start") 408 | singleFileTool(fastq1, k) 409 | logging.debug("singleSampleTool : fastq1 done") 410 | logging.debug("singleSampleTool : fastq2 start") 411 | singleFileTool(fastq2, k) 412 | logging.debug("singleSampleTool : fastq2 done") 413 | if alleleCount == {}: 414 | string = "No k-mer matches were found for the sample " + fastq1 + " and "+ fastq2 + ". Probable cause of the error: low quality data/too many N's in the data" 415 | logging.error("singleSampleTool : " + string) 416 | print(string) 417 | # exit(0) 418 | profileCount = alleleCount 419 | else: 420 | logging.debug("singleSampleTool : paired False") 421 | logging.debug("singleSampleTool : fastq start") 422 | singleFileTool(fastq1, k) 423 | profileCount = alleleCount 424 | logging.debug("singleSampleTool : fastq done") 425 | if alleleCount == 0: 426 | string = "No k-mer matches were found for the sample " + fastq1 + ". Probable cause of the error: low quality data/too many N's in the data" 427 | logging.error("singleSampleTool : " + string) 428 | print(string) 429 | logging.debug("singleSampleTool : weightedProfile start") 430 | weightedProfile = weightedProf(profileCount, weightDict) 431 | logging.debug("singleSampleTool : weightedProfile finished") 432 | logging.debug("singleSampleTool : getMaxCount start") 433 | finalProfile = getMaxCount(weightedProfile, fileName) 434 | logging.debug("singleSampleTool : getMaxCount end") 435 | st = 0 436 | if profileFile != '': 437 | logging.debug("singleSampleTool : findST start") 438 | st = findST(finalProfile, stProfile) 439 | logging.debug("singleSampleTool : findST end") 440 | if reads is True: 441 | readFile.close() 442 | t3 = time.time() 443 | finalProfile['ST'] = st 444 | finalProfile['t'] = t3-t1 445 | results[fileName] = finalProfile 446 | return results 447 | ############################################################# 448 | # Function : singleFileTool 449 | # Input : fastq file, k value 450 | # Output : Edits a global dictionary - results 451 | # Description: Processes the single fastq file 452 | ############################################################# 453 | def singleFileTool(fastq, k): 454 | msg = "singleFileTool :" + fastq 455 | logging.debug(msg) 456 | if os.path.isfile(fastq): 457 | logging.debug("singleFileTool : fastq") 458 | non_overlapping_window = 1 459 | finalProfile = {} 460 | t1 = time.time() 461 | fileExplorer(fastq, k, non_overlapping_window) 462 | t3 = time.time() 463 | else: 464 | msg = "File does not exist: " + fastq 465 | logging.error("singleFileTool : msg") 466 | print(msg) 467 | def fileExplorer(file, k, non_overlapping_window): 468 | if file.endswith('.gz'): 469 | if sys.version_info[0] == 3: 470 | f = gzip.open(file, 'rt') 471 | else: 472 | f = gzip.open(file, 'rb') 473 | else: 474 | f = open(file) 475 | msg = "fileExplorer :" + file 476 | logging.debug(msg) 477 | lines = f.readlines() 478 | i = 1 479 | n_reads = 0 480 | try: 481 | if len(lines[1]) < k: 482 | m1 = "Read length " + len(lines[1])+" for file " + file + " smaller than " + k 483 | print(m1) 484 | print("Skipping to next file.") 485 | logging.debug(m1) 486 | return 0 487 | except Exception: 488 | m2 = "Check fastq file " + file 489 | print(m2) 490 | logging.debug(m2) 491 | return 0 492 | start = int((len(lines[1])-k)//2) 493 | end = int((len(lines[1])-k)//2) 494 | yesRead = False 495 | for line in lines: 496 | if i % 4 == 0 and yesRead: 497 | readFile.write(line) 498 | if i % 4 != 3: 499 | yesRead = False 500 | if i%4 == 1: 501 | head = line 502 | if i%4 == 2: 503 | s1 = str(line[start:k+start]) 504 | sn_1 = str(line[-k-end:-end]).rstrip() 505 | if s1 in kmerDict[k]: 506 | n_reads += 1 507 | goodReads(line, k, non_overlapping_window) 508 | if reads is True: 509 | readFile.write(head) 510 | readFile.write(line) 511 | readFile.write('+\n') 512 | yesRead = True 513 | i += 1 514 | ############################################################# 515 | # Function : goodReads 516 | # Input : sequence read, k, step size 517 | # Output : Edits the count of global variable alleleCount 518 | # Description: Increment the count for each k-mer match 519 | ############################################################# 520 | def goodReads(read, k, non_overlapping_window): 521 | n = 0 522 | line = read.rstrip() 523 | while n+k <= len(line): 524 | s = str(line[n:n+k]) 525 | if s in kmerDict[k]: 526 | for probLoc in kmerDict[k][s]: 527 | if probLoc not in alleleCount: 528 | alleleCount[probLoc] = {} 529 | a = kmerDict[k][s][probLoc] 530 | for allele in a: 531 | allele = allele.rstrip() 532 | if allele in alleleCount[probLoc]: 533 | alleleCount[probLoc][allele] += 1 534 | else: 535 | alleleCount[probLoc][allele] = 1 536 | n += non_overlapping_window 537 | ############################################################# 538 | # Function : weightedProf 539 | # Input : allele count global var, weight factors 540 | # Output/Desc: Normalizes alleleCount by weight factor 541 | ############################################################# 542 | def weightedProf(alleleCount, weightDict): 543 | logging.debug("weightedProf") 544 | weightedDict = {} 545 | for loc in alleleCount: 546 | weightedDict[loc] = {} 547 | for allele in alleleCount[loc]: 548 | if loc in weightDict: 549 | if allele in weightDict[loc]: 550 | weightedDict[loc][allele] = (alleleCount[loc][allele] / weightDict[loc][allele]) 551 | else: 552 | weightedDict[loc][allele] = alleleCount[loc][allele] 553 | else: 554 | weightedDict[loc][allele] = alleleCount[loc][allele] 555 | return weightedDict 556 | ############################################################# 557 | # Function : getMaxCount 558 | # Input : allele counts 559 | # Output : allelic profile and ST 560 | # Description: Finds the alleles with maximum counts and 561 | # generates the allelic profile and ST 562 | ############################################################# 563 | def getMaxCount(alleleCount, fileName): 564 | logging.debug("getMaxCount") 565 | max_n = {} 566 | secondMax = {} 567 | maxSupport = {} 568 | secondSupport = {} 569 | finalProfileCount = {} 570 | for locus in alleleNames: 571 | finalProfileCount[locus] = {} 572 | num = '' 573 | for loc in alleleCount: 574 | n = 0 575 | m = 0 576 | for num in alleleCount[loc]: 577 | if alleleCount[loc][num] >= n: 578 | m = n 579 | n = alleleCount[loc][num] 580 | if n-m < fuzzy: 581 | try: 582 | alleleCount[loc][num] 583 | except: 584 | pass 585 | else: 586 | alleleCount[loc][num] = str(alleleCount[loc][num])+'*' 587 | max_n[loc] = str(n)+'*' 588 | else: 589 | max_n[loc] = n 590 | secondMax[loc] = m 591 | for loc in alleleCount: 592 | try: 593 | max_n[loc] 594 | except: 595 | pass 596 | else: 597 | maxSupport[loc] = {} 598 | secondSupport[loc] = {} 599 | num_max = [] 600 | num_max2 = [] 601 | compare = float(re.sub("\*$", "", str(max_n[loc]))) 602 | for num in alleleCount[loc]: 603 | if float(re.sub("\*$", "", str(alleleCount[loc][num]))) == compare: 604 | if "\*" in str(max_n[loc]): 605 | insert = num + '*' 606 | num_max.append(insert) 607 | else: 608 | num_max.append(num) 609 | maxSupport[loc][num] = max_n[loc] 610 | 611 | if alleleCount[loc][num] == secondMax[loc]: 612 | num_max2.append(num) 613 | secondSupport[loc][num] = secondMax[loc] 614 | try: 615 | finalProfileCount[loc] = num_max[0] 616 | except LookupError: 617 | finalProfileCount[loc] = 'NA' 618 | msgs = "Max Support :" + fileName + " : " + str(maxSupport) 619 | logging.debug(msgs) 620 | msgs = "Second Max Support :" + fileName + " : " + str(secondSupport) 621 | logging.debug(msgs) 622 | return finalProfileCount 623 | ############################################################# 624 | # Function : findST 625 | # Input : allelic profile for one sample and profiles for all STs 626 | # Output : ST number, or 0 if no ST match was found 627 | # Description: Finds the ST number which best matches the given sample profile. 628 | ############################################################# 629 | def findST(finalProfile, stProfile): 630 | if not stProfile: 631 | return 0 632 | oneProfile = next(iter(stProfile.values())) 633 | # The gene names in finalProfile may not exactly match those in stProfile. To deal with this, 634 | # each finalProfile gene is associated with the best matching gene in the ST profiles. 635 | finalGeneToSTGene = {} 636 | profileGenes = list(oneProfile.keys()) 637 | for finalGene in list(finalProfile.keys()): 638 | if finalGene in profileGenes: # exact match is preferable 639 | finalGeneToSTGene[finalGene] = finalGene 640 | else: # failing an exact match, look for a case-sensitive containment 641 | for profileGene in profileGenes: 642 | if finalGene in profileGene: 643 | finalGeneToSTGene[finalGene] = profileGene 644 | break 645 | if finalGene not in finalGeneToSTGene: # if there's still no match, try a case-insensitive containment 646 | for profileGene in profileGenes: 647 | if finalGene.lower() in profileGene.lower(): 648 | finalGeneToSTGene[finalGene] = profileGene 649 | break 650 | if finalGene not in finalGeneToSTGene: 651 | print("ERROR: gene names in config file do not match gene names in profile file") 652 | exit(0) 653 | transformedFinalProfile = {} 654 | for gene, allele in finalProfile.items(): 655 | if allele: 656 | allele = re.sub("\*", "", allele) 657 | transformedFinalProfile[finalGeneToSTGene[gene]] = allele 658 | # Check to see if the dictionary is empty, if so then means no allele were found at all 659 | if bool(transformedFinalProfile) is False: 660 | return 0 661 | # Find the best matching ST, considering only the genes in the sample's profile. This is to 662 | # allow for superfluous columns in the ST profile. 663 | logging.debug("findST") 664 | for stNum, profile in stProfile.items(): 665 | if all(x in list(profile.items()) for x in list(transformedFinalProfile.items())): 666 | return stNum 667 | return 0 668 | ############################################################# 669 | # Function : loadModule 670 | # Input : k value and prefix of the DB file 671 | # Output : Updates the DB dictionary variables 672 | # Description: Used in loading the DB as set of variables 673 | # by calling other functions 674 | ############################################################# 675 | def loadModule(k, dbPrefix): 676 | global dbFile 677 | dbFile = dbPrefix+'_'+str(k)+'.txt' 678 | global weightFile 679 | weightFile = dbPrefix+'_weight.txt' 680 | global profileFile 681 | profileFile = dbPrefix+'_profile.txt' 682 | global kmerDict 683 | kmerDict = {} 684 | kmerDict[k] = loadKmerDict(dbFile) 685 | global weightDict 686 | weightDict = loadWeightDict(weightFile) 687 | global stProfile 688 | stProfile = loadSTfromFile(profileFile) 689 | ############################################################# 690 | # Function : loadSTfromFile 691 | # Input : profile definition file 692 | # Output : Updates the DB dictionary variables 693 | # Description: Used in loading the DB as set of variables 694 | ############################################################# 695 | def loadSTfromFile(profileF): 696 | with open(profileF, 'r') as definitionFile: 697 | st = {} 698 | index = {} 699 | lines = definitionFile.readlines() 700 | heads = lines[0].rstrip().split('\t') 701 | for locus in heads: 702 | index[locus] = heads.index(locus) 703 | for line in lines: 704 | pro = line.rstrip().split('\t') 705 | l = {} 706 | for locus in heads[1:]: 707 | try: 708 | l[locus] = pro[index[locus]] 709 | except LookupError: 710 | logging.debug("ERROR while loading ST") 711 | pass 712 | st[pro[0]] = l 713 | return st 714 | ############################################################# 715 | # Function : loadKmerDict 716 | # Input : DB prefix 717 | # Output : Updates the DB dictionary variables 718 | # Description: Used in loading the DB as set of variables 719 | ############################################################# 720 | def loadKmerDict(dbFile): 721 | kmerTableDict = {} 722 | with open(dbFile, 'r') as kmerTableFile: 723 | lines = kmerTableFile.readlines() 724 | global alleleNames 725 | alleleNames = set() 726 | for line in lines: 727 | array = line.rstrip().rsplit('\t') 728 | kmerTableDict[array[0]] = {} 729 | kmerTableDict[array[0]][array[1]] = array[2][1:-1].rsplit(',') 730 | alleleNames.add(array[1]) 731 | return kmerTableDict 732 | ############################################################# 733 | # Function : loadWeightDict 734 | # Input : Weight file prefix 735 | # Output : Updates the DB dictionary variables 736 | # Description: Used in loading the DB as set of variables 737 | ############################################################# 738 | def loadWeightDict(weightFile): 739 | weightDict = {} 740 | with open(weightFile, 'r') as weightTableFile: 741 | lines = weightTableFile.readlines() 742 | for line in lines: 743 | array = line.rstrip().rsplit('\t') 744 | try: 745 | (loc, allele) = array[0].replace('-', '_').rsplit('_', 1) 746 | except ValueError: 747 | print("Error : Allele name in locus file should be seperated by '_' or '-'") 748 | exit(0) 749 | if loc not in weightDict: 750 | weightDict[loc] = {} 751 | weightDict[loc][allele] = float(array[1]) 752 | return weightDict 753 | ############################################################# 754 | # Function : loadConfig 755 | # Input : config file path from getopts 756 | # Output : Updates configDict 757 | # Description: Used to find allele fasta files for getCoverage 758 | ############################################################# 759 | def loadConfig(config): 760 | global configDict 761 | configDict = {} 762 | with open(config) as configFile: 763 | lines = configFile.readlines() 764 | head = '' 765 | for line in lines: 766 | if line.rstrip() == '': 767 | continue 768 | if line.rstrip() == '[loci]': 769 | head = 'loci' 770 | configDict[head] = {} 771 | elif line.rstrip() == '[profile]': 772 | head = 'profile' 773 | configDict[head] = {} 774 | else: 775 | arr = line.strip().split() 776 | configDict[head][arr[0]] = arr[1] 777 | for head in configDict: 778 | for element in configDict[head]: 779 | if not os.path.isfile(configDict[head][element]): 780 | print("ERROR: %s file does not exist at %s" % (element, configDict[head][element])) 781 | exit(0) 782 | return configDict 783 | ############################################################# 784 | # Function : getCoverage 785 | # Input : results dictionary 786 | # Output : Updates results to include coverage info 787 | ############################################################# 788 | def getCoverage(results): 789 | tmpdir = tempfile.mkdtemp() 790 | for sample in results: 791 | file = tmpdir +'/'+ sample + '.fasta' 792 | bed = tmpdir +'/'+ sample + '.bed' 793 | sortedFile = tmpdir +'/'+ sample + '.sorted' 794 | covOut = tmpdir +'/'+ sample + '.out' 795 | with open(file, 'w') as tmpFasta: 796 | with open(bed, 'w') as bedFile: 797 | for gene in configDict['loci']: 798 | genes = Fasta(configDict['loci'][gene]) 799 | allele = gene+'_'+re.sub('\*', "", str(results[sample][gene])) 800 | tmpFasta.write('>'+gene+'\n') 801 | bedFile.write(gene+'\t0\t'+str(len(genes[allele]))+'\n') 802 | for line in genes[allele]: 803 | tmpFasta.write(str(line)+'\n') 804 | cmdIndex = "bwa index %s 2>/dev/null"%(file) 805 | os.system(cmdIndex) 806 | readBWA = sample+'_reads.fq' 807 | cmdBwaMem = "bwa mem %s %s 2>/dev/null| samtools view -uS - | samtools sort - -o %s"%(file, readBWA, sortedFile) 808 | os.system(cmdBwaMem) 809 | cmdCov = "bedtools coverage -a %s -b %s > %s"%(bed, sortedFile, covOut) 810 | os.system(cmdCov) 811 | with open(covOut, 'r') as cov: 812 | for line in cov.readlines(): 813 | records = line.rstrip().rsplit('\t') 814 | gene = records[0] 815 | geneCov = float(records[6]) * 100 816 | results[sample][gene] = results[sample][gene] + " (" + str("%.2f" % geneCov) + ")" 817 | shutil.rmtree(tmpdir) 818 | """Prints the results in the format asked by the user.""" 819 | ############################################################# 820 | # Function : printResults 821 | # Input : results, output file, overwrite? 822 | # Output : Prints on the screen or in a file 823 | # Description: Prints the results in the format asked by the user 824 | ############################################################# 825 | def printResults(results, output_filename, overwrite, timeDisp): 826 | if output_filename != None: 827 | if overwrite is False: 828 | outfile = open(output_filename, "a") 829 | else: 830 | outfile = open(output_filename, "w") 831 | heading = "Sample" 832 | for head in sorted(results[list(results.keys())[0]]): 833 | if head == 'ST' or head == 't': 834 | continue 835 | heading += '\t' + head 836 | heading += '\tST' 837 | if timeDisp is True: 838 | heading += '\tTime' 839 | if output_filename != None: 840 | outfile.write(heading) 841 | outfile.write('\n') 842 | else: 843 | print(heading) 844 | for s in results: 845 | sample = s.split("_")[0] 846 | for l in sorted(results[s]): 847 | if l == 'ST' or l == 't': 848 | continue 849 | if results[s][l]: 850 | sample += '\t'+results[s][l] 851 | else: 852 | sample += '\tNA' 853 | if timeDisp is True: 854 | sample += '\t' + str(results[s]['ST']) + '\t%.2f ' %results[s]['t'] 855 | else: 856 | sample += '\t' + str(results[s]['ST']) 857 | if output_filename != None: 858 | outfile.write(sample) 859 | outfile.write('\n') 860 | else: 861 | print(sample) 862 | """Predict part ends here""" 863 | """Build DB part starts""" 864 | """Returns the reverse complement of the sequence""" 865 | def reverseComplement(seq): 866 | seqU = seq.upper() 867 | seq_dict = {'A':'T', 'T':'A', 'G':'C', 'C':'G', 'Y':'R', 'R':'Y', 'S':'S', 'W':'W', 'K':'M', 'M':'K', 'N':'N'} 868 | try: 869 | return "".join([seq_dict[base] for base in reversed(seqU)]) 870 | except Exception: 871 | strn = "Reverse Complement Error:" + seqU 872 | logging.debug(strn) 873 | pass 874 | ############################################################# 875 | # Function : getFastaDict 876 | # Input : locus file name 877 | # Output : dictionary with all the allele sequences 878 | # Description: Stores each allele sequence in a dictionary 879 | ############################################################# 880 | def getFastaDict(fullLocusFile): 881 | logging.debug("Create Fasta Dict") 882 | logging.debug(fullLocusFile) 883 | fastaFile = open(fullLocusFile, 'r').read() 884 | entries = [x for x in fastaFile.split('>') if len(x) != 0] 885 | fastaDict = {} 886 | for entry in entries: 887 | key = [x for x in entry.split('\n')[0].split() if len(x) != 0][0] 888 | sequence = ''.join(entry.split('\n')[1:]).rstrip() 889 | fastaDict[key] = {'sequence':sequence} 890 | return fastaDict 891 | ############################################################# 892 | # Function : formKmerDB 893 | # Input : configuration file, k value, output prefix 894 | # Output : stringMLST DB 895 | # Description: Constructs the k-mer DB in both strand orientation 896 | ############################################################# 897 | def formKmerDB(configDict, k, output_filename): 898 | dbFileName = output_filename+'_'+str(k)+'.txt' 899 | weightFileName = output_filename+'_weight.txt' 900 | kmerDict = {} 901 | mean = {} 902 | for locus in configDict['loci']: 903 | msgs = "formKmerDB :" +locus 904 | logging.debug(msgs) 905 | fastaDict = getFastaDict(configDict['loci'][locus]) 906 | sum = 0 907 | n = 0 908 | for allele in list(fastaDict.keys()): 909 | seq = fastaDict[allele]['sequence'].strip() 910 | l = len(seq) 911 | sum += l 912 | n += 1 913 | try: 914 | (loc, num) = allele.replace('-', '_').rsplit('_', 1) 915 | except ValueError: 916 | print("Error : Allele name in locus file should be seperated by '_' or '-'") 917 | exit(0) 918 | splitId = allele.replace('-', '_').rsplit('_', 1) 919 | i = 0 920 | while i+k <= l: 921 | kmer = seq[i:i+k] 922 | revCompKmer = reverseComplement(kmer) 923 | if kmer not in kmerDict: 924 | kmerDict[kmer] = {} 925 | kmerDict[kmer][splitId[0]] = [] 926 | kmerDict[kmer][splitId[0]].append(int(splitId[1])) 927 | else: 928 | if splitId[0] not in kmerDict[kmer]: 929 | kmerDict[kmer][splitId[0]] = [] 930 | kmerDict[kmer][splitId[0]].append(int(splitId[1])) 931 | else: 932 | kmerDict[kmer][splitId[0]].append(int(splitId[1])) 933 | if revCompKmer not in kmerDict: 934 | kmerDict[revCompKmer] = {} 935 | kmerDict[revCompKmer][splitId[0]] = [] 936 | kmerDict[revCompKmer][splitId[0]].append(int(splitId[1])) 937 | else: 938 | if splitId[0] not in kmerDict[revCompKmer]: 939 | kmerDict[revCompKmer][splitId[0]] = [] 940 | kmerDict[revCompKmer][splitId[0]].append(int(splitId[1])) 941 | else: 942 | kmerDict[revCompKmer][splitId[0]].append(int(splitId[1])) 943 | i += 1 944 | mean[locus] = sum/n*1.0 945 | with open(dbFileName, 'w') as kfile: 946 | for key in kmerDict: 947 | for key1 in kmerDict[key]: 948 | string = key+'\t'+key1+'\t'+str(kmerDict[key][key1]).replace(" ", "")+'\n' 949 | kfile.write(string) 950 | with open(weightFileName, 'w') as wfile: 951 | for locus in configDict['loci']: 952 | fastaDict = getFastaDict(configDict['loci'][locus]) 953 | for allele in list(fastaDict.keys()): 954 | splitId = allele.split('_') 955 | seq = fastaDict[allele]['sequence'] 956 | l = len(seq) 957 | fac = (l/mean[locus]) 958 | s = allele + '\t' + str(fac) + '\n' 959 | if fac > 1.05 or fac < 0.95: 960 | wfile.write(s) 961 | """Copies the profile definition file as a new file""" 962 | def copyProfileFile(profileDict, output_filename): 963 | profileFileName = output_filename+'_profile.txt' 964 | with open(profileDict['profile']) as f: 965 | lines = f.readlines() 966 | with open(profileFileName, "w") as f1: 967 | f1.writelines(lines) 968 | ############################################################# 969 | # Function : makeCustomDB 970 | # Input : configuration file, k value, output prefix 971 | # Output : None 972 | # Description: Processes the config file and calls the relevant 973 | # function 974 | ############################################################# 975 | def makeCustomDB(config, k, output_filename): 976 | configDict = {} 977 | if output_filename == None: 978 | output_filename = 'kmerDB' 979 | with open(config, 'r') as configFile: 980 | lines = configFile.readlines() 981 | head = '' 982 | for line in lines: 983 | if line.rstrip() == '': 984 | continue 985 | if line.rstrip() == '[loci]': 986 | head = 'loci' 987 | configDict[head] = {} 988 | elif line.rstrip() == '[profile]': 989 | head = 'profile' 990 | configDict[head] = {} 991 | else: 992 | arr = line.strip().split() 993 | configDict[head][arr[0]] = arr[1] 994 | for head in configDict: 995 | for element in configDict[head]: 996 | if not os.path.isfile(configDict[head][element]): 997 | print("ERROR: %s file does not exist at %s" % (element, configDict[head][element])) 998 | exit(0) 999 | formKmerDB(configDict, k, output_filename) 1000 | copyProfileFile(configDict['profile'], output_filename) 1001 | """Build DB part ends""" 1002 | """Check Parameters""" 1003 | def checkParams(buildDB, predict, config, k, listMode, list, batch, dir, fastq1, fastq2, paired, dbPrefix): 1004 | if predict is True and buildDB is True: 1005 | print(helpTextSmall) 1006 | print("Select either predict or buildDB module") 1007 | exit(0) 1008 | if predict is False and buildDB is False and downloadDB is False: 1009 | print(helpTextSmall) 1010 | print("Select either predict or buildDB module") 1011 | exit(0) 1012 | if predict is True: 1013 | if config != None and coverage is False: 1014 | print(helpTextSmall) 1015 | print("Config parameter is not required for predict mode.") 1016 | exit(0) 1017 | elif config is None and coverage is True: 1018 | print(helpTextSmall) 1019 | print("Config parameter is required to for coverage prediction") 1020 | exit(0) 1021 | if not os.path.isfile(dbPrefix+'_'+str(k)+'.txt'): 1022 | print(helpTextSmall) 1023 | print("DB file does not exist : ", dbPrefix, '_', str(k), '.txt or change DB prefix.') 1024 | exit(0) 1025 | if not os.path.isfile(dbPrefix+'_weight.txt'): 1026 | print(helpTextSmall) 1027 | print("DB file does not exist : ", dbPrefix, '_weight.txt or change DB prefix.') 1028 | exit(0) 1029 | if not os.path.isfile(dbPrefix+'_profile.txt'): 1030 | print(helpTextSmall) 1031 | print("DB file does not exist : ", dbPrefix, '_profile.txt or change DB prefix.') 1032 | exit(0) 1033 | if listMode is True: 1034 | if not os.path.isfile(fList): 1035 | print(helpTextSmall) 1036 | print("Error: List file ("+fList+") does not exist!") 1037 | exit(0) 1038 | elif batch is True: 1039 | if not os.path.isdir(dir): 1040 | print(helpTextSmall) 1041 | print("Error: Directory ("+dir+") does not exist!") 1042 | exit(0) 1043 | elif paired is True: 1044 | if not os.path.isfile(fastq1): 1045 | print(helpTextSmall) 1046 | print("Error: FASTQ file ("+fastq1+") does not exist!") 1047 | exit(0) 1048 | if not os.path.isfile(fastq2): 1049 | print(helpTextSmall) 1050 | print("Error: FASTQ file ("+fastq2+") does not exist!") 1051 | exit(0) 1052 | elif paired is False: 1053 | if not os.path.isfile(fastq1): 1054 | print(helpTextSmall) 1055 | print("Error: FASTQ file ("+fastq1+") does not exist!") 1056 | exit(0) 1057 | if buildDB is True: 1058 | try: 1059 | if not os.path.isfile(config): 1060 | print(helpTextSmall) 1061 | print("Error: Configuration file ("+config+") does not exist!") 1062 | exit(0) 1063 | except Exception: 1064 | print(helpTextSmall) 1065 | print("Error: Specify Configuration file") 1066 | exit(0) 1067 | helpText = """ 1068 | Readme for stringMLST 1069 | ============================================================================================= 1070 | Usage 1071 | ./stringMLST.py 1072 | [--buildDB] 1073 | [--predict] 1074 | [-1 filename_fastq1][--fastq1 filename_fastq1] 1075 | [-2 filename_fastq2][--fastq2 filename_fastq2] 1076 | [-d directory][--dir directory][--directory directory] 1077 | [-l list_file][--list list_file] 1078 | [-p][--paired] 1079 | [-s][--single] 1080 | [-c][--config] 1081 | [-P][--prefix] 1082 | [-z][--fuzzy] 1083 | [-a] 1084 | [-C][--coverage] 1085 | [-k] 1086 | [-o output_filename][--output output_filename] 1087 | [-x][--overwrite] 1088 | [-t] 1089 | [-r] 1090 | [-v] 1091 | [-h][--help] 1092 | ============================================================================================== 1093 | There are two steps to predicting ST using stringMLST. 1094 | 1. Create DB : stringMLST.py --buildDB 1095 | 2. Predict : stringMLST --predict 1096 | 1. stringMLST.py --buildDB 1097 | Synopsis: 1098 | stringMLST.py --buildDB -c -k -P 1099 | config file : is a tab delimited file which has the information for typing scheme ie loci, its multifasta file and profile definition file. 1100 | Format : 1101 | [loci] 1102 | locus1 locusFile1 1103 | locus2 locusFile2 1104 | [profile] 1105 | profile profileFile 1106 | kmer length : is the kmer length for the db. Note, while processing this should be smaller than the read length. 1107 | We suggest kmer lengths of 35, 66 depending on the read length. 1108 | DB prefix(optional) : holds the information for DB files to be created and their location. This module creates 3 files with this prefix. 1109 | You can use a folder structure with prefix to store your db at particular location. 1110 | Required arguments 1111 | --buildDB 1112 | Identifier for build db module 1113 | -c,--config = 1114 | Config file in the format described above. 1115 | All the files follow the structure followed by pubmlst. Refer extended document for details. 1116 | Optional arguments 1117 | -k = 1118 | Kmer size for which the db has to be formed(Default k = 35). Note the tool works best with kmer length in between 35 and 66 1119 | for read lengths of 55 to 150 bp. Kmer size can be increased accordingly. It is advised to keep lower kmer sizes 1120 | if the quality of reads is not very good. 1121 | -P,--prefix = 1122 | Prefix for db and log files to be created(Default = kmer). Also you can specify folder where you want the dbb to be created. 1123 | -a 1124 | File location to write build log 1125 | -h,--help 1126 | Prints the help manual for this application 1127 | -------------------------------------------------------------------------------------------- 1128 | 2. stringMLST.py --predict 1129 | stringMLST --predict : can run in three modes 1130 | 1) single sample (default mode) 1131 | 2) batch mode : run stringMLST for all the samples in a folder (for a particular specie) 1132 | 3) list mode : run stringMLST on samples specified in a file 1133 | stringMLST can process both single and paired end files. By default program expects paired end files. 1134 | Synopsis 1135 | stringMLST.py --predict -1 -2 -d -l -p -s -P -k -o -x 1136 | Required arguments 1137 | --predict 1138 | Identifier for predict module 1139 | Optional arguments 1140 | -1,--fastq1 = 1141 | Path to first fastq file for paired end sample and path to the fastq file for single end file. 1142 | Should have extension fastq or fq. 1143 | -2,--fastq2 = 1144 | Path to second fastq file for paired end sample. 1145 | Should have extension fastq or fq. 1146 | -d,--dir,--directory = 1147 | BATCH MODE : Location of all the samples for batch mode. 1148 | -C,--coverage 1149 | Calculate sequence coverage for each allele. Turns on read generation (-r) and turns off fuzzy (-z 1) 1150 | Requires bwa, bamtools and samtools be in your path 1151 | -k = 1152 | Kmer length for which the db was created(Default k = 35). Could be verified by looking at the name of the db file. 1153 | Could be used if the reads are of very bad quality or have a lot of N's. 1154 | -l,--list = 1155 | LIST MODE : Location of list file and flag for list mode. 1156 | list file should have full file paths for all the samples/files. 1157 | Each sample takes one line. For paired end samples the 2 files should be tab separated on single line. 1158 | -o,--output = 1159 | Prints the output to a file instead of stdout. 1160 | -p,--paired 1161 | Flag for specifying paired end files. Default option so would work the same if you do not specify for all modes. 1162 | For batch mode the paired end samples should be differentiated by 1/2.fastq or 1/2.fq 1163 | -P,--prefix = 1164 | Prefix using which the db was created(Defaults = kmer). The location of the db could also be provided. 1165 | -r 1166 | A separate reads file is created which has all the reads covering all the locus. 1167 | -s,--single 1168 | Flag for specifying single end files. 1169 | -t 1170 | Time for each analysis will also be reported. 1171 | -v 1172 | Prints the version of the software. 1173 | -x,--overwrite 1174 | By default stringMLST appends the results to the output_filename if same name is used. 1175 | This argument overwrites the previously specified output file. 1176 | -z,--fuzzy = 1177 | Threshold for reporting a fuzzy match (Default=300). For higher coverage reads this threshold should be set higher to avoid 1178 | indicating fuzzy match when exact match was more likely. For lower coverage reads, threshold of <100 is recommended 1179 | -h,--help 1180 | Prints the help manual for this application 1181 | ============================================================================================= 1182 | 3. stringMLST.py --getMLST 1183 | Synopsis: 1184 | stringMLST.py --getMLST --species= [-k kmer length] [-P DB prefix] 1185 | Required arguments 1186 | --getMLST 1187 | Identifier for getMLST module 1188 | --species= 1189 | Species name from the pubMLST schemes (use "--species show" to get list of available schemes) 1190 | "all" will download and build all 1191 | Optional arguments 1192 | -k = 1193 | Kmer size for which the db has to be formed(Default k = 35). Note the tool works best with kmer length in between 35 and 66 1194 | for read lengths of 55 to 150 bp. Kmer size can be increased accordingly. It is advised to keep lower kmer sizes 1195 | if the quality of reads is not very good. 1196 | -P,--prefix = 1197 | Prefix for db and log files to be created(Default = kmer). Also you can specify folder where you want the db to be created. 1198 | We recommend that prefix and config point to the same folder for cleanliness but this is not required 1199 | --schemes 1200 | Display the list of available schemes 1201 | -h,--help 1202 | Prints the help manual for this application 1203 | ============================================================================================= 1204 | Example usage: 1205 | ./stringMLST.py --buildDB 1206 | 1) Build DB 1207 | ./stringMLST.py --buildDB --config config.txt -k 35 -P NM 1208 | -------------------------------------------------------------------------------------------- 1209 | ./stringMLST.py --predict 1210 | 1) Single sample, paired end 1211 | ./stringMLST.py --predict -1 data/Neisseria/ERR017001_1.fastq -2 data/Neisseria/ERR017001_2.fastq -p --prefix NM -k 35 -o output.txt 1212 | 2) Single sample, single end, overwrite output 1213 | ./stringMLST.py --predict -1 data/Neisseria/ERR017001_1.fastq -s --prefix NM -k 35 -o output.txt -x 1214 | 3) Multiple sample batch mode, paired end 1215 | ./stringMLST.py --predict -d data/Neisseria/ -p --prefix NM -k 35 -o output.txt -x 1216 | 4) Multiple samples list mode, paired end 1217 | ./stringMLST.py --predict -l data/listFile.txt -p --prefix NM -k 35 -o output.txt -x 1218 | 5) Single, high coverage sample, paired end 1219 | ./stringMLST.py --predict -1 data/Neisseria/ERR017001_1.fastq -2 data/Neisseria/ERR017001_2.fastq -p --prefix NM -k 35 -z 1000 -o output.txt 1220 | -------------------------------------------------------------------------------------------- 1221 | ./stringMLST.py --getMLST 1222 | 1) List available schemes 1223 | ./stringMLST.py --getMLST --schemes 1224 | 2) Download the Neisseria spp. pubMLST scheme 1225 | ./stringMLST.py --getMLST --species=neisseria -P datasets/nmb 1226 | """ 1227 | helpTextSmall = """ 1228 | Usage 1229 | [--buildDB] 1230 | [--predict] 1231 | [-1 filename_fastq1][--fastq1 filename_fastq1] 1232 | [-2 filename_fastq2][--fastq2 filename_fastq2] 1233 | [-d directory][--dir directory][--directory directory] 1234 | [-l list_file][--list list_file] 1235 | [-p][--paired] 1236 | [-s][--single] 1237 | [-c][--config] 1238 | [-P][--prefix] 1239 | [-z][--fuzzy] 1240 | [-a] 1241 | [-C][--coverage] 1242 | [-k] 1243 | [-o output_filename][--output output_filename] 1244 | [-x][--overwrite] 1245 | [-t] 1246 | [-r] 1247 | [-v] 1248 | [-h][--help] 1249 | ============================================================================================== 1250 | There are two steps to predicting ST using stringMLST. 1251 | 1. Create DB : stringMLST.py --buildDB 1252 | 2. Predict : stringMLST --predict 1253 | 1. stringMLST.py --buildDB 1254 | Synopsis: 1255 | stringMLST.py --buildDB -c -k -P 1256 | Required arguments 1257 | --buildDB 1258 | Identifier for build db module 1259 | -c,--config = 1260 | Config file in the format described above. 1261 | All the files follow the structure followed by pubmlst. Refer extended document for details. 1262 | Optional arguments 1263 | -k = 1264 | Kmer size for which the db has to be formed(Default k = 35). Note the tool works best with kmer length in between 35 and 66 1265 | for read lengths of 55 to 150 bp. Kmer size can be increased accordingly. It is advised to keep lower kmer sizes 1266 | if the quality of reads is not very good. 1267 | -P,--prefix = 1268 | Prefix for db and log files to be created(Default = kmer). Also you can specify folder where you want the db to be created. 1269 | -h,--help 1270 | Prints the help manual for this application 1271 | ============================================================================================== 1272 | 2. stringMLST.py --predict 1273 | Synopsis 1274 | stringMLST.py --predict -1 -2 -d -l -p -s -P -k -o -x 1275 | Required arguments 1276 | --predict 1277 | Identifier for predict module 1278 | Optional arguments 1279 | -1,--fastq1 = 1280 | Path to first fastq file for paired end sample and path to the fastq file for single end file. 1281 | Should have extension fastq or fq. 1282 | -2,--fastq2 = 1283 | Path to second fastq file for paired end sample. 1284 | Should have extension fastq or fq. 1285 | -d,--dir,--directory = 1286 | BATCH MODE : Location of all the samples for batch mode. 1287 | -C,--coverage 1288 | Calculate sequence coverage for each allele. Turns on read generation (-r) and turns off fuzzy (-z 1) 1289 | Requires bwa, bamtools and samtools be in your path 1290 | -k = 1291 | Kmer length for which the db was created(Default k = 35). Could be verified by looking at the name of the db file. 1292 | Could be used if the reads are of very bad quality or have a lot of N's. 1293 | -l,--list = 1294 | LIST MODE : Location of list file and flag for list mode. 1295 | list file should have full file paths for all the samples/files. 1296 | Each sample takes one line. For paired end samples the 2 files should be tab separated on single line. 1297 | -o,--output = 1298 | Prints the output to a file instead of stdout. 1299 | -p,--paired 1300 | Flag for specifying paired end files. Default option so would work the same if you do not specify for all modes. 1301 | For batch mode the paired end samples should be differentiated by 1/2.fastq or 1/2.fq 1302 | -P,--prefix = 1303 | Prefix using which the db was created(Defaults = kmer). The location of the db could also be provided. 1304 | -r 1305 | A separate reads file is created which has all the reads covering all the locus. 1306 | -s,--single 1307 | Flag for specifying single end files. 1308 | -t 1309 | Time for each analysis will also be reported. 1310 | -v 1311 | Prints the version of the software. 1312 | -x,--overwrite 1313 | By default stringMLST appends the results to the output_filename if same name is used. 1314 | This argument overwrites the previously specified output file. 1315 | -z,--fuzzy = 1316 | Threshold for reporting a fuzzy match (Default=300). For higher coverage reads this threshold should be set higher to avoid 1317 | indicating fuzzy match when exact match was more likely. For lower coverage reads, threshold of <100 is recommended 1318 | -h,--help 1319 | Prints the help manual for this application 1320 | ============================================================================================= 1321 | 3. stringMLST.py --getMLST 1322 | Synopsis: 1323 | stringMLST.py --getMLST --species= [-k kmer length] [-P DB prefix] 1324 | Required arguments 1325 | --getMLST 1326 | Identifier for getMLST module 1327 | --species= 1328 | Species name from the pubMLST schemes 1329 | Use "show" or "list" to list available schemes 1330 | "all" will download and build all available schemes 1331 | Optional arguments 1332 | -k = 1333 | Kmer size for which the db has to be formed(Default k = 35). Note the tool works best with kmer length in between 35 and 66 1334 | for read lengths of 55 to 150 bp. Kmer size can be increased accordingly. It is advised to keep lower kmer sizes 1335 | if the quality of reads is not very good. 1336 | -P,--prefix = 1337 | Prefix for db and log files to be created(Default = kmer). Also you can specify folder where you want the db to be created. 1338 | We recommend that prefix and config point to the same folder for cleanliness but this is not required 1339 | --schemes 1340 | Display the list of available schemes 1341 | -h,--help 1342 | Prints the help manual for this application 1343 | ============================================================================================= 1344 | 1345 | """ 1346 | 1347 | """The Program Starts Execution Here""" 1348 | """Default Params""" 1349 | downloadDB = False 1350 | species = None 1351 | printSchemes = False 1352 | buildDB = False 1353 | predict = False 1354 | output_filename = None 1355 | batch = False 1356 | listMode = False 1357 | overwrite = False 1358 | paired = True 1359 | fastq1 = None 1360 | fastq2 = None 1361 | user_k = False 1362 | config = None 1363 | timeDisp = False 1364 | reads = False 1365 | dbPrefix = 'kmer' 1366 | log = '' 1367 | k = 35 1368 | fuzzy = 300 1369 | coverage = False 1370 | #print'ARGV :', sys.argv[1:] 1371 | #exit(0) 1372 | """Input arguments""" 1373 | options, remainder = getopt.getopt(sys.argv[1:], 'o:x1:2:k:l:bd:pshP:c:trva:z:C', [ 1374 | 'buildDB', 1375 | 'predict', 1376 | 'output=', 1377 | 'config=', 1378 | 'prefix=', 1379 | 'overwrite', 1380 | 'batch', 1381 | 'list', 1382 | 'fastq1=', 1383 | 'fastq2=', 1384 | 'dir=', 1385 | 'directory=', 1386 | 'paired', 1387 | 'single', 1388 | 'help', 1389 | 'fuzzy=', 1390 | 'coverage', 1391 | 'getMLST', 1392 | 'schemes', 1393 | 'species=']) 1394 | for opt, arg in options: 1395 | if opt in ('-o', '--output'): 1396 | output_filename = arg 1397 | elif opt in ('-x', '--overwrite'): 1398 | overwrite = True 1399 | elif opt in '--buildDB': 1400 | buildDB = True 1401 | elif opt in ('-P', '--prefix'): 1402 | dbPrefix = arg 1403 | elif opt in '--predict': 1404 | predict = True 1405 | elif opt in ('-c', '--config'): 1406 | config = arg 1407 | elif opt in '-k': 1408 | user_k = True 1409 | try: 1410 | k = int(arg) 1411 | except ValueError: 1412 | print("Error: Enter a numerical k value.") 1413 | exit(0) 1414 | # Check to make sure the arg is an int. 1415 | elif opt in ('-l', '--list'): 1416 | listMode = True 1417 | fList = arg 1418 | elif opt in ('-1', '--fastq1'): 1419 | fastq1 = arg 1420 | elif opt in ('-2', '--fastq2'): 1421 | fastq2 = arg 1422 | elif opt in ('-d', '--dir', '--directory'): 1423 | dir = arg 1424 | batch = True 1425 | elif opt in ('-p', '--paired'): 1426 | paired = True 1427 | single = False 1428 | elif opt in ('-s', '--single'): 1429 | single = True 1430 | paired = False 1431 | elif opt in '-t': 1432 | timeDisp = True 1433 | elif opt in '-a': 1434 | log = arg 1435 | elif opt in '-r': 1436 | reads = True 1437 | elif opt in '-v': 1438 | print(version) 1439 | exit(0) 1440 | elif opt in ('-C', '--coverage'): 1441 | coverage = True 1442 | reads = True 1443 | fuzzy = 1 1444 | elif opt in ('-z', '--fuzzy'): 1445 | try: 1446 | fuzzy = int(arg) 1447 | except ValueError: 1448 | print("You provided '" + arg + "' for your fuzziness threshold, which is not an integer value") 1449 | exit(0) 1450 | elif opt in '--schemes': 1451 | print("The `--schemes` option has been depreciated. Please use `--species list` to see available schemes") 1452 | exit(0) 1453 | elif opt in '--getMLST': 1454 | downloadDB = True 1455 | elif opt in '--species': 1456 | species = arg 1457 | elif opt in ('-h', '--help'): 1458 | print(helpText) 1459 | exit(0) 1460 | checkParams(buildDB, predict, config, k, listMode, list, batch, dir, fastq1, fastq2, paired, dbPrefix) 1461 | if buildDB is True: 1462 | try: 1463 | if not log: 1464 | log = dbPrefix+'.log' 1465 | except TypeError: 1466 | log = 'kmer.log' 1467 | logging.basicConfig(filename=log, level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') 1468 | if os.path.isfile(config): 1469 | print("Info: Making DB for k = ", k) 1470 | print("Info: Making DB with prefix =", dbPrefix) 1471 | print("Info: Log file written to ", log) 1472 | makeCustomDB(config, k, dbPrefix) 1473 | else: 1474 | print("Error: The input config file "+config +" does not exist.") 1475 | elif predict is True: 1476 | try: 1477 | if not log: 1478 | log = dbPrefix+'.log' 1479 | except TypeError: 1480 | log = 'kmer.log' 1481 | logging.basicConfig(filename=log, level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') 1482 | loadModule(k, dbPrefix) 1483 | if batch is True: 1484 | results = batchTool(dir, paired, k) 1485 | elif listMode is True: 1486 | results = listTool(fList, paired, k) 1487 | else: 1488 | results = {} 1489 | results = singleSampleTool(fastq1, fastq2, paired, k, results) 1490 | if coverage is True: 1491 | try: 1492 | from pyfaidx import Fasta 1493 | except ImportError: 1494 | print("pyfaidx is required for coverage calculation\npip install pyfaidx") 1495 | exit(0) 1496 | loadConfig(config) 1497 | getCoverage(results) 1498 | printResults(results, output_filename, overwrite, timeDisp) 1499 | elif downloadDB is True: 1500 | dbURL = "http://pubmlst.org/data/dbases.xml" 1501 | databaseXML = urlopen(dbURL) 1502 | dbTree = ET.parse(databaseXML) 1503 | dbRoot = dbTree.getroot() 1504 | if species is None: 1505 | print("Please refer to --help to more information") 1506 | print() 1507 | print("Expected command format:") 1508 | print("stringMLST.py --getMLST --species= [-k kmer length] [-P DB prefix]") 1509 | print() 1510 | print("To printavailable MLST Schemes use:") 1511 | print("stringMLST.py --getMLST --species show") 1512 | exit(0) 1513 | elif species == "show" or species == "list": 1514 | for species in dbRoot: 1515 | print(species.text.rstrip()) 1516 | elif species == "all": 1517 | print("Using a kmer size of " + str(k) + " for all databases.") 1518 | for species in dbRoot: 1519 | speciesName = species.text.rstrip() 1520 | print('\033[1m' + "Preparing: " + speciesName + '\033[0m') 1521 | if re.search('[/#. ()]', speciesName): 1522 | normSpeciesName = re.sub('[/# ]', "_", speciesName) 1523 | normSpeciesName = re.sub('[.()]', "", normSpeciesName) 1524 | print('\t\033[33m' + "INFO: normalizing name to: " + normSpeciesName + '\033[0m') 1525 | else: 1526 | normSpeciesName = speciesName 1527 | filePrefix = str(dbPrefix.rsplit("/", 1)[0]) + "/" + normSpeciesName 1528 | # Move the rest of this informational message into the download handler 1529 | # + " ( " + filePrefix + "/" + key + "_" +str(k) + " )") 1530 | try: 1531 | os.makedirs(filePrefix) 1532 | except OSError: 1533 | pass 1534 | filePrefix = filePrefix + "/" + normSpeciesName 1535 | config = filePrefix + "_config.txt" 1536 | profileURL, loci = get_links(dbRoot, filePrefix, speciesName) 1537 | get_files(filePrefix, loci, profileURL, speciesName) 1538 | else: 1539 | print('\033[1m' + "Preparing: " + species + '\033[0m') 1540 | if re.search('[/#. ()]', species): 1541 | normSpeciesName = re.sub('[/# ]', "_", species) 1542 | normSpeciesName = re.sub('[.()]', "", normSpeciesName) 1543 | print('\t\033[33m' + "INFO: normalizing name to: " + normSpeciesName + '\033[0m') 1544 | else: 1545 | normSpeciesName = species 1546 | try: 1547 | os.makedirs(dbPrefix.rsplit("/", 1)[0]) 1548 | except OSError: 1549 | pass 1550 | if len(re.findall("/", dbPrefix)) == 0: 1551 | filePrefix = dbPrefix + "/" + normSpeciesName 1552 | elif len(re.findall("/", dbPrefix)) == 1 and len(dbPrefix.rsplit("/", 1)[1]) > 0: 1553 | filePrefix = dbPrefix 1554 | elif len(re.findall("/", dbPrefix)) == 1 and len(dbPrefix.rsplit("/", 1)[1]) == 0: 1555 | filePrefix = dbPrefix + normSpeciesName 1556 | elif len(re.findall("/", dbPrefix)) > 1: 1557 | if dbPrefix.endswith('/'): 1558 | filePrefix = dbPrefix + normSpeciesName 1559 | else: 1560 | filePrefix = dbPrefix 1561 | config = filePrefix + "_config.txt" 1562 | profileURL, loci = get_links(dbRoot, filePrefix, species) 1563 | get_files(filePrefix, loci, profileURL, species) 1564 | else: 1565 | print(helpTextSmall) 1566 | print("Error: Please select the mode: buildDB (for database building) or predict (for ST discovery) module") 1567 | logging.debug('Command :' + str(sys.argv)) 1568 | 1569 | --------------------------------------------------------------------------------