├── setup.cfg
├── tests
    └── fastqs
    │   └── .desc
├── .gitmodules
├── PATENTS
├── .gitignore
├── download_example_reads.sh
├── setup.py
├── LICENSE
├── LICENSE.txt
├── README.md
└── stringMLST.py


/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/tests/fastqs/.desc:
--------------------------------------------------------------------------------
1 | location of downloaded test fastq files
2 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "datasets"]
2 | 	path = datasets
3 | 	url = https://github.com/jordanlab/stringMLST_datasets
4 | 


--------------------------------------------------------------------------------
/PATENTS:
--------------------------------------------------------------------------------
 1 | Patent Rights Grant
 2 | 
 3 | Some portions of the allele selection algorithm in stringMLST are patent
 4 | pending.
 5 | 
 6 | The Jordan Lab and Applied Bioinformatics Laboratory / IHRC Inc.  (ABiL-IHRC)
 7 | and its affiliates promise not to assert any stringMLST related patents against
 8 | you for using or modifying stringMLST for non-commericial, academic use
 9 | consistent with the terms stated in the License.
10 | 
11 | No rights other than those explicitly stated in the License or this Promise are
12 | granted or waived.
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Folder view configuration files
 2 | .DS_Store
 3 | Desktop.ini
 4 | 
 5 | # Thumbnail cache files
 6 | ._*
 7 | Thumbs.db
 8 | 
 9 | # Files that might appear on external disks
10 | .Spotlight-V100
11 | .Trashes
12 | 
13 | # Compiled Python files
14 | *.pyc
15 | 
16 | # Compiled C++ files
17 | *.out
18 | # Byte-compiled / optimized / DLL files
19 | __pycache__/
20 | *.py[cod]
21 | *$py.class
22 | 
23 | # C extensions
24 | *.so
25 | 
26 | # Distribution / packaging
27 | .Python
28 | env/
29 | build/
30 | develop-eggs/
31 | dist/
32 | downloads/
33 | eggs/
34 | .eggs/
35 | lib/
36 | lib64/
37 | parts/
38 | sdist/
39 | var/
40 | wheels/
41 | *.egg-info/
42 | .installed.cfg
43 | *.egg
44 | 
45 | # PyInstaller
46 | #  Usually these files are written by a python script from a template
47 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
48 | *.manifest
49 | *.spec
50 | 
51 | 


--------------------------------------------------------------------------------
/download_example_reads.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | testdir='tests/fastqs'
 4 | 
 5 | echo -e "Downloading fastq files from EBI..."
 6 | 
 7 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR026/ERR026529/ERR026529_1.fastq.gz -P $testdir
 8 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR026/ERR026529/ERR026529_2.fastq.gz -P $testdir
 9 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR027/ERR027250/ERR027250_1.fastq.gz -P $testdir
10 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR027/ERR027250/ERR027250_2.fastq.gz -P $testdir
11 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036104/ERR036104_1.fastq.gz -P $testdir
12 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036104/ERR036104_2.fastq.gz -P $testdir
13 | 
14 | echo -e "Done downloading.\n\n"
15 | 
16 | for file in `ls $testdir/*` ; do
17 |     echo -e "Attempting to unzip $file..."
18 |     gunzip $file;
19 |     echo -e "Done unzipping $file.\n"
20 | done
21 | 
22 | echo -e "Done downloading and extrating test read files."
23 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | try:
 3 |   import os
 4 |   from setuptools import setup, find_packages
 5 | except ImportError:
 6 |   from distutils.core import setup
 7 | from os import path
 8 | here = path.abspath(path.dirname(__file__))
 9 | def readme(file):
10 |   with open(path.join(here, 'README.md')) as fh:
11 |       long_description_text = fh.read()
12 |   return(long_description_text)
13 | 
14 | setup(
15 |   name = 'stringMLST',
16 |   scripts = ['stringMLST.py'],
17 |   version = "0.6.1",
18 |   description = 'Fast k-mer based tool for alignment and assembly-free multi locus sequence typing (MLST) directly from genome sequencing reads.',
19 |   long_description=readme('README.md'),
20 |   long_description_content_type="text/markdown",
21 |   author = 'Jordan Lab',
22 |   author_email = 'pypi@atc.io',
23 |   url = 'https://github.com/jordanlab/stringMLST',
24 |   keywords = ['MLST', 'kmer', "NGS", "stringMSLT"],
25 |   classifiers = [
26 |       'Programming Language :: Python :: 2.7',
27 |       'Programming Language :: Python :: 3.5',
28 |   ],
29 |   install_requires=['lxml','pyfaidx'],
30 | )
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License
 2 | 
 3 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.
 4 | 
 5 | Section 1 – Definitions.
 6 | 
 7 | Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.
 8 | Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License.
 9 | BY-NC-SA Compatible License means a license listed at creativecommons.org/compatiblelicenses, approved by Creative Commons as essentially the equivalent of this Public License.
10 | Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights.
11 | Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.
12 | Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.
13 | License Elements means the license attributes listed in the name of a Creative Commons Public License. The License Elements of this Public License are Attribution, NonCommercial, and ShareAlike.
14 | Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License.
15 | Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.
16 | Licensor means the individual(s) or entity(ies) granting rights under this Public License.
17 | NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange.
18 | Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.
19 | Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.
20 | You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.
21 | Section 2 – Scope.
22 | 
23 | License grant.
24 | Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to:
25 | reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and
26 | produce, reproduce, and Share Adapted Material for NonCommercial purposes only.
27 | Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions.
28 | Term. The term of this Public License is specified in Section 6(a).
29 | Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material.
30 | Downstream recipients.
31 | Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License.
32 | Additional offer from the Licensor – Adapted Material. Every recipient of Adapted Material from You automatically receives an offer from the Licensor to exercise the Licensed Rights in the Adapted Material under the conditions of the Adapter’s License You apply.
33 | No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material.
34 | No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i).
35 | Other rights.
36 | 
37 | Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise.
38 | Patent and trademark rights are not licensed under this Public License.
39 | To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes.
40 | Section 3 – License Conditions.
41 | 
42 | Your exercise of the Licensed Rights is expressly made subject to the following conditions.
43 | 
44 | Attribution.
45 | 
46 | If You Share the Licensed Material (including in modified form), You must:
47 | 
48 | retain the following if it is supplied by the Licensor with the Licensed Material:
49 | identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated);
50 | a copyright notice;
51 | a notice that refers to this Public License;
52 | a notice that refers to the disclaimer of warranties;
53 | a URI or hyperlink to the Licensed Material to the extent reasonably practicable;
54 | indicate if You modified the Licensed Material and retain an indication of any previous modifications; and
55 | indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License.
56 | You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information.
57 | If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable.
58 | ShareAlike.
59 | In addition to the conditions in Section 3(a), if You Share Adapted Material You produce, the following conditions also apply.
60 | 
61 | The Adapter’s License You apply must be a Creative Commons license with the same License Elements, this version or later, or a BY-NC-SA Compatible License.
62 | You must include the text of, or the URI or hyperlink to, the Adapter's License You apply. You may satisfy this condition in any reasonable manner based on the medium, means, and context in which You Share Adapted Material.
63 | You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, Adapted Material that restrict exercise of the rights granted under the Adapter's License You apply.
64 | Section 4 – Sui Generis Database Rights.
65 | 
66 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:
67 | 
68 | for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only;
69 | if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material, including for purposes of Section 3(b); and
70 | You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.
71 | For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.
72 | Section 5 – Disclaimer of Warranties and Limitation of Liability.
73 | 
74 | Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.
75 | To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.
76 | The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.
77 | Section 6 – Term and Termination.
78 | 
79 | This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.
80 | Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:
81 | 
82 | automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or
83 | upon express reinstatement by the Licensor.
84 | For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License.
85 | For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.
86 | Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
87 | Section 7 – Other Terms and Conditions.
88 | 
89 | The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.
90 | Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.
91 | Section 8 – Interpretation.
92 | 
93 | For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.
94 | To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.
95 | No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.
96 | Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.
97 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License
 2 | 
 3 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.
 4 | 
 5 | Section 1 – Definitions.
 6 | 
 7 | Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.
 8 | Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License.
 9 | BY-NC-SA Compatible License means a license listed at creativecommons.org/compatiblelicenses, approved by Creative Commons as essentially the equivalent of this Public License.
10 | Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights.
11 | Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.
12 | Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.
13 | License Elements means the license attributes listed in the name of a Creative Commons Public License. The License Elements of this Public License are Attribution, NonCommercial, and ShareAlike.
14 | Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License.
15 | Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.
16 | Licensor means the individual(s) or entity(ies) granting rights under this Public License.
17 | NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange.
18 | Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.
19 | Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.
20 | You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.
21 | Section 2 – Scope.
22 | 
23 | License grant.
24 | Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to:
25 | reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and
26 | produce, reproduce, and Share Adapted Material for NonCommercial purposes only.
27 | Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions.
28 | Term. The term of this Public License is specified in Section 6(a).
29 | Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material.
30 | Downstream recipients.
31 | Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License.
32 | Additional offer from the Licensor – Adapted Material. Every recipient of Adapted Material from You automatically receives an offer from the Licensor to exercise the Licensed Rights in the Adapted Material under the conditions of the Adapter’s License You apply.
33 | No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material.
34 | No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i).
35 | Other rights.
36 | 
37 | Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise.
38 | Patent and trademark rights are not licensed under this Public License.
39 | To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes.
40 | Section 3 – License Conditions.
41 | 
42 | Your exercise of the Licensed Rights is expressly made subject to the following conditions.
43 | 
44 | Attribution.
45 | 
46 | If You Share the Licensed Material (including in modified form), You must:
47 | 
48 | retain the following if it is supplied by the Licensor with the Licensed Material:
49 | identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated);
50 | a copyright notice;
51 | a notice that refers to this Public License;
52 | a notice that refers to the disclaimer of warranties;
53 | a URI or hyperlink to the Licensed Material to the extent reasonably practicable;
54 | indicate if You modified the Licensed Material and retain an indication of any previous modifications; and
55 | indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License.
56 | You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information.
57 | If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable.
58 | ShareAlike.
59 | In addition to the conditions in Section 3(a), if You Share Adapted Material You produce, the following conditions also apply.
60 | 
61 | The Adapter’s License You apply must be a Creative Commons license with the same License Elements, this version or later, or a BY-NC-SA Compatible License.
62 | You must include the text of, or the URI or hyperlink to, the Adapter's License You apply. You may satisfy this condition in any reasonable manner based on the medium, means, and context in which You Share Adapted Material.
63 | You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, Adapted Material that restrict exercise of the rights granted under the Adapter's License You apply.
64 | Section 4 – Sui Generis Database Rights.
65 | 
66 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:
67 | 
68 | for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only;
69 | if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material, including for purposes of Section 3(b); and
70 | You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.
71 | For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.
72 | Section 5 – Disclaimer of Warranties and Limitation of Liability.
73 | 
74 | Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.
75 | To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.
76 | The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.
77 | Section 6 – Term and Termination.
78 | 
79 | This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.
80 | Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:
81 | 
82 | automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or
83 | upon express reinstatement by the Licensor.
84 | For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License.
85 | For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.
86 | Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
87 | Section 7 – Other Terms and Conditions.
88 | 
89 | The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.
90 | Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.
91 | Section 8 – Interpretation.
92 | 
93 | For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.
94 | To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.
95 | No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.
96 | Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.
97 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # stringMLST
  2 | 
  3 | Fast k-mer based tool for multi locus sequence typing (MLST)
  4 | stringMLST is a tool for detecting the MLST of an isolate directly from the genome sequencing reads. stringMLST predicts the ST of an isolate in a completely assembly and alignment free manner. The tool is designed in a light-weight, platform-independent fashion with minimum dependencies.
  5 | 
  6 | Some portions of the allele selection algorithm in stringMLST are patent
  7 | pending.  Please refer to the PATENTS file for additional inforamation
  8 | regarding licencing and use.
  9 | 
 10 | 
 11 | Reference
 12 | *http://jordan.biology.gatech.edu/page/software/stringmlst/*
 13 | 
 14 | Abstract
 15 | *http://bioinformatics.oxfordjournals.org/content/early/2016/09/06/bioinformatics.btw586.short?rss=1*
 16 | 
 17 | Application Note
 18 | *http://bioinformatics.oxfordjournals.org/content/early/2016/09/06/bioinformatics.btw586.full.pdf+html*
 19 | 
 20 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat-square)](http://bioconda.github.io/recipes/stringmlst/README.html)  [![PyPI version](https://badge.fury.io/py/stringMLST.svg)](https://badge.fury.io/py/stringMLST)  ![downloads](https://img.shields.io/conda/dn/bioconda/stringmlst.svg?style=flat) [![container ready](https://quay.io/repository/biocontainers/stringmlst/status)](https://quay.io/repository/biocontainers/stringmlst)
 21 | 
 22 | 
 23 | 
 24 | **stringMLST is a *tool* not a *database*, always use the most up-to-date database files as possible.** To facilitate
 25 | keeping your databases updated, stringMLST can download and build databases from pubMLST using the most recent allele
 26 | and profile definitions. Please see the "Included databases and automated retrieval of databases from pubMLST" section
 27 | below for instructions. *The databases bundled here are for convenience only, do not rely on them being up-to-date*.
 28 | 
 29 | stringMLST is licensed and distributed under [CC Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0)
 30 | and is free for academic users and requires permission before any commercial use for any version of this code/algorithm.
 31 | If you are a commercial user, please contact king.jordan@biology.gatech.edu for permissions
 32 | 
 33 | ## Recommended installation method
 34 | 
 35 | ```
 36 | pip install stringMLST
 37 | 
 38 | ```
 39 | 
 40 | #### Installation via git (Not recommended for most users)
 41 | 
 42 | ```
 43 | git clone https://github.com/jordanlab/stringMLST
 44 | # Optional, download prebuilt databases
 45 | # We don't recommend this method, instead build the databases locally
 46 | cd stringMLST
 47 | git submodule init
 48 | git submodule update
 49 | ```
 50 | 
 51 | ## Quickstart guide
 52 | 
 53 | ```bash
 54 | pip install stringMLST
 55 | mkdir -p stringMLST_analysis; cd stringMLST_analysis
 56 | stringMLST.py --getMLST -P neisseria/nmb --species neisseria
 57 | # Download all available databases with:
 58 | # stringMLST.py --getMLST -P mlst_dbs --species all
 59 | wget  ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR026/ERR026529/ERR026529_1.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR026/ERR026529/ERR026529_2.fastq.gz
 60 | stringMLST.py --predict -P neisseria/nmb -1 ERR026529_1.fastq.gz -2 ERR026529_2.fastq.gz
 61 | Sample  abcZ    adk     aroE    fumC    gdh     pdhC    pgm     ST
 62 | ERR026529       231     180     306     612     269     277     260     10174
 63 | 
 64 | ```
 65 | 
 66 | ## Python dependencies and external programs
 67 | 
 68 | stringMLST does not require any python dependencies for basic usage (Building databases and predicting STs).
 69 | 
 70 | For advanced used (genome coverage), stringMLST depends on the `pyfaidx` python module and `bamtools`, `bwa`, and `samtools`.
 71 | See the coverage section for more information
 72 | 
 73 | stringMLST has been tested with:
 74 | ```
 75 | pyfaidx: 0.4.8.1
 76 | samtools: 1.3 (Using htslib 1.3.1)  [Requires the 1.x branch of samtools]
 77 | bedtools: v2.24.0
 78 | bwa: 0.7.13-r1126
 79 | ```
 80 | 
 81 | ### To install the dependencies
 82 | 
 83 | ```bash
 84 | # pyfaidx
 85 | pip install --user pyfaidx
 86 | # samtools
 87 | wget https://github.com/samtools/samtools/releases/download/1.3.1/samtools-1.3.1.tar.bz2 -o samtools-1.3.1.tar.bz2
 88 | tar xf samtools-1.3.1.tar.bz2
 89 | cd samtools-1.3.1.tar
 90 | make
 91 | make prefix=$HOME install
 92 | # bedtools
 93 | wget https://github.com/arq5x/bedtools2/releases/download/v2.25.0/bedtools-2.25.0.tar.gz
 94 | tar -zxvf bedtools-2.25.0.tar.gz
 95 | cd bedtools2; make
 96 | cp ./bin/* ~/bin
 97 | # bwa
 98 | git clone https://github.com/lh3/bwa.git
 99 | cd bwa; make
100 | cp bwa ~/bin/bwa
101 | export PATH=$PATH:$HOME/bin
102 | ```
103 | 
104 | 
105 | ## Usage for Example Read Files (Neisseria meningitidis)
106 | 
107 | * Download stringMLST.py, example read files (ERR026529, ERR027250, ERR036104) and the dataset for Neisseria meningitidis (Neisseria_spp.zip).
108 | ### Build database:
109 | 
110 | ```
111 | # Add dir to path
112 | export PATH=$PATH:$PWD
113 | # Will connect to EBI's SRA servers
114 | download_example_reads.sh
115 | ````
116 | 
117 | * Extract the MLST loci dataset.
118 | 
119 | ```
120 | unzip datasets/Neisseria_spp.zip -d datasets
121 | ```
122 | 
123 | * Create or use a config file specifying the location of all the locus and profile files.
124 | Example config file (Neisseria_spp/config.txt):
125 | 
126 | ```
127 | [loci]
128 | abcZ  datasets/Neisseria_spp/abcZ.fa
129 | adk datasets/Neisseria_spp/adk.fa
130 | aroE  datasets/Neisseria_spp/aroE.fa
131 | fumC  datasets/Neisseria_spp/fumC.fa
132 | gdh datasets/Neisseria_spp/gdh.fa
133 | pdhC  datasets/Neisseria_spp/pdhC.fa
134 | pgm datasets/Neisseria_spp/pgm.fa
135 | [profile]
136 | profile datasets/Neisseria_spp/neisseria.txt
137 | ```
138 | 
139 | * Run stringMLST.py --buildDB to create DB. Choose a k value and prefix (optional).
140 | 
141 | ```
142 | stringMLST.py --buildDB -c databases/Neisseria_spp/config.txt -k 35 -P NM
143 | ```
144 | 
145 | ### Predict:
146 | 
147 | #### Single sample :
148 | ```
149 | stringMLST.py --predict -1 tests/fastqs/ERR026529_1.fastq -2 tests/fastqs/ERR026529_2.fastq -k 35 -P NM
150 | ```
151 | #### Batch mode (all the samples together):
152 | ```
153 | stringMLST.py --predict -d ./tests/fastqs/ -k 35 -P NM
154 | ```
155 | #### List mode:
156 | Create a list file (list_paired.txt) as :
157 | ```
158 | tests/fastqs/ERR026529_1.fastq  tests/fastqs/ERR026529_2.fastq
159 | tests/fastqs/ERR027250_1.fastq  tests/fastqs/ERR027250_2.fastq
160 | tests/fastqs/ERR036104_1.fastq  tests/fastqs/ERR036104_2.fastq
161 | ```
162 | Run the tool as:
163 | ```
164 | stringMLST.py --predict -l list_paired.txt -k 35 -P NM
165 | ```
166 | #### Working with gziped files
167 | ```
168 | stringMLST.py --predict -1 tests/fastqs/ERR026529_1.fq.gz -2 tests/fastqs/ERR026529_2.fq.gz -p -P NM -k 35 -o ST_NM.txt
169 | ```
170 | ## Usage Documentation
171 | 
172 | stringMLST's workflow is divided into two routines:
173 | * Database building and
174 | * ST discovery
175 | 
176 | *Database building:* Builds the stringMLST database which is used for assigning STs to input sample files. This step is required once for each organism. Please note that stringMLST is capable of working on a custom user defined typing scheme but its efficiency has not been tested on other typing scheme.
177 | 
178 | *ST discovery:* This routine takes the database created in the last step and predicts the ST of the input sample(s). Please note that the database building is required prior to this routine. stringMLST is capable of processing single-end and paired-end files. It can run in three modes:
179 | * Single sample mode - for running stringMLST on a single sample
180 | * Batch mode - for running stringMLST on all the FASTQ files present in a directory
181 | * List mode - for running stringMLST on all the FASTQ files provided in a list file
182 | 
183 | 
184 | ```
185 | Readme for stringMLST
186 | =============================================================================================
187 | Usage
188 | ./stringMLST.py
189 | [--buildDB]
190 | [--predict]
191 | [-1 filename_fastq1][--fastq1 filename_fastq1]
192 | [-2 filename_fastq2][--fastq2 filename_fastq2]
193 | [-d directory][--dir directory][--directory directory]
194 | [-l list_file][--list list_file]
195 | [-p][--paired]
196 | [-s][--single]
197 | [-c][--config]
198 | [-P][--prefix]
199 | [-z][--fuzzy]
200 | [-a]
201 | [-C][--coverage]
202 | [-k]
203 | [-o output_filename][--output output_filename]
204 | [-x][--overwrite]
205 | [-t]
206 | [-r]
207 | [-v]
208 | [-h][--help]
209 | ==============================================================================================
210 | 
211 | There are two steps to predicting ST using stringMLST.
212 | 1. Create DB : stringMLST.py --buildDB
213 | 2. Predict : stringMLST --predict
214 | 
215 | 1. stringMLST.py --buildDB
216 | 
217 | Synopsis:
218 | stringMLST.py --buildDB -c <config file> -k <kmer length(optional)> -P <DB prefix(optional)>
219 |   config file : is a tab delimited file which has the information for typing scheme ie loci, its multifasta file and profile definition file.
220 |     Format :
221 |       [loci]
222 |       locus1    locusFile1
223 |       locus2    locusFile2
224 |       [profile]
225 |       profile   profileFile
226 |   kmer length : is the kmer length for the db. Note, while processing this should be smaller than the read length.
227 |     We suggest kmer lengths of 35, 66 depending on the read length.
228 |   DB prefix(optional) : holds the information for DB files to be created and their location. This module creates 3 files with this prefix.
229 |     You can use a folder structure with prefix to store your db at particular location.
230 | 
231 | Required arguments
232 | --buildDB
233 |   Identifier for build db module
234 | -c,--config = <configuration file>
235 |   Config file in the format described above.
236 |   All the files follow the structure followed by pubmlst. Refer extended document for details.
237 | 
238 | Optional arguments
239 | -k = <kmer length>
240 |   Kmer size for which the db has to be formed(Default k = 35). Note the tool works best with kmer length in between 35 and 66
241 |   for read lengths of 55 to 150 bp. Kmer size can be increased accordingly. It is advised to keep lower kmer sizes
242 |   if the quality of reads is not very good.
243 | -P,--prefix = <prefix>
244 |   Prefix for db and log files to be created(Default = kmer). Also you can specify folder where you want the dbb to be created.
245 | -a
246 |         File location to write build log
247 | -h,--help
248 |   Prints the help manual for this application
249 | 
250 |  --------------------------------------------------------------------------------------------
251 | 
252 | 2. stringMLST.py --predict
253 | 
254 | stringMLST --predict : can run in three modes
255 |   1) single sample (default mode)
256 |   2) batch mode : run stringMLST for all the samples in a folder (for a particular specie)
257 |   3) list mode : run stringMLST on samples specified in a file
258 | stringMLST can process both single and paired end files. By default program expects paired end files.
259 | 
260 | Synopsis
261 | stringMLST.py --predict -1 <fastq file> -2 <fastq file> -d <directory location> -l <list file> -p -s -P <DB prefix(optional)> -k <kmer length(optional)> -o <output file> -x
262 | 
263 | Required arguments
264 | --predict
265 |   Identifier for predict miodule
266 | 
267 | Optional arguments
268 | -1,--fastq1 = <fastq1_filename>
269 |   Path to first fastq file for paired end sample and path to the fastq file for single end file.
270 |   Should have extension fastq or fq.
271 | -2,--fastq2 = <fastq2_filename>
272 |   Path to second fastq file for paired end sample.
273 |   Should have extension fastq or fq.
274 | -d,--dir,--directory = <directory>
275 |   BATCH MODE : Location of all the samples for batch mode.
276 | -C,--coverage
277 |   Calculate seqence coverage for each allele. Turns on read generation (-r) and turns off fuzzy (-z 1)
278 |   Requires bwa, bamtools and samtools be in your path
279 | -k = <kmer_length>
280 |   Kmer length for which the db was created(Default k = 35). Could be verified by looking at the name of the db file.
281 |   Could be used if the reads are of very bad quality or have a lot of N's.
282 | -l,--list = <list_file>
283 |   LIST MODE : Location of list file and flag for list mode.
284 |   list file should have full file paths for all the samples/files.
285 |   Each sample takes one line. For paired end samples the 2 files should be tab separated on single line.
286 | -o,--output = <output_filename>
287 |   Prints the output to a file instead of stdio.
288 | -p,--paired
289 |   Flag for specifying paired end files. Default option so would work the same if you do not specify for all modes.
290 |   For batch mode the paired end samples should be differentiated by 1/2.fastq or 1/2.fq
291 | -P,--prefix = <prefix>
292 |   Prefix using which the db was created(Defaults = kmer). The location of the db could also be provided.
293 | -r
294 |   A seperate reads file is created which has all the reads covering all the locus.
295 | -s,--single
296 |   Flag for specifying single end files.
297 | -t
298 |   Time for each analysis will also be reported.
299 | -v
300 |   Prints the version of the software.
301 | -x,--overwrite
302 |   By default stringMLST appends the results to the output_filename if same name is used.
303 |   This argument overwrites the previously specified output file.
304 | -z,--fuzzy = <fuzzy threshold int>
305 |   Threshold for reporting a fuzzy match (Default=300). For higher coverage reads this threshold should be set higher to avoid
306 |   indicating fuzzy match when exact match was more likely. For lower coverage reads, threshold of <100 is recommended
307 | -h,--help
308 |   Prints the help manual for this application
309 | 
310 |  --------------------------------------------------------------------------------------------
311 | 
312 | 3. stringMLST.py --getMLST
313 | 
314 | Synopsis:
315 | stringMLST.py --getMLST --species= <species> [-k kmer length] [-P DB prefix]
316 | 
317 | Required arguments
318 | --getMLST
319 |     Identifier for getMLST module
320 | --species= <species name>
321 |     Species name from the pubMLST schemes (use "--species show" to get list of available schemes)
322 |     "all" will download and build all
323 | 
324 | Optional arguments
325 | -k = <kmer length>
326 |     Kmer size for which the db has to be formed(Default k = 35). Note the tool works best with kmer length in between 35 and 66
327 |     for read lengths of 55 to 150 bp. Kmer size can be increased accordingly. It is advised to keep lower kmer sizes
328 |     if the quality of reads is not very good.
329 | -P,--prefix = <prefix>
330 |     Prefix for db and log files to be created(Default = kmer). Also you can specify folder where you want the db to be created.
331 |     We recommend that prefix and config point to the same folder for cleanliness but this is not required
332 | --schemes
333 |     Display the list of available schemes
334 | -h,--help
335 |   Prints the help manual for this application
336 | 
337 | ```
338 | 
339 | 
340 | **stringMLST expects paired end reads to be in [Illumina naming convention](http://support.illumina.com/help/SequencingAnalysisWorkflow/Content/Vault/Informatics/Sequencing_Analysis/CASAVA/swSEQ_mCA_FASTQFiles.htm), minimally ending with _1.fq and _2.fq to delineate read1 and read2:**
341 | 
342 | *Periods (.) are disallowed delimiters except for file extensions*
343 | 
344 | ```
345 | Illumina FASTQ files use the following naming scheme:
346 | 
347 | <sample name>_<barcode sequence>_L<lane (0-padded to 3 digits)>_R<read number>_<set number (0-padded to 3 digits>.fastq.gz
348 | 
349 | For example, the following is a valid FASTQ file name:
350 | 
351 | NA10831_ATCACG_L002_R1_001.fastq.gz
352 | ```
353 | 
354 | ## Running stringMLST
355 | 
356 | #### Included databases and automated retrieval of databases from pubMLST
357 | 
358 | stringMLST includes all the pubMLST databases as of **February 15, 2017**, built with the default kmer (*35*). They can be found in the `datasets/` folder.
359 | Simply unzip the databases you need and begin using stringMSLT as described below.
360 | 
361 | All the databases from pubMLST can be downloaded and prepared with your kmer choice
362 | 
363 | *Getting all pubMLST schemes*
364 | ```
365 | stringMLST.py --getMLST -P datasets/ --species all
366 | ```
367 | 
368 | 
369 | Individual databases from pubMLST can also be downloaded as needed, using the scheme identifiers
370 | 
371 | *Downloading a scheme*
372 | ```
373 | # List available schemes
374 | stringMLST.py --getMLST --schemes
375 | 
376 | # Download the Neisseria spp. scheme
377 | 
378 | stringMLST.py --getMLST -P datasets/nmb --species Neisseria
379 | 
380 | ```
381 | 
382 | 
383 | 
384 | #### Database Preparation
385 | In order to create the database, files can be downloaded from the database page.
386 | 
387 | If the organism of interest is not present in the provided link, the required files can be downloaded from PubMLST as follows:
388 | * On your browser, navigate to http://pubmlst.org/
389 | * Navigate to "Download MLST definitions" link or go to http://pubmlst.org/data/
390 | * Scroll to the species of interest. For each species, user may find the file for typing definitions and multi-FASTA files for each locus. Download these files.
391 | 
392 | E.g.:
393 | 
394 | Species of interest: Neisseria spp.
395 | Corresponding definition file: http://pubmlst.org/data/profiles/neisseria.txt
396 | Corresponding multi fasta locus files:
397 | http://pubmlst.org/data/alleles/neisseria/abcZ.tfa
398 | http://pubmlst.org/data/alleles/neisseria/adk.tfa
399 | http://pubmlst.org/data/alleles/neisseria/aroE.tfa
400 | http://pubmlst.org/data/alleles/neisseria/fumC.tfa
401 | http://pubmlst.org/data/alleles/neisseria/gdh.tfa
402 | http://pubmlst.org/data/alleles/neisseria/pdhC.tfa
403 | http://pubmlst.org/data/alleles/neisseria/pgm.tfa
404 | 
405 | Download these files at a desired location.
406 | 
407 | 
408 | Custom user files can also be used for building database. The database building routine requires the profile definition file and allele sequence file. The profile definition file is a tab separated file that contains the ST and the allele profile corresponding to the ST. An example of the profile definition file is shown below:
409 | ```
410 | ST  abcZ  adk aroE  fumC  gdh pdhC  pgm clonal_complex
411 | 1 1 3 1 1 1 1 3 ST-1 complex/subgroup I/II
412 | 2 1 3 4 7 1 1 3 ST-1 complex/subgroup I/II
413 | 3 1 3 1 1 1 23  13  ST-1 complex/subgroup I/II
414 | 4 1 3 3 1 4 2 3 ST-4 complex/subgroup IV
415 | ```
416 | The allele sequence file is a standard multi-FASTA with the description being the loci name with the allele number. An example abcZ allele sequence is shown below:
417 | ```
418 | >abcZ_1
419 | TTTGATACTGTTGCCGA...
420 | >abcZ_2
421 | TTTGATACCGTTGCCGA...
422 | >abcZ_3
423 | TTTGATACCGTTGCGAA...
424 | >abcZ_4
425 | TTTGATACCGTTGCCAA...
426 | ```
427 | 
428 | These files can be obtained from PubMLST/BIGSdb or can be create by the user themselves.
429 | 
430 | In either case, an accompanying configuration file is also required to describe the profile definition and allele sequence files. An example configuration file is shown below:
431 | ```
432 | [loci]
433 | abcZ  /data/home/stringMLST/pubmlst/Neisseria_sp/abcZ.fa
434 | adk /data/home/stringMLST/pubmlst/Neisseria_sp/adk.fa
435 | aroE  /data/home/stringMLST/pubmlst/Neisseria_sp/aroE.fa
436 | fumC  /data/home/stringMLST/pubmlst/Neisseria_sp/fumC.fa
437 | gdh /data/home/stringMLST/pubmlst/Neisseria_sp/gdh.fa
438 | pdhC  /data/home/stringMLST/pubmlst/Neisseria_sp/pdhC.fa
439 | pgm /data/home/stringMLST/pubmlst/Neisseria_sp/pgm.fa
440 | 
441 | [profile]
442 | profile /data/home/stringMLST/pubmlst/Neisseria_sp/neisseria.txt
443 | ```
444 | 
445 | This file is pre-packed on stringMLSTs website and can easily be created by the user for custom database.
446 | 
447 | #### Database Building
448 | The next step is for database building is running the buildDB module to create the database files. buildDB module requires the user to specify the config file. The default k-mer size is 35 but can be changed using the -k option. Specifying the prefix for the created database files is optional but is recommended.
449 | 
450 | The choice of k-mer depends on the size of the sequencing read. In general, the value of k can never be greater than the read length. The application has been tested on a number of read lengths ranging from 55 to 150 bps using k-mer sizes of 21 to 66. In our testing, the k-mer size does not affect the accuracy of the read length. A smaller k-mer size will increase the runtime and a larger k-mer size will increase the file size. The user should ideally pick a k-mer with a length around half of the average read length. For lower quality data, it also advised to choose smaller k-mer values to reduce false hits.
451 | ```
452 | stringMLST.py --buildDB --config <config file> -k  <k-mer length> -P <prefix>
453 | ```
454 | Example:
455 | ```
456 | stringMLST.py --buildDB --config config.txt -k 35 -P NM
457 | ```
458 | This command will produce 3 database files and a log file. The log file is used for debugging purposes in the event an error is encountered. The 3 database files created are:
459 | * <prefix>_<k-mer>.txt : The main database file for the application. This is a tab delimited file describing k-mer to locus relationship.
460 | * <prefix>_weight.txt : Contains the weight factors for alleles which differ in lengths by more than 5%. Will be empty otherwise.
461 | * <prefix>_profile.txt : Profile definition file used for finding the ST from the predicted allelic profile.
462 | 
463 | For the example above, the following files will be created:
464 | NM_35.txt, NM_weight.txt and NM_profile.txt
465 | 
466 | Please note that in the prediction routine the database is identified with the prefix.
467 | 
468 | ST discovery routine
469 | As discussed earlier, StringMLST has 3 running modes
470 | * Single sample mode - for running stringMLST on a single sample
471 | * Batch mode - for running stringMLST on all the FASTQ files present in a directory
472 | * List mode - for running stringMLST on all the FASTQ files provided in a list file
473 | 
474 | ####  Single sample mode:
475 | This is the default mode for stringMLST and takes in one sample at a time. The sample can be single-end or paired-end. The sample has to be in FASTQ format. In order to run, the user should know the prefix of the database created and the k-mer size.
476 | 
477 | By default, the tool expects paired-end samples.
478 | ```
479 | stringMLST.py --predict -1 <paired-end file 1> -2 <paired-end file 2> -p --prefix <prefix for the database> -k <k-mer size> -o <output file name>
480 | ```
481 | *For single-end samples:*
482 | ```
483 | stringMLST.py --predict -1 <single-end file> -s --prefix <prefix for the database> -k <k-mer size> -o <output file name>
484 | ```
485 | ####  Batch Mode:
486 | This mode can be used for processing multiple files with one command. All the samples will be queried against the same database. Also all samples should be in the same directory. All the samples will be treated either as single-end or paired-end. The paired-end samples should be differentiated with the character _1 and _2 at the end (E.g.: sampleX_1.fastq and sampleX_2.fastq).
487 | 
488 | *Paired-end samples:*
489 | ```
490 | stringMLST.py --predict -d <directory for samples> -p --prefix <prefix for the database> -k <k-mer size> -o <output file name>
491 | ```
492 | 
493 | *Single-end samples:*
494 | ```
495 | stringMLST.py --predict -d <directory for samples> -s --prefix <prefix for the database> -k <k-mer size> -o <output file name>
496 | ```
497 | #### List Mode:
498 | This mode could be used if user has samples at different locations or if the paired-end samples are not stored in traditional way. All the samples will be queried against the same database. All the samples will be treated either as single-end or paired-end. This mode requires the user to provide a list file which has the list of all samples along with the location. Each line in the list file represents a new sample.
499 | A sample list file for single-end sample looks like the following.
500 | ```
501 | <full path of sample 1 fastq file>
502 | <full path of sample 2 fastq file>
503 | <full path of sample 3 fastq file>
504 | .
505 | .
506 | <full path of sample n fastq file>
507 | ```
508 | A sample list file for paired-end sample looks like the following.
509 | 
510 | ```
511 | <full path of sample 1 fastq file 1>  <full path of sample 1 fastq file 2>
512 | <full path of sample 2 fastq file 1>  <full path of sample 2 fastq file 2>
513 | <full path of sample 3 fastq file 1>  <full path of sample 3 fastq file 2>
514 | .
515 | .
516 | <full path of sample n fastq file 1>  <full path of sample n fastq file 2>
517 | ```
518 | 
519 | Once the user has the list file, he can directly use the tool.
520 | 
521 | *Paired-end samples:*
522 | ```
523 | stringMLST.py --predict -l <full path to list file> -p --prefix <prefix for the database> -k <k-mer size> -o <output file name>
524 | ```
525 | *Single-end samples:*
526 | ```
527 | stringMLST.py --predict -l <full path to list file > -s --prefix <prefix for the database> -k <k-mer size> -o <output file name>
528 | ```
529 | 
530 | #### Gene coverage and match confidence
531 | 
532 | stringMLST provides two, complimentary methods for determining confidence in an inferred ST. There's the `-C|--coverage` flag and `-z|--fuzzy` threshold option.
533 | 
534 | stringMLST determines an allele based on its kmer support; the more kmers seen for allele 1, the more likely that allele 1 is the allele present in the genome. Unlike SRST2 and other mapping/BLAST based tools, stringMLST always infers an ST, using the maximimally supported allele (allele with most kmer hits). The difference between the maximum support (the reported allele) and the second support (next closest allele) can be informative for low coverage reads. The `-z|--fuzzy` threshold (Default = 300), assigns significance to the difference between supports. Much like SRST2 and Torsten Seemann's popular [pubMLST script](https://github.com/tseemann/mlst), stringMLST reports potentially new or closely supported alleles in allele* syntax. For high coverage reads, we suggest a fuzzy threshold >500. For low coverage reads, a fuzzy threshold of <50.
535 | 
536 | Coverage mode requires `bedtools`, `bwa`, and `samtools` in your PATH and an additional python module, `pyfaidx` (See the dependencies section for installion information).  Coverage mode by default disables display of fuzzy alleles in favor of sequence coverage information made by mapping potential reads to the putative allele sequence. In our testing, coverage mode slightly increases prediction time (<1 sec increase per sample).
537 | 
538 | **Please note:** stringMLST *always* infers the ST from the reads, fuzzy matches and/or <100% coverage do not necessarily mean a new allele has been found.
539 | 
540 | *Getting gene coverage from reads*
541 | ```
542 | stringMLST.py --predict -1 <paired-end file 1> -2 <paired-end file 2> -p --prefix <prefix for the database> -k <k-mer size> -r -o <output file name>- -c <path to config> -C
543 | ```
544 | *Changing the fuzziness of the search for low coverage reads*
545 | ```
546 | stringMLST.py --predict -1 <paired-end file 1> -2 <paired-end file 2> -p --prefix <prefix for the database> -k <k-mer size> -r -o <output file name>- -f 50
547 | ```
548 | 
549 | #### Other Examples :
550 | 
551 | *Reporting time along with the output.*
552 | ```
553 | stringMLST.py --predict -1 <paired-end file 1> -2 <paired-end file 2> -p --prefix <prefix for the database> -k <k-mer size> -t -o <output file name>
554 | ```
555 | *Getting reads file relevant to typing scheme.*
556 | ```
557 | stringMLST.py --predict -1 <paired-end file 1> -2 <paired-end file 2> -p --prefix <prefix for the database> -k <k-mer size> -r -o <output file name>
558 | ```
559 | 


--------------------------------------------------------------------------------
/stringMLST.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python
   2 | import getopt
   3 | import sys
   4 | import logging
   5 | import os
   6 | import time
   7 | import ast
   8 | import gzip
   9 | import re
  10 | import tempfile
  11 | import shutil
  12 | import xml.etree.ElementTree as ET
  13 | try:
  14 |     from urllib.request import urlopen, urlretrieve
  15 | except ImportError:
  16 |     from urllib import urlopen, urlretrieve
  17 | import argparse
  18 | version = """ stringMLST v0.6.3 (updated : September 02, 2020) """
  19 | """
  20 | 
  21 | stringMLST free for academic users and requires permission before any commercial
  22 | use for any version of this code/algorithm. If you are a commercial user, please
  23 | contact king.jordan@biology.gatech.edu for permissions
  24 | 
  25 | LICENSE TERMS FOR stringMLST
  26 | Adopted from: https://creativecommons.org/licenses/by-nc-sa/4.0/
  27 | 
  28 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public
  29 | License
  30 | 
  31 | By exercising the Licensed Rights (defined below), You accept and agree to be
  32 | bound by the terms and conditions of this Creative Commons Attribution-
  33 | NonCommercial-ShareAlike 4.0 International Public License ("Public License"). To
  34 | the extent this Public License may be interpreted as a contract, You are granted
  35 | the Licensed Rights in consideration of Your acceptance of these terms and
  36 | conditions, and the Licensor grants You such rights in consideration of benefits
  37 | the Licensor receives from making the Licensed Material available under these
  38 | terms and conditions.
  39 | 
  40 | Section 1 - Definitions.
  41 | 
  42 | Adapted Material means material subject to Copyright and Similar Rights that is
  43 | derived from or based upon the Licensed Material and in which the Licensed
  44 | Material is translated, altered, arranged, transformed, or otherwise modified in
  45 | a manner requiring permission under the Copyright and Similar Rights held by the
  46 | Licensor. For purposes of this Public License, where the Licensed Material is a
  47 | musical work, performance, or sound recording, Adapted Material is always
  48 | produced where the Licensed Material is synched in timed relation with a moving
  49 | image. Adapter's License means the license You apply to Your Copyright and
  50 | Similar Rights in Your contributions to Adapted Material in accordance with the
  51 | terms and conditions of this Public License. BY-NC-SA Compatible License means a
  52 | license listed at creativecommons.org/compatiblelicenses, approved by Creative
  53 | Commons as essentially the equivalent of this Public License. Copyright and
  54 | Similar Rights means copyright and/or similar rights closely related to
  55 | copyright including, without limitation, performance, broadcast, sound
  56 | recording, and Sui Generis Database Rights, without regard to how the rights are
  57 | labeled or categorized. For purposes of this Public License, the rights
  58 | specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights. Effective
  59 | Technological Measures means those measures that, in the absence of proper
  60 | authority, may not be circumvented under laws fulfilling obligations under
  61 | Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or
  62 | similar international agreements. Exceptions and Limitations means fair use,
  63 | fair dealing, and/or any other exception or limitation to Copyright and Similar
  64 | Rights that applies to Your use of the Licensed Material. License Elements means
  65 | the license attributes listed in the name of a Creative Commons Public License.
  66 | The License Elements of this Public License are Attribution, NonCommercial, and
  67 | ShareAlike. Licensed Material means the artistic or literary work, database, or
  68 | other material to which the Licensor applied this Public License. Licensed
  69 | Rights means the rights granted to You subject to the terms and conditions of
  70 | this Public License, which are limited to all Copyright and Similar Rights that
  71 | apply to Your use of the Licensed Material and that the Licensor has authority
  72 | to license. Licensor means the individual(s) or entity(ies) granting rights
  73 | under this Public License. NonCommercial means not primarily intended for or
  74 | directed towards commercial advantage or monetary compensation. For purposes of
  75 | this Public License, the exchange of the Licensed Material for other material
  76 | subject to Copyright and Similar Rights by digital file-sharing or similar means
  77 | is NonCommercial provided there is no payment of monetary compensation in
  78 | connection with the exchange. Share means to provide material to the public by
  79 | any means or process that requires permission under the Licensed Rights, such as
  80 | reproduction, public display, public performance, distribution, dissemination,
  81 | communication, or importation, and to make material available to the public
  82 | including in ways that members of the public may access the material from a
  83 | place and at a time individually chosen by them. Sui Generis Database Rights
  84 | means rights other than copyright resulting from Directive 96/9/EC of the
  85 | European Parliament and of the Council of 11 March 1996 on the legal protection
  86 | of databases, as amended and/or succeeded, as well as other essentially
  87 | equivalent rights anywhere in the world. You means the individual or entity
  88 | exercising the Licensed Rights under this Public License. Your has a
  89 | corresponding meaning. Section 2 - Scope.
  90 | 
  91 | License grant. Subject to the terms and conditions of this Public License, the
  92 | Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-
  93 | exclusive, irrevocable license to exercise the Licensed Rights in the Licensed
  94 | Material to: reproduce and Share the Licensed Material, in whole or in part, for
  95 | NonCommercial purposes only; and produce, reproduce, and Share Adapted Material
  96 | for NonCommercial purposes only. Exceptions and Limitations. For the avoidance
  97 | of doubt, where Exceptions and Limitations apply to Your use, this Public
  98 | License does not apply, and You do not need to comply with its terms and
  99 | conditions. Term. The term of this Public License is specified in Section 6(a).
 100 | Media and formats; technical modifications allowed. The Licensor authorizes You
 101 | to exercise the Licensed Rights in all media and formats whether now known or
 102 | hereafter created, and to make technical modifications necessary to do so. The
 103 | Licensor waives and/or agrees not to assert any right or authority to forbid You
 104 | from making technical modifications necessary to exercise the Licensed Rights,
 105 | including technical modifications necessary to circumvent Effective
 106 | Technological Measures. For purposes of this Public License, simply making
 107 | modifications authorized by this Section 2(a)(4) never produces Adapted
 108 | Material. Downstream recipients. Offer from the Licensor - Licensed Material.
 109 | Every recipient of the Licensed Material automatically receives an offer from
 110 | the Licensor to exercise the Licensed Rights under the terms and conditions of
 111 | this Public License. Additional offer from the Licensor - Adapted Material.
 112 | Every recipient of Adapted Material from You automatically receives an offer
 113 | from the Licensor to exercise the Licensed Rights in the Adapted Material under
 114 | the conditions of the Adapter's License You apply. No downstream restrictions.
 115 | You may not offer or impose any additional or different terms or conditions on,
 116 | or apply any Effective Technological Measures to, the Licensed Material if doing
 117 | so restricts exercise of the Licensed Rights by any recipient of the Licensed
 118 | Material. No endorsement. Nothing in this Public License constitutes or may be
 119 | construed as permission to assert or imply that You are, or that Your use of the
 120 | Licensed Material is, connected with, or sponsored, endorsed, or granted
 121 | official status by, the Licensor or others designated to receive attribution as
 122 | provided in Section 3(a)(1)(A)(i). Other rights.
 123 | 
 124 | Moral rights, such as the right of integrity, are not licensed under this Public
 125 | License, nor are publicity, privacy, and/or other similar personality rights;
 126 | however, to the extent possible, the Licensor waives and/or agrees not to assert
 127 | any such rights held by the Licensor to the limited extent necessary to allow
 128 | You to exercise the Licensed Rights, but not otherwise. Patent and trademark
 129 | rights are not licensed under this Public License. To the extent possible, the
 130 | Licensor waives any right to collect royalties from You for the exercise of the
 131 | Licensed Rights, whether directly or through a collecting society under any
 132 | voluntary or waivable statutory or compulsory licensing scheme. In all other
 133 | cases the Licensor expressly reserves any right to collect such royalties,
 134 | including when the Licensed Material is used other than for NonCommercial
 135 | purposes. Section 3 - License Conditions.
 136 | 
 137 | Your exercise of the Licensed Rights is expressly made subject to the following
 138 | conditions.
 139 | 
 140 | Attribution.
 141 | 
 142 | If You Share the Licensed Material (including in modified form), You must:
 143 | 
 144 | retain the following if it is supplied by the Licensor with the Licensed
 145 | Material: identification of the creator(s) of the Licensed Material and any
 146 | others designated to receive attribution, in any reasonable manner requested by
 147 | the Licensor (including by pseudonym if designated); a copyright notice; a
 148 | notice that refers to this Public License; a notice that refers to the
 149 | disclaimer of warranties; a URI or hyperlink to the Licensed Material to the
 150 | extent reasonably practicable; indicate if You modified the Licensed Material
 151 | and retain an indication of any previous modifications; and indicate the
 152 | Licensed Material is licensed under this Public License, and include the text
 153 | of, or the URI or hyperlink to, this Public License. You may satisfy the
 154 | conditions in Section 3(a)(1) in any reasonable manner based on the medium,
 155 | means, and context in which You Share the Licensed Material. For example, it may
 156 | be reasonable to satisfy the conditions by providing a URI or hyperlink to a
 157 | resource that includes the required information. If requested by the Licensor,
 158 | You must remove any of the information required by Section 3(a)(1)(A) to the
 159 | extent reasonably practicable. ShareAlike. In addition to the conditions in
 160 | Section 3(a), if You Share Adapted Material You produce, the following
 161 | conditions also apply.
 162 | 
 163 | The Adapter's License You apply must be a Creative Commons license with the same
 164 | License Elements, this version or later, or a BY-NC-SA Compatible License. You
 165 | must include the text of, or the URI or hyperlink to, the Adapter's License You
 166 | apply. You may satisfy this condition in any reasonable manner based on the
 167 | medium, means, and context in which You Share Adapted Material. You may not
 168 | offer or impose any additional or different terms or conditions on, or apply any
 169 | Effective Technological Measures to, Adapted Material that restrict exercise of
 170 | the rights granted under the Adapter's License You apply. Section 4 - Sui
 171 | Generis Database Rights.
 172 | 
 173 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your
 174 | use of the Licensed Material:
 175 | 
 176 | for the avoidance of doubt, Section 2(a)(1) grants You the right to extract,
 177 | reuse, reproduce, and Share all or a substantial portion of the contents of the
 178 | database for NonCommercial purposes only; if You include all or a substantial
 179 | portion of the database contents in a database in which You have Sui Generis
 180 | Database Rights, then the database in which You have Sui Generis Database Rights
 181 | (but not its individual contents) is Adapted Material, including for purposes of
 182 | Section 3(b); and You must comply with the conditions in Section 3(a) if You
 183 | Share all or a substantial portion of the contents of the database. For the
 184 | avoidance of doubt, this Section 4 supplements and does not replace Your
 185 | obligations under this Public License where the Licensed Rights include other
 186 | Copyright and Similar Rights. Section 5 - Disclaimer of Warranties and
 187 | Limitation of Liability.
 188 | 
 189 | Unless otherwise separately undertaken by the Licensor, to the extent possible,
 190 | the Licensor offers the Licensed Material as-is and as-available, and makes no
 191 | representations or warranties of any kind concerning the Licensed Material,
 192 | whether express, implied, statutory, or other. This includes, without
 193 | limitation, warranties of title, merchantability, fitness for a particular
 194 | purpose, non-infringement, absence of latent or other defects, accuracy, or the
 195 | presence or absence of errors, whether or not known or discoverable. Where
 196 | disclaimers of warranties are not allowed in full or in part, this disclaimer
 197 | may not apply to You. To the extent possible, in no event will the Licensor be
 198 | liable to You on any legal theory (including, without limitation, negligence) or
 199 | otherwise for any direct, special, indirect, incidental, consequential,
 200 | punitive, exemplary, or other losses, costs, expenses, or damages arising out of
 201 | this Public License or use of the Licensed Material, even if the Licensor has
 202 | been advised of the possibility of such losses, costs, expenses, or damages.
 203 | Where a limitation of liability is not allowed in full or in part, this
 204 | limitation may not apply to You. The disclaimer of warranties and limitation of
 205 | liability provided above shall be interpreted in a manner that, to the extent
 206 | possible, most closely approximates an absolute disclaimer and waiver of all
 207 | liability. Section 6 - Term and Termination.
 208 | 
 209 | This Public License applies for the term of the Copyright and Similar Rights
 210 | licensed here. However, if You fail to comply with this Public License, then
 211 | Your rights under this Public License terminate automatically. Where Your right
 212 | to use the Licensed Material has terminated under Section 6(a), it reinstates:
 213 | 
 214 | automatically as of the date the violation is cured, provided it is cured within
 215 | 30 days of Your discovery of the violation; or upon express reinstatement by the
 216 | Licensor. For the avoidance of doubt, this Section 6(b) does not affect any
 217 | right the Licensor may have to seek remedies for Your violations of this Public
 218 | License. For the avoidance of doubt, the Licensor may also offer the Licensed
 219 | Material under separate terms or conditions or stop distributing the Licensed
 220 | Material at any time; however, doing so will not terminate this Public License.
 221 | Sections 1, 5, 6, 7, and 8 survive termination of this Public License. Section 7
 222 | - Other Terms and Conditions.
 223 | 
 224 | The Licensor shall not be bound by any additional or different terms or
 225 | conditions communicated by You unless expressly agreed. Any arrangements,
 226 | understandings, or agreements regarding the Licensed Material not stated herein
 227 | are separate from and independent of the terms and conditions of this Public
 228 | License. Section 8 - Interpretation.
 229 | 
 230 | For the avoidance of doubt, this Public License does not, and shall not be
 231 | interpreted to, reduce, limit, restrict, or impose conditions on any use of the
 232 | Licensed Material that could lawfully be made without permission under this
 233 | Public License. To the extent possible, if any provision of this Public License
 234 | is deemed unenforceable, it shall be automatically reformed to the minimum
 235 | extent necessary to make it enforceable. If the provision cannot be reformed, it
 236 | shall be severed from this Public License without affecting the enforceability
 237 | of the remaining terms and conditions. No term or condition of this Public
 238 | License will be waived and no failure to comply consented to unless expressly
 239 | agreed to by the Licensor. Nothing in this Public License constitutes or may be
 240 | interpreted as a limitation upon, or waiver of, any privileges and immunities
 241 | that apply to the Licensor or You, including from the legal processes of any
 242 | jurisdiction or authority.
 243 | 
 244 | 
 245 | 
 246 | The program has 3 basic modes :
 247 |     mainTool: for single sample (both single and paired end)
 248 |     batchTool: for multiple samples stored at a common location (both single and paired end samples)
 249 |     listTool: for multiple samples with location information stored in a list (both single and paired end samples)
 250 | predict part starts here
 251 | """
 252 | #############################################################
 253 | # Function   : get_links
 254 | # Input      : speciesName and schemes dict
 255 | # Output     : Dict containing links to alleles and profile
 256 | # Description: Gets the URLs from pubMLST for the required
 257 | #              files (alleles, profile)
 258 | #############################################################
 259 | def get_links(xmlData, savePath, speciesName):
 260 |     lociList = {}
 261 |     profileURL = None
 262 |     for species in xmlData:
 263 |         if re.search(re.escape(speciesName), species.text, re.IGNORECASE, ):
 264 |             for mlst in species:
 265 |                 for database in mlst:
 266 |                     for child in database:
 267 |                         if child.tag == "profiles":
 268 |                             profileURL = child[0].text
 269 |                         if child.tag == "loci":
 270 |                             for locus in child:
 271 |                                 lociList[locus.text.rstrip()] = locus[0].text
 272 |     if profileURL is None:
 273 |         profileError = "Parsing failed: could not find profiles file"
 274 |         print(profileError)
 275 |         print("This usually means the provided species, '{}', does not exist on PubMLST".format(speciesName))
 276 |         print("Use `{} --getMLST --species list` to list available species".format(sys.argv[0]))
 277 |         print("Or visit PubMLST for more information:\nhttps://pubmlst.org/data/")
 278 |         logging.debug(profileError)
 279 |         sys.exit(1)
 280 |     elif lociList == {}:
 281 |         lociError = "Parsing failed: could not find allele sequences"
 282 |         logging.debug(lociError)
 283 |         print(lociError)
 284 |         sys.exit(1)
 285 |     else:
 286 |         return profileURL, lociList
 287 | #############################################################
 288 | # Function   : get_files
 289 | # Input      : URLs from get_links
 290 | # Output     : Downloads files and builds database
 291 | #############################################################
 292 | def get_files(filePrefix, loci, profileURL, speciesName):
 293 |     with open(config, "w") as configFile:
 294 |         configFile.write("[loci]\n")
 295 |         for file in loci:
 296 |             localFile = filePrefix + "_" + file + ".tfa"
 297 |             try:
 298 |                 localFile, headers = urlretrieve(loci[file], localFile)
 299 |             except:
 300 |                 print('\033[91m' + "There was an error downloading " + file + '\033[0m')
 301 |                 pass
 302 |             configFile.write(file + "\t" + filePrefix + "_" + file + ".tfa\n")
 303 |         localFile = filePrefix + "_profile.txt"
 304 |         localFile, headers = urlretrieve(profileURL, localFile)
 305 |         configFile.write("[profile]\n")
 306 |         configFile.write("profile\t" + filePrefix + "_profile.txt\n")
 307 |         configFile.close()
 308 |         try:
 309 |             makeCustomDB(config, k, filePrefix)
 310 |         except:
 311 |             print('\033[91m' + "Failed to create database " + speciesName + '\033[0m')
 312 |             pass
 313 |         else:
 314 |             print("\t" + '\033[92m' + "Database ready for " + speciesName + '\033[0m')
 315 |             print("\t" + filePrefix)
 316 | ############################################################
 317 | # Function   : batchTool
 318 | # Input      : Directory name, paired or single, k value
 319 | # Output     : STs and allelic profiles for each FASTQ file
 320 | # Description: Processes all FASTQ files present in the input
 321 | #              directory
 322 | #############################################################
 323 | def batchTool(fdir, paired, k):
 324 |     fileList = []
 325 |     if not dir.endswith('/'):
 326 |         fdir += '/'
 327 |     for inputFile in os.listdir(fdir):
 328 |         if paired is True:
 329 |             if inputFile.endswith('1.fastq') or inputFile.endswith('1.fq') or inputFile.endswith('1.fq.gz') or inputFile.endswith('1.fastq.gz'):
 330 |                 fastq1 = fdir+inputFile
 331 |                 fastq2 = fdir+inputFile.replace('1.', '2.')
 332 |                 fileList.append((fastq1, fastq2))
 333 |         else:
 334 |             if inputFile.endswith('.fastq') or inputFile.endswith('.fq') or inputFile.endswith('.fq.gz') or inputFile.endswith('.fastq.gz'):
 335 |                 fastq1 = fdir + inputFile
 336 |                 fileList.append(fastq1)
 337 |     results = multiSampleTool(fileList, paired, k)
 338 |     return results
 339 | #############################################################
 340 | # Function   : listTool
 341 | # Input      : List file, paired or single, k value
 342 | # Output     : STs and allelic profiles for each FASTQ file
 343 | # Description: Processes all FASTQ files present in the input
 344 | #              list file
 345 | #############################################################
 346 | def listTool(fList, paired, k):
 347 |     fileList = []
 348 |     listf = open(fList, 'r')
 349 |     samples = listf.readlines()
 350 |     for sample in samples:
 351 |         if paired is True:
 352 |             s = sample.strip().split()
 353 |             fastq1 = s[0]
 354 |             try:
 355 |                 fastq2 = s[1]
 356 |             except IndexError:
 357 |                 print("Error: Paired end files should be whitespace/tab seperated")
 358 |                 exit(0)
 359 |             fileList.append((fastq1, fastq2))
 360 |         else:
 361 |             fastq1 = sample.rstrip()
 362 |             fileList.append(fastq1)
 363 |     results = multiSampleTool(fileList, paired, k)
 364 |     return results
 365 | #############################################################
 366 | # Function   : multiSampleTool
 367 | # Input      : List of files to process, paired or single, k value
 368 | # Output     : STs and allelic profiles for each FASTQ file
 369 | # Description: Processes all FASTQ files present in the input list
 370 | #############################################################
 371 | def multiSampleTool(fileList, paired, k):
 372 |     results = {}
 373 |     for sample in fileList:
 374 |         if paired is True:
 375 |             fastq1 = sample[0]
 376 |             fastq2 = sample[1]
 377 |         else:
 378 |             fastq1 = sample
 379 |             fastq2 = None
 380 |         results = singleSampleTool(fastq1, fastq2, paired, k, results)
 381 |     return results
 382 | #############################################################
 383 | # Function   : singleSampleTool
 384 | # Input      : fastq file 1 and 2, paired or single, k value, output dictionary
 385 | # Output     : STs and allelic profiles for each FASTQ file
 386 | # Description: Processes both FASTQ files passed to the function
 387 | #############################################################
 388 | def singleSampleTool(fastq1, fastq2, paired, k, results):
 389 |     if paired is True:
 390 |         fileName = fastq1.split('/')[-1].split('.')[0][:-1]
 391 |     else:
 392 |         fileName = fastq1.split('/')[-1].split('.')[0]
 393 |     if reads is True:
 394 |         readFileName = fileName + '_reads.fq'
 395 |         global readFile
 396 |         readFile = open(readFileName, 'w+')
 397 |     if paired is True:
 398 |         msg = "singleSampleTool : " + fastq1 + ' and ' + fastq2
 399 |     else:
 400 |         msg = "singleSampleTool : " + fastq1
 401 |     logging.debug(msg)
 402 |     global alleleCount
 403 |     alleleCount = {}
 404 |     t1 = time.time()
 405 |     if paired is True:
 406 |         logging.debug("singleSampleTool : paired True")
 407 |         logging.debug("singleSampleTool : fastq1 start")
 408 |         singleFileTool(fastq1, k)
 409 |         logging.debug("singleSampleTool : fastq1 done")
 410 |         logging.debug("singleSampleTool : fastq2 start")
 411 |         singleFileTool(fastq2, k)
 412 |         logging.debug("singleSampleTool : fastq2 done")
 413 |         if alleleCount == {}:
 414 |             string = "No k-mer matches were found for the sample " + fastq1 + " and "+ fastq2 + ".  Probable cause of the error:  low quality data/too many N's in the data"
 415 |             logging.error("singleSampleTool : " + string)
 416 |             print(string)
 417 | #           exit(0)
 418 |         profileCount = alleleCount
 419 |     else:
 420 |         logging.debug("singleSampleTool : paired False")
 421 |         logging.debug("singleSampleTool : fastq start")
 422 |         singleFileTool(fastq1, k)
 423 |         profileCount = alleleCount
 424 |         logging.debug("singleSampleTool : fastq done")
 425 |         if alleleCount == 0:
 426 |             string = "No k-mer matches were found for the sample " + fastq1 + ".  Probable cause of the error:  low quality data/too many N's in the data"
 427 |             logging.error("singleSampleTool : " + string)
 428 |             print(string)
 429 |     logging.debug("singleSampleTool : weightedProfile start")
 430 |     weightedProfile = weightedProf(profileCount, weightDict)
 431 |     logging.debug("singleSampleTool : weightedProfile finished")
 432 |     logging.debug("singleSampleTool : getMaxCount start")
 433 |     finalProfile = getMaxCount(weightedProfile, fileName)
 434 |     logging.debug("singleSampleTool : getMaxCount end")
 435 |     st = 0
 436 |     if profileFile != '':
 437 |         logging.debug("singleSampleTool : findST start")
 438 |         st = findST(finalProfile, stProfile)
 439 |         logging.debug("singleSampleTool : findST end")
 440 |     if reads is True:
 441 |         readFile.close()
 442 |     t3 = time.time()
 443 |     finalProfile['ST'] = st
 444 |     finalProfile['t'] = t3-t1
 445 |     results[fileName] = finalProfile
 446 |     return results
 447 | #############################################################
 448 | # Function   : singleFileTool
 449 | # Input      : fastq file, k value
 450 | # Output     : Edits a global dictionary - results
 451 | # Description: Processes the single fastq file
 452 | #############################################################
 453 | def singleFileTool(fastq, k):
 454 |     msg = "singleFileTool :" + fastq
 455 |     logging.debug(msg)
 456 |     if os.path.isfile(fastq):
 457 |         logging.debug("singleFileTool : fastq")
 458 |         non_overlapping_window = 1
 459 |         finalProfile = {}
 460 |         t1 = time.time()
 461 |         fileExplorer(fastq, k, non_overlapping_window)
 462 |         t3 = time.time()
 463 |     else:
 464 |         msg = "File does not exist: " + fastq
 465 |         logging.error("singleFileTool : msg")
 466 |         print(msg)
 467 | def fileExplorer(file, k, non_overlapping_window):
 468 |     if file.endswith('.gz'):
 469 |         if sys.version_info[0] == 3:
 470 |             f = gzip.open(file, 'rt')
 471 |         else:
 472 |             f = gzip.open(file, 'rb')
 473 |     else:
 474 |         f = open(file)
 475 |     msg = "fileExplorer :" + file
 476 |     logging.debug(msg)
 477 |     lines = f.readlines()
 478 |     i = 1
 479 |     n_reads = 0
 480 |     try:
 481 |         if len(lines[1]) < k:
 482 |             m1 = "Read length " + len(lines[1])+" for file " + file + " smaller than " + k
 483 |             print(m1)
 484 |             print("Skipping to next file.")
 485 |             logging.debug(m1)
 486 |             return 0
 487 |     except Exception:
 488 |         m2 = "Check fastq file " + file
 489 |         print(m2)
 490 |         logging.debug(m2)
 491 |         return 0
 492 |     start = int((len(lines[1])-k)//2)
 493 |     end = int((len(lines[1])-k)//2)
 494 |     yesRead = False
 495 |     for line in lines:
 496 |         if i % 4 == 0 and yesRead:
 497 |             readFile.write(line)
 498 |         if i % 4 != 3:
 499 |             yesRead = False
 500 |         if i%4 == 1:
 501 |             head = line
 502 |         if i%4 == 2:
 503 |             s1 = str(line[start:k+start])
 504 |             sn_1 = str(line[-k-end:-end]).rstrip()
 505 |             if s1 in kmerDict[k]:
 506 |                 n_reads += 1
 507 |                 goodReads(line, k, non_overlapping_window)
 508 |                 if reads is True:
 509 |                     readFile.write(head)
 510 |                     readFile.write(line)
 511 |                     readFile.write('+\n')
 512 |                     yesRead = True
 513 |         i += 1
 514 | #############################################################
 515 | # Function   : goodReads
 516 | # Input      : sequence read, k, step size
 517 | # Output     : Edits the count of global variable alleleCount
 518 | # Description: Increment the count for each k-mer match
 519 | #############################################################
 520 | def goodReads(read, k, non_overlapping_window):
 521 |     n = 0
 522 |     line = read.rstrip()
 523 |     while n+k <= len(line):
 524 |         s = str(line[n:n+k])
 525 |         if s in kmerDict[k]:
 526 |             for probLoc in kmerDict[k][s]:
 527 |                 if probLoc not in alleleCount:
 528 |                     alleleCount[probLoc] = {}
 529 |                 a = kmerDict[k][s][probLoc]
 530 |                 for allele in a:
 531 |                     allele = allele.rstrip()
 532 |                     if allele in alleleCount[probLoc]:
 533 |                         alleleCount[probLoc][allele] += 1
 534 |                     else:
 535 |                         alleleCount[probLoc][allele] = 1
 536 |         n += non_overlapping_window
 537 | #############################################################
 538 | # Function   : weightedProf
 539 | # Input      : allele count global var, weight factors
 540 | # Output/Desc: Normalizes alleleCount by weight factor
 541 | #############################################################
 542 | def weightedProf(alleleCount, weightDict):
 543 |     logging.debug("weightedProf")
 544 |     weightedDict = {}
 545 |     for loc in alleleCount:
 546 |         weightedDict[loc] = {}
 547 |         for allele in alleleCount[loc]:
 548 |             if loc in weightDict:
 549 |                 if allele in weightDict[loc]:
 550 |                     weightedDict[loc][allele] = (alleleCount[loc][allele] / weightDict[loc][allele])
 551 |                 else:
 552 |                     weightedDict[loc][allele] = alleleCount[loc][allele]
 553 |             else:
 554 |                 weightedDict[loc][allele] = alleleCount[loc][allele]
 555 |     return weightedDict
 556 | #############################################################
 557 | # Function   : getMaxCount
 558 | # Input      : allele counts
 559 | # Output     : allelic profile and ST
 560 | # Description: Finds the alleles with maximum counts and
 561 | #              generates the allelic profile and ST
 562 | #############################################################
 563 | def getMaxCount(alleleCount, fileName):
 564 |     logging.debug("getMaxCount")
 565 |     max_n = {}
 566 |     secondMax = {}
 567 |     maxSupport = {}
 568 |     secondSupport = {}
 569 |     finalProfileCount = {}
 570 |     for locus in alleleNames:
 571 |         finalProfileCount[locus] = {}
 572 |     num = ''
 573 |     for loc in alleleCount:
 574 |         n = 0
 575 |         m = 0
 576 |         for num in alleleCount[loc]:
 577 |             if alleleCount[loc][num] >= n:
 578 |                 m = n
 579 |                 n = alleleCount[loc][num]
 580 |         if n-m < fuzzy:
 581 |             try:
 582 |                 alleleCount[loc][num]
 583 |             except:
 584 |                 pass
 585 |             else:
 586 |                 alleleCount[loc][num] = str(alleleCount[loc][num])+'*'
 587 |                 max_n[loc] = str(n)+'*'
 588 |         else:
 589 |             max_n[loc] = n
 590 |         secondMax[loc] = m
 591 |     for loc in alleleCount:
 592 |         try:
 593 |             max_n[loc]
 594 |         except:
 595 |             pass
 596 |         else:
 597 |             maxSupport[loc] = {}
 598 |             secondSupport[loc] = {}
 599 |             num_max = []
 600 |             num_max2 = []
 601 |             compare = float(re.sub("\*$", "", str(max_n[loc])))
 602 |             for num in alleleCount[loc]:
 603 |                 if  float(re.sub("\*$", "", str(alleleCount[loc][num]))) == compare:
 604 |                     if "\*" in str(max_n[loc]):
 605 |                         insert = num + '*'
 606 |                         num_max.append(insert)
 607 |                     else:
 608 |                         num_max.append(num)
 609 |                     maxSupport[loc][num] = max_n[loc]
 610 | 
 611 |                 if  alleleCount[loc][num] == secondMax[loc]:
 612 |                     num_max2.append(num)
 613 |                     secondSupport[loc][num] = secondMax[loc]
 614 |             try:
 615 |                 finalProfileCount[loc] = num_max[0]
 616 |             except LookupError:
 617 |                 finalProfileCount[loc] = 'NA'
 618 |     msgs = "Max Support :" + fileName + " : " + str(maxSupport)
 619 |     logging.debug(msgs)
 620 |     msgs = "Second Max Support :" + fileName + " : " + str(secondSupport)
 621 |     logging.debug(msgs)
 622 |     return finalProfileCount
 623 | #############################################################
 624 | # Function   : findST
 625 | # Input      : allelic profile for one sample and profiles for all STs
 626 | # Output     : ST number, or 0 if no ST match was found
 627 | # Description: Finds the ST number which best matches the given sample profile.
 628 | #############################################################
 629 | def findST(finalProfile, stProfile):
 630 |     if not stProfile:
 631 |         return 0
 632 |     oneProfile = next(iter(stProfile.values()))
 633 |     # The gene names in finalProfile may not exactly match those in stProfile. To deal with this,
 634 |     # each finalProfile gene is associated with the best matching gene in the ST profiles.
 635 |     finalGeneToSTGene = {}
 636 |     profileGenes = list(oneProfile.keys())
 637 |     for finalGene in list(finalProfile.keys()):
 638 |         if finalGene in profileGenes:  # exact match is preferable
 639 |             finalGeneToSTGene[finalGene] = finalGene
 640 |         else:  # failing an exact match, look for a case-sensitive containment
 641 |             for profileGene in profileGenes:
 642 |                 if finalGene in profileGene:
 643 |                     finalGeneToSTGene[finalGene] = profileGene
 644 |                     break
 645 |         if finalGene not in finalGeneToSTGene:  # if there's still no match, try a case-insensitive containment
 646 |             for profileGene in profileGenes:
 647 |                 if finalGene.lower() in profileGene.lower():
 648 |                     finalGeneToSTGene[finalGene] = profileGene
 649 |                     break
 650 |         if finalGene not in finalGeneToSTGene:
 651 |             print("ERROR: gene names in config file do not match gene names in profile file")
 652 |             exit(0)
 653 |     transformedFinalProfile = {}
 654 |     for gene, allele in finalProfile.items():
 655 |         if allele:
 656 |             allele = re.sub("\*", "", allele)
 657 |         transformedFinalProfile[finalGeneToSTGene[gene]] = allele
 658 |         # Check to see if the dictionary is empty, if so then means no allele were found at all
 659 |         if bool(transformedFinalProfile) is False:
 660 |             return 0
 661 |     # Find the best matching ST, considering only the genes in the sample's profile. This is to
 662 |     # allow for superfluous columns in the ST profile.
 663 |     logging.debug("findST")
 664 |     for stNum, profile in stProfile.items():
 665 |         if all(x in list(profile.items()) for x in list(transformedFinalProfile.items())):
 666 |             return stNum
 667 |     return 0
 668 | #############################################################
 669 | # Function   : loadModule
 670 | # Input      : k value and prefix of the DB file
 671 | # Output     : Updates the DB dictionary variables
 672 | # Description: Used in loading the DB as set of variables
 673 | #              by calling other functions
 674 | #############################################################
 675 | def loadModule(k, dbPrefix):
 676 |     global dbFile
 677 |     dbFile = dbPrefix+'_'+str(k)+'.txt'
 678 |     global weightFile
 679 |     weightFile = dbPrefix+'_weight.txt'
 680 |     global profileFile
 681 |     profileFile = dbPrefix+'_profile.txt'
 682 |     global kmerDict
 683 |     kmerDict = {}
 684 |     kmerDict[k] = loadKmerDict(dbFile)
 685 |     global weightDict
 686 |     weightDict = loadWeightDict(weightFile)
 687 |     global stProfile
 688 |     stProfile = loadSTfromFile(profileFile)
 689 | #############################################################
 690 | # Function   : loadSTfromFile
 691 | # Input      : profile definition file
 692 | # Output     : Updates the DB dictionary variables
 693 | # Description: Used in loading the DB as set of variables
 694 | #############################################################
 695 | def loadSTfromFile(profileF):
 696 |     with open(profileF, 'r') as definitionFile:
 697 |         st = {}
 698 |         index = {}
 699 |         lines = definitionFile.readlines()
 700 |         heads = lines[0].rstrip().split('\t')
 701 |         for locus in heads:
 702 |             index[locus] = heads.index(locus)
 703 |         for line in lines:
 704 |             pro = line.rstrip().split('\t')
 705 |             l = {}
 706 |             for locus in heads[1:]:
 707 |                 try:
 708 |                     l[locus] = pro[index[locus]]
 709 |                 except LookupError:
 710 |                     logging.debug("ERROR while loading ST")
 711 |                     pass
 712 |             st[pro[0]] = l
 713 |     return st
 714 | #############################################################
 715 | # Function   : loadKmerDict
 716 | # Input      : DB prefix
 717 | # Output     : Updates the DB dictionary variables
 718 | # Description: Used in loading the DB as set of variables
 719 | #############################################################
 720 | def loadKmerDict(dbFile):
 721 |     kmerTableDict = {}
 722 |     with open(dbFile, 'r') as kmerTableFile:
 723 |         lines = kmerTableFile.readlines()
 724 |         global alleleNames
 725 |         alleleNames = set()
 726 |         for line in lines:
 727 |             array = line.rstrip().rsplit('\t')
 728 |             kmerTableDict[array[0]] = {}
 729 |             kmerTableDict[array[0]][array[1]] = array[2][1:-1].rsplit(',')
 730 |             alleleNames.add(array[1])
 731 |     return kmerTableDict
 732 | #############################################################
 733 | # Function   : loadWeightDict
 734 | # Input      : Weight file prefix
 735 | # Output     : Updates the DB dictionary variables
 736 | # Description: Used in loading the DB as set of variables
 737 | #############################################################
 738 | def loadWeightDict(weightFile):
 739 |     weightDict = {}
 740 |     with open(weightFile, 'r') as weightTableFile:
 741 |         lines = weightTableFile.readlines()
 742 |         for line in lines:
 743 |             array = line.rstrip().rsplit('\t')
 744 |             try:
 745 |                (loc, allele) = array[0].replace('-', '_').rsplit('_', 1)
 746 |             except ValueError:
 747 |                 print("Error : Allele name in locus file should be seperated by '_' or '-'")
 748 |                 exit(0)
 749 |             if loc not in weightDict:
 750 |                 weightDict[loc] = {}
 751 |             weightDict[loc][allele] = float(array[1])
 752 |     return weightDict
 753 | #############################################################
 754 | # Function   : loadConfig
 755 | # Input      : config file path from getopts
 756 | # Output     : Updates configDict
 757 | # Description: Used to find allele fasta files for getCoverage
 758 | #############################################################
 759 | def loadConfig(config):
 760 |     global configDict
 761 |     configDict = {}
 762 |     with open(config) as configFile:
 763 |         lines = configFile.readlines()
 764 |         head = ''
 765 |         for line in lines:
 766 |             if line.rstrip() == '':
 767 |                 continue
 768 |             if line.rstrip() == '[loci]':
 769 |                 head = 'loci'
 770 |                 configDict[head] = {}
 771 |             elif line.rstrip() == '[profile]':
 772 |                 head = 'profile'
 773 |                 configDict[head] = {}
 774 |             else:
 775 |                 arr = line.strip().split()
 776 |                 configDict[head][arr[0]] = arr[1]
 777 |     for head in configDict:
 778 |         for element in configDict[head]:
 779 |             if not os.path.isfile(configDict[head][element]):
 780 |                 print("ERROR: %s file does not exist at %s" % (element, configDict[head][element]))
 781 |                 exit(0)
 782 |     return configDict
 783 | #############################################################
 784 | # Function   : getCoverage
 785 | # Input      : results dictionary
 786 | # Output     : Updates results to include coverage info
 787 | #############################################################
 788 | def getCoverage(results):
 789 |     tmpdir = tempfile.mkdtemp()
 790 |     for sample in results:
 791 |         file = tmpdir +'/'+ sample + '.fasta'
 792 |         bed = tmpdir +'/'+ sample + '.bed'
 793 |         sortedFile = tmpdir +'/'+ sample + '.sorted'
 794 |         covOut = tmpdir +'/'+ sample + '.out'
 795 |         with open(file, 'w') as tmpFasta:
 796 |             with open(bed, 'w') as bedFile:
 797 |                 for gene in configDict['loci']:
 798 |                     genes = Fasta(configDict['loci'][gene])
 799 |                     allele = gene+'_'+re.sub('\*', "", str(results[sample][gene]))
 800 |                     tmpFasta.write('>'+gene+'\n')
 801 |                     bedFile.write(gene+'\t0\t'+str(len(genes[allele]))+'\n')
 802 |                     for line in genes[allele]:
 803 |                         tmpFasta.write(str(line)+'\n')
 804 |         cmdIndex = "bwa index %s 2>/dev/null"%(file)
 805 |         os.system(cmdIndex)
 806 |         readBWA = sample+'_reads.fq'
 807 |         cmdBwaMem = "bwa mem %s %s 2>/dev/null| samtools view -uS - | samtools sort - -o %s"%(file, readBWA, sortedFile)
 808 |         os.system(cmdBwaMem)
 809 |         cmdCov = "bedtools coverage -a %s -b %s > %s"%(bed, sortedFile, covOut)
 810 |         os.system(cmdCov)
 811 |         with open(covOut, 'r') as cov:
 812 |             for line in cov.readlines():
 813 |                 records = line.rstrip().rsplit('\t')
 814 |                 gene = records[0]
 815 |                 geneCov = float(records[6]) * 100
 816 |                 results[sample][gene] = results[sample][gene] + " (" + str("%.2f" % geneCov) + ")"
 817 |     shutil.rmtree(tmpdir)
 818 | """Prints the results in the format asked by the user."""
 819 | #############################################################
 820 | # Function   : printResults
 821 | # Input      : results, output file, overwrite?
 822 | # Output     : Prints on the screen or in a file
 823 | # Description: Prints the results in the format asked by the user
 824 | #############################################################
 825 | def printResults(results, output_filename, overwrite, timeDisp):
 826 |     if output_filename != None:
 827 |         if overwrite is False:
 828 |             outfile = open(output_filename, "a")
 829 |         else:
 830 |             outfile = open(output_filename, "w")
 831 |     heading = "Sample"
 832 |     for head in sorted(results[list(results.keys())[0]]):
 833 |         if head == 'ST' or head == 't':
 834 |             continue
 835 |         heading += '\t' + head
 836 |     heading += '\tST'
 837 |     if timeDisp is True:
 838 |         heading += '\tTime'
 839 |     if output_filename != None:
 840 |         outfile.write(heading)
 841 |         outfile.write('\n')
 842 |     else:
 843 |         print(heading)
 844 |     for s in results:
 845 |         sample = s.split("_")[0]
 846 |         for l in sorted(results[s]):
 847 |             if l == 'ST' or l == 't':
 848 |                 continue
 849 |             if results[s][l]:
 850 |                 sample += '\t'+results[s][l]
 851 |             else:
 852 |                 sample += '\tNA'
 853 |         if timeDisp is True:
 854 |             sample += '\t' + str(results[s]['ST']) + '\t%.2f ' %results[s]['t']
 855 |         else:
 856 |             sample += '\t' + str(results[s]['ST'])
 857 |         if output_filename != None:
 858 |             outfile.write(sample)
 859 |             outfile.write('\n')
 860 |         else:
 861 |             print(sample)
 862 | """Predict part ends here"""
 863 | """Build DB part starts"""
 864 | """Returns the reverse complement of the sequence"""
 865 | def reverseComplement(seq):
 866 |     seqU = seq.upper()
 867 |     seq_dict = {'A':'T', 'T':'A', 'G':'C', 'C':'G', 'Y':'R', 'R':'Y', 'S':'S', 'W':'W', 'K':'M', 'M':'K', 'N':'N'}
 868 |     try:
 869 |         return "".join([seq_dict[base] for base in reversed(seqU)])
 870 |     except Exception:
 871 |         strn = "Reverse Complement Error:" + seqU
 872 |         logging.debug(strn)
 873 |         pass
 874 | #############################################################
 875 | # Function   : getFastaDict
 876 | # Input      : locus file name
 877 | # Output     : dictionary with all the allele sequences
 878 | # Description: Stores each allele sequence in a dictionary
 879 | #############################################################
 880 | def getFastaDict(fullLocusFile):
 881 |     logging.debug("Create Fasta Dict")
 882 |     logging.debug(fullLocusFile)
 883 |     fastaFile = open(fullLocusFile, 'r').read()
 884 |     entries = [x for x in fastaFile.split('>') if len(x) != 0]
 885 |     fastaDict = {}
 886 |     for entry in entries:
 887 |         key = [x for x in entry.split('\n')[0].split() if len(x) != 0][0]
 888 |         sequence = ''.join(entry.split('\n')[1:]).rstrip()
 889 |         fastaDict[key] = {'sequence':sequence}
 890 |     return fastaDict
 891 | #############################################################
 892 | # Function   : formKmerDB
 893 | # Input      : configuration file, k value, output prefix
 894 | # Output     : stringMLST DB
 895 | # Description: Constructs the k-mer DB in both strand orientation
 896 | #############################################################
 897 | def formKmerDB(configDict, k, output_filename):
 898 |     dbFileName = output_filename+'_'+str(k)+'.txt'
 899 |     weightFileName = output_filename+'_weight.txt'
 900 |     kmerDict = {}
 901 |     mean = {}
 902 |     for locus in configDict['loci']:
 903 |         msgs = "formKmerDB :" +locus
 904 |         logging.debug(msgs)
 905 |         fastaDict = getFastaDict(configDict['loci'][locus])
 906 |         sum = 0
 907 |         n = 0
 908 |         for allele in list(fastaDict.keys()):
 909 |             seq = fastaDict[allele]['sequence'].strip()
 910 |             l = len(seq)
 911 |             sum += l
 912 |             n += 1
 913 |             try:
 914 |                 (loc, num) = allele.replace('-', '_').rsplit('_', 1)
 915 |             except ValueError:
 916 |                 print("Error : Allele name in locus file should be seperated by '_' or '-'")
 917 |                 exit(0)
 918 |             splitId = allele.replace('-', '_').rsplit('_', 1)
 919 |             i = 0
 920 |             while i+k <= l:
 921 |                 kmer = seq[i:i+k]
 922 |                 revCompKmer = reverseComplement(kmer)
 923 |                 if kmer not in kmerDict:
 924 |                     kmerDict[kmer] = {}
 925 |                     kmerDict[kmer][splitId[0]] = []
 926 |                     kmerDict[kmer][splitId[0]].append(int(splitId[1]))
 927 |                 else:
 928 |                     if splitId[0] not in kmerDict[kmer]:
 929 |                         kmerDict[kmer][splitId[0]] = []
 930 |                         kmerDict[kmer][splitId[0]].append(int(splitId[1]))
 931 |                     else:
 932 |                         kmerDict[kmer][splitId[0]].append(int(splitId[1]))
 933 |                 if revCompKmer not in kmerDict:
 934 |                     kmerDict[revCompKmer] = {}
 935 |                     kmerDict[revCompKmer][splitId[0]] = []
 936 |                     kmerDict[revCompKmer][splitId[0]].append(int(splitId[1]))
 937 |                 else:
 938 |                     if splitId[0] not in kmerDict[revCompKmer]:
 939 |                         kmerDict[revCompKmer][splitId[0]] = []
 940 |                         kmerDict[revCompKmer][splitId[0]].append(int(splitId[1]))
 941 |                     else:
 942 |                         kmerDict[revCompKmer][splitId[0]].append(int(splitId[1]))
 943 |                 i += 1
 944 |         mean[locus] = sum/n*1.0
 945 |     with open(dbFileName, 'w') as kfile:
 946 |         for key in kmerDict:
 947 |             for key1 in kmerDict[key]:
 948 |                 string = key+'\t'+key1+'\t'+str(kmerDict[key][key1]).replace(" ", "")+'\n'
 949 |                 kfile.write(string)
 950 |     with open(weightFileName, 'w') as wfile:
 951 |         for locus in configDict['loci']:
 952 |             fastaDict = getFastaDict(configDict['loci'][locus])
 953 |             for allele in list(fastaDict.keys()):
 954 |                 splitId = allele.split('_')
 955 |                 seq = fastaDict[allele]['sequence']
 956 |                 l = len(seq)
 957 |                 fac = (l/mean[locus])
 958 |                 s = allele  + '\t' + str(fac) + '\n'
 959 |                 if fac > 1.05 or fac < 0.95:
 960 |                     wfile.write(s)
 961 | """Copies the profile definition file as a new file"""
 962 | def copyProfileFile(profileDict, output_filename):
 963 |     profileFileName = output_filename+'_profile.txt'
 964 |     with open(profileDict['profile']) as f:
 965 |         lines = f.readlines()
 966 |         with open(profileFileName, "w") as f1:
 967 |             f1.writelines(lines)
 968 | #############################################################
 969 | # Function   : makeCustomDB
 970 | # Input      : configuration file, k value, output prefix
 971 | # Output     : None
 972 | # Description: Processes the config file and calls the relevant
 973 | #              function
 974 | #############################################################
 975 | def makeCustomDB(config, k, output_filename):
 976 |     configDict = {}
 977 |     if output_filename == None:
 978 |         output_filename = 'kmerDB'
 979 |     with open(config, 'r') as configFile:
 980 |         lines = configFile.readlines()
 981 |         head = ''
 982 |         for line in lines:
 983 |             if line.rstrip() == '':
 984 |                 continue
 985 |             if line.rstrip() == '[loci]':
 986 |                 head = 'loci'
 987 |                 configDict[head] = {}
 988 |             elif line.rstrip() == '[profile]':
 989 |                 head = 'profile'
 990 |                 configDict[head] = {}
 991 |             else:
 992 |                 arr = line.strip().split()
 993 |                 configDict[head][arr[0]] = arr[1]
 994 |     for head in configDict:
 995 |         for element in configDict[head]:
 996 |             if not os.path.isfile(configDict[head][element]):
 997 |                 print("ERROR: %s file does not exist at %s" % (element, configDict[head][element]))
 998 |                 exit(0)
 999 |     formKmerDB(configDict, k, output_filename)
1000 |     copyProfileFile(configDict['profile'], output_filename)
1001 | """Build DB part ends"""
1002 | """Check Parameters"""
1003 | def checkParams(buildDB, predict, config, k, listMode, list, batch, dir, fastq1, fastq2, paired, dbPrefix):
1004 |     if predict is True and buildDB is True:
1005 |         print(helpTextSmall)
1006 |         print("Select either predict or buildDB module")
1007 |         exit(0)
1008 |     if predict is False and buildDB is False and downloadDB is False:
1009 |         print(helpTextSmall)
1010 |         print("Select either predict or buildDB module")
1011 |         exit(0)
1012 |     if predict is True:
1013 |         if config != None and coverage is False:
1014 |             print(helpTextSmall)
1015 |             print("Config parameter is not required for predict mode.")
1016 |             exit(0)
1017 |         elif config is None and coverage is True:
1018 |             print(helpTextSmall)
1019 |             print("Config parameter is required to for coverage prediction")
1020 |             exit(0)
1021 |         if not os.path.isfile(dbPrefix+'_'+str(k)+'.txt'):
1022 |             print(helpTextSmall)
1023 |             print("DB file does not exist : ", dbPrefix, '_', str(k), '.txt or change DB prefix.')
1024 |             exit(0)
1025 |         if not os.path.isfile(dbPrefix+'_weight.txt'):
1026 |             print(helpTextSmall)
1027 |             print("DB file does not exist : ", dbPrefix, '_weight.txt or change DB prefix.')
1028 |             exit(0)
1029 |         if not os.path.isfile(dbPrefix+'_profile.txt'):
1030 |             print(helpTextSmall)
1031 |             print("DB file does not exist : ", dbPrefix, '_profile.txt or change DB prefix.')
1032 |             exit(0)
1033 |         if listMode is True:
1034 |             if not os.path.isfile(fList):
1035 |                 print(helpTextSmall)
1036 |                 print("Error: List file ("+fList+") does not exist!")
1037 |                 exit(0)
1038 |         elif batch is True:
1039 |             if not os.path.isdir(dir):
1040 |                 print(helpTextSmall)
1041 |                 print("Error: Directory ("+dir+") does not exist!")
1042 |                 exit(0)
1043 |         elif paired is True:
1044 |             if not os.path.isfile(fastq1):
1045 |                 print(helpTextSmall)
1046 |                 print("Error: FASTQ file ("+fastq1+") does not exist!")
1047 |                 exit(0)
1048 |             if not os.path.isfile(fastq2):
1049 |                 print(helpTextSmall)
1050 |                 print("Error: FASTQ file ("+fastq2+") does not exist!")
1051 |                 exit(0)
1052 |         elif paired is False:
1053 |             if not os.path.isfile(fastq1):
1054 |                 print(helpTextSmall)
1055 |                 print("Error: FASTQ file ("+fastq1+") does not exist!")
1056 |                 exit(0)
1057 |     if buildDB is True:
1058 |         try:
1059 |             if not os.path.isfile(config):
1060 |                 print(helpTextSmall)
1061 |                 print("Error: Configuration file ("+config+") does not exist!")
1062 |                 exit(0)
1063 |         except Exception:
1064 |             print(helpTextSmall)
1065 |             print("Error: Specify Configuration file")
1066 |             exit(0)
1067 | helpText = """
1068 | Readme for stringMLST
1069 | =============================================================================================
1070 | Usage
1071 | ./stringMLST.py
1072 | [--buildDB]
1073 | [--predict]
1074 | [-1 filename_fastq1][--fastq1 filename_fastq1]
1075 | [-2 filename_fastq2][--fastq2 filename_fastq2]
1076 | [-d directory][--dir directory][--directory directory]
1077 | [-l list_file][--list list_file]
1078 | [-p][--paired]
1079 | [-s][--single]
1080 | [-c][--config]
1081 | [-P][--prefix]
1082 | [-z][--fuzzy]
1083 | [-a]
1084 | [-C][--coverage]
1085 | [-k]
1086 | [-o output_filename][--output output_filename]
1087 | [-x][--overwrite]
1088 | [-t]
1089 | [-r]
1090 | [-v]
1091 | [-h][--help]
1092 | ==============================================================================================
1093 | There are two steps to predicting ST using stringMLST.
1094 | 1. Create DB : stringMLST.py --buildDB
1095 | 2. Predict : stringMLST --predict
1096 | 1. stringMLST.py --buildDB
1097 | Synopsis:
1098 | stringMLST.py --buildDB -c <config file> -k <kmer length(optional)> -P <DB prefix(optional)>
1099 |     config file : is a tab delimited file which has the information for typing scheme ie loci, its multifasta file and profile definition file.
1100 |         Format :
1101 |             [loci]
1102 |             locus1      locusFile1
1103 |             locus2      locusFile2
1104 |             [profile]
1105 |             profile     profileFile
1106 |     kmer length : is the kmer length for the db. Note, while processing this should be smaller than the read length.
1107 |         We suggest kmer lengths of 35, 66 depending on the read length.
1108 |     DB prefix(optional) : holds the information for DB files to be created and their location. This module creates 3 files with this prefix.
1109 |         You can use a folder structure with prefix to store your db at particular location.
1110 | Required arguments
1111 | --buildDB
1112 |     Identifier for build db module
1113 | -c,--config = <configuration file>
1114 |     Config file in the format described above.
1115 |     All the files follow the structure followed by pubmlst. Refer extended document for details.
1116 | Optional arguments
1117 | -k = <kmer length>
1118 |     Kmer size for which the db has to be formed(Default k = 35). Note the tool works best with kmer length in between 35 and 66
1119 |     for read lengths of 55 to 150 bp. Kmer size can be increased accordingly. It is advised to keep lower kmer sizes
1120 |     if the quality of reads is not very good.
1121 | -P,--prefix = <prefix>
1122 |     Prefix for db and log files to be created(Default = kmer). Also you can specify folder where you want the dbb to be created.
1123 | -a
1124 |         File location to write build log
1125 | -h,--help
1126 |   Prints the help manual for this application
1127 |  --------------------------------------------------------------------------------------------
1128 | 2. stringMLST.py --predict
1129 | stringMLST --predict : can run in three modes
1130 |   1) single sample (default mode)
1131 |   2) batch mode : run stringMLST for all the samples in a folder (for a particular specie)
1132 |   3) list mode : run stringMLST on samples specified in a file
1133 | stringMLST can process both single and paired end files. By default program expects paired end files.
1134 | Synopsis
1135 | stringMLST.py --predict -1 <fastq file> -2 <fastq file> -d <directory location> -l <list file> -p -s -P <DB prefix(optional)> -k <kmer length(optional)> -o <output file> -x
1136 | Required arguments
1137 | --predict
1138 |     Identifier for predict module
1139 | Optional arguments
1140 | -1,--fastq1 = <fastq1_filename>
1141 |   Path to first fastq file for paired end sample and path to the fastq file for single end file.
1142 |   Should have extension fastq or fq.
1143 | -2,--fastq2 = <fastq2_filename>
1144 |   Path to second fastq file for paired end sample.
1145 |   Should have extension fastq or fq.
1146 | -d,--dir,--directory = <directory>
1147 |   BATCH MODE : Location of all the samples for batch mode.
1148 | -C,--coverage
1149 |     Calculate sequence coverage for each allele. Turns on read generation (-r) and turns off fuzzy (-z 1)
1150 |     Requires bwa, bamtools and samtools be in your path
1151 | -k = <kmer_length>
1152 |   Kmer length for which the db was created(Default k = 35). Could be verified by looking at the name of the db file.
1153 |   Could be used if the reads are of very bad quality or have a lot of N's.
1154 | -l,--list = <list_file>
1155 |   LIST MODE : Location of list file and flag for list mode.
1156 |   list file should have full file paths for all the samples/files.
1157 |   Each sample takes one line. For paired end samples the 2 files should be tab separated on single line.
1158 | -o,--output = <output_filename>
1159 |   Prints the output to a file instead of stdout.
1160 | -p,--paired
1161 |   Flag for specifying paired end files. Default option so would work the same if you do not specify for all modes.
1162 |   For batch mode the paired end samples should be differentiated by 1/2.fastq or 1/2.fq
1163 | -P,--prefix = <prefix>
1164 |     Prefix using which the db was created(Defaults = kmer). The location of the db could also be provided.
1165 | -r
1166 |   A separate reads file is created which has all the reads covering all the locus.
1167 | -s,--single
1168 |   Flag for specifying single end files.
1169 | -t
1170 |   Time for each analysis will also be reported.
1171 | -v
1172 |   Prints the version of the software.
1173 | -x,--overwrite
1174 |   By default stringMLST appends the results to the output_filename if same name is used.
1175 |   This argument overwrites the previously specified output file.
1176 | -z,--fuzzy = <fuzzy threshold int>
1177 |     Threshold for reporting a fuzzy match (Default=300). For higher coverage reads this threshold should be set higher to avoid
1178 |     indicating fuzzy match when exact match was more likely. For lower coverage reads, threshold of <100 is recommended
1179 | -h,--help
1180 |   Prints the help manual for this application
1181 | =============================================================================================
1182 | 3. stringMLST.py --getMLST
1183 | Synopsis:
1184 | stringMLST.py --getMLST --species= <species> [-k kmer length] [-P DB prefix]
1185 | Required arguments
1186 | --getMLST
1187 |     Identifier for getMLST module
1188 | --species= <species name>
1189 |     Species name from the pubMLST schemes (use "--species show" to get list of available schemes)
1190 |     "all" will download and build all
1191 | Optional arguments
1192 | -k = <kmer length>
1193 |     Kmer size for which the db has to be formed(Default k = 35). Note the tool works best with kmer length in between 35 and 66
1194 |     for read lengths of 55 to 150 bp. Kmer size can be increased accordingly. It is advised to keep lower kmer sizes
1195 |     if the quality of reads is not very good.
1196 | -P,--prefix = <prefix>
1197 |     Prefix for db and log files to be created(Default = kmer). Also you can specify folder where you want the db to be created.
1198 |     We recommend that prefix and config point to the same folder for cleanliness but this is not required
1199 | --schemes
1200 |     Display the list of available schemes
1201 | -h,--help
1202 |   Prints the help manual for this application
1203 | =============================================================================================
1204 | Example usage:
1205 | ./stringMLST.py --buildDB
1206 | 1) Build DB
1207 |  ./stringMLST.py --buildDB --config config.txt -k 35 -P NM
1208 |  --------------------------------------------------------------------------------------------
1209 | ./stringMLST.py --predict
1210 | 1) Single sample, paired end
1211 |  ./stringMLST.py --predict -1 data/Neisseria/ERR017001_1.fastq -2 data/Neisseria/ERR017001_2.fastq -p --prefix NM -k 35 -o output.txt
1212 | 2) Single sample, single end, overwrite output
1213 |   ./stringMLST.py --predict -1 data/Neisseria/ERR017001_1.fastq -s --prefix NM -k 35 -o output.txt -x
1214 | 3) Multiple sample batch mode, paired end
1215 |    ./stringMLST.py --predict -d data/Neisseria/ -p --prefix NM -k 35 -o output.txt -x
1216 | 4) Multiple samples list mode, paired end
1217 |    ./stringMLST.py --predict -l data/listFile.txt -p --prefix NM -k 35 -o output.txt -x
1218 | 5) Single, high coverage sample, paired end
1219 |  ./stringMLST.py --predict -1 data/Neisseria/ERR017001_1.fastq -2 data/Neisseria/ERR017001_2.fastq -p --prefix NM -k 35 -z 1000 -o output.txt
1220 | --------------------------------------------------------------------------------------------
1221 | ./stringMLST.py --getMLST
1222 | 1) List available schemes
1223 |  ./stringMLST.py --getMLST --schemes
1224 | 2) Download the Neisseria spp. pubMLST scheme
1225 |   ./stringMLST.py --getMLST --species=neisseria -P datasets/nmb
1226 | """
1227 | helpTextSmall = """
1228 | Usage
1229 | [--buildDB]
1230 | [--predict]
1231 | [-1 filename_fastq1][--fastq1 filename_fastq1]
1232 | [-2 filename_fastq2][--fastq2 filename_fastq2]
1233 | [-d directory][--dir directory][--directory directory]
1234 | [-l list_file][--list list_file]
1235 | [-p][--paired]
1236 | [-s][--single]
1237 | [-c][--config]
1238 | [-P][--prefix]
1239 | [-z][--fuzzy]
1240 | [-a]
1241 | [-C][--coverage]
1242 | [-k]
1243 | [-o output_filename][--output output_filename]
1244 | [-x][--overwrite]
1245 | [-t]
1246 | [-r]
1247 | [-v]
1248 | [-h][--help]
1249 | ==============================================================================================
1250 | There are two steps to predicting ST using stringMLST.
1251 | 1. Create DB : stringMLST.py --buildDB
1252 | 2. Predict : stringMLST --predict
1253 | 1. stringMLST.py --buildDB
1254 | Synopsis:
1255 | stringMLST.py --buildDB -c <config file> -k <kmer length(optional)> -P <DB prefix(optional)>
1256 | Required arguments
1257 | --buildDB
1258 |     Identifier for build db module
1259 | -c,--config = <configuration file>
1260 |     Config file in the format described above.
1261 |     All the files follow the structure followed by pubmlst. Refer extended document for details.
1262 | Optional arguments
1263 | -k = <kmer length>
1264 |     Kmer size for which the db has to be formed(Default k = 35). Note the tool works best with kmer length in between 35 and 66
1265 |     for read lengths of 55 to 150 bp. Kmer size can be increased accordingly. It is advised to keep lower kmer sizes
1266 |     if the quality of reads is not very good.
1267 | -P,--prefix = <prefix>
1268 |     Prefix for db and log files to be created(Default = kmer). Also you can specify folder where you want the db to be created.
1269 | -h,--help
1270 |   Prints the help manual for this application
1271 | ==============================================================================================
1272 | 2. stringMLST.py --predict
1273 | Synopsis
1274 | stringMLST.py --predict -1 <fastq file> -2 <fastq file> -d <directory location> -l <list file> -p -s -P <DB prefix(optional)> -k <kmer length(optional)> -o <output file> -x
1275 | Required arguments
1276 | --predict
1277 |     Identifier for predict module
1278 | Optional arguments
1279 | -1,--fastq1 = <fastq1_filename>
1280 |   Path to first fastq file for paired end sample and path to the fastq file for single end file.
1281 |   Should have extension fastq or fq.
1282 | -2,--fastq2 = <fastq2_filename>
1283 |   Path to second fastq file for paired end sample.
1284 |   Should have extension fastq or fq.
1285 | -d,--dir,--directory = <directory>
1286 |   BATCH MODE : Location of all the samples for batch mode.
1287 | -C,--coverage
1288 |     Calculate sequence coverage for each allele. Turns on read generation (-r) and turns off fuzzy (-z 1)
1289 |     Requires bwa, bamtools and samtools be in your path
1290 | -k = <kmer_length>
1291 |   Kmer length for which the db was created(Default k = 35). Could be verified by looking at the name of the db file.
1292 |   Could be used if the reads are of very bad quality or have a lot of N's.
1293 | -l,--list = <list_file>
1294 |   LIST MODE : Location of list file and flag for list mode.
1295 |   list file should have full file paths for all the samples/files.
1296 |   Each sample takes one line. For paired end samples the 2 files should be tab separated on single line.
1297 | -o,--output = <output_filename>
1298 |   Prints the output to a file instead of stdout.
1299 | -p,--paired
1300 |   Flag for specifying paired end files. Default option so would work the same if you do not specify for all modes.
1301 |   For batch mode the paired end samples should be differentiated by 1/2.fastq or 1/2.fq
1302 | -P,--prefix = <prefix>
1303 |     Prefix using which the db was created(Defaults = kmer). The location of the db could also be provided.
1304 | -r
1305 |   A separate reads file is created which has all the reads covering all the locus.
1306 | -s,--single
1307 |   Flag for specifying single end files.
1308 | -t
1309 |   Time for each analysis will also be reported.
1310 | -v
1311 |   Prints the version of the software.
1312 | -x,--overwrite
1313 |   By default stringMLST appends the results to the output_filename if same name is used.
1314 |   This argument overwrites the previously specified output file.
1315 | -z,--fuzzy = <fuzzy threshold int>
1316 |     Threshold for reporting a fuzzy match (Default=300). For higher coverage reads this threshold should be set higher to avoid
1317 |     indicating fuzzy match when exact match was more likely. For lower coverage reads, threshold of <100 is recommended
1318 | -h,--help
1319 |   Prints the help manual for this application
1320 | =============================================================================================
1321 | 3. stringMLST.py --getMLST
1322 | Synopsis:
1323 | stringMLST.py --getMLST --species= <species> [-k kmer length] [-P DB prefix]
1324 | Required arguments
1325 | --getMLST
1326 |     Identifier for getMLST module
1327 | --species= <species name>
1328 |     Species name from the pubMLST schemes
1329 |     Use "show" or "list" to list available schemes
1330 |     "all" will download and build all available schemes
1331 | Optional arguments
1332 | -k = <kmer length>
1333 |     Kmer size for which the db has to be formed(Default k = 35). Note the tool works best with kmer length in between 35 and 66
1334 |     for read lengths of 55 to 150 bp. Kmer size can be increased accordingly. It is advised to keep lower kmer sizes
1335 |     if the quality of reads is not very good.
1336 | -P,--prefix = <prefix>
1337 |     Prefix for db and log files to be created(Default = kmer). Also you can specify folder where you want the db to be created.
1338 |     We recommend that prefix and config point to the same folder for cleanliness but this is not required
1339 | --schemes
1340 |     Display the list of available schemes
1341 | -h,--help
1342 |   Prints the help manual for this application
1343 | =============================================================================================
1344 | 
1345 | """
1346 | 
1347 | """The Program Starts Execution Here"""
1348 | """Default Params"""
1349 | downloadDB = False
1350 | species = None
1351 | printSchemes = False
1352 | buildDB = False
1353 | predict = False
1354 | output_filename = None
1355 | batch = False
1356 | listMode = False
1357 | overwrite = False
1358 | paired = True
1359 | fastq1 = None
1360 | fastq2 = None
1361 | user_k = False
1362 | config = None
1363 | timeDisp = False
1364 | reads = False
1365 | dbPrefix = 'kmer'
1366 | log = ''
1367 | k = 35
1368 | fuzzy = 300
1369 | coverage = False
1370 | #print'ARGV      :', sys.argv[1:]
1371 | #exit(0)
1372 | """Input arguments"""
1373 | options, remainder = getopt.getopt(sys.argv[1:], 'o:x1:2:k:l:bd:pshP:c:trva:z:C', [
1374 |     'buildDB',
1375 |     'predict',
1376 |     'output=',
1377 |     'config=',
1378 |     'prefix=',
1379 |     'overwrite',
1380 |     'batch',
1381 |     'list',
1382 |     'fastq1=',
1383 |     'fastq2=',
1384 |     'dir=',
1385 |     'directory=',
1386 |     'paired',
1387 |     'single',
1388 |     'help',
1389 |     'fuzzy=',
1390 |     'coverage',
1391 |     'getMLST',
1392 |     'schemes',
1393 |     'species='])
1394 | for opt, arg in options:
1395 |     if opt in ('-o', '--output'):
1396 |         output_filename = arg
1397 |     elif opt in ('-x', '--overwrite'):
1398 |         overwrite = True
1399 |     elif opt in '--buildDB':
1400 |         buildDB = True
1401 |     elif opt in ('-P', '--prefix'):
1402 |         dbPrefix = arg
1403 |     elif opt in '--predict':
1404 |         predict = True
1405 |     elif opt in ('-c', '--config'):
1406 |         config = arg
1407 |     elif opt in '-k':
1408 |         user_k = True
1409 |         try:
1410 |             k = int(arg)
1411 |         except ValueError:
1412 |             print("Error: Enter a numerical k value.")
1413 |             exit(0)
1414 |         # Check to make sure the arg is an int.
1415 |     elif opt in ('-l', '--list'):
1416 |         listMode = True
1417 |         fList = arg
1418 |     elif opt in ('-1', '--fastq1'):
1419 |         fastq1 = arg
1420 |     elif opt in ('-2', '--fastq2'):
1421 |         fastq2 = arg
1422 |     elif opt in ('-d', '--dir', '--directory'):
1423 |         dir = arg
1424 |         batch = True
1425 |     elif opt in ('-p', '--paired'):
1426 |         paired = True
1427 |         single = False
1428 |     elif opt in ('-s', '--single'):
1429 |         single = True
1430 |         paired = False
1431 |     elif opt in '-t':
1432 |         timeDisp = True
1433 |     elif opt in '-a':
1434 |         log = arg
1435 |     elif opt in '-r':
1436 |         reads = True
1437 |     elif opt in '-v':
1438 |         print(version)
1439 |         exit(0)
1440 |     elif opt in ('-C', '--coverage'):
1441 |         coverage = True
1442 |         reads = True
1443 |         fuzzy = 1
1444 |     elif opt in ('-z', '--fuzzy'):
1445 |         try:
1446 |             fuzzy = int(arg)
1447 |         except ValueError:
1448 |             print("You provided '" + arg + "' for your fuzziness threshold, which is not an integer value")
1449 |             exit(0)
1450 |     elif opt in '--schemes':
1451 |         print("The `--schemes` option has been depreciated.  Please use `--species list` to see available schemes")
1452 |         exit(0)
1453 |     elif opt in '--getMLST':
1454 |         downloadDB = True
1455 |     elif opt in '--species':
1456 |         species = arg
1457 |     elif opt in ('-h', '--help'):
1458 |         print(helpText)
1459 |         exit(0)
1460 | checkParams(buildDB, predict, config, k, listMode, list, batch, dir, fastq1, fastq2, paired, dbPrefix)
1461 | if buildDB is True:
1462 |     try:
1463 |         if not log:
1464 |             log = dbPrefix+'.log'
1465 |     except TypeError:
1466 |         log = 'kmer.log'
1467 |     logging.basicConfig(filename=log, level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
1468 |     if os.path.isfile(config):
1469 |         print("Info: Making DB for k = ", k)
1470 |         print("Info: Making DB with prefix =", dbPrefix)
1471 |         print("Info: Log file written to ", log)
1472 |         makeCustomDB(config, k, dbPrefix)
1473 |     else:
1474 |         print("Error: The input config file "+config +" does not exist.")
1475 | elif predict is True:
1476 |     try:
1477 |         if not log:
1478 |             log = dbPrefix+'.log'
1479 |     except TypeError:
1480 |         log = 'kmer.log'
1481 |     logging.basicConfig(filename=log, level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
1482 |     loadModule(k, dbPrefix)
1483 |     if batch is True:
1484 |         results = batchTool(dir, paired, k)
1485 |     elif listMode is True:
1486 |         results = listTool(fList, paired, k)
1487 |     else:
1488 |         results = {}
1489 |         results = singleSampleTool(fastq1, fastq2, paired, k, results)
1490 |     if coverage is True:
1491 |         try:
1492 |             from pyfaidx import Fasta
1493 |         except ImportError:
1494 |             print("pyfaidx is required for coverage calculation\npip install pyfaidx")
1495 |             exit(0)
1496 |         loadConfig(config)
1497 |         getCoverage(results)
1498 |     printResults(results, output_filename, overwrite, timeDisp)
1499 | elif downloadDB is True:
1500 |     dbURL = "http://pubmlst.org/data/dbases.xml"
1501 |     databaseXML = urlopen(dbURL)
1502 |     dbTree = ET.parse(databaseXML)
1503 |     dbRoot = dbTree.getroot()
1504 |     if species is None:
1505 |         print("Please refer to --help to more information")
1506 |         print()
1507 |         print("Expected command format:")
1508 |         print("stringMLST.py --getMLST --species= <species> [-k kmer length] [-P DB prefix]")
1509 |         print()
1510 |         print("To printavailable MLST Schemes use:")
1511 |         print("stringMLST.py --getMLST --species show")
1512 |         exit(0)
1513 |     elif species == "show" or species == "list":
1514 |         for species in dbRoot:
1515 |             print(species.text.rstrip())
1516 |     elif species == "all":
1517 |         print("Using a kmer size of " + str(k) + " for all databases.")
1518 |         for species in dbRoot:
1519 |             speciesName = species.text.rstrip()
1520 |             print('\033[1m' + "Preparing: " + speciesName + '\033[0m')
1521 |             if re.search('[/#. ()]', speciesName):
1522 |                 normSpeciesName = re.sub('[/# ]', "_", speciesName)
1523 |                 normSpeciesName = re.sub('[.()]', "", normSpeciesName)
1524 |                 print('\t\033[33m' + "INFO: normalizing name to: " + normSpeciesName + '\033[0m')
1525 |             else:
1526 |                 normSpeciesName = speciesName
1527 |             filePrefix = str(dbPrefix.rsplit("/", 1)[0]) + "/" + normSpeciesName
1528 |             # Move the rest of this informational message into the download handler
1529 |             # + " ( " + filePrefix + "/" + key + "_" +str(k) + " )")
1530 |             try:
1531 |                 os.makedirs(filePrefix)
1532 |             except OSError:
1533 |                pass
1534 |             filePrefix = filePrefix + "/" + normSpeciesName
1535 |             config = filePrefix + "_config.txt"
1536 |             profileURL, loci = get_links(dbRoot, filePrefix, speciesName)
1537 |             get_files(filePrefix, loci, profileURL, speciesName)
1538 |     else:
1539 |         print('\033[1m' + "Preparing: " + species + '\033[0m')
1540 |         if re.search('[/#. ()]', species):
1541 |             normSpeciesName = re.sub('[/# ]', "_", species)
1542 |             normSpeciesName = re.sub('[.()]', "", normSpeciesName)
1543 |             print('\t\033[33m' + "INFO: normalizing name to: " + normSpeciesName + '\033[0m')
1544 |         else:
1545 |             normSpeciesName = species
1546 |         try:
1547 |             os.makedirs(dbPrefix.rsplit("/", 1)[0])
1548 |         except OSError:
1549 |             pass
1550 |         if len(re.findall("/", dbPrefix)) == 0:
1551 |             filePrefix = dbPrefix + "/" + normSpeciesName
1552 |         elif len(re.findall("/", dbPrefix)) == 1 and len(dbPrefix.rsplit("/", 1)[1]) > 0:
1553 |             filePrefix = dbPrefix
1554 |         elif len(re.findall("/", dbPrefix)) == 1 and len(dbPrefix.rsplit("/", 1)[1]) == 0:
1555 |             filePrefix = dbPrefix + normSpeciesName
1556 |         elif len(re.findall("/", dbPrefix)) > 1:
1557 |             if dbPrefix.endswith('/'):
1558 |                 filePrefix = dbPrefix + normSpeciesName
1559 |             else:
1560 |                 filePrefix = dbPrefix
1561 |         config = filePrefix + "_config.txt"
1562 |         profileURL, loci = get_links(dbRoot, filePrefix, species)
1563 |         get_files(filePrefix, loci, profileURL, species)
1564 | else:
1565 |     print(helpTextSmall)
1566 |     print("Error: Please select the mode: buildDB (for database building) or predict (for ST discovery) module")
1567 | logging.debug('Command :' + str(sys.argv))
1568 | 
1569 | 


--------------------------------------------------------------------------------