├── .circleci └── config.yml ├── .gitignore ├── LICENSE ├── README.md ├── bin ├── __init__.py └── fastmlst.py ├── fastmlst ├── mlst.py └── update_mlst_kit.py ├── get_citations.py └── setup.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | orbs: 4 | python: circleci/python@0.2.1 5 | 6 | jobs: 7 | build: 8 | docker: 9 | - image: cimg/python:3.8.5 10 | steps: 11 | - checkout 12 | - run: 13 | command: | 14 | python --version 15 | python setup.py install --user 16 | fastmlst -V 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .DS_Store 3 | fastmlst.egg-info/ 4 | build/ 5 | dist/ 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Conda](https://img.shields.io/conda/pn/bioconda/fastmlst)![CircleCI](https://img.shields.io/circleci/build/github/EnzoAndree/FastMLST/master)![GitHub](https://img.shields.io/github/license/EnzoAndree/FastMLST)[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/fastmlst/README.html)[![Downloads](https://img.shields.io/conda/dn/bioconda/fastmlst.svg?style=flat)](http://bioconda.github.io/recipes/fastmlst/README.html)![Citations](https://img.shields.io/badge/citations-29-blue) 2 | 3 | # FastMLST 4 | A multi-core Multilocus Sequence Typing tool coupled with allele concatenation. 5 | # Introduction 6 | FastMLST is a high speed standalone script wrote in Python3, which takes assemblies in FASTA format (gzipped is also allowed) and determines its ST according to MLST schemes defined in [PubMLST](https://doi.org/10.12688/wellcomeopenres.14826.1). The main advantage over other ST determination programs is that FastMLST allows the generation of a FASTA file containing the concatenated alleles for all analyzed genomes ready to be aligned and used in phylogenetic inference. 7 | 8 | You can read a complete guide to MLST analysis in our [Wiki](https://github.com/EnzoAndree/FastMLST/wiki/The-definitive-guide-to-MLST-analysis). 9 | # Installation 10 | You can install FastMLST using either Conda or pip. If you want the most up-to-date version, you can install it directly from GitHub using pip. 11 | 12 | ### Using Conda 13 | ```bash 14 | conda config --add channels defaults 15 | conda config --add channels conda-forge 16 | conda config --add channels bioconda 17 | conda install fastmlst 18 | ``` 19 | 20 | ### Using pip for the latest version 21 | To install the latest version directly from GitHub, use the following command: 22 | ```bash 23 | pip install git+https://github.com/EnzoAndree/FastMLST.git 24 | ``` 25 | ## Dependencies 26 | It is expected that all dependencies will be resolved when using conda for installation. 27 | * Python > 3 28 | * Biopython 29 | * tqdm 30 | * pandas 31 | * NCBI BLAST+ 32 | # Quick Start 33 | ``` 34 | $ fastmlst cdiff_refferences/RT078_CDM120.fasta 35 | RT078_CDM120.fasta,cdifficile,11,adk(5),atpA(8),dxr(5),glyA(11),recA(9),sodA(11),tpi(8),mlst_clade(5.0) 36 | 37 | $ fastmlst cdiff_refferences/RT078_CDM120.fasta.gz 38 | RT078_CDM120.fasta.gz,cdifficile,11,adk(5),atpA(8),dxr(5),glyA(11),recA(9),sodA(11),tpi(8),mlst_clade(5.0) 39 | 40 | $ fastmlst cdiff_refferences/*.fasta 41 | RT001_BI9.fasta,cdifficile,3,adk(1),atpA(1),dxr(2),glyA(1),recA(1),sodA(1),tpi(1),mlst_clade(1.0) 42 | RT001_Liv24.fasta,cdifficile,3,adk(1),atpA(1),dxr(2),glyA(1),recA(1),sodA(1),tpi(1),mlst_clade(1.0) 43 | RT002_TL178.fasta,cdifficile,8,adk(1),atpA(1),dxr(2),glyA(6),recA(1),sodA(5),tpi(1),mlst_clade(1.0) 44 | RT012_CD630_chr_V12.fasta,cdifficile,54,adk(1),atpA(4),dxr(7),glyA(1),recA(1),sodA(3),tpi(3),mlst_clade(1.0) 45 | RT014_TL176_v3.fasta,cdifficile,13,adk(1),atpA(1),dxr(6),glyA(1),recA(5),sodA(3),tpi(1),mlst_clade(1.0) 46 | RT015_TL174.fasta,cdifficile,44,adk(2),atpA(5),dxr(2),glyA(1),recA(1),sodA(3),tpi(1),mlst_clade(1.0) 47 | RT017_CF5.fasta,cdifficile,86,adk(3),atpA(7),dxr(3),glyA(8),recA(6),sodA(19),tpi(11),mlst_clade(4.0) 48 | RT017_M68.fasta,cdifficile,37,adk(3),atpA(7),dxr(3),glyA(8),recA(6),sodA(9),tpi(11),mlst_clade(4.0) 49 | RT023_CD305.fasta,cdifficile,791,adk(65),atpA(1),dxr(4),glyA(7),recA(2),sodA(8),tpi(7),mlst_clade(nan) 50 | RT027_CD196.fasta,cdifficile,1,adk(1),atpA(1),dxr(1),glyA(10),recA(1),sodA(3),tpi(5),mlst_clade(2.0) 51 | RT027_R20291_July2013.fasta,cdifficile,1,adk(1),atpA(1),dxr(1),glyA(10),recA(1),sodA(3),tpi(5),mlst_clade(2.0) 52 | RT078_CDM120.fasta,cdifficile,11,adk(5),atpA(8),dxr(5),glyA(11),recA(9),sodA(11),tpi(8),mlst_clade(5.0) 53 | RT106_Liv22.fasta,cdifficile,42,adk(1),atpA(1),dxr(2),glyA(1),recA(1),sodA(7),tpi(1),mlst_clade(1.0) 54 | ``` 55 | # Usage 56 | FastMLST uses as input a assembly in FASTA format. Optionally it can be compressed with gzip or bzip2. 57 | ``` 58 | $ fastmlst cdiff_refferences/RT078_CDM120.fasta 59 | RT078_CDM120.fasta,cdifficile,11,adk(5),atpA(8),dxr(5),glyA(11),recA(9),sodA(11),tpi(8),mlst_clade(5.0) 60 | ``` 61 | The output is a comma separated file (csv) by default, but it can be modified using the `-s` option. 62 | ``` 63 | $ fastmlst -s '\t' cdiff_refferences/RT078_CDM120.fasta 64 | RT078_CDM120.fasta cdifficile 11 adk(5) atpA(8) dxr(5) glyA(11) recA(9) sodA(11) tpi(8) mlst_clade(5.0) 65 | ``` 66 | There are two options for saving the result in a text file: 67 | ``` 68 | $ fastmlst -to mlst.csv cdiff_refferences/RT078_CDM120.fasta 69 | $ fastmlst cdiff_refferences/RT078_CDM120.fasta > mlst.csv 70 | ``` 71 | Both options generate the `mlst.csv` file containing the FastMLST result. 72 | 73 | FastMLST is able to generate a file in FASTA format with the alleles concatenated in the same way as they are in PubMLST. If any genome is not found in this result, it means that (1) Allele contain Ns, (2) alleles missing or (3) contamination (multiple alleles for one genome). Optionally the name could be modified with `-fo` option: 74 | ``` 75 | $ fastmlst cdiff_refferences/RT078_CDM120.fasta 76 | ``` 77 | FastMLST will try to use all available cores. It can be modified with `-t` option: 78 | ``` 79 | $ fastmlst -t 2 cdiff_refferences/RT078_CDM120.fasta 80 | ``` 81 | You also can specify to FastMLST the scheme name using the `--scheme` option, this is particularly useful when there is more than one scheme per species. If you use this option, it will generate a table with a new format (available since version 0.0.10) which is easier to use in other programs like [phyloviz](http://www.phyloviz.net/). 82 | 83 | ``` 84 | $ fastmlst --scheme cdifficile cdiff_refferences/RT078_CDM120.fasta 85 | Genome,Scheme,ST,adk,atpA,dxr,glyA,recA,sodA,tpi,mlst_clade 86 | RT078_CDM120.fasta,cdifficile,11,5,8,5,11,9,11,8,5.0 87 | ``` 88 | 89 | If you want the old format just add the option `--legacy`: 90 | 91 | ``` 92 | $ fastmlst --legacy --scheme cdifficile cdiff_refferences/RT078_CDM120.fasta 93 | RT078_CDM120.fasta,cdifficile,11,adk(5),atpA(8),dxr(5),glyA(11),recA(9),sodA(11),tpi(8),mlst_clade(5.0) 94 | ``` 95 | 96 | A list of schemes supported is displayed with the option `--scheme-list` in the following format `(n) code_name: Full species name` 97 | 98 | **Hint: You must use just the `code_name` in the `--scheme` option.** 99 | 100 | ``` 101 | $ fastmlst --scheme-list 102 | There are 153 schemes (A round of applause to @keithajolley! (Jolley, et al., 2018)): 103 | 104 | (1) achromobacter: Achromobacter spp. 105 | (2) abaumannii#1: Acinetobacter baumannii#1 106 | (3) abaumannii#2: Acinetobacter baumannii#2 107 | (n) (...) 108 | (151) xfastidiosa: Xylella fastidiosa 109 | (152) ypseudotuberculosis: Yersinia pseudotuberculosis 110 | (153) yruckeri: Yersinia ruckeri 111 | ``` 112 | 113 | A new option in version v0.0.14 is the possibility to obtain the alleles divided into individual FASTA files (one for each allele in the scheme), ready to be used in other programs such as MLSTest. 114 | 115 | ``` 116 | $ fastmlst --scheme cdifficile cdiff_refferences/*.fasta --splited-output splited_mlst 117 | Genome,Scheme,ST,adk,atpA,dxr,glyA,recA,sodA,tpi,mlst_clade 118 | RT001_BI9.fasta,cdifficile,3,1,1,2,1,1,1,1,1.0 119 | RT001_Liv24.fasta,cdifficile,3,1,1,2,1,1,1,1,1.0 120 | RT002_TL178.fasta,cdifficile,8,1,1,2,6,1,5,1,1.0 121 | RT012_CD630_chr_V12.fasta,cdifficile,54,1,4,7,1,1,3,3,1.0 122 | RT014_TL176_v3.fasta,cdifficile,13,1,1,6,1,5,3,1,1.0 123 | RT015_TL174.fasta,cdifficile,44,2,5,2,1,1,3,1,1.0 124 | RT017_CF5.fasta,cdifficile,86,3,7,3,8,6,19,11,4.0 125 | RT017_M68.fasta,cdifficile,37,3,7,3,8,6,9,11,4.0 126 | RT023_CD305.fasta,cdifficile,791,65,1,4,7,2,8,7, 127 | RT027_CD196.fasta,cdifficile,1,1,1,1,10,1,3,5,2.0 128 | RT027_R20291_July2013.fasta,cdifficile,1,1,1,1,10,1,3,5,2.0 129 | RT078_CDM120.fasta,cdifficile,11,5,8,5,11,9,11,8,5.0 130 | RT106_Liv22.fasta,cdifficile,42,1,1,2,1,1,7,1,1.0 131 | ``` 132 | 133 | ``` 134 | $ ls splited_mlst/ 135 | adk.fasta atpA.fasta dxr.fasta glyA.fasta recA.fasta sodA.fasta tpi.fasta 136 | $ cat splited_mlst/adk.fasta 137 | >RT001_BI9.fasta adk 138 | CATATATCAACAGGAGATATATTCAGAAAGAATATAAAAGAGGGAACAGAACTTGGAAAA 139 | AAAGCTAAAGAATACATGGACCAAGGTTTATTAGTACCAGATGAGTTAACTGTAGGTTTA 140 | GTTACTGATAGAATATCTCAAGAAGATTGTAAAAATGGATTTATGTTAGATGGATTTCCA 141 | AGAAATGTAGCACAAGGAGAACATTTAGATATCTTCTTAAAAAATGCTGGTATATCACTA 142 | GATAAAGTTGTCAATATTGAAGTTGATAAGAGTATATTAGTGTCTAGAGCAGTTGGTAGA 143 | AGAATATGTAAGTCTTGTGGAGCTACTTACCATGTTGAGTTTAATCCTCCTAAAGTAGAA 144 | GGTGTATGTGATGTATGCCAAGGAGAATTATATCAAAGAGCTGATGATAATGAAGAAACT 145 | GTATCTAAGAGAATACAAGTTTATCTAGATGAAACTAAGCCTTTAGTAGATTATTATAGC 146 | AAACAAGGTATAATAGCAGAT 147 | ... 148 | >RT106_Liv22.fasta adk 149 | CATATATCAACAGGAGATATATTCAGAAAGAATATAAAAGAGGGAACAGAACTTGGAAAA 150 | AAAGCTAAAGAATACATGGACCAAGGTTTATTAGTACCAGATGAGTTAACTGTAGGTTTA 151 | GTTACTGATAGAATATCTCAAGAAGATTGTAAAAATGGATTTATGTTAGATGGATTTCCA 152 | AGAAATGTAGCACAAGGAGAACATTTAGATATCTTCTTAAAAAATGCTGGTATATCACTA 153 | GATAAAGTTGTCAATATTGAAGTTGATAAGAGTATATTAGTGTCTAGAGCAGTTGGTAGA 154 | AGAATATGTAAGTCTTGTGGAGCTACTTACCATGTTGAGTTTAATCCTCCTAAAGTAGAA 155 | GGTGTATGTGATGTATGCCAAGGAGAATTATATCAAAGAGCTGATGATAATGAAGAAACT 156 | GTATCTAAGAGAATACAAGTTTATCTAGATGAAACTAAGCCTTTAGTAGATTATTATAGC 157 | AAACAAGGTATAATAGCAGAT 158 | ``` 159 | 160 | ## Custom MLST Database Location 161 | 162 | FastMLST now supports configuring a custom location for the PubMLST database. By default, the tool uses a cache directory at `~/.cache/fastmlst/pubmlst`. However, if you prefer to store the database in an alternate location (for example, on a high-performance drive or in a centralized directory), you can override this default path using the `--db_path` command-line argument. 163 | 164 | ### How It Works 165 | 166 | When the `--db_path` option is provided, FastMLST calls a helper function (`set_pathdb`) that: 167 | - **Overrides the default database path:** The internal global `pathdb` variable is updated to use your specified path. 168 | - **Ensures the custom directory exists:** The directory is automatically created if it does not exist. 169 | - **Uses the custom path for all subsequent operations:** All processes (such as fetching, updating, or reading database files) use the new path. 170 | 171 | ### Usage Example 172 | 173 | To run FastMLST with a custom MLST database directory, simply use the `--db_path` option: 174 | 175 | ```bash 176 | $ fastmlst --db_path /path/to/your/custom/db [other-options] genomes... 177 | ``` 178 | 179 | For instance, if you want the MLST database to reside in `/data/fastmlst_db`, run: 180 | 181 | ```bash 182 | $ fastmlst --db_path /data/fastmlst_db cdiff_refferences/RT078_CDM120.fasta 183 | ``` 184 | 185 | ### When to Use This Feature 186 | 187 | - **Optimizing I/O Performance:** Place the database on a disk with faster read/write speeds. 188 | - **Managing Disk Usage:** Store the database on a separate partition or drive with more available space. 189 | - **Custom Deployment Setups:** Particularly useful in multi-user or cluster environments where centralized data management is preferred. 190 | 191 | **Note:** Ensure that the directory you specify has proper write permissions. FastMLST will automatically create the directory (and any necessary parent directories) if they do not already exist. 192 | 193 | ## Output symbology 194 | 195 | Symbol | Meaning | Length | Identity 196 | --- | --- | --- | --- 197 | `n` | Exact intact allele | 100% | 100% 198 | `~n` | Novel full length allele similar to n | 100% | ≥ `-pid` 199 | `n?` | Partial match to known allele | ≥ `-cov` | ≥ `-pid` 200 | `-` | Allele missing (or allele containing Ns) | < `-cov` | < `-pid` 201 | `n,m` | Multiple alleles |   |   202 | ## Scoring system 203 | FastMLST uses a scoring system to determine the scheme to be employed similar to that proposed by [Tseemann](https://github.com/tseemann/mlst). The score for a scheme with N alleles is as follows: 204 | 205 | * +100/N points for an exact allele match _e.g._ `1` 206 | * +70/N points for a novel allele match _e.g._ `~1` 207 | * +20/N points for a partial allele match _e.g._ `1?` 208 | * 0 points for a missing allele _e.g._ `-` 209 | # Updating the Schemes 210 | You should **always, always, always keep the PubMLST database updated**. Fortunately there is a function to simply update the database: 211 | ``` 212 | $ fastmlst --update-mlst 213 | ``` 214 | You can indicate how many schemes will be downloaded in parallel with `-t` option if you want more download speed. 215 | ``` 216 | $ fastmlst --update-mlst -t 24 217 | ``` 218 | # Complete usage Options 219 | ``` 220 | usage: fastmlst [-h] [-t THREADS] [-v {0,1,2}] [-s SEPARATOR] [-sch SCHEME] [--scheme-list] [-fo FASTAOUTPUT] [-to TABLEOUTPUT] [-cov COVERAGE] [-pid IDENTITY] [--update-mlst] 221 | [-sp SPLITED_OUTPUT] [--fasta2line] [--longheader] [--legacy] [-n NOVEL] [-V] [--db_path DB_PATH] 222 | [genomes ...] 223 | 224 | ⚡️🧬 FastMLST: A multi-core tool for multilocus sequence typing of draft genome assemblies 225 | 226 | positional arguments: 227 | genomes 228 | 229 | options: 230 | -h, --help show this help message and exit 231 | -t THREADS, --threads THREADS 232 | Number of threads to use (default 14) 233 | -v {0,1,2}, --verbose {0,1,2} 234 | Verbose output level choices: [0, 1, 2] 235 | -s SEPARATOR, --separator SEPARATOR 236 | Choose a character to use as a separator (default ",") 237 | -sch SCHEME, --scheme SCHEME 238 | Set a scheme target (I am not dumb, let me choose a scheme by myself!) 239 | --scheme-list Show all schemes supported 240 | -fo FASTAOUTPUT, --fastaoutput FASTAOUTPUT 241 | File name of the concatenated alleles output (default "") 242 | -to TABLEOUTPUT, --tableoutput TABLEOUTPUT 243 | File name of the MLST table output (default STDOUT) 244 | -cov COVERAGE, --coverage COVERAGE 245 | DNA %Cov to report high quality partial allele [?] (default 99%) 246 | -pid IDENTITY, --identity IDENTITY 247 | DNA %Identity of full allelle to consider 'similar' [~] (default 95%) 248 | --update-mlst Perform an update of the PubMLST database 249 | -sp SPLITED_OUTPUT, --splited-output SPLITED_OUTPUT 250 | Directory output for splited alleles (default "") 251 | --fasta2line The fasta files will be in fasta2line format 252 | --longheader If --longheader is invoked, the header of FASTA file contain a long description 253 | --legacy If --legacy is invoked, the csv reported contain the gene name and the allele id in the row [adk(1),atpA(4),dxr(7),glyA(1),recA(1),sodA(3),tpi(3)]. This option 254 | is only available when the --scheme is defined 255 | -n NOVEL, --novel NOVEL 256 | File name of the novel alleles 257 | -V, --version Show program's version number and exit 258 | --db_path DB_PATH Custom directory for MLST database (default: ~/.cache/fastmlst/pubmlst) 259 | ``` 260 | # Citation 261 | 262 | Guerrero-Araya E, Muñoz M, Rodríguez C, Paredes-Sabja D. FastMLST: A Multi-core Tool for Multilocus Sequence Typing of Draft Genome Assemblies. Bioinform Biol Insights. 2021 Nov 27;15:11779322211059238. doi: [10.1177/11779322211059238](https://doi.org/10.1177/11779322211059238). PMID: 34866905; PMCID: [PMC8637782](http://www.ncbi.nlm.nih.gov/pmc/articles/pmc8637782/). 263 | -------------------------------------------------------------------------------- /bin/__init__.py: -------------------------------------------------------------------------------- 1 | # This file is intentionally left blank to mark the directory as a package. -------------------------------------------------------------------------------- /bin/fastmlst.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import logging 5 | from sys import exit 6 | from sys import stderr 7 | from sys import stdout 8 | from Bio import SeqIO 9 | from multiprocessing import Pool 10 | from multiprocessing import cpu_count 11 | from fastmlst.update_mlst_kit import pathdb 12 | from fastmlst.update_mlst_kit import load_obj 13 | from fastmlst.update_mlst_kit import necessary_file 14 | from itertools import repeat 15 | from fastmlst.mlst import MLST 16 | from codecs import decode 17 | from tqdm import tqdm # pip3 install tqdm 18 | from pathlib import Path 19 | import pandas as pd 20 | from re import compile 21 | from collections import defaultdict 22 | import multiprocessing 23 | import fastmlst.update_mlst_kit as update_mlst_kit 24 | 25 | def unescaped_str(arg_str): 26 | return decode(str(arg_str), 'unicode_escape') 27 | 28 | 29 | def check_coverage_range(value): 30 | fvalue = float(value) 31 | if fvalue <= 0 or fvalue > 100: 32 | raise argparse.ArgumentTypeError( 33 | 'The coverage velue must to be [0..100]' 34 | ) 35 | return fvalue 36 | 37 | 38 | def check_identity_range(value): 39 | fvalue = float(value) 40 | if fvalue <= 0 or fvalue > 100: 41 | raise argparse.ArgumentTypeError( 42 | 'The identity velue must to be [0..100]' 43 | ) 44 | return fvalue 45 | 46 | 47 | def runMLST(margument): 48 | genome, cov, ident, sep, header, shcheme = margument 49 | return MLST(genome, cov, ident, sep, header, shcheme) 50 | 51 | 52 | def main(): 53 | V = '%(prog)s v0.0.19' 54 | parser = argparse.ArgumentParser( 55 | description='⚡️🧬 FastMLST: A multi-core tool for multilocus sequence typing of draft genome assemblies' 56 | ) 57 | parser.add_argument(type=str, nargs='*', dest='genomes') 58 | parser.add_argument('-t', '--threads', type=int, default=cpu_count(), 59 | help='Number of threads to use (default {})'. 60 | format(cpu_count())) 61 | parser.add_argument('-v', '--verbose', type=int, default=0, 62 | choices=[0, 1, 2], 63 | help='Verbose output level choices: [0, 1, 2]') 64 | parser.add_argument('-s', '--separator', type=unescaped_str, default=',', 65 | help='Choose a character to use as a separator' + 66 | ' (default ",")') 67 | parser.add_argument('-sch', '--scheme', type=str, 68 | help='Set a scheme target (I am not dumb, let me choose a scheme by myself!)') 69 | parser.add_argument('--scheme-list', action='store_true', 70 | help='Show all schemes supported') 71 | parser.add_argument('-fo', '--fastaoutput', type=str, default='', 72 | help='File name of the concatenated alleles output' + 73 | ' (default "")') 74 | parser.add_argument('-to', '--tableoutput', type=str, default=stdout, 75 | help='File name of the MLST table output' + 76 | ' (default STDOUT)') 77 | parser.add_argument('-cov', '--coverage', type=check_coverage_range, 78 | default=99, 79 | help='DNA %%Cov to report high quality partial allele [?]' + 80 | ' (default 99%%)') 81 | parser.add_argument('-pid', '--identity', type=check_identity_range, 82 | default=95, 83 | help='DNA %%Identity of full allelle to consider' + 84 | ' \'similar\' [~] (default 95%%)') 85 | parser.add_argument('--update-mlst', action='store_true', 86 | help='Perform an update of the PubMLST database') 87 | parser.add_argument('-sp', '--splited-output', type=str, default='', 88 | help='Directory output for splited alleles' + 89 | ' (default "")') 90 | parser.add_argument('--fasta2line', action='store_true', 91 | help='The fasta files will be in fasta2line format') 92 | parser.add_argument('--longheader', action='store_true', 93 | help='If --longheader is invoked, the header of FASTA' + 94 | ' file contain a long description') 95 | parser.add_argument('--legacy', action='store_true', 96 | help='If --legacy is invoked, the csv reported contain the gene name' + 97 | ' and the allele id in the row [adk(1),atpA(4),dxr(7),glyA(1),recA(1),sodA(3),tpi(3)].' + 98 | ' This option is only available when the --scheme is defined') 99 | parser.add_argument('-n', '--novel', type=str, 100 | help='File name of the novel alleles') 101 | parser.add_argument('-V', '--version', action='version', 102 | version=V, help='Show program\'s version number and exit') 103 | parser.add_argument( 104 | '--db_path', 105 | type=str, 106 | default=None, 107 | help='Custom directory for MLST database (default: ~/.cache/fastmlst/pubmlst)' 108 | ) 109 | args = parser.parse_args() 110 | 111 | # If the user provided a custom database path, update it 112 | if args.db_path: 113 | update_mlst_kit.set_pathdb(args.db_path) 114 | 115 | # Verbose? 116 | formatter = logging.Formatter('[%(asctime)s] %(levelname)s@%(name)s: %(message)s') 117 | ch = logging.StreamHandler() 118 | split_namefromcode = compile(r'(?P.+)\((?P~?)(?P\d+)(?P\??)\)') # OMG genename can be alphanumeric 119 | 120 | if args.verbose == 0: 121 | logging.basicConfig(level=logging.WARNING, 122 | format='[%(asctime)s] %(levelname)s@%(name)s: %(message)s', 123 | datefmt='%H:%M:%S') 124 | logger = logging.getLogger('FastMLST') 125 | elif args.verbose == 1: 126 | logging.basicConfig(level=logging.INFO, 127 | format='[%(asctime)s] %(levelname)s@%(name)s: %(message)s', 128 | datefmt='%H:%M:%S') 129 | logger = logging.getLogger('FastMLST') 130 | elif args.verbose == 2: 131 | logging.basicConfig(level=logging.DEBUG, 132 | format='[%(asctime)s] %(levelname)s@%(name)s: %(message)s', 133 | datefmt='%H:%M:%S') 134 | logger = logging.getLogger('FastMLST') 135 | # Check for pubmlst 136 | update_mlst_kit.pathdb.mkdir(exist_ok=True, parents=True) 137 | is_all_files = all((update_mlst_kit.pathdb / f).is_file() for f in necessary_file) 138 | # If update_mlst is true o any necesary file are missing update pubmlst 139 | if args.update_mlst or not is_all_files: 140 | from shutil import rmtree 141 | from fastmlst.update_mlst_kit import update_mlstdb 142 | rmtree(str(update_mlst_kit.pathdb)) 143 | update_mlstdb(args.threads) 144 | if args.update_mlst: 145 | exit() 146 | if args.scheme_list: 147 | from fastmlst.update_mlst_kit import show_scheme_list 148 | show_scheme_list() 149 | exit() 150 | if not args.genomes: 151 | parser.print_help(stderr) 152 | exit() 153 | # Check if there are a target scheme 154 | if args.scheme != None: 155 | args.scheme = args.scheme.lower() 156 | scheme_dir = update_mlst_kit.pathdb/'schemes' 157 | if args.scheme in [d.name for d in scheme_dir.iterdir()]: 158 | logger.info('Ok my little buddy, i trust your judgment. I will '+ 159 | f'proceed with the search using only the following scheme: {args.scheme}') 160 | else: 161 | logger.error(f'Are you sure that "{args.scheme}" is a supported scheme?') 162 | logger.error('Don\'t worry my little buddy. You are a human '+ 163 | 'after all. I\'ll keep trying to choose the best scheme.') 164 | args.scheme = None 165 | genome_mlst = [] 166 | multipleargs = list(zip(args.genomes, 167 | repeat(args.coverage), 168 | repeat(args.identity), 169 | repeat(args.separator), 170 | repeat(args.longheader), 171 | repeat(args.scheme), 172 | )) 173 | with Pool(args.threads) as p: 174 | for result in tqdm(p.imap(runMLST, multipleargs), 175 | total=len(multipleargs), 176 | desc='Scanning Genomes using {} threads'. 177 | format(args.threads), unit='Genomes', leave=False): 178 | genome_mlst.append(result) 179 | # Show the shcheme list and exit 180 | fastaconcat = [] 181 | fastasplited = defaultdict(list) 182 | fastanovelconcat = [] 183 | str_alleles = '' 184 | dict_alleles = [] 185 | for genome in genome_mlst: 186 | # only export if 187 | # 1. Allele not contain Ns 188 | # 2. No alleles missing 189 | # 3. No contamination in genome 190 | # if genome.blastresult\ 191 | # and not genome.descarted\ 192 | # and not genome.contamination\ 193 | # and not genome.allelemissing: 194 | if genome.blastresult: 195 | if not genome.descarted\ 196 | and not genome.contamination\ 197 | and not genome.allelemissing: 198 | fastaconcat.append(genome.concat_alleles) 199 | # splited alleles 200 | if args.splited_output != '': 201 | for allele in genome.name_alleles: 202 | fasta = genome.alleles[allele] 203 | fasta.id = genome.beautiname 204 | fasta.description = f'{allele}({genome.dict_st[allele]})' 205 | if allele not in fastasplited.keys(): 206 | fastasplited[allele].append(fasta) 207 | else: 208 | fastasplited[allele].append(fasta) 209 | 210 | if args.novel and genome.novel_alleles: 211 | for novelallele in genome.novel_alleles: 212 | genenovel = split_namefromcode.search(novelallele) 213 | if genenovel: 214 | gene_name = genenovel.group('gene') 215 | try: 216 | # Debugging output 217 | seq = genome.alleles[gene_name] 218 | except KeyError as e: 219 | print(f"KeyError: {e} - The key '{gene_name}' was not found in genome.alleles.") 220 | continue # Skip this allele and continue with the next 221 | else: 222 | print(f"No match found for novelallele: {novelallele}") 223 | continue 224 | seq.id = novelallele + '@' + genome.beautiname 225 | seq.description = '' 226 | fastanovelconcat.append(seq) 227 | str_alleles += genome.str_st 228 | str_alleles += '\n' 229 | dict_alleles.append(genome.dict_st) 230 | # FastMLSTv0.0.12 by default do not write the fasta file 231 | if args.fasta2line and args.fastaoutput != '': 232 | SeqIO.write(fastaconcat, args.fastaoutput, 'fasta-2line') 233 | if args.novel: 234 | SeqIO.write(fastanovelconcat, args.novel, 'fasta-2line') 235 | elif args.fastaoutput != '': 236 | SeqIO.write(fastaconcat, args.fastaoutput, 'fasta') 237 | if args.novel: 238 | SeqIO.write(fastanovelconcat, args.novel, 'fasta') 239 | # Novel output in version v0.0.14 240 | if args.fasta2line and args.splited_output != '': 241 | spout = Path(args.splited_output) 242 | spout.mkdir(exist_ok=True, parents=True) 243 | for gene, fastalist in fastasplited.items(): 244 | SeqIO.write(fastalist, f'{spout.absolute()}/{gene}.fasta', 'fasta-2line') 245 | elif args.splited_output != '': 246 | spout = Path(args.splited_output) 247 | spout.mkdir(exist_ok=True, parents=True) 248 | for gene, fastalist in fastasplited.items(): 249 | SeqIO.write(fastalist, f'{spout.absolute()}/{gene}.fasta', 'fasta') 250 | # output formated 251 | if type(args.tableoutput) == str: 252 | if args.scheme != None: 253 | if args.legacy: 254 | print(str_alleles[:-1], file=open(args.tableoutput, 'w')) 255 | else: 256 | df = pd.DataFrame(dict_alleles) 257 | df.to_csv(f'{args.tableoutput}', index=False, sep=args.separator) 258 | else: 259 | print(str_alleles[:-1], file=open(args.tableoutput, 'w')) 260 | else: 261 | if args.scheme != None: 262 | if args.legacy: 263 | print(str_alleles[:-1], file=args.tableoutput) 264 | else: 265 | df = pd.DataFrame(dict_alleles) 266 | print(df.to_csv(index=False, sep=args.separator)) 267 | else: 268 | print(str_alleles[:-1], file=args.tableoutput) 269 | 270 | if __name__ == '__main__': 271 | multiprocessing.set_start_method('fork') # or 'spawn' or 'forkserver' 272 | main() 273 | -------------------------------------------------------------------------------- /fastmlst/mlst.py: -------------------------------------------------------------------------------- 1 | from operator import itemgetter 2 | import pandas as pd 3 | from fastmlst.update_mlst_kit import pathdb 4 | import logging 5 | from fastmlst.update_mlst_kit import load_obj 6 | from collections import defaultdict 7 | from sys import exit 8 | from Bio import SeqIO 9 | from Bio.SeqRecord import SeqRecord 10 | from gzip import open as gopen 11 | from io import StringIO 12 | from pathlib import Path 13 | import gzip 14 | import bz2 15 | import gc 16 | import subprocess 17 | 18 | magic_dict = { 19 | b"\x1f\x8b\x08": (gzip.open, 'rb'), 20 | b"\x42\x5a\x68": (bz2.BZ2File, 'r'), 21 | } 22 | 23 | max_len = max(len(x) for x in magic_dict) 24 | 25 | def open_by_magic(filename): 26 | with open(filename, "rb") as f: 27 | file_start = f.read(max_len) 28 | for magic, (fn, flag) in magic_dict.items(): 29 | if file_start.startswith(magic): 30 | return fn(filename, flag) 31 | return open(filename, 'r') 32 | 33 | 34 | logger = logging.getLogger('mlst') 35 | 36 | # Will be arbitrarily excluded. by Enzo. 37 | excluded_by_default = ['abaumannii#2', 'ecoli#2', 'leptospira#2', 'leptospira#3', 38 | 'mgallisepticum#1', 'pmultocida#1', 'sthermophilus', 39 | 'vcholerae#2'] 40 | 41 | class MLST(object): 42 | def __init__(self, fasta, coverage=75, identity=95, sep=',', 43 | longheader=False, target_scheme=None): 44 | super(MLST, self).__init__() 45 | self.target_scheme = target_scheme 46 | self.longheader = longheader 47 | self.fasta = fasta 48 | self.fasta_opened = open_by_magic(self.fasta).read() 49 | if type(self.fasta_opened) != str: 50 | self.fasta_opened = self.fasta_opened.decode() 51 | self.coverage = coverage / 100.0 52 | self.identity = identity / 100.0 53 | self.sep = sep 54 | self.scheme_number = load_obj(str(pathdb) + '/scheme_number.pkl') 55 | self.beautiname = self.fasta.strip('/').split('/')[-1] 56 | self.blastn_cli = None 57 | # QCflags 58 | self.descarted = False 59 | self.contamination = False 60 | self.allelemissing = False 61 | self.blastresult = False 62 | # QCflags 63 | self.blast = self.make_blast() 64 | if self.blastresult: 65 | self.scheme = None 66 | self.score = None 67 | self.novel_alleles = [] 68 | self.scoring() 69 | self.QCflags() 70 | if not self.allelemissing and\ 71 | not self.novel_alleles and\ 72 | not self.contamination: 73 | self.ST = self.STassignment() 74 | # if novel alleles, is new ST by default 75 | elif not self.allelemissing and\ 76 | self.novel_alleles and\ 77 | not self.contamination: 78 | self.ST = 'new_alleles' 79 | else: 80 | self.ST = '-' 81 | self.name_alleles = self.scheme_number[self.scheme] 82 | self.number_alleles = len(self.name_alleles) 83 | self.STnumber = None 84 | self.alleles = None 85 | self.concat_alleles = self.mlstex() 86 | if self.descarted: 87 | # If any allele has Ns or is broken in 2 contigs, do not determine STs 88 | self.ST = '-' 89 | self.str_st, self.dict_st = self.str_allelic_profile() 90 | 91 | # Release Ram! 92 | del self.fasta_opened 93 | del self.scheme_number 94 | del self.blast 95 | gc.collect() 96 | 97 | def __repr__(self,): 98 | return '{}–ST: {}'.format(self.beautiname, self.STnumber) 99 | 100 | def QCflags(self, ): 101 | for locus, value in self.score['scheme'].items(): 102 | if '-' in value: 103 | self.allelemissing = True 104 | # mlst.py must check for contamination in mlstex() 105 | # elif '|' in value: 106 | # # Check if is a duplication of same reported alleles 107 | # if '~' in value: 108 | # self.contamination = True 109 | # else: 110 | # valuelist = value.split('|') 111 | # if len(set(valuelist)) == 1: 112 | # pass 113 | # else: 114 | # self.contamination = True 115 | elif '~' in value or '?' in value: 116 | self.novel_alleles.append(f'{locus}({value})') 117 | 118 | 119 | def make_blast(self,): 120 | # Build the BLAST command without using the deprecated Bio.Blast.Application wrappers 121 | cmd = [ 122 | "blastn", 123 | "-db", str(pathdb) + '/mlst.fasta', 124 | "-dust", "no", 125 | "-outfmt", "6 sseqid slen sstrand sstart send length nident gaps qseqid qstart qend", 126 | "-max_target_seqs", "130000", 127 | "-evalue", "1E-20", 128 | "-ungapped" 129 | ] 130 | self.blastn_cli = " ".join(cmd) 131 | logger.debug(self.blastn_cli + ' < ' + self.fasta) 132 | result = subprocess.run(cmd, input=self.fasta_opened, text=True, 133 | stdout=subprocess.PIPE, stderr=subprocess.PIPE) 134 | out, err = result.stdout, result.stderr 135 | if out == '': 136 | logger.warning(f'There is no result for: {self.blastn_cli} < {self.fasta}') 137 | return None 138 | self.blastresult = True 139 | blastfiltred = self.blast_filter(out) 140 | del out 141 | del err 142 | return blastfiltred 143 | 144 | def blast_filter(self, blast_out): 145 | header = ['sseqid', 'slen', 'sstrand', 'sstart', 'send', 'length', 146 | 'nident', 'gaps', 'qseqid', 'qstart', 'qend'] 147 | dfblast = pd.read_csv(StringIO(blast_out), sep='\t', names=header) 148 | toint = ['slen', 'sstart', 'send', 'length', 'nident', 'gaps', 149 | 'qstart', 'qend'] 150 | dfblast['coverage'] = (dfblast.length - dfblast.gaps) / dfblast.slen 151 | dfblast['identity'] = (dfblast.nident - dfblast.gaps) / dfblast.slen # this is a 'global' %identity 152 | dfblast[toint] = dfblast[toint].astype(int) 153 | # Bug Fixed on FastMLSTv0.0.12: The contigs identifier of some assemblers like unicycler is just a number, I opted to standardize all searches using strings 154 | dfblast['qseqid'] = dfblast['qseqid'].astype(str) 155 | # don't even look at the hits below these cov's and identities 156 | dfblast = dfblast.loc[(dfblast['coverage'] <= 1) & (dfblast['coverage'] >= self.coverage) & (dfblast['identity'] >= self.identity)] # insertions can not be processed properly yet 157 | # dfblast = dfblast.loc[dfblast['slen'] >= dfblast['length']] # if have an insertion slen < length 158 | if len(dfblast) == 0: 159 | # there is no result 160 | logger.warning(f'There is no result for: {self.blastn_cli} < {self.fasta}') 161 | self.blastresult = False 162 | return None 163 | else: 164 | dfblast = dfblast.join( 165 | dfblast['sseqid'].str.split('.', n=1, expand=True). 166 | rename(columns={0: 'scheme', 1: 'genenumber'})) 167 | dfblast = dfblast.join( 168 | dfblast['genenumber'].str.rsplit('_', n=1, expand=True). 169 | rename(columns={0: 'gene', 1: 'number'})) 170 | dfblast = dfblast.drop(['sseqid', 'genenumber'], axis=1) 171 | dfblast['genome_id'] = self.beautiname 172 | # dfblast.index = dfblast['genome_id'] 173 | dfblast = dfblast[['genome_id', 'scheme', 'gene', 'number', 'slen', 174 | 'sstrand', 'sstart', 'send', 'length', 'nident', 175 | 'gaps', 'coverage', 'identity', 'qseqid', 176 | 'qstart', 'qend']] 177 | if self.target_scheme != None: 178 | dfblast = dfblast.loc[dfblast['scheme'] == self.target_scheme] 179 | else: 180 | dfblast = dfblast[~dfblast['scheme'].isin(excluded_by_default)] # decapitation! 181 | if len(dfblast) == 0: 182 | # there is no result 183 | logger.warning(f'There is no result for: {self.blastn_cli} < {self.fasta}') 184 | self.blastresult = False 185 | return None 186 | 187 | # dfblast.sort_index(inplace=True) 188 | # Grup by gene and select the best hit (cov=100% high ID) 189 | # Better Timing 190 | # dfblast = dfblast.sort_values(by=['coverage', 'nident', 'gaps' ], 191 | # ascending=[False, False, True]).drop_duplicates(['gene'], keep='first') 192 | 193 | # genegrup = dfblast.groupby('gene') 194 | # blastfiltred_bygene = [] 195 | # for gene, df_group in genegrup: 196 | # df_group.sort_values(by=['coverage', 'nident', 'gaps' ], 197 | # ascending=[False, False, True], inplace=True) 198 | # blastfiltred_bygene.append(df_group.head(5)) 199 | # dfblast = pd.concat(blastfiltred_bygene, ignore_index=True) 200 | # del blastfiltred_bygene 201 | # del genegrup 202 | return dfblast 203 | 204 | 205 | def str_allelic_profile(self, ): 206 | if not isinstance(self.ST, pd.DataFrame): 207 | output = '{0}{3}{1}{3}{2}{3}'.format(self.beautiname, 208 | self.scheme, self.ST, 209 | self.sep) 210 | self.STnumber = self.ST 211 | dictofrows = {'Genome': self.beautiname, 212 | 'Scheme': self.scheme, 213 | 'ST': self.ST} 214 | for i in self.score['scheme'].keys(): 215 | out = '{0}({1}){2}'.format(i, self.score['scheme'][i], 216 | self.sep) 217 | dictofrows[i] = self.score['scheme'][i] 218 | output += out 219 | output = output.strip(self.sep) 220 | else: 221 | output = '{0}{3}{1}{3}{2}{3}'.format(self.beautiname, 222 | self.scheme, 223 | self.ST.index.values[0], 224 | self.sep) 225 | self.STnumber = self.ST.index.values[0] 226 | dictofrows = {'Genome': self.beautiname, 227 | 'Scheme': self.scheme, 228 | 'ST': self.ST.index.values[0]} 229 | for i in self.ST: 230 | out = '{0}({1}){2}'.format(i, self.ST[i].values[0], 231 | self.sep) 232 | dictofrows[i] = self.ST[i].values[0] 233 | output += out 234 | output = output.strip(self.sep) 235 | return (output, dictofrows) 236 | 237 | def is_context_complete(self, length, start, end): 238 | if start < 0 or end < 0: 239 | return False 240 | elif start > length or end > length: 241 | return False 242 | else: 243 | return True 244 | 245 | def STassignment(self, ): 246 | scheme_dir = str(pathdb) + '/schemes' + '/' + self.scheme 247 | STlist = Path(str(scheme_dir) + '/' + self.scheme + '.txt') 248 | dfSTlist = pd.read_csv(str(STlist), sep='\t', index_col=0) 249 | for key, value in self.score['scheme'].items(): 250 | if '|' in value: 251 | value = list(set(value.split('|'))) 252 | if len(value) == 1: 253 | dfSTlist = dfSTlist.loc[dfSTlist[key] == int(value[0])] 254 | else: 255 | return '-' 256 | else: 257 | dfSTlist = dfSTlist.loc[dfSTlist[key] == int(value)] 258 | if len(dfSTlist) == 1: 259 | return dfSTlist 260 | elif len(dfSTlist) == 0: 261 | return 'new_ST' 262 | else: 263 | logger.error('If you got here, congratulations, ' + 264 | ' you found a place in maintenance STassignment()!') 265 | logger.error(dfSTlist, self.blastn_cli + ' < ' + self.fasta) 266 | logger.error(self.score['scheme']) 267 | 268 | def mlstex(self, ): 269 | fasta_output = dict() 270 | for record in SeqIO.parse(StringIO(self.fasta_opened), 'fasta'): 271 | try: 272 | pd_blast = self.blast.loc[(self.blast['qseqid'] == record.id) & 273 | (self.blast['scheme'] == self.scheme) 274 | ] 275 | except KeyError: 276 | continue 277 | if isinstance(pd_blast, pd.DataFrame): 278 | for row in pd_blast.iterrows(): 279 | if row[1]['number'] not in\ 280 | self.score['scheme'][row[1]['gene']] or\ 281 | row[1]['coverage'] < self.coverage: 282 | continue 283 | if row[1]['sstrand'] == 'plus': 284 | if row[1]['slen'] == row[1]['send']: 285 | # finish well 286 | finishmissing = 0 287 | else: 288 | # is missing some nucleotides 289 | finishmissing = row[1]['slen'] - row[1]['send'] 290 | if row[1]['sstart'] == 1: 291 | # start well 292 | startmissing = 0 293 | else: 294 | # is missing some nucleotides 295 | startmissing = row[1]['sstart'] - 1 296 | start = int(row[1]['qstart']) - 1 - startmissing 297 | end = int(row[1]['qend']) + finishmissing 298 | seq = record.seq[start:end] 299 | else: 300 | if row[1]['slen'] == row[1]['sstart']: 301 | # start well 302 | startmissing = 0 303 | else: 304 | # is missing some nucleotides 305 | startmissing = row[1]['slen'] - row[1]['sstart'] 306 | if row[1]['send'] == 1: 307 | # finish well 308 | finishmissing = 0 309 | else: 310 | # is missing some nucleotides 311 | finishmissing = row[1]['send'] - 1 312 | start = int(row[1]['qstart']) - 1 - startmissing 313 | end = int(row[1]['qend']) + finishmissing 314 | seq = record.seq[start:end].reverse_complement() 315 | if seq.count('N') > 0 or not \ 316 | self.is_context_complete(len(record), start, end): 317 | self.descarted = True 318 | continue 319 | identificator = '{}|{}|{}|{}_{}'.format(self.beautiname, 320 | record.id, 321 | row[1]['gene'], 322 | start, end) 323 | record_fasta = SeqRecord(seq, id=identificator) 324 | # BUG: SNP at end of aligment is detected as deletion. Probably 'fixed' in 'Grup by gene and select the best hit' commentary 325 | # Check for equal gene name and diferent sequence (e.g. conaminations) 326 | if row[1]['gene'] in fasta_output: 327 | # gene name already in the output 328 | if fasta_output[row[1]['gene']].seq == record_fasta.seq: 329 | # Ok, it is the same allele 330 | pass 331 | else: 332 | self.contamination = True 333 | else: 334 | fasta_output[row[1]['gene']] = record_fasta 335 | del pd_blast 336 | elif isinstance(pd_blast, pd.Series): 337 | logger.error('If you got here, congratulations, ' + 338 | ' you found a place in maintenance mlstex()!') 339 | self.alleles = fasta_output 340 | if self.longheader: 341 | header = self.beautiname + ' ' 342 | concatenatedseq = '' 343 | for genename in self.name_alleles: 344 | if genename in self.alleles.keys(): 345 | header += genename + '_' 346 | concatenatedseq += self.alleles[genename].seq 347 | record_out = SeqRecord(concatenatedseq, id=header.strip('_'), 348 | description='Concatenated Sequences of MLST ' + 349 | 'from ' + self.beautiname) 350 | else: 351 | concatenatedseq = '' 352 | for genename in self.name_alleles: 353 | if genename in self.alleles.keys(): 354 | concatenatedseq += self.alleles[genename].seq 355 | record_out = SeqRecord(concatenatedseq, id=self.beautiname, 356 | description='') 357 | return record_out 358 | 359 | def scoring(self, ): 360 | genome_query = set(self.blast['genome_id'].tolist()) 361 | if len(genome_query) == 1: 362 | genome_query = list(genome_query)[0] 363 | else: 364 | logger.warning('Warning, more than one genome as query ', 365 | genome_query) 366 | exit() 367 | # check completeness, perfect identity, snp identity, 368 | rank_list = defaultdict(dict) 369 | for scheme, group in self.blast.groupby('scheme'): 370 | rank_list[scheme] = defaultdict(dict) 371 | rank_list[scheme]['score'] = 0 372 | rank_list[scheme]['scheme'] = defaultdict() 373 | loci = self.scheme_number[scheme] 374 | N = len(loci) 375 | blast_scheme = self.blast[(self.blast.gene.isin(loci)) & 376 | (self.blast.scheme == scheme)].copy() 377 | blast_scheme.sort_values(by=['coverage', 'identity', 'length'], 378 | ascending=[False, False, False], inplace=True) 379 | for locus in loci: 380 | row = blast_scheme[blast_scheme.gene == locus] 381 | if len(row) == 0: 382 | # allele missing 383 | rank_list[scheme]['score'] += 0 384 | rank_list[scheme]['scheme'][locus] = '-' 385 | elif len(row) == 1: 386 | # only one allele 387 | # if have an insertion slen < length 388 | # if row['slen'].values[0] < row['length'].values[0]: 389 | # rank_list[scheme]['score'] += 20.0 / N 390 | # rank_list[scheme]['scheme'][locus] = \ 391 | # '{}?'.format(row['number'].values[0]) 392 | if row['coverage'].values[0] == 1 and\ 393 | row['identity'].values[0] == 1: 394 | # perfect match 395 | rank_list[scheme]['score'] += 100.0 / N 396 | rank_list[scheme]['scheme'][locus] = \ 397 | row['number'].values[0] 398 | # BUG: Blast no make a full aligmnent, if a snp aries at the very end, the coverage is not 1 399 | # In the fasta output, this is fixed lookat the input sequence, but this no update the blast table 400 | elif row['coverage'].values[0] == 1 and\ 401 | row['identity'].values[0] >= self.identity and\ 402 | row['slen'].values[0] >= row['length'].values[0]: 403 | # full length partial match 404 | rank_list[scheme]['score'] += 70.0 / N 405 | rank_list[scheme]['scheme'][locus] = \ 406 | '~{}'.format(row['number'].values[0]) 407 | elif row['coverage'].values[0] >= self.coverage and \ 408 | row['identity'].values[0] >= self.identity: 409 | # partial length partial match 410 | rank_list[scheme]['score'] += 20.0 / N 411 | rank_list[scheme]['scheme'][locus] = \ 412 | '{}?'.format(row['number'].values[0]) 413 | else: 414 | rank_list[scheme]['score'] += 0 415 | rank_list[scheme]['scheme'][locus] = '-' 416 | else: 417 | # multiples hits 418 | for index, r in row.iterrows(): 419 | # if r['slen'] < r['length']: 420 | # if locus not in rank_list[scheme]['scheme']: 421 | # rank_list[scheme]['score'] += 20.0 / N 422 | # rank_list[scheme]['scheme'][locus] = \ 423 | # '{}?'.format(r['number']) 424 | # else: 425 | # rank_list[scheme]['score'] -= 20.0 / N 426 | # rank_list[scheme]['score'] += 20.0 / N / len(row) 427 | # rank_list[scheme]['scheme'][locus] += \ 428 | # '|' + '{}?'.format(r['number']) 429 | if r['coverage'] == 1 and r['identity'] == 1: 430 | # perfect match 431 | if locus not in rank_list[scheme]['scheme']: 432 | rank_list[scheme]['score'] += 100.0 / N 433 | rank_list[scheme]['scheme'][locus] = \ 434 | r['number'] 435 | else: 436 | rank_list[scheme]['score'] -= 100.0 / N 437 | rank_list[scheme]['score'] += 100.0 / N / len(row) 438 | rank_list[scheme]['scheme'][locus] += \ 439 | '|' + r['number'] 440 | elif r['coverage'] == 1 and\ 441 | r['identity'] >= self.identity and\ 442 | r['slen'] >= r['length']: 443 | # full length partia match 444 | if locus not in rank_list[scheme]['scheme']: 445 | rank_list[scheme]['score'] += 70.0 / N 446 | rank_list[scheme]['scheme'][locus] = \ 447 | '~{}'.format(r['number']) 448 | # else: 449 | # rank_list[scheme]['score'] -= 70.0 / N 450 | # rank_list[scheme]['score'] += 70.0 / N / len(row) 451 | # rank_list[scheme]['scheme'][locus] += \ 452 | # '|' + '~{}'.format(r['number']) 453 | # self.contamination = True 454 | elif r['coverage'] >= self.coverage and\ 455 | r['identity'] >= self.identity: 456 | # partial length partial match 457 | if locus not in rank_list[scheme]['scheme']: 458 | rank_list[scheme]['score'] += 20.0 / N 459 | rank_list[scheme]['scheme'][locus] = \ 460 | '{}?'.format(r['number']) 461 | # else: 462 | # rank_list[scheme]['score'] -= 20.0 / N 463 | # rank_list[scheme]['score'] += 20.0 / N / len(row) 464 | # rank_list[scheme]['scheme'][locus] += \ 465 | # '|' + '{}?'.format(r['number']) 466 | # self.contamination = True 467 | else: 468 | rank_list[scheme]['score'] += 0 469 | if locus not in rank_list[scheme]['scheme']: 470 | rank_list[scheme]['scheme'][locus] = '-' 471 | 472 | sorted_rank_list = sorted(rank_list.items(), 473 | key=lambda x: (x[1]['score']), reverse=True) 474 | bestscore = sorted_rank_list[0] 475 | self.scheme = bestscore[0] 476 | self.score = bestscore[1] 477 | 478 | 479 | def main(): 480 | print('Get out of here you little human!') 481 | 482 | if __name__ == '__main__': 483 | main() 484 | -------------------------------------------------------------------------------- /fastmlst/update_mlst_kit.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import xml.etree.ElementTree as ET 3 | from Bio import SeqIO 4 | from pickle import dump 5 | from pickle import load 6 | from datetime import date 7 | from subprocess import PIPE 8 | from subprocess import Popen 9 | from collections import defaultdict 10 | from multiprocessing import cpu_count 11 | from multiprocessing.pool import ThreadPool 12 | from tqdm import tqdm # pip3 install tqdm 13 | from urllib.request import urlretrieve 14 | from pathlib import Path 15 | import os 16 | 17 | logger = logging.getLogger('update_mlst') 18 | 19 | # NEW: Define a function to override the default database location 20 | def set_pathdb(custom_path): 21 | """ 22 | Override the default PubMLST database directory. 23 | 24 | Parameters: 25 | custom_path (str): Custom path for the MLST database directory. 26 | """ 27 | global pathdb 28 | pathdb = Path(custom_path) 29 | pathdb.mkdir(parents=True, exist_ok=True) 30 | 31 | # Define the path to the .cache directory in the user's home directory 32 | home_dir = Path.home() 33 | cache_dir = home_dir / '.cache' / 'fastmlst' 34 | 35 | # Ensure the cache directory exists 36 | cache_dir.mkdir(parents=True, exist_ok=True) 37 | 38 | # Default pathdb points to the pubmlst folder in the default cache 39 | pathdb = cache_dir / 'pubmlst' 40 | 41 | necessary_file = ['mlst.fasta.nhr', 'mlst.fasta.nsq', 'mlst.fasta.nin'] 42 | 43 | 44 | def save_obj(obj, name): 45 | with open(name, 'wb') as f: 46 | dump(obj, f, 2) 47 | 48 | 49 | def load_obj(name): 50 | with open(name, 'rb') as f: 51 | return load(f) 52 | 53 | def best_guess_codename(text): 54 | # i hope that this is correct 55 | text = text.replace('.', '').strip().lower() 56 | text = text.replace('candidatus ', '') 57 | text = text.replace('/', '') # the dots are used in blast_filter to separate the scheme form data, use other sep 58 | text = text.replace('(', '') 59 | text = text.replace(')', '') 60 | # no more bugs plz! 61 | if len(text.split(' ')) == 1: 62 | # is a genus scheme 63 | if '#' in text: 64 | # sh#t, there is multiples schemes for a sigle specie :( 65 | components = text.split('#') 66 | genus = components[0] 67 | schemenumber = components[-1] 68 | codename = f'{genus}#{schemenumber}' 69 | else: 70 | codename = f'{text}' 71 | elif 'spp' in text: 72 | # is a genus scheme 73 | if '#' in text: 74 | # sh#t, there is multiples schemes for a sigle specie :( 75 | components = text.split(' ') 76 | genus = components[0] 77 | schemenumber = text.split('#')[-1] 78 | codename = f'{genus}#{schemenumber}' 79 | else: 80 | components = text.split(' ') 81 | genus = components[0] 82 | codename = f'{genus}' 83 | elif len(text.split(' ')) == 2: 84 | # is a clasic genus_species like scheme 85 | if '#' in text: 86 | # sh#t, there is multiples schemes for a sigle specie :( 87 | components = text.split(' ') 88 | genus = components[0][0] 89 | specie = components[1].split('#')[0] 90 | schemenumber = text.split('#')[-1] 91 | codename = f'{genus}{specie}#{schemenumber}' 92 | else: 93 | components = text.split(' ') 94 | genus = components[0][0] 95 | specie = components[1] 96 | codename = f'{genus}{specie}' 97 | elif len(text.split(' ')) > 2: 98 | # is a genus_species_etc... like scheme 99 | if '#' in text: 100 | # sh#t, there is multiples schemes for a sigle specie :( 101 | components = text.split(' ') 102 | genus = components[0][0] 103 | specie = components[1].split('#')[0] 104 | extra = '_'.join(components[2:]) 105 | schemenumber = text.split('#')[-1] 106 | codename = f'{genus}{specie}_{extra}#{schemenumber}' 107 | else: 108 | components = text.split(' ') 109 | genus = components[0][0] 110 | specie = components[1] 111 | extra = '_'.join(components[2:]) 112 | codename = f'{genus}{specie}_{extra}' 113 | return codename.strip() 114 | 115 | 116 | def parseXML(xml): 117 | tree = ET.parse(xml) 118 | root = tree.getroot() 119 | species = defaultdict() 120 | for parent in root.iter('species'): 121 | data = [] 122 | for child in parent.iter('url'): 123 | data.append(child.text) 124 | # BUG solved!, this is not the scheme code calculate it using best_guess_codename 125 | # codename = '_'.join(data[1].strip('/').split('/')[-4].split('_')[1:-1]) 126 | codename = best_guess_codename(parent.text) 127 | species[codename] = data[1:] 128 | return species 129 | 130 | 131 | def download_fasta(items): 132 | codename = items[0] 133 | files = items[1] 134 | outdir = Path(str(pathdb) + '/schemes' + '/' + codename) 135 | outdir.mkdir(exist_ok=True, parents=True) 136 | for file in files: 137 | if 'profiles_csv' in file: 138 | out_filename = codename 139 | urlretrieve(file, str(outdir) + '/' + 140 | out_filename + '.txt') 141 | else: 142 | urlretrieve(file, str(outdir) + '/' + 143 | file.strip('/').split('/')[-2] + '.tfa') 144 | return '[OK] {}'.format(codename) 145 | 146 | 147 | def update_mlstdb(threads): 148 | pathdb.mkdir(exist_ok=True, parents=True) 149 | urlretrieve('https://pubmlst.org/data/dbases.xml', str(pathdb) + 150 | '/dbases.xml') 151 | logger.info('https://pubmlst.org/data/dbases.xml downloaded to {}' 152 | .format(str(pathdb) + '/dbases.xml')) 153 | datadb = parseXML(str(pathdb) + '/dbases.xml') 154 | logger.info('Starting download of all schemes') 155 | t = ThreadPool(threads) 156 | genome_mlst = [] 157 | with ThreadPool(threads) as t: 158 | for result in tqdm(t.imap(download_fasta, datadb.items()), 159 | total=len(datadb.items()), 160 | desc='Downloading Schemes using {} threads'. 161 | format(threads), unit='Schemes', leave=True): 162 | genome_mlst.append(result) 163 | logger.info('Schemes were downloaded') 164 | fastas = Path(str(pathdb) + '/schemes').glob('*/*.tfa') 165 | allfasta = [] 166 | scheme_number = defaultdict() 167 | for species, data in datadb.items(): 168 | scheme_number[species] = [genes.strip('/').split('/')[-2]. 169 | split('.')[0] for genes in data[1:]] 170 | save_obj(scheme_number, str(pathdb) + '/scheme_number.pkl') 171 | logger.info('Schemes object was created in {}'.format(str(pathdb) + 172 | '/scheme_number.pkl') 173 | ) 174 | for fasta in fastas: 175 | for record in SeqIO.parse(str(fasta), 'fasta'): 176 | scheme = str(fasta.parent).split('/')[-1] 177 | record.id = '{}.{}'.format(scheme, record.id) 178 | record.description = '' 179 | allfasta.append(record) 180 | outfna = 'mlst.fasta' 181 | SeqIO.write(allfasta, str(pathdb) + '/' + outfna, 'fasta') 182 | blastdb_cmd = 'makeblastdb -hash_index -in {0} -dbtype nucl -title \ 183 | "PubMLST_{1}" -parse_seqids' 184 | blastdb_cmd = blastdb_cmd.format(str(pathdb) + '/' + 185 | outfna, date.today().strftime('%d%m%y')) 186 | DB_process = Popen(blastdb_cmd, shell=True, stdin=PIPE, stdout=PIPE, 187 | stderr=PIPE) 188 | DB_process.wait() 189 | logger.info('BLASTdb was created using pubmlst data') 190 | logger.info('Update PubMLST Complete') 191 | 192 | def show_scheme_list(): 193 | datadb = Path(str(pathdb) + '/dbases.xml') 194 | if not datadb.is_file(): 195 | # update the database 196 | from sys import exit 197 | logger.error('There is no dbases.xml, please update the database') 198 | exit() 199 | tree = ET.parse(datadb) 200 | root = tree.getroot() 201 | species = defaultdict() 202 | for parent in root.iter('species'): 203 | data = [] 204 | for child in parent.iter('url'): 205 | data.append(child.text) 206 | # BUG solved!, this is not the scheme code calculate it using best_guess_codename 207 | # codename = '_'.join(data[1].strip('/').split('/')[-4].split('_')[1:-1]) 208 | codename = best_guess_codename(parent.text) 209 | species[codename] = parent.text 210 | print(f'There are {len(species)} schemes (A round of applause to @keithajolley! (Jolley, et al., 2018)):\n') 211 | i = 1 212 | for sch, species in species.items(): 213 | print(f'({i}) {sch}: {species.strip()}') 214 | i += 1 215 | -------------------------------------------------------------------------------- /get_citations.py: -------------------------------------------------------------------------------- 1 | from scholarly import scholarly 2 | import requests 3 | import re 4 | 5 | # Buscar artículo en Google Scholar 6 | search_query = scholarly.search_pubs('FastMLST: A Multi-core Tool for Multilocus Sequence Typing of Draft Genome Assemblies') 7 | article = next(search_query) 8 | 9 | citas = article['num_citations'] 10 | 11 | # Crear la URL para el badge con el número de citas 12 | badge_url = f"https://img.shields.io/badge/citations-{citas}-blue" 13 | 14 | # Leer el contenido del archivo README.md 15 | with open("README.md", "r") as f: 16 | readme_content = f.read() 17 | 18 | # Reemplazar el badge de citas en el archivo README.md 19 | new_readme_content = re.sub(r'!\[Citations\]\(.*\)', f"![Citations]({badge_url})", readme_content) 20 | 21 | # Escribir el contenido actualizado de nuevo en el archivo README.md 22 | with open("README.md", "w") as f: 23 | f.write(new_readme_content) 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='fastmlst', 4 | version='0.0.19', 5 | description='A multi-core tool for multilocus sequence typing of draft genome assemblies using PubMLST typing schemes', 6 | url='https://github.com/EnzoAndree/FastMLST', 7 | author='Enzo Guerrero-Araya', 8 | author_email='biologoenzo@gmail.com', 9 | license='GPLv3', 10 | packages=['fastmlst', 'bin'], 11 | install_requires=['tqdm', 12 | 'pandas', 13 | 'biopython'], 14 | entry_points={ 15 | 'console_scripts': [ 16 | 'fastmlst = bin.fastmlst:main' 17 | ] 18 | }, 19 | zip_safe=False) 20 | --------------------------------------------------------------------------------