├── pymlst ├── cla │ ├── __init__.py │ ├── commands │ │ ├── __init__.py │ │ ├── info.py │ │ ├── remove.py │ │ ├── search.py │ │ ├── create.py │ │ ├── search2.py │ │ └── import.py │ └── model.py ├── common │ ├── __init__.py │ ├── commands │ │ ├── __init__.py │ │ └── configure.py │ ├── flag.py │ ├── exceptions.py │ ├── mafft.py │ ├── blat.py │ ├── utils.py │ ├── kma.py │ ├── psl.py │ └── web.py ├── pytyper │ ├── __init__.py │ ├── commands │ │ ├── __init__.py │ │ └── search.py │ ├── method.py │ ├── url.py │ └── model.py ├── wg │ ├── commands │ │ ├── __init__.py │ │ ├── stats.py │ │ ├── recombination.py │ │ ├── gene.py │ │ ├── distance.py │ │ ├── subgraph.py │ │ ├── mlst.py │ │ ├── strain.py │ │ ├── add.py │ │ ├── msa.py │ │ ├── sequence.py │ │ ├── add2.py │ │ ├── remove.py │ │ ├── create.py │ │ └── import.py │ ├── __init__.py │ ├── model.py │ └── extractors.py ├── data │ ├── pytyper │ │ ├── clmt.txt │ │ ├── fimh.fna │ │ ├── spa.fna │ │ └── clmt.fna │ └── alembic │ │ ├── cla │ │ ├── script.py.mako │ │ ├── versions │ │ │ ├── c0f871a99d96_add_database_infos.py │ │ │ └── 21efe503d07d_initial.py │ │ └── env.py │ │ ├── wg │ │ ├── script.py.mako │ │ ├── versions │ │ │ ├── a793f8f3fd83_add_database_infos.py │ │ │ └── 52ae99cb5f33_initial.py │ │ └── env.py │ │ ├── pytyper │ │ ├── script.py.mako │ │ ├── versions │ │ │ └── 1f96d027f4aa_initial.py │ │ └── env.py │ │ └── alembic.ini ├── version.py ├── __init__.py ├── config.py └── cmd.py ├── setup.cfg ├── environment.yml ├── docs ├── source │ ├── logo.png │ ├── documentation │ │ ├── cgmlst.png │ │ ├── cgmlst │ │ │ ├── subgraph.png │ │ │ ├── check │ │ │ ├── export_seq.rst │ │ │ ├── add.rst │ │ │ ├── other_analysis.rst │ │ │ ├── export_res.rst │ │ │ ├── initialise.rst │ │ │ └── check.rst │ │ ├── pytyper.rst │ │ ├── clamlst.rst │ │ ├── cgmlst.rst │ │ ├── installation.rst │ │ ├── clamlst │ │ │ ├── search.rst │ │ │ └── initialise.rst │ │ └── pytyper │ │ │ └── search.rst │ ├── development.rst │ ├── requirements.txt │ ├── api.rst │ ├── development │ │ ├── getting_started.rst │ │ ├── docs.rst │ │ └── make.rst │ ├── index.rst │ └── conf.py ├── Makefile └── make.bat ├── pytest.ini ├── complete.sh ├── requirements.txt ├── .readthedocs.yaml ├── MANIFEST.in ├── LICENSE ├── .github └── workflows │ └── python-publish.yml ├── Makefile ├── .gitignore ├── README.md ├── setup.py ├── tests ├── test_cla.py ├── test_typer.py └── test_wg.py └── rcfile.rc /pymlst/cla/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pymlst/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pymlst/cla/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pymlst/pytyper/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pymlst/wg/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pymlst/common/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pymlst/pytyper/commands/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README 3 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: pymlst 2 | dependencies: 3 | - python=3.7 4 | -------------------------------------------------------------------------------- /docs/source/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bvalot/pyMLST/HEAD/docs/source/logo.png -------------------------------------------------------------------------------- /pymlst/wg/__init__.py: -------------------------------------------------------------------------------- 1 | """A module offering tools to work with Whole Genome MLST databases.""" 2 | -------------------------------------------------------------------------------- /pymlst/pytyper/method.py: -------------------------------------------------------------------------------- 1 | # Different typing static variable 2 | FIM='fim' 3 | SPA='spa' 4 | CLMT='clmt' 5 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore::DeprecationWarning 4 | ignore::UserWarning 5 | -------------------------------------------------------------------------------- /docs/source/documentation/cgmlst.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bvalot/pyMLST/HEAD/docs/source/documentation/cgmlst.png -------------------------------------------------------------------------------- /docs/source/documentation/cgmlst/subgraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bvalot/pyMLST/HEAD/docs/source/documentation/cgmlst/subgraph.png -------------------------------------------------------------------------------- /pymlst/pytyper/url.py: -------------------------------------------------------------------------------- 1 | SPA_URL_TYPE = "http://spa.ridom.de/dynamic/spatypes.txt" 2 | SPA_URL_SEQ = "http://spa.ridom.de/dynamic/sparepeats.fasta" 3 | FIM_URL = "https://bitbucket.org/genomicepidemiology/fimtyper_db.git" 4 | -------------------------------------------------------------------------------- /pymlst/common/flag.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import MetaData, Table, String, Column 2 | 3 | metadata = MetaData() 4 | 5 | mlst_type = Table('mlst_type', metadata, 6 | Column('name', String(length=4), primary_key=True)) 7 | -------------------------------------------------------------------------------- /complete.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | _wgmlst_completions() 4 | { 5 | if [ "${#COMP_WORDS[@]}" != "2" ]; then 6 | return 7 | fi 8 | 9 | COMPREPLY=($(compgen -W "add_strain create_db" "${COMP_WORDS[1]}")) 10 | } 11 | 12 | complete -F _wgmlst_completions wgMLST 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | biopython>=1.78 2 | click>=7.1 3 | pytest>=6.2 4 | pytest-cov>=2.10 5 | sqlalchemy>=1.4,<2 6 | networkx>=2.5 7 | decorator>=4.4 8 | requests>=2.23 9 | pandas>=1.2 10 | numpy>=1.20 11 | beautifulsoup4>=4.9 12 | questionary>=1.9 13 | setuptools>=44.0 14 | alembic>=1.6 15 | GitPython>=3.1 16 | -------------------------------------------------------------------------------- /pymlst/data/pytyper/clmt.txt: -------------------------------------------------------------------------------- 1 | Allele,arpA,chuA,yjaA,TspE4.C2 2 | A,+,-,-,- 3 | B1,+,-,-,+ 4 | G|F,-,+,-,- 5 | H|B2,-,+,+,- 6 | B2,-,+,+,+ 7 | G|B2,-,+,-,+ 8 | I|A|C,+,-,+,- 9 | D|E,+,+,-,- 10 | D|E,+,+,-,+ 11 | E|I,+,+,+,- 12 | I|II,-,-,+,- 13 | III|IV|V,-,-,-,- 14 | U,-,-,-,+ 15 | U,-,-,+,+ 16 | U,+,-,+,+ 17 | U,+,+,+,+ 18 | -------------------------------------------------------------------------------- /pymlst/version.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This module contains project version information. 6 | 7 | .. currentmodule:: pymlst.version 8 | .. moduleauthor:: benoit_valot 9 | """ 10 | 11 | __version__ = "2.2.2" #: the working version 12 | __release__ = "2.2" #: the release version 13 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: "ubuntu-22.04" 5 | tools: 6 | python: "3.9" 7 | 8 | # Build from the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/source/conf.py 11 | 12 | # Explicitly set the version of Python and its requirements 13 | python: 14 | install: 15 | - requirements: docs/source/requirements.txt 16 | -------------------------------------------------------------------------------- /docs/source/development.rst: -------------------------------------------------------------------------------- 1 | .. _development: 2 | 3 | *********** 4 | Development 5 | *********** 6 | 7 | This section describes how to configure your environment for 8 | development, details the make file option, and how to build 9 | documentation. 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | :caption: Table of contents: 14 | 15 | development/getting_started 16 | development/make 17 | development/docs 18 | -------------------------------------------------------------------------------- /pymlst/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | python Mlst Local Search Tool 6 | 7 | .. currentmodule:: pymlst 8 | .. moduleauthor:: benoit_valot 9 | """ 10 | 11 | from . import config 12 | from .version import __version__, __release__ # noqa 13 | from .wg.core import open_wg 14 | from .cla.core import open_cla 15 | from .pytyper.core import open_typer 16 | -------------------------------------------------------------------------------- /pymlst/data/pytyper/fimh.fna: -------------------------------------------------------------------------------- 1 | >fimh fimH1 2 | TTCGCCTGTAAAACCGCCAATGGTACTGCTATCCCTATTGGCGGTGGCAGCGCCAATGTTTATGTAAACC 3 | TTGCGCCTGCCGTGAATGTGGGGCAAAACCTGGTCGTGGATCTTTCGACGCAAATCTTTTGCCATAACGA 4 | TTACCCGGAAACCATTACAGACTATGTCACACTGCAACGAGGTTCGGCTTATGGCGGCGTGTTATCTAGT 5 | TTTTCCGGGACCGTAAAATATAATGGCAGTAGCTATCCTTTCCCTACTACCAGCGAAACGCCGCGGGTTG 6 | TTTATAATTCGAGAACGGATAAGCCGTGGCCGGTGGCGCTTTATTTGACGCCTGTGAGCAGTGCTGGCGG 7 | GGTGGCGATTAAAGCTGGTTCATTAATTGCCGTGCTTATTTTGCGACAGACCAACAACTATAACAGCGAT 8 | GATTTTCAGTTTGTGTGGAATATTTACGCCAATAATGATGTGGTGGTGCCCACTGGCGGCTGTGATGTT -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include pymlst/data/alembic/alembic.ini 2 | include pymlst/data/alembic/wg/env.py 3 | include pymlst/data/alembic/wg/script.py.mako 4 | include pymlst/data/alembic/wg/versions/*.py 5 | include pymlst/data/alembic/cla/env.py 6 | include pymlst/data/alembic/cla/script.py.mako 7 | include pymlst/data/alembic/cla/versions/*.py 8 | include pymlst/data/alembic/pytyper/env.py 9 | include pymlst/data/alembic/pytyper/script.py.mako 10 | include pymlst/data/alembic/pytyper/versions/*.py 11 | include pymlst/data/pytyper/*fna 12 | include pymlst/data/pytyper/*txt 13 | -------------------------------------------------------------------------------- /pymlst/pytyper/model.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import MetaData, Column, Table, Integer, Text, ForeignKey, Index 2 | 3 | metadata = MetaData() 4 | 5 | typerSeq = Table("typerSeq", metadata, 6 | Column("id", Integer, primary_key=True), 7 | Column("sequence", Text, unique=True), 8 | Column("typing", Text), 9 | Column("allele", Text)) 10 | 11 | typerSt = Table("typerSt", metadata, 12 | Column("id", Integer, primary_key=True), 13 | Column("typing", Text), 14 | Column("st", Text), 15 | Column("allele", Text)) 16 | 17 | 18 | -------------------------------------------------------------------------------- /docs/source/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==3.4.3 2 | sphinxcontrib-applehelp<=1.0.4 3 | sphinxcontrib-devhelp<=1.0.2 4 | sphinxcontrib-htmlhelp<=2.0.1 5 | sphinxcontrib-jsmath<=1.0.1 6 | sphinxcontrib-qthelp<=1.0.3 7 | sphinxcontrib-serializinghtml<=1.1.5 8 | sphinx_rtd_theme==0.5.1 9 | jinja2==3.0.3 10 | docutils==0.16 11 | networkx>=2.5 12 | biopython>=1.78 13 | click<=7.1 14 | pytest>=6.2 15 | pytest-cov>=2.10 16 | sqlalchemy>=1.4,<2 17 | networkx>=2.5 18 | decorator>=4.4 19 | requests>=2.23 20 | pandas>=1.2 21 | numpy>=1.20 22 | beautifulsoup4>=4.9 23 | questionary>=1.9 24 | setuptools>=44.0 25 | alembic>=1.6 26 | GitPython>=3.1 27 | -------------------------------------------------------------------------------- /docs/source/documentation/pytyper.rst: -------------------------------------------------------------------------------- 1 | .. _pytyper: 2 | 3 | ******************** 4 | Other Typing Methods 5 | ******************** 6 | 7 | Other typing method are available using a series of 8 | Python scripts described below. 9 | 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | 14 | pytyper/search 15 | 16 | All avalaible commands can be listed using help fonction: 17 | 18 | .. code-block:: bash 19 | 20 | pyTyper --help 21 | 22 | Usage: pyTyper [OPTIONS] COMMAND [ARGS]... 23 | 24 | Other typing commands. 25 | 26 | Commands: 27 | search Searches strain type using specified METHOD for an assembly... 28 | -------------------------------------------------------------------------------- /pymlst/data/alembic/cla/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | ${imports if imports else ""} 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = ${repr(up_revision)} 14 | down_revision = ${repr(down_revision)} 15 | branch_labels = ${repr(branch_labels)} 16 | depends_on = ${repr(depends_on)} 17 | 18 | 19 | def upgrade(): 20 | ${upgrades if upgrades else "pass"} 21 | 22 | 23 | def downgrade(): 24 | ${downgrades if downgrades else "pass"} 25 | -------------------------------------------------------------------------------- /pymlst/data/alembic/wg/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | ${imports if imports else ""} 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = ${repr(up_revision)} 14 | down_revision = ${repr(down_revision)} 15 | branch_labels = ${repr(branch_labels)} 16 | depends_on = ${repr(depends_on)} 17 | 18 | 19 | def upgrade(): 20 | ${upgrades if upgrades else "pass"} 21 | 22 | 23 | def downgrade(): 24 | ${downgrades if downgrades else "pass"} 25 | -------------------------------------------------------------------------------- /pymlst/data/alembic/pytyper/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | ${imports if imports else ""} 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = ${repr(up_revision)} 14 | down_revision = ${repr(down_revision)} 15 | branch_labels = ${repr(branch_labels)} 16 | depends_on = ${repr(depends_on)} 17 | 18 | 19 | def upgrade(): 20 | ${upgrades if upgrades else "pass"} 21 | 22 | 23 | def downgrade(): 24 | ${downgrades if downgrades else "pass"} 25 | -------------------------------------------------------------------------------- /pymlst/wg/commands/stats.py: -------------------------------------------------------------------------------- 1 | """extract stats CLI command file.""" 2 | 3 | import os 4 | 5 | import click 6 | 7 | import pymlst 8 | from pymlst.common import utils, exceptions 9 | from pymlst.wg.extractors import StatsExtractor 10 | 11 | @click.command(name='stats') 12 | 13 | @click.argument('database', type=click.Path(exists=True)) 14 | def cli(database, **kwargs): 15 | """Extracts stats from a wgMLST DATABASE.""" 16 | 17 | try: 18 | 19 | with pymlst.open_wg(os.path.abspath(database)) as mlst: 20 | mlst.extract(StatsExtractor()) 21 | 22 | except exceptions.PyMLSTError as err: 23 | raise click.ClickException(str(err)) 24 | -------------------------------------------------------------------------------- /docs/source/documentation/cgmlst/check: -------------------------------------------------------------------------------- 1 | .. _cgmlst_add: 2 | 3 | .. toctree:: 4 | :glob: 5 | 6 | =========================== 7 | Add strains to the database 8 | =========================== 9 | 10 | Next, you need to add your strain iteratively to the database. 11 | A draft genome can be used (we recommend to use Spades for assembly). 12 | You can also add reference genome for comparison. 13 | 14 | 15 | Get the Source 16 | ============== 17 | 18 | The source code for the `PyMLST` project lives at 19 | `github `_. 20 | You can use `git clone` to get it. 21 | 22 | .. code-block:: bash 23 | 24 | git clone https://github.com/bvalot/pyMLST.git 25 | 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Copyright (c) 2021, Benoit Valot 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /pymlst/cla/commands/info.py: -------------------------------------------------------------------------------- 1 | """Info CLI command file.""" 2 | 3 | import os 4 | import click 5 | 6 | import pymlst 7 | from pymlst.common import exceptions, utils 8 | 9 | 10 | @click.command(name='info') 11 | @click.option('--output', '-o', 12 | type=click.File('w'), 13 | help='Writes ST search result to (default:stdout).') 14 | @click.argument('database', 15 | type=click.Path(exists=False)) 16 | 17 | 18 | def cli(database, **kwargs): 19 | """Output the information about a classical MLST DATABASE""" 20 | 21 | try: 22 | with pymlst.open_cla(os.path.abspath(database)) as mlst: 23 | mlst.get_infos(**utils.clean_kwargs(kwargs)) 24 | 25 | except exceptions.PyMLSTError as err: 26 | raise click.ClickException(str(err)) 27 | -------------------------------------------------------------------------------- /pymlst/cla/commands/remove.py: -------------------------------------------------------------------------------- 1 | """remove CLI command file.""" 2 | 3 | import os 4 | 5 | import click 6 | 7 | import pymlst 8 | from pymlst.common import utils, exceptions 9 | 10 | 11 | @click.command(name='remove') 12 | @click.argument('database', 13 | type=click.Path(exists=True)) 14 | @click.argument('gene', 15 | type=click.STRING) 16 | @click.argument('allele', 17 | type=click.INT) 18 | 19 | 20 | def cli(database, **kwargs): 21 | """Removes ALLELE sequence from the GENE on a mlst DATABASE.""" 22 | 23 | try: 24 | with pymlst.open_cla(os.path.abspath(database)) as mlst: 25 | mlst.remove_allele(**utils.clean_kwargs(kwargs)) 26 | 27 | except exceptions.PyMLSTError as err: 28 | raise click.ClickException(str(err)) 29 | -------------------------------------------------------------------------------- /pymlst/wg/commands/recombination.py: -------------------------------------------------------------------------------- 1 | """recombination CLI command file.""" 2 | 3 | import click 4 | 5 | from pymlst.common import utils, exceptions 6 | from pymlst.wg import core 7 | 8 | 9 | @click.command(name='recombination') 10 | @click.option('--output', '-o', 11 | type=click.File('w'), 12 | help='Output number of variations by genes (default:stdout).') 13 | @click.argument('genes', 14 | type=click.File('r')) 15 | @click.argument('alignment', 16 | type=click.File('r')) 17 | def cli(genes, alignment, **kwargs): 18 | """Searches potential gene recombinations from wgMLST database export.""" 19 | 20 | try: 21 | 22 | core.find_recombination(genes, alignment, **utils.clean_kwargs(kwargs)) 23 | 24 | except exceptions.PyMLSTError as err: 25 | raise click.ClickException(str(err)) 26 | -------------------------------------------------------------------------------- /pymlst/cla/model.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import MetaData, Table, Column, Integer, Text 2 | 3 | metadata = MetaData() 4 | 5 | sequences = Table('sequences', metadata, 6 | Column('id', Integer, primary_key=True), 7 | Column('sequence', Text, unique=True), 8 | Column('gene', Text), 9 | Column('allele', Integer)) 10 | 11 | mlst = Table('mlst', metadata, 12 | Column('id', Integer, primary_key=True), 13 | Column('st', Integer), 14 | Column('gene', Text), 15 | Column('allele', Integer)) 16 | 17 | mlst_type = Table('mlst_type', metadata, 18 | Column('name', Text), 19 | Column('source', Text), 20 | Column('species', Text), 21 | Column('mlst', Text), 22 | Column('version', Text)) 23 | -------------------------------------------------------------------------------- /pymlst/data/alembic/wg/versions/a793f8f3fd83_add_database_infos.py: -------------------------------------------------------------------------------- 1 | """Add database infos 2 | 3 | Revision ID: a793f8f3fd83 4 | Revises: 52ae99cb5f33 5 | Create Date: 2025-03-14 15:38:05.090257 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'a793f8f3fd83' 14 | down_revision = '52ae99cb5f33' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.add_column('mlst_type', sa.Column('source', sa.String(), nullable=True)) 21 | op.add_column('mlst_type', sa.Column('species', sa.String(), nullable=True)) 22 | op.add_column('mlst_type', sa.Column('version', sa.String(), nullable=True)) 23 | 24 | 25 | def downgrade(): 26 | op.drop_column('mlst_type', 'source') 27 | op.drop_column('mlst_type', 'species') 28 | op.drop_column('mlst_type', 'version') 29 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /pymlst/wg/commands/gene.py: -------------------------------------------------------------------------------- 1 | """extract gene CLI command file.""" 2 | 3 | import os 4 | 5 | import click 6 | 7 | import pymlst 8 | from pymlst.common import utils, exceptions 9 | from pymlst.wg.extractors import GeneExtractor, TableExtractorCommand 10 | 11 | @click.command(name='gene', cls=TableExtractorCommand) 12 | @click.option('--output', '-o', 13 | type=click.File('w'), 14 | help='Export GENE list to (default=stdout).') 15 | @click.argument('database', type=click.Path(exists=True)) 16 | def cli(database, **kwargs): 17 | """Extracts a list of genes from a wgMLST DATABASE.""" 18 | 19 | tab_kwargs,out_kwargs = utils.get_output(utils.clean_kwargs(kwargs)) 20 | 21 | try: 22 | 23 | with pymlst.open_wg(os.path.abspath(database)) as mlst: 24 | mlst.extract(GeneExtractor(**tab_kwargs), **out_kwargs) 25 | 26 | except exceptions.PyMLSTError as err: 27 | raise click.ClickException(str(err)) 28 | -------------------------------------------------------------------------------- /pymlst/wg/commands/distance.py: -------------------------------------------------------------------------------- 1 | """extract cgMLST distance CLI command file.""" 2 | 3 | import os 4 | 5 | import click 6 | 7 | import pymlst 8 | from pymlst.common import utils, exceptions 9 | from pymlst.wg.extractors import DistanceExtractor, TableExtractorCommand 10 | 11 | @click.command(name='distance', cls=TableExtractorCommand) 12 | @click.option('--output', '-o', 13 | type=click.File('w'), 14 | help='Export distance to (default=stdout).') 15 | @click.argument('database', type=click.Path(exists=True)) 16 | def cli(database, **kwargs): 17 | """Extracts a distance matrix from a wgMLST DATABASE.""" 18 | 19 | tab_kwargs,out_kwargs = utils.get_output(utils.clean_kwargs(kwargs)) 20 | 21 | try: 22 | 23 | with pymlst.open_wg(os.path.abspath(database)) as mlst: 24 | mlst.extract(DistanceExtractor(**tab_kwargs), **out_kwargs) 25 | 26 | except exceptions.PyMLSTError as err: 27 | raise click.ClickException(str(err)) 28 | -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | .. _fasta: https://en.wikipedia.org/wiki/FASTA_format 2 | .. _fastq: https://en.wikipedia.org/wiki/FASTQ_format 3 | .. _BLAT: http://genome.ucsc.edu/cgi-bin/hgBlat 4 | .. _KMA: https://bitbucket.org/genomicepidemiology/kma/src/master/ 5 | .. _BWA: https://bio-bwa.sourceforge.net/ 6 | 7 | 8 | .. _api: 9 | 10 | .. toctree:: 11 | :glob: 12 | 13 | ================= 14 | API Documentation 15 | ================= 16 | 17 | .. automodule:: pymlst 18 | 19 | Whole Genome MLST 20 | ----------------- 21 | 22 | .. automodule:: pymlst.wg.core 23 | :members: 24 | :member-order: bysource 25 | 26 | .. automodule:: pymlst.wg.extractors 27 | :members: 28 | :member-order: bysource 29 | 30 | Classical MLST 31 | ----------------- 32 | 33 | .. automodule:: pymlst.cla.core 34 | :members: 35 | :member-order: bysource 36 | 37 | Other Typing 38 | ----------------- 39 | 40 | .. automodule:: pymlst.pytyper.core 41 | :members: 42 | :member-order: bysource 43 | -------------------------------------------------------------------------------- /pymlst/wg/model.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import MetaData, Column, Table, Integer, Text, ForeignKey, Index 2 | 3 | metadata = MetaData() 4 | 5 | sequences = Table('sequences', metadata, 6 | Column('id', Integer, primary_key=True), 7 | Column('sequence', Text, unique=True)) 8 | 9 | mlst = Table('mlst', metadata, 10 | Column('id', Integer, primary_key=True), 11 | Column('souche', Text), 12 | Column('gene', Text), 13 | Column('seqid', Integer, ForeignKey(sequences.c.id)), 14 | Index('ix_souche', 'souche'), 15 | Index('ix_gene', 'gene'), 16 | Index('ix_seqid', 'seqid'), 17 | Index('ix_souche_gene_seqid', 'gene', 'souche', 'seqid')) 18 | 19 | mlst_type = Table('mlst_type', metadata, 20 | Column('name', Text), 21 | Column('source', Text), 22 | Column('species', Text), 23 | Column('version', Text)) 24 | -------------------------------------------------------------------------------- /pymlst/data/pytyper/spa.fna: -------------------------------------------------------------------------------- 1 | >spa CP127590.1:c70231-69071 Staphylococcus aureus strain C867 chromosome, complete genome 2 | TTGAAAAAGAAAAACATTTATTCAATTCGTAAACTAGGTGTAGGTATTGCATCTGTAACTTTAGGTACAT 3 | TACTTATATCTGGTGGCGTAACACCTGCTGCAAATGCTGCGCAACACGATGAAGCTCAACAAAATGCTTT 4 | TTATCAAGTGTTAAATATGCCTAACTTAAACGCTGATCAACGTAATGGTTTTATCCAAAGCCTTAAAGAT 5 | GATCCAAGCCAAAGTGCTAACGTTTTAGGTGAAGCTCAAAAACTTAATGACTCTCAAGCTCCAAAAGCTG 6 | ATGCGCAACAAAATAACTTCAACAAAGATCAACAAAGCGCCTTCTATGAAATCTTGAACATGCCTAACTT 7 | AAACGAAGAACAACGCAATGGTTTCATCCAAAGCTTAAAAGATGACCCAAGCCAAAGTGCTAACCTATTG 8 | TCAGAAGCTAAAAAGTTAAATGAATCTCAAGCACCGAAAGCGGATAACAAATTCAACAAAGAACAACAAA 9 | ATGCTTTCTATGAAATCTTACATTTACCTAACTTAAACGAAGAACAACGCAATGGTTTCATCCAAAGCTT 10 | CGTTAAACCTGGTGATACAGTAAATGACATTGCAAAAGCAAACGGCACTACTGCTGACAAAATTGCTGCA 11 | GATAACAAATTAGCTGATAAAAACATGATCAAACCTGGTCAAGAACTTGTTGTTGATAAGAAGCAACCAG 12 | CAAACCATGCAGATGCTAACAAAGCTCAAGCATTACCAGAAACTGGTGAAGAAAATCCATTCATCGGTAC 13 | AACTGTATTTGGTGGATTATCATTAGCCTTAGGTGCAGCGTTATTAGCTGGACGTCGTCGCGAACTATAA 14 | -------------------------------------------------------------------------------- /pymlst/data/alembic/cla/versions/c0f871a99d96_add_database_infos.py: -------------------------------------------------------------------------------- 1 | """Add database infos 2 | 3 | Revision ID: c0f871a99d96 4 | Revises: 21efe503d07d 5 | Create Date: 2025-03-14 09:29:25.322104 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'c0f871a99d96' 14 | down_revision = '21efe503d07d' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.add_column('mlst_type', sa.Column('source', sa.String(), nullable=True)) 21 | op.add_column('mlst_type', sa.Column('species', sa.String(), nullable=True)) 22 | op.add_column('mlst_type', sa.Column('mlst', sa.String(), nullable=True)) 23 | op.add_column('mlst_type', sa.Column('version', sa.String(), nullable=True)) 24 | 25 | 26 | 27 | def downgrade(): 28 | op.drop_column('mlst_type', 'source') 29 | op.drop_column('mlst_type', 'species') 30 | op.drop_column('mlst_type', 'mlst') 31 | op.drop_column('mlst_type', 'version') 32 | -------------------------------------------------------------------------------- /docs/source/documentation/clamlst.rst: -------------------------------------------------------------------------------- 1 | .. _clamlst: 2 | 3 | *********************** 4 | classical MLST analysis 5 | *********************** 6 | 7 | A workflow analysis of classical MLST is performed using a series of 8 | Python scripts described below. 9 | 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | 14 | clamlst/initialise 15 | clamlst/search 16 | 17 | All avalaible commands can be listed using help fonction: 18 | 19 | .. code-block:: bash 20 | 21 | claMLST --help 22 | 23 | Usage: claMLST [OPTIONS] COMMAND [ARGS]... 24 | 25 | Classical MLST commands. 26 | 27 | Commands: 28 | create Creates a classical MLST DATABASE from a SCHEME csv and ALLELES... 29 | import Creates a claMLST DATABASE from an online resource. 30 | info Output the informations about a classical MLST DATABASE 31 | remove Removes ALLELE sequence from the GENE on a mlst DATABASE. 32 | search Searches ST number for an assembly GENOME using an mlst DATABASE. 33 | search2 Searches ST number from FASTQS(.gz) raw reads using an mlst... 34 | -------------------------------------------------------------------------------- /pymlst/wg/commands/subgraph.py: -------------------------------------------------------------------------------- 1 | """subgraph CLI command file.""" 2 | 3 | import click 4 | 5 | from pymlst.common import utils, exceptions 6 | from pymlst.wg import core 7 | 8 | 9 | @click.command(name='subgraph') 10 | @click.option('--output', '-o', 11 | type=click.File('w'), 12 | help='Output group files (default:stdout).') 13 | @click.option('--threshold', '-t', 14 | type=click.INT, 15 | help='Minimum distance to conserve ' 16 | 'for extraction of group (default:50).') 17 | @click.option('--export', '-e', 18 | type=click.Choice(['list', 'count', 'group'], case_sensitive=False), 19 | help='Export type (default:list).') 20 | @click.argument('distance', 21 | type=click.File('r')) 22 | def cli(distance, **kwargs): 23 | """Searches group of strains at a DISTANCE threshold.""" 24 | 25 | try: 26 | 27 | core.find_subgraph(distance, **utils.clean_kwargs(kwargs)) 28 | 29 | except exceptions.PyMLSTError as err: 30 | raise click.ClickException(str(err)) 31 | -------------------------------------------------------------------------------- /pymlst/wg/commands/mlst.py: -------------------------------------------------------------------------------- 1 | """extract MLST table CLI command file.""" 2 | 3 | import os 4 | 5 | import click 6 | 7 | import pymlst 8 | from pymlst.common import utils, exceptions 9 | from pymlst.wg.extractors import MlstExtractor, TableExtractorCommand 10 | 11 | @click.command(name='mlst', cls=TableExtractorCommand) 12 | @click.option('--form', '-f', 13 | type=click.Choice(["default", "grapetree"]), 14 | help='Specify format of output') 15 | @click.option('--output', '-o', 16 | type=click.File('w'), 17 | help='Export strain list to (default=stdout).') 18 | @click.argument('database', type=click.Path(exists=True)) 19 | def cli(database, **kwargs): 20 | """Extracts an MLST table from a wgMLST DATABASE.""" 21 | 22 | tab_kwargs,out_kwargs = utils.get_output(utils.clean_kwargs(kwargs)) 23 | 24 | try: 25 | 26 | with pymlst.open_wg(os.path.abspath(database)) as mlst: 27 | mlst.extract(MlstExtractor(**tab_kwargs), **out_kwargs) 28 | 29 | except exceptions.PyMLSTError as err: 30 | raise click.ClickException(str(err)) 31 | -------------------------------------------------------------------------------- /pymlst/wg/commands/strain.py: -------------------------------------------------------------------------------- 1 | """extract strains CLI command file.""" 2 | 3 | import os 4 | 5 | import click 6 | 7 | import pymlst 8 | from pymlst.common import utils, exceptions 9 | from pymlst.wg.extractors import StrainExtractor, TableExtractorCommand 10 | 11 | @click.command(name='strain', cls=TableExtractorCommand) 12 | @click.option('--count', '-c', 13 | is_flag=True, 14 | help='Count the number of gene present in the database for each strains.') 15 | @click.option('--output', '-o', 16 | type=click.File('w'), 17 | help='Export strain list to (default=stdout).') 18 | @click.argument('database', type=click.Path(exists=True)) 19 | def cli(database, **kwargs): 20 | """Extracts a list of strains from a wgMLST DATABASE.""" 21 | 22 | tab_kwargs,out_kwargs = utils.get_output(utils.clean_kwargs(kwargs)) 23 | 24 | try: 25 | 26 | with pymlst.open_wg(os.path.abspath(database)) as mlst: 27 | mlst.extract(StrainExtractor(**tab_kwargs), **out_kwargs) 28 | 29 | except exceptions.PyMLSTError as err: 30 | raise click.ClickException(str(err)) 31 | -------------------------------------------------------------------------------- /pymlst/wg/commands/add.py: -------------------------------------------------------------------------------- 1 | """add CLI command file.""" 2 | 3 | import os 4 | 5 | import click 6 | 7 | import pymlst 8 | from pymlst.common import utils, exceptions 9 | 10 | @click.command(name='add') 11 | @click.option('--strain', '-s', 12 | type=click.STRING, 13 | help='Name of the strain (default:genome name).') 14 | @click.option('--identity', '-i', 15 | type=click.FLOAT, 16 | help='Minimum identity to search gene (default=0.95).') 17 | @click.option('--coverage', '-c', 18 | type=click.FLOAT, 19 | help='Minimum coverage to search gene (default=0.9).') 20 | @click.argument('database', 21 | type=click.Path(exists=True)) 22 | @click.argument('genome', 23 | type=click.File("r")) 24 | 25 | def cli(genome, database, **kwargs): 26 | """Adds a strain GENOME to the wgMLST DATABASE.""" 27 | 28 | try: 29 | 30 | with pymlst.open_wg(os.path.abspath(database)) as mlst: 31 | mlst.add_strain(genome, **utils.clean_kwargs(kwargs)) 32 | 33 | except exceptions.PyMLSTError as err: 34 | raise click.ClickException(str(err)) 35 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.11' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /pymlst/wg/commands/msa.py: -------------------------------------------------------------------------------- 1 | """Multiple Sequence Alignment CLI command file.""" 2 | 3 | import os 4 | 5 | import click 6 | 7 | import pymlst 8 | from pymlst.common import utils, exceptions 9 | from pymlst.wg.extractors import MsaExtractor 10 | 11 | 12 | @click.command(name='msa') 13 | @click.option('--output', '-o', 14 | type=click.File('w'), 15 | help='Output result in fasta format (default:stdout).') 16 | @click.option('--file', '-f', 17 | type=click.File('r'), 18 | help='file containing list of coregenes to extract (default:all coregenes).') 19 | @click.option('--realign', '-r', 20 | is_flag=True, 21 | help='Realigns genes with same length (Default:No).') 22 | @click.argument('database', 23 | type=click.Path(exists=True)) 24 | def cli(database, **kwargs): 25 | """Computes Multiple Sequence Alignment from a wgMLST DATABASE.""" 26 | 27 | seq_kwargs, out_kwargs = utils.get_output(utils.clean_kwargs(kwargs)) 28 | 29 | try: 30 | 31 | with pymlst.open_wg(os.path.abspath(database)) as mlst: 32 | mlst.extract(MsaExtractor(**seq_kwargs), **out_kwargs) 33 | 34 | except exceptions.PyMLSTError as err: 35 | raise click.ClickException(str(err)) 36 | -------------------------------------------------------------------------------- /pymlst/wg/commands/sequence.py: -------------------------------------------------------------------------------- 1 | """extract sequence CLI command file.""" 2 | 3 | import os 4 | 5 | import click 6 | 7 | import pymlst 8 | from pymlst.common import utils, exceptions 9 | from pymlst.wg.extractors import SequenceExtractor 10 | 11 | 12 | @click.command(name='sequence') 13 | @click.option('--output', '-o', 14 | type=click.File('w'), 15 | help='Output result in fasta format (default:stdout).') 16 | @click.option('--file', '-f', 17 | type=click.File('r'), 18 | help='File containing list of coregenes to extract (default:all coregenes).') 19 | @click.option('--reference', 20 | is_flag=True, 21 | help='Return sequence of the reference instead of strains alleles') 22 | @click.argument('database', 23 | type=click.Path(exists=True)) 24 | def cli(database, **kwargs): 25 | """Extracts sequences from a wgMLST DATABASE.""" 26 | 27 | seq_kwargs, out_kwargs = utils.get_output(utils.clean_kwargs(kwargs)) 28 | 29 | try: 30 | 31 | with pymlst.open_wg(os.path.abspath(database)) as mlst: 32 | mlst.extract(SequenceExtractor(**seq_kwargs), **out_kwargs) 33 | 34 | except exceptions.PyMLSTError as err: 35 | raise click.ClickException(str(err)) 36 | -------------------------------------------------------------------------------- /pymlst/common/exceptions.py: -------------------------------------------------------------------------------- 1 | class PyMLSTError(Exception): 2 | pass 3 | 4 | 5 | class GeneError(PyMLSTError): 6 | pass 7 | 8 | 9 | class DuplicatedGeneSequence(GeneError): 10 | pass 11 | 12 | 13 | class DuplicatedGeneName(GeneError): 14 | pass 15 | 16 | 17 | class InvalidGeneName(GeneError): 18 | pass 19 | 20 | 21 | class AlleleSequenceNotFound(PyMLSTError): 22 | pass 23 | 24 | 25 | class WrongBaseType(PyMLSTError): 26 | pass 27 | 28 | 29 | class ReferenceStrainRemoval(PyMLSTError): 30 | pass 31 | 32 | 33 | class BadIdentityRange(PyMLSTError): 34 | pass 35 | 36 | class BadCoverageRange(PyMLSTError): 37 | pass 38 | 39 | 40 | class BinaryNotFound(PyMLSTError): 41 | pass 42 | 43 | 44 | class StrainAlreadyPresent(PyMLSTError): 45 | pass 46 | 47 | 48 | class ChromosomeNotFound(PyMLSTError): 49 | pass 50 | 51 | 52 | class CoreGenomePathNotFound(PyMLSTError): 53 | pass 54 | 55 | 56 | class NothingToRemove(PyMLSTError): 57 | pass 58 | 59 | 60 | class UndefinedExportType(PyMLSTError): 61 | pass 62 | 63 | class EmptyDatabase(PyMLSTError): 64 | pass 65 | 66 | class BadInputForCreate(PyMLSTError): 67 | pass 68 | 69 | 70 | class PyMLSTWebError(PyMLSTError): 71 | pass 72 | 73 | 74 | class StructureError(PyMLSTWebError): 75 | pass 76 | -------------------------------------------------------------------------------- /pymlst/wg/commands/add2.py: -------------------------------------------------------------------------------- 1 | """add CLI command file.""" 2 | 3 | import os 4 | 5 | import click 6 | 7 | import pymlst 8 | from pymlst.common import utils, exceptions 9 | 10 | @click.command(name='add2') 11 | @click.option('--strain', '-s', 12 | type=click.STRING, 13 | help='Name of the strain (default:genome name).') 14 | @click.option('--identity', '-i', 15 | type=click.FLOAT, 16 | help='Minimum identity to search gene (default=0.95).') 17 | @click.option('--coverage', '-c', 18 | type=click.FLOAT, 19 | help='Minimum coverage to search gene (default=0.9).') 20 | @click.option('--reads', '-r', 21 | type=click.INT, 22 | help='Minimum reads coverage to search a gene (default=10).') 23 | @click.argument('database', nargs=1, 24 | type=click.Path(exists=True)) 25 | @click.argument('fastqs', nargs=-1, 26 | type=click.File("r")) 27 | 28 | def cli(fastqs, database, **kwargs): 29 | """Adds a strain from FASTQS(.gz) reads to the wgMLST DATABASE.""" 30 | 31 | try: 32 | 33 | with pymlst.open_wg(os.path.abspath(database)) as mlst: 34 | mlst.add_reads(fastqs, **utils.clean_kwargs(kwargs)) 35 | 36 | except exceptions.PyMLSTError as err: 37 | raise click.ClickException(str(err)) 38 | -------------------------------------------------------------------------------- /pymlst/cla/commands/search.py: -------------------------------------------------------------------------------- 1 | """search CLI command file.""" 2 | 3 | import os 4 | 5 | import click 6 | 7 | import pymlst 8 | from pymlst.common import utils, exceptions 9 | 10 | 11 | @click.command(name='search') 12 | @click.option('--identity', '-i', 13 | type=click.FLOAT, 14 | help='Minimum identity to search gene (default=0.9).') 15 | @click.option('--coverage', '-c', 16 | type=click.FLOAT, 17 | help='Minimum coverage to search gene (default=0.9).') 18 | @click.option('--fasta', '-f', 19 | type=click.File('w'), 20 | help='Writes fasta file with gene allele.') 21 | @click.option('--output', '-o', 22 | type=click.File('w'), 23 | help='Writes ST search result to (default:stdout).') 24 | @click.argument('database', 25 | type=click.Path(exists=True)) 26 | @click.argument('genomes', 27 | type=click.File('r'), nargs=-1) 28 | 29 | 30 | def cli(genomes, database, **kwargs): 31 | """Searches ST number for an assembly GENOME using an mlst DATABASE.""" 32 | 33 | try: 34 | with pymlst.open_cla(os.path.abspath(database)) as mlst: 35 | mlst.multi_search(genomes, **utils.clean_kwargs(kwargs)) 36 | 37 | except exceptions.PyMLSTError as err: 38 | raise click.ClickException(str(err)) 39 | -------------------------------------------------------------------------------- /docs/source/development/getting_started.rst: -------------------------------------------------------------------------------- 1 | .. _getting_started_dev: 2 | 3 | .. toctree:: 4 | :glob: 5 | 6 | *************** 7 | Getting Started 8 | *************** 9 | 10 | This section provides instructions for setting up your development environment. If you follow the 11 | steps from top to bottom you should be ready to roll by the end. 12 | 13 | 14 | Get the Source 15 | ============== 16 | 17 | The source code for the `PyMLST` project lives at 18 | `github `_. 19 | You can use `git clone` to get it. 20 | 21 | .. code-block:: bash 22 | 23 | git clone https://github.com/bvalot/pyMLST.git 24 | 25 | Create the Virtual Environment 26 | ============================== 27 | 28 | You can create a virtual environment and install the project's dependencies using :ref:`make `. 29 | 30 | .. code-block:: bash 31 | 32 | make venv 33 | source venv/bin/activate 34 | make install 35 | 36 | 37 | Try It Out 38 | ========== 39 | 40 | We recommend that you test your environment by running appropriate 41 | tests. 42 | You can do this with the `make test` target. 43 | 44 | .. code-block:: bash 45 | 46 | make test 47 | 48 | If the tests run and pass, you're ready to roll. 49 | 50 | Getting Answers 51 | =============== 52 | 53 | Once the environment is set up, you can perform a quick build of this project 54 | documentation using the `make answers` target. 55 | 56 | .. code-block:: bash 57 | 58 | make answers 59 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .DEFAULT_GOAL := build 2 | .PHONY: build publish package coverage test lint docs venv 3 | PROJ_SLUG = pymlst 4 | CLI_NAME = pymlst 5 | PY_VERSION = 3.9 6 | LINTER = pylint 7 | 8 | 9 | 10 | build: 11 | pip install --editable . 12 | 13 | run: 14 | $(CLI_NAME) run 15 | 16 | submit: 17 | $(CLI_NAME) submit 18 | 19 | freeze: 20 | pip freeze > requirements.txt 21 | 22 | test: 23 | py.test --cov-report term --cov=$(PROJ_SLUG) tests/ 24 | 25 | quicktest: 26 | py.test --cov-report term --cov=$(PROJ_SLUG) tests/ 27 | 28 | coverage: 29 | py.test --cov-report html --cov=$(PROJ_SLUG) tests/ 30 | 31 | docs: 32 | mkdir -p docs/source/_static 33 | mkdir -p docs/source/_templates 34 | cd docs && $(MAKE) html 35 | 36 | answers: 37 | cd docs && $(MAKE) html 38 | xdg-open docs/build/html/index.html 39 | 40 | package: clean 41 | python setup.py sdist 42 | 43 | publish: package 44 | twine upload --repository-url https://test.pypi.org/legacy/ dist/* 45 | 46 | clean : 47 | rm -rf dist \ 48 | rm -rf docs/build \ 49 | rm -rf *.egg-info 50 | coverage erase 51 | 52 | venv : 53 | virtualenv --python python$(PY_VERSION) venv 54 | 55 | venv_docs : 56 | virtualenv --python python$(PY_VERSION) venv_docs 57 | 58 | install: 59 | pip install -r requirements.txt 60 | 61 | install_docs: 62 | pip install -r docs/source/requirements.txt 63 | 64 | licenses: 65 | pip-licenses --with-url --format=rst \ 66 | --ignore-packages $(shell cat .pip-lic-ignore | awk '{$$1=$$1};1') 67 | -------------------------------------------------------------------------------- /pymlst/cla/commands/create.py: -------------------------------------------------------------------------------- 1 | """create CLI command file.""" 2 | 3 | import os 4 | import click 5 | 6 | import pymlst 7 | from pymlst.common import exceptions 8 | 9 | 10 | @click.command(name='create') 11 | @click.option('--force', '-f', 12 | is_flag=True, 13 | help='Overwrites alrealdy existing DATABASE') 14 | @click.option('--species', '-s', 15 | type=click.STRING, 16 | help='Name of the species') 17 | @click.option('--version', '-V', 18 | type=click.STRING, 19 | help='Version of the database') 20 | @click.argument('database', 21 | type=click.Path(exists=False)) 22 | @click.argument('profile', 23 | type=click.File('r')) 24 | @click.argument('alleles', 25 | type=click.File('r'), nargs=-1, required=True) 26 | 27 | 28 | def cli(force, species, version, database, profile, alleles): 29 | """Creates a classical MLST DATABASE from a txt PROFILE and fasta ALLELES files.""" 30 | 31 | try: 32 | 33 | if os.path.exists(database): 34 | if force: 35 | open(database, "w").close() 36 | else: 37 | raise exceptions.PyMLSTError("Database alreadly exists, use --force to override it") 38 | 39 | with pymlst.open_cla(os.path.abspath(database)) as mlst: 40 | mlst.create(profile, alleles) 41 | mlst.add_infos("Custom", species, "", version) 42 | 43 | except exceptions.PyMLSTError as err: 44 | raise click.ClickException(str(err)) 45 | -------------------------------------------------------------------------------- /pymlst/wg/commands/remove.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import click 4 | import logging 5 | 6 | import pymlst 7 | from pymlst.common import utils, exceptions 8 | 9 | 10 | @click.command(name="remove") 11 | # @click.option('--item', '-i', default='strains', show_default=True, 12 | # type=click.Choice(['strains','genes'], case_sensitive=False), 13 | # help= "Choose the item you wish to remove : strain or genes") 14 | 15 | @click.option('--strains/--genes', 16 | default=True, show_default="strains", 17 | help= "Choose the item you wish to remove") 18 | @click.option('--file', '-f',type=click.File('r'), 19 | help='File list of genes or strains to removed on the wgMLST database.') 20 | @click.argument('database', type=click.Path(exists=True), nargs=1) 21 | @click.argument('genes_or_strains', required=False, type=str, nargs=-1) 22 | 23 | 24 | def cli(database, strains, genes_or_strains, **kwargs): 25 | """Removes STRAINS or GENES from a wgMLST DATABASE.""" 26 | 27 | utils.create_logger() 28 | 29 | try: 30 | with pymlst.open_wg(os.path.abspath(database)) as mlst: 31 | if strains: 32 | logging.info("We will remove one or more strain(s)") 33 | mlst.remove_strain(genes_or_strains, **utils.clean_kwargs(kwargs)) 34 | 35 | else : 36 | logging.info("We will remove one or more gene(s)") 37 | mlst.remove_gene(genes_or_strains, **utils.clean_kwargs(kwargs)) 38 | 39 | except exceptions.PyMLSTError as err: 40 | raise click.ClickException(str(err)) 41 | -------------------------------------------------------------------------------- /pymlst/cla/commands/search2.py: -------------------------------------------------------------------------------- 1 | """search CLI command file.""" 2 | 3 | import os 4 | 5 | import click 6 | 7 | import pymlst 8 | from pymlst.common import utils, exceptions 9 | 10 | 11 | @click.command(name='search2') 12 | @click.option('--identity', '-i', 13 | type=click.FLOAT, 14 | help='Minimum identity to search gene (default=0.9).') 15 | @click.option('--coverage', '-c', 16 | type=click.FLOAT, 17 | help='Minimum coverage to search gene (default=0.95).') 18 | @click.option('--reads', '-r', 19 | type=click.INT, 20 | help='Minimum reads coverage to search gene (default=10).') 21 | @click.option('--paired/--single', default=True, 22 | help= "Defines type of fastqs files.") 23 | @click.option('--fasta', '-f', 24 | type=click.File('w'), 25 | help='Writes fasta file with gene allele.') 26 | @click.option('--output', '-o', 27 | type=click.File('w'), 28 | help='Writes ST search result to (default:stdout).') 29 | @click.argument('database', 30 | type=click.Path(exists=True)) 31 | @click.argument('fastqs', 32 | type=click.File('r'), nargs=-1) 33 | 34 | 35 | def cli(fastqs, database, **kwargs): 36 | """Searches ST number from FASTQS(.gz) raw reads using an mlst DATABASE.""" 37 | 38 | try: 39 | with pymlst.open_cla(os.path.abspath(database)) as mlst: 40 | mlst.multi_read(fastqs, **utils.clean_kwargs(kwargs)) 41 | 42 | except exceptions.PyMLSTError as err: 43 | raise click.ClickException(str(err)) 44 | -------------------------------------------------------------------------------- /pymlst/data/alembic/pytyper/versions/1f96d027f4aa_initial.py: -------------------------------------------------------------------------------- 1 | """Initial 2 | 3 | Revision ID: 1f96d027f4aa 4 | Revises: 5 | Create Date: 2024-04-29 10:11:29.815236 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '1f96d027f4aa' 14 | down_revision = None 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | 21 | engine = op.get_bind() 22 | inspector = sa.inspect(engine) 23 | tables = inspector.get_table_names() 24 | 25 | if 'typerSeq' not in tables: 26 | op.create_table('typerSeq', 27 | sa.Column('id', sa.Integer(), nullable=False), 28 | sa.Column('sequence', sa.Text(), nullable=False), 29 | sa.Column('typing', sa.Text(), nullable=True), 30 | sa.Column('allele', sa.Text(), nullable=False), 31 | sa.PrimaryKeyConstraint('id'), 32 | sa.UniqueConstraint('sequence')) 33 | 34 | if 'typerSt' not in tables: 35 | op.create_table('typerSt', 36 | sa.Column('id', sa.Integer(), nullable=False), 37 | sa.Column('st', sa.Text(), nullable=False), 38 | sa.Column('typing', sa.Text(), nullable=True), 39 | sa.Column('allele', sa.Text(), nullable=False), 40 | sa.PrimaryKeyConstraint('id')) 41 | 42 | if 'mlst_type' not in tables: 43 | table = op.create_table('mlst_type', 44 | sa.Column('name', sa.String(7), nullable=False, 45 | primary_key=True)) 46 | data = [ { 'name' : 'pytyper'}] 47 | op.bulk_insert(table, data) 48 | 49 | 50 | def downgrade(): 51 | pass 52 | -------------------------------------------------------------------------------- /docs/source/documentation/cgmlst/export_seq.rst: -------------------------------------------------------------------------------- 1 | .. _cgmlst_export_seq: 2 | 3 | .. toctree:: 4 | :glob: 5 | 6 | ================ 7 | Export sequences 8 | ================ 9 | 10 | You can access to allele sequences present in the database and specify 11 | a list of genes to export with **-l** option. 12 | 13 | .. note:: 14 | 15 | The gene list can be obtained with the :ref:`gene ` command. 16 | 17 | .. _cgmlst_export_sequence: 18 | 19 | Sequence 20 | ======== 21 | 22 | A simple export of the different sequences. 23 | 24 | .. code-block:: bash 25 | 26 | wgMLST sequence -h 27 | Usage: wgMLST sequence [OPTIONS] DATABASE 28 | 29 | Extracts sequences from a wgMLST DATABASE. 30 | 31 | Options: 32 | -o, --output FILENAME Output result in fasta format (default:stdout). 33 | -f, --file FILENAME File containing list of coregenes to extract 34 | (default:all coregenes). 35 | --reference Returns reference sequence instead of strain alleles. 36 | 37 | .. _cgmlst_export_msa: 38 | 39 | MSA 40 | === 41 | 42 | A multialign fasta file with concatenated genes. The file can be used 43 | directly for phylogenetic analysis using maximum likelihood or 44 | Bayesian approaches. 45 | 46 | .. code-block:: bash 47 | 48 | wgMLST msa -h 49 | Usage: wgMLST msa [OPTIONS] DATABASE 50 | 51 | Computes Multiple Sequence Alignment from a wgMLST DATABASE. 52 | 53 | Options: 54 | ... 55 | -r, --realign Realigns genes with same length (Default:No). 56 | 57 | 58 | .. warning:: 59 | 60 | It is highly recommended to define a limited list of genes to be 61 | exported for the phylogenetic approach. 62 | 63 | 64 | -------------------------------------------------------------------------------- /docs/source/development/docs.rst: -------------------------------------------------------------------------------- 1 | .. _docs: 2 | 3 | ************************** 4 | Building the Documentation 5 | ************************** 6 | 7 | Sphinx 8 | ------ 9 | 10 | The documentation in this project is generated by 11 | `Sphinx `_ 12 | from `reStructuredTex `_. 13 | 14 | Ubuntu/Debian 15 | ------------- 16 | 17 | This project started with `Debian 11 `_. 18 | This does not mean that you cannot use another distribution, or even 19 | another operating system, but you may need to perform additional setup 20 | steps to make your builds work. 21 | 22 | Prerequisites 23 | ^^^^^^^^^^^^^ 24 | 25 | You need to install sphinx dependancy using the :ref:`Makefile ` 26 | file. 27 | 28 | .. code-block:: bash 29 | 30 | make install_docs 31 | 32 | 33 | The project uses the Sphinx 34 | `LatexBuilder `_ 35 | to generate a `PDF `_ 36 | document. If you are using Debian you will need to install 37 | `texlive `_ and 38 | `latexmk `_. 39 | 40 | .. code-block:: bash 41 | 42 | sudo apt-get install texlive-latex-recommended \ 43 | texlive-latex-extra \ 44 | texlive-fonts-recommended \ 45 | latexmk 46 | 47 | 48 | make 49 | ---- 50 | 51 | Once everything is in place, you can build the documentation using the 52 | :ref:`make docs ` the target defined in the project's 53 | :ref:`Makefile `. 54 | 55 | .. code-block:: 56 | 57 | make docs 58 | -------------------------------------------------------------------------------- /pymlst/pytyper/commands/search.py: -------------------------------------------------------------------------------- 1 | """ search CLI command file. """ 2 | 3 | import os 4 | import click 5 | 6 | import pymlst 7 | from pymlst.pytyper import model 8 | from pymlst.pytyper.method import FIM, SPA, CLMT 9 | from pymlst.common import utils, exceptions 10 | 11 | @click.command(name="search") 12 | @click.option("--identity", "-i", 13 | type=click.FLOAT, 14 | help="Minimum identity to search gene.") 15 | @click.option("--coverage", "-c", 16 | type=click.FLOAT, 17 | help="Minimum coverage to search gene.") 18 | @click.option('--fasta', '-f', 19 | type=click.File('w'), 20 | help='Writes fasta file with gene allele.') 21 | @click.option('--output', '-o', 22 | type=click.File('w'), 23 | help='Writes search result to (default:stdout).') 24 | 25 | # Database is initialized automatically without intervention from user 26 | @click.argument('method', 27 | type=click.Choice([FIM, SPA, CLMT]), 28 | required=True) 29 | @click.argument('genomes', 30 | type=click.File('r'), 31 | required=True, 32 | nargs=-1) 33 | 34 | def cli(method, genomes, **kwargs): 35 | 36 | """Searches strain type using specified METHOD for an assembly GENOME. 37 | 38 | fim: fimH typing for Escherichia coli\n 39 | spa: spa typing for Staphylococcus aureus\n 40 | clmt: Phylogouping using ClermontTyping method for Escherichia coli 41 | """ 42 | 43 | try: 44 | with pymlst.open_typer(method) as typer: 45 | typer.multi_search(genomes, **utils.clean_kwargs(kwargs)) 46 | 47 | except exceptions.PyMLSTError as err: 48 | raise click.ClickException(str(err)) 49 | 50 | 51 | -------------------------------------------------------------------------------- /docs/source/documentation/cgmlst.rst: -------------------------------------------------------------------------------- 1 | .. _cgmlst: 2 | 3 | ****************** 4 | cg/wgMLST analysis 5 | ****************** 6 | 7 | A workflow analysis of cg/wgMLST is performed using a series of Python 8 | scripts described below. 9 | 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | 14 | cgmlst/initialise 15 | cgmlst/add 16 | cgmlst/check 17 | cgmlst/export_res 18 | cgmlst/export_seq 19 | cgmlst/other_analysis 20 | 21 | 22 | .. figure:: cgmlst.png 23 | :alt: pymlst architecture 24 | :align: center 25 | 26 | pyMLST architecture for cg/wgMLST analysis 27 | 28 | 29 | All avalaible commands can be listed using help fonction: 30 | 31 | .. code-block:: bash 32 | 33 | wgMLST --help 34 | 35 | Usage: wgMLST [OPTIONS] COMMAND [ARGS]... 36 | 37 | Whole/Core genome MLST analysis. 38 | 39 | Commands: 40 | add Adds a strain GENOME to the wgMLST DATABASE. 41 | add2 Adds a strain from FASTQS(.gz) reads to the wgMLST... 42 | create Creates a wgMLST DATABASE from a template COREGENE. 43 | distance Extracts a distance matrix from a wgMLST DATABASE. 44 | gene Extracts a list of genes from a wgMLST DATABASE. 45 | import Creates a wgMLST DATABASE from an online resource. 46 | mlst Extracts an MLST table from a wgMLST DATABASE. 47 | msa Computes Multiple Sequence Alignment from a wgMLST... 48 | recombination Searches potential gene recombinations from wgMLST... 49 | remove Removes STRAINS or GENES from a wgMLST DATABASE. 50 | sequence Extracts sequences from a wgMLST DATABASE. 51 | stats Extracts stats from a wgMLST DATABASE. 52 | strain Extracts a list of strains from a wgMLST DATABASE. 53 | subgraph Searches group of strains at a DISTANCE threshold. 54 | -------------------------------------------------------------------------------- /pymlst/wg/commands/create.py: -------------------------------------------------------------------------------- 1 | """create CLI command file.""" 2 | 3 | import os 4 | import click 5 | 6 | import pymlst 7 | from pymlst.common import exceptions, utils 8 | 9 | @click.command(name='create') 10 | @click.option('--force', '-f', 11 | is_flag=True, 12 | help='Overwrite alrealdy existing DATABASE') 13 | @click.option('--concatenate', '-c', 14 | is_flag=True, 15 | help='Automatically concatenates GENES with duplicated sequences.') 16 | @click.option('--remove', '-r', 17 | is_flag=True, 18 | help='Automatically removes GENES with duplicated sequences.') 19 | @click.option('--species', '-s', 20 | type=click.STRING, 21 | help='Name of the species') 22 | @click.option('--version', '-V', 23 | type=click.STRING, 24 | help='Version of the database') 25 | @click.argument('database', type=click.Path(exists=False)) 26 | @click.argument('coregene', type=click.File('r')) 27 | 28 | def cli(force, species, version, database, **kwargs): 29 | """Creates a wgMLST DATABASE from a template COREGENE.""" 30 | 31 | try: 32 | 33 | if os.path.exists(database): 34 | if force: 35 | open(database, "w").close() 36 | else: 37 | raise exceptions.PyMLSTError("Database alreadly exists, use --force to override it") 38 | 39 | with pymlst.open_wg(os.path.abspath(database)) as mlst: 40 | mlst.create(**utils.clean_kwargs(kwargs)) 41 | mlst.add_infos("custom", species, version) 42 | 43 | except exceptions.DuplicatedGeneSequence as err: 44 | raise click.UsageError('{}, use -c or -r options to manage it' 45 | .format(str(err))) 46 | except exceptions.PyMLSTError as err: 47 | raise click.ClickException(str(err)) 48 | -------------------------------------------------------------------------------- /pymlst/common/mafft.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import logging 3 | import sys 4 | import subprocess 5 | 6 | from Bio import AlignIO 7 | from io import StringIO 8 | 9 | from pymlst import config 10 | from pymlst.common import utils, exceptions 11 | 12 | 13 | def align(genes): 14 | path = config.get_binary_path('mafft') 15 | if not path: 16 | raise exceptions.BinaryNotFound('MAFFT binary was not found') 17 | with tempfile.NamedTemporaryFile(mode='w+t', suffix='.fasta') as tmp: 18 | utils.write_genome(genes, tmp) 19 | tmp.flush() 20 | p = subprocess.Popen([path, "--auto", tmp.name], \ 21 | stdout=subprocess.PIPE, \ 22 | stderr=subprocess.PIPE, \ 23 | encoding=sys.stdout.encoding) 24 | #records = AlignIO.parse(p.stdout, "fasta") 25 | try: 26 | outs, errs = p.communicate() 27 | alignments = next(AlignIO.parse(StringIO(outs), "fasta")) 28 | except: 29 | logging.error("MAFFT doesn't finish correctly\n" + \ 30 | errs) 31 | return {} 32 | return utils.records_to_dict(alignments) 33 | 34 | 35 | def __first_aligned_position(sequence): 36 | position = 0 37 | for char in sequence: 38 | if char != '-': 39 | return position 40 | position += 1 41 | return -1 42 | 43 | 44 | def get_aligned_area(query, target): 45 | alignments = align({'query': query, 'target': target}) 46 | if len(alignments) != 2: 47 | return None, None 48 | q_align = alignments['query'] 49 | q_len = len(q_align) 50 | start_index = __first_aligned_position(q_align) 51 | if start_index == -1: 52 | return None, None 53 | end_index = q_len - __first_aligned_position(reversed(q_align)) 54 | return start_index, end_index 55 | -------------------------------------------------------------------------------- /pymlst/data/alembic/cla/versions/21efe503d07d_initial.py: -------------------------------------------------------------------------------- 1 | """initial 2 | 3 | Revision ID: 21efe503d07d 4 | Revises: 5 | Create Date: 2021-05-21 15:55:22.181990 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '21efe503d07d' 14 | down_revision = None 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # This is the initial revision created after the PyMLST refactoring. 21 | # The old un-versioned databases data are untouched. 22 | # A new alembic_version table is added automatically to enable versioning. 23 | 24 | engine = op.get_bind() 25 | inspector = sa.inspect(engine) 26 | tables = inspector.get_table_names() 27 | 28 | if 'mlst' not in tables: 29 | op.create_table('mlst', 30 | sa.Column('id', sa.Integer(), nullable=False), 31 | sa.Column('st', sa.Integer(), nullable=True), 32 | sa.Column('gene', sa.Text(), nullable=True), 33 | sa.Column('allele', sa.Integer(), nullable=True), 34 | sa.PrimaryKeyConstraint('id')) 35 | 36 | if 'sequences' not in tables: 37 | op.create_table('sequences', 38 | sa.Column('id', sa.Integer(), nullable=False), 39 | sa.Column('sequence', sa.Text(), nullable=True), 40 | sa.Column('gene', sa.Text(), nullable=True), 41 | sa.Column('allele', sa.Integer(), nullable=True), 42 | sa.PrimaryKeyConstraint('id'), 43 | sa.UniqueConstraint('sequence')) 44 | 45 | if 'mlst_type' not in tables: 46 | table = op.create_table('mlst_type', 47 | sa.Column('name', sa.String(length=4), nullable=False, 48 | primary_key=True)) 49 | data = [ { 'name': 'cla' } ] 50 | op.bulk_insert(table, data) 51 | 52 | 53 | def downgrade(): 54 | ##remove mlst_type table 55 | op.drop_table('mlst_type') 56 | -------------------------------------------------------------------------------- /pymlst/common/commands/configure.py: -------------------------------------------------------------------------------- 1 | """configure CLI command file.""" 2 | 3 | import click 4 | 5 | from pymlst import config 6 | 7 | CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) 8 | 9 | @click.command(context_settings=CONTEXT_SETTINGS) 10 | @click.option('--blat', '-b', 11 | type=click.Path(exists=True, dir_okay=False), 12 | help='Blat executable absolute path.') 13 | @click.option('--kma', '-k', 14 | type=click.Path(exists=True, dir_okay=False), 15 | help='Kma executable absolute path.') 16 | @click.option('--mafft', '-m', 17 | type=click.Path(exists=True, dir_okay=False), 18 | help='Mafft executable absolute path.') 19 | @click.option('--log', '-l', 20 | type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR']), 21 | help='Level of logging, default=INFO') 22 | @click.option('--reset', '-r', 23 | is_flag=True, 24 | help='Reset the configuration.') 25 | def cli(blat, kma, mafft, log, reset): 26 | """Configure executables paths and log level.""" 27 | if reset: 28 | config.reset_binary_paths() 29 | config.set_logging_level("INFO") 30 | click.echo('Resetting the configuration...') 31 | 32 | if mafft or blat or kma: 33 | paths = {} 34 | if blat: 35 | paths['blat'] = blat 36 | if kma: 37 | paths['kma'] = kma 38 | if mafft: 39 | paths['mafft'] = mafft 40 | config.update_binary_paths(paths) 41 | if log: 42 | config.set_logging_level(log) 43 | 44 | paths = config.list_binary_paths() 45 | log = config.get_logging_level() 46 | click.echo('--- Configuration ---') 47 | if len(paths) > 0: 48 | for key, value in paths: 49 | click.echo(key + ': ' + value) 50 | click.echo('---------------------') 51 | click.echo('LOG : ' + log) 52 | click.echo('---------------------') 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | *~ 8 | *.conf 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | docs/build 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # dotenv 86 | .env 87 | 88 | # virtualenv 89 | .venv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # JetBrains 107 | .idea 108 | 109 | # Sphinx documentation 110 | docs/_build/ 111 | 112 | # PyBuilder 113 | target/ 114 | 115 | #Configuration file 116 | pymlst.conf 117 | 118 | #Alembic database 119 | pymlst/data/alembic/*/*db 120 | pymlst/data/*db -------------------------------------------------------------------------------- /docs/source/documentation/installation.rst: -------------------------------------------------------------------------------- 1 | .. _installation: 2 | 3 | .. toctree:: 4 | :glob: 5 | 6 | ============ 7 | Installation 8 | ============ 9 | 10 | This section provides instructions for installation and configuration of pyMLST. 11 | 12 | 13 | Automatic Installation 14 | ====================== 15 | 16 | You can install pyMLST and their dependancy using `bioconda `_: 17 | 18 | .. code-block:: bash 19 | 20 | conda install -c conda-forge -c bioconda pymlst 21 | 22 | Manual Installation 23 | =================== 24 | 25 | * From `pypi repository `_: 26 | 27 | .. code-block:: bash 28 | 29 | pip install pymlst 30 | 31 | * From `github source `_: 32 | 33 | .. code-block:: bash 34 | 35 | virtualenv venv 36 | source venv/bin/activate 37 | make install 38 | make build 39 | 40 | 41 | Dependancy 42 | ========== 43 | 44 | PyMLST uses 3 external tools to run alignment: 45 | 46 | * Mafft (>=7.307) 47 | 48 | .. code-block:: bash 49 | 50 | sudo apt install mafft 51 | 52 | * Blat (v35). You need to compile source or obtain executable at: 53 | https://genome.ucsc.edu/FAQ/FAQblat.html 54 | 55 | * kma (>=1.3) You need to compile source from: 56 | https://bitbucket.org/genomicepidemiology/kma/src/master/ 57 | 58 | 59 | Configuration 60 | ============= 61 | 62 | Configure the executable locations (if they are not on the PATH) and log level : 63 | 64 | .. code-block:: bash 65 | 66 | pyMLST configure --help 67 | Usage: pyMLST configure [OPTIONS] 68 | 69 | Configure executables paths and log level. 70 | 71 | Options: 72 | -b, --blat FILE Blat executable absolute path. 73 | -k, --kma FILE Kma executable absolute path. 74 | -m, --mafft FILE Mafft executable absolute path. 75 | -l, --log [DEBUG|INFO|WARNING|ERROR] 76 | Level of logging, default=INFO 77 | -r, --reset Reset the configuration. 78 | --help Show this message and exit. 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. PyMLST documentation master file 2 | You can adapt this file completely to your liking, but it should at least 3 | contain the root `toctree` directive. 4 | 5 | ****** 6 | pyMLST 7 | ****** 8 | 9 | .. figure:: logo.png 10 | :align: center 11 | :height: 150px 12 | :alt: pyMLST 13 | 14 | python Mlst Local Search Tool 15 | 16 | Purpose 17 | ======= 18 | 19 | 20 | Bacterial typing is critical to unraveling the spread of pathogens. 21 | For this purpose, data from next-generation sequencing are 22 | now widely used, with core multilocus sequence typing (cgMLST) or 23 | whole genome multilocus sequence typing (wgMLST) becoming the new 24 | standard. These methods are an extension of the traditional MLST 25 | method, which uses a short list of housekeeping genes. cgMLST and 26 | wgMLST use a large set of genes corresponding to the core or whole 27 | genome. Similar to MLST, each unique sequence corresponds to a 28 | specific allele, and the combination of alleles determines the 29 | sequence type (ST) of the strain. 30 | 31 | 32 | We have developed pyMLST to perform this task. Unlike other tools, it 33 | uses a local SQLite database to store allele sequences and MLST 34 | profiles. This allows the collection of strains to be expanded 35 | iteratively. The input can be (i) an assembler-generated draft 36 | genome, (ii) the direct raw data, or (iii) other genomes stored in the 37 | sequence database. 38 | 39 | 40 | Documentation 41 | ============= 42 | 43 | .. toctree:: 44 | :maxdepth: 2 45 | :caption: Users: 46 | 47 | documentation/installation 48 | documentation/cgmlst 49 | documentation/clamlst 50 | documentation/pytyper 51 | 52 | 53 | .. toctree:: 54 | :maxdepth: 2 55 | :caption: Developers: 56 | 57 | development 58 | api 59 | 60 | 61 | Citation 62 | ======== 63 | 64 | If you use pyMLST, please cite the following paper: 65 | 66 | Bignenet A. et al., Introduction and benchmarking of pyMLST: 67 | open-source software for assessing bacterial clonality using core 68 | genome MLST. 2023 Microbials Genomics, 9(11), 1126. 69 | doi: `10.1099/mgen.0.001126 `_ 70 | -------------------------------------------------------------------------------- /docs/source/documentation/clamlst/search.rst: -------------------------------------------------------------------------------- 1 | .. _clamlst_search: 2 | 3 | .. toctree:: 4 | :glob: 5 | 6 | =============================== 7 | Search MLST profile of a strain 8 | =============================== 9 | 10 | Similarly to wgMLST analysis, you need a draft genome or raw reads 11 | data to find the MLST profile. 12 | 13 | .. note:: 14 | 15 | You can perform MLST searches on multiple genomes or raw reads 16 | simultaneously. 17 | 18 | 19 | 20 | Genome data 21 | ^^^^^^^^^^^ 22 | 23 | You can search ST from GENOME fasta sequence files. 24 | 25 | .. code-block:: bash 26 | 27 | claMLST search --help 28 | Usage: claMLST search [OPTIONS] DATABASE GENOMES 29 | 30 | Searches ST number for assembly GENOMES using an mlst DATABASE 31 | 32 | Options: 33 | -i, --identity FLOAT Minimum identity to search gene (default=0.9) 34 | -c, --coverage FLOAT Minimum coverage to search gene (default=0.9) 35 | -f, --fasta FILENAME Writes fasta file with gene allele 36 | -o, --output FILENAME Writes ST search result to (default:stdout) 37 | 38 | 39 | Reads data 40 | ^^^^^^^^^^ 41 | 42 | Alternatively, you can search ST directly from raw reads with single 43 | or paired FASTQS(.gz) files. 44 | 45 | .. code-block:: bash 46 | 47 | claMLST search2 --help 48 | Usage: claMLST search2 [OPTIONS] DATABASE [FASTQS]... 49 | 50 | Searches ST number from FASTQS(.gz) raw reads using an mlst DATABASE. 51 | 52 | Options: 53 | -i, --identity FLOAT Minimum identity to search gene (default=0.9). 54 | -c, --coverage FLOAT Minimum coverage to search gene (default=0.95). 55 | -r, --reads INTEGER Minimum reads coverage to search gene (default=10). 56 | --paired / --single Defines type of fastqs files. 57 | -f, --fasta FILENAME Writes fasta file with gene allele. 58 | -o, --output FILENAME Writes ST search result to (default:stdout). 59 | 60 | 61 | .. note:: 62 | 63 | The default identity and coverage thresholds are set to 0.9 and can 64 | be modulated using the **-i** and **-c** options. 65 | 66 | .. note:: 67 | 68 | If new alleles are present, you can obtain their sequences with 69 | the **-f** option. 70 | 71 | -------------------------------------------------------------------------------- /pymlst/common/blat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | ##Copyright (c) 2019 Benoit Valot 5 | ##benoit.valot@univ-fcomte.fr 6 | ##UMR 6249 Chrono-Environnement, Besançon, France 7 | ##Licence GPL 8 | import logging 9 | 10 | import subprocess 11 | import tempfile 12 | from io import BytesIO 13 | 14 | from pymlst import config 15 | from pymlst.common.psl import Psl 16 | from pymlst.common import exceptions 17 | 18 | 19 | def run_blat(genome, tmpfile, tmpout, identity, coverage, maxintron=20): 20 | """Run Blat and return Psl Object""" 21 | path = config.get_binary_path('blat') 22 | if path is None: 23 | raise exceptions.BinaryNotFound('BLAT binary was not found') 24 | 25 | command = [path, '-maxIntron='+str(maxintron), '-fine', \ 26 | '-minIdentity='+str(identity*100), \ 27 | genome.name, tmpfile.name, tmpout.name] 28 | proc = subprocess.Popen(command, stderr=subprocess.PIPE, stdout=subprocess.PIPE) 29 | 30 | output, error = proc.communicate() 31 | for line in BytesIO(output).readlines(): 32 | logging.debug(line.decode().rstrip()) 33 | have_error = False 34 | for line in BytesIO(error).readlines(): 35 | have_error = True 36 | logging.error(line.decode().rstrip()) 37 | if have_error: 38 | raise exceptions.PyMLSTError( 39 | 'An error occurred while running BLAT') 40 | genes = {} 41 | for line in open(tmpout.name, 'r'): 42 | try: 43 | int(line.split()[0]) 44 | except (ValueError, IndexError): 45 | continue 46 | psl = Psl(line) 47 | if coverage <= psl.coverage <= 1: 48 | genes.setdefault(psl.gene_id(), []).append(psl) 49 | if len(genes) == 0: 50 | raise exceptions.CoreGenomePathNotFound( 51 | 'No path was found for the core genome') 52 | return genes 53 | 54 | 55 | def blat_tmp(): 56 | """Return a fasta and a psl temporary file""" 57 | tmpfile = tempfile.NamedTemporaryFile(mode='w+t', suffix='.fasta', delete=False) 58 | tmpout = tempfile.NamedTemporaryFile(mode='w+t', suffix='.psl', delete=False) 59 | tmpout.close() 60 | return tmpfile, tmpout 61 | -------------------------------------------------------------------------------- /docs/source/documentation/cgmlst/add.rst: -------------------------------------------------------------------------------- 1 | .. _cgmlst_add: 2 | 3 | .. toctree:: 4 | :glob: 5 | 6 | =========================== 7 | Add strains to the database 8 | =========================== 9 | 10 | Next, you need to iteratively add your strains to the database. You 11 | can use a draft genome (we recommend using `Spades 12 | `_ for assembly). 13 | You can also add a reference genome for comparison. 14 | 15 | 16 | .. note:: 17 | 18 | You need to add each strain one by one to the database. You can 19 | specify strain name using **-s** option. 20 | 21 | 22 | Genome data 23 | ^^^^^^^^^^^ 24 | 25 | You can add strains using GENOME fasta sequence file. 26 | 27 | .. code-block:: bash 28 | 29 | wgMLST add --help 30 | Usage: wgMLST add [OPTIONS] DATABASE GENOME 31 | 32 | Adds a strain GENOME to the wgMLST DATABASE. 33 | 34 | Options: 35 | -s, --strain TEXT Name of the strain (default:genome name) 36 | -i, --identity FLOAT Minimum identity to search gene (default=0.95) 37 | -c, --coverage FLOAT Minimum coverage to search gene (default=0.9) 38 | 39 | 40 | Reads data 41 | ^^^^^^^^^^ 42 | 43 | Alternatively, you can also add strains from raw reads direcly with 44 | single or paired FASTQS(.gz) files. 45 | 46 | .. code-block:: bash 47 | 48 | wgMLST add2 --help 49 | Usage: wgMLST add2 [OPTIONS] DATABASE [FASTQS]... 50 | 51 | Adds a strain from FASTQS(.gz) reads to the wgMLST DATABASE. 52 | 53 | Options: 54 | -s, --strain TEXT Name of the strain (default:genome name). 55 | -i, --identity FLOAT Minimum identity to search gene (default=0.95). 56 | -c, --coverage FLOAT Minimum coverage to search gene (default=0.9). 57 | -r, --reads INTEGER Minimum reads coverage to search a gene 58 | (default=10). 59 | 60 | 61 | .. note:: 62 | 63 | The defaut identity and coverage treshold are set to 0.9 and can be 64 | modulated with **-i** and **-c** options. 65 | 66 | 67 | .. warning:: 68 | 69 | Carefully check that the allele calling has been performed 70 | correctly for each genome. Check the number of genes found for each 71 | strain using the :ref:`strain command `. 72 | 73 | -------------------------------------------------------------------------------- /pymlst/config.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import os 3 | import shutil 4 | 5 | _ROOT = os.path.abspath(os.path.dirname(__file__)) 6 | _CONF_PATH = os.path.join(_ROOT, 'data', 'pymlst.conf') 7 | 8 | _BIN_SECTION = 'BINARIES' 9 | _LOG_SECTION = 'LOGGING' 10 | _LOG_LEVEL = 'Log Level' 11 | 12 | def get_data(path): 13 | return os.path.join(_ROOT, 'data', path) 14 | 15 | def write_config(conf): 16 | with open(_CONF_PATH, 'w') as file: 17 | conf.write(file) 18 | 19 | def get_config(): 20 | """Retrieves the configuration file.""" 21 | conf = configparser.ConfigParser() 22 | if os.path.exists(_CONF_PATH): 23 | conf.read(_CONF_PATH) 24 | return conf 25 | 26 | 27 | def update_binary_paths(paths): 28 | """Updates the paths stored in the configuration file.""" 29 | conf = get_config() 30 | 31 | if not conf.has_section(_BIN_SECTION): 32 | conf.add_section(_BIN_SECTION) 33 | 34 | for key, value in paths.items(): 35 | conf[_BIN_SECTION][key] = os.path.abspath(value) 36 | write_config(conf) 37 | 38 | 39 | def reset_binary_paths(): 40 | """Removes the configuration file.""" 41 | conf = get_config() 42 | if conf.has_section(_BIN_SECTION): 43 | conf.remove_section(_BIN_SECTION) 44 | write_config(conf) 45 | 46 | 47 | def get_binary_path(bin_name): 48 | """Retrieves a binary path.""" 49 | conf = get_config() 50 | if conf.has_option(_BIN_SECTION, bin_name): 51 | return conf.get(_BIN_SECTION, bin_name) 52 | return shutil.which(bin_name) # path research 53 | 54 | 55 | def list_binary_paths(): 56 | """Lists the binary paths stored in the configuration file.""" 57 | conf = get_config() 58 | if conf.has_section(_BIN_SECTION): 59 | return conf.items(_BIN_SECTION) 60 | return [] 61 | 62 | def get_logging_level(): 63 | """Return log level""" 64 | conf = get_config() 65 | if conf.has_section(_LOG_SECTION): 66 | return conf.get(_LOG_SECTION, _LOG_LEVEL) 67 | return("INFO") 68 | 69 | def set_logging_level(levelname): 70 | """Defined level of logging""" 71 | conf = get_config() 72 | 73 | if not conf.has_section(_LOG_SECTION): 74 | conf.add_section(_LOG_SECTION) 75 | conf[_LOG_SECTION][_LOG_LEVEL] = levelname 76 | write_config(conf) 77 | -------------------------------------------------------------------------------- /docs/source/documentation/pytyper/search.rst: -------------------------------------------------------------------------------- 1 | .. _pytyper_search: 2 | 3 | .. toctree:: 4 | :glob: 5 | 6 | ======================================= 7 | Search other typing profile of a strain 8 | ======================================= 9 | 10 | In complement to classical MSLT, you can search for other typing METHOD 11 | using a draft genome or raw reads. 12 | 13 | 14 | Typing method 15 | ^^^^^^^^^^^^^ 16 | 17 | fimH typing 18 | FimH typing is based on the allelic sequence of the fimH gene in the 19 | species *Escherichia coli* 20 | (`Dias et al, 2010 `_). 21 | Allelic sequence were download from `CGE `_. 22 | 23 | spa typing 24 | Spa typing is based on the repetitions polymorphism present on the protein 25 | A gene (spa) in the species *Staphylococcus aureus* 26 | (`Frénay et al, 1996 `_). 27 | Repetitions and sequence types definition were download from 28 | `Ridom `_. 29 | 30 | Clermont typing 31 | Clermont phylogrouping is based on the presence/absence of 4 different genes 32 | in the species *Escherichia coli* 33 | (`Clermont et al, 2012 `_). 34 | 35 | 36 | Genome data 37 | ^^^^^^^^^^^ 38 | 39 | You can search typing METHOD from GENOME fasta sequence files. 40 | 41 | .. code-block:: bash 42 | 43 | pyTyper search --help 44 | Usage: pyTyper search [OPTIONS] {fim|spa|clmt} GENOMES... 45 | 46 | Searches strain type using specified METHOD for an assembly GENOME. 47 | 48 | fim: fimH typing for Escherichia coli 49 | spa: spa typing for Staphylococcus aureus 50 | clmt: Phylogouping using ClermontTyping method for Escherichia coli 51 | 52 | Options: 53 | -i, --identity FLOAT Minimum identity to search gene. 54 | -c, --coverage FLOAT Minimum coverage to search gene. 55 | -f, --fasta FILENAME Writes fasta file with gene allele. 56 | -o, --output FILENAME Writes search result to (default:stdout). 57 | 58 | 59 | .. note:: 60 | 61 | If new alleles are present or you want to have sequence target by the typing method in your strains, 62 | you can obtain their sequences with the **-f** option. 63 | 64 | .. note:: 65 | 66 | You can perform searches on multiple genomes simultaneously. 67 | -------------------------------------------------------------------------------- /docs/source/development/make.rst: -------------------------------------------------------------------------------- 1 | .. _make: 2 | 3 | .. _using-the-makefile: 4 | 5 | Using the `Makefile` 6 | ==================== 7 | 8 | This project includes a `Makefile `_ 9 | that you can use to perform common tasks such as running tests and building 10 | documentation. 11 | 12 | Targets 13 | ------- 14 | 15 | This section contains a brief description of the targets defined in the 16 | ``Makefile``. 17 | 18 | ``clean`` 19 | ^^^^^^^^^ 20 | 21 | Removes generated packages, documentation, temporary files, *etc*. 22 | 23 | .. _make_lint: 24 | 25 | ``lint`` 26 | ^^^^^^^^ 27 | 28 | Runs `pylint `_ against the project files. 29 | 30 | .. _make_test: 31 | 32 | ``test`` 33 | ^^^^^^^^ 34 | 35 | Runs the unit tests. 36 | 37 | ``quicktest`` 38 | ^^^^^^^^^^^^^ 39 | 40 | Runs the unit tests without performing pre-test validations (like 41 | :ref:`linting `). 42 | 43 | .. _make_docs: 44 | 45 | ``docs`` 46 | ^^^^^^^^ 47 | 48 | Builds the documentation for production. 49 | 50 | .. note:: 51 | 52 | You can also build the documents directly, bypassing validations like 53 | :ref:`linting ` and :ref:`testing ` using 54 | `Sphinx Makefile `_ 55 | directly. 56 | 57 | .. code-block:: bash 58 | 59 | cd docs 60 | make clean && make html 61 | make latexpdf 62 | 63 | .. _make_answers: 64 | 65 | ``answers`` 66 | ^^^^^^^^^^^ 67 | 68 | Performs a quick build of the documentation and open it in your browser. 69 | 70 | ``package`` 71 | ^^^^^^^^^^^ 72 | 73 | Builds the package for publishing. 74 | 75 | .. _make-publish: 76 | 77 | ``publish`` 78 | ^^^^^^^^^^^ 79 | 80 | Publishes the package to your repository. 81 | 82 | ``build`` 83 | ^^^^^^^^^ 84 | 85 | Installs the current project locally so that you may run the command-line application. 86 | 87 | ``venv`` 88 | ^^^^^^^^ 89 | 90 | Creates a virtual environment. 91 | 92 | ``install`` 93 | ^^^^^^^^^^^ 94 | 95 | Installs (or updates) project dependencies. 96 | 97 | ``install_docs`` 98 | ^^^^^^^^^^^^^^^^ 99 | 100 | Installs (or updates) documentation dependencies. 101 | 102 | ``licenses`` 103 | ^^^^^^^^^^^^ 104 | 105 | Generates a report of the projects dependencies and respective licenses. 106 | 107 | .. note:: 108 | 109 | If project dependencies change, please update this documentation. 110 | -------------------------------------------------------------------------------- /pymlst/data/alembic/wg/env.py: -------------------------------------------------------------------------------- 1 | from logging.config import fileConfig 2 | 3 | from sqlalchemy import engine_from_config 4 | from sqlalchemy import pool 5 | 6 | from alembic import context 7 | 8 | from pymlst.wg import model 9 | 10 | # this is the Alembic Config object, which provides 11 | # access to the values within the .ini file in use. 12 | 13 | config = context.config 14 | 15 | # Interpret the config file for Python logging. 16 | # This line sets up loggers basically. 17 | # fileConfig(config.config_file_name) 18 | 19 | # add your model's MetaData object here 20 | # for 'autogenerate' support 21 | # from myapp import mymodel 22 | # target_metadata = mymodel.Base.metadata 23 | target_metadata = model.metadata 24 | 25 | # other values from the config, defined by the needs of env.py, 26 | # can be acquired: 27 | # my_important_option = config.get_main_option("my_important_option") 28 | # ... etc. 29 | 30 | 31 | def run_migrations_offline(): 32 | """Run migrations in 'offline' mode. 33 | 34 | This configures the context with just a URL 35 | and not an Engine, though an Engine is acceptable 36 | here as well. By skipping the Engine creation 37 | we don't even need a DBAPI to be available. 38 | 39 | Calls to context.execute() here emit the given string to the 40 | script output. 41 | 42 | """ 43 | url = config.get_main_option("sqlalchemy.url") 44 | context.configure( 45 | url=url, 46 | target_metadata=target_metadata, 47 | literal_binds=True, 48 | dialect_opts={"paramstyle": "named"}, 49 | ) 50 | 51 | with context.begin_transaction(): 52 | context.run_migrations() 53 | 54 | 55 | def run_migrations_online(): 56 | """Run migrations in 'online' mode. 57 | 58 | In this scenario we need to create an Engine 59 | and associate a connection with the context. 60 | 61 | """ 62 | connectable = config.attributes.get('connection', None) 63 | 64 | if connectable is None: 65 | # only create Engine if we don't have a Connection 66 | # from the outside 67 | connectable = engine_from_config( 68 | config.get_section(config.config_ini_section), 69 | prefix='sqlalchemy.', 70 | poolclass=pool.NullPool) 71 | 72 | # when connectable is already a Connection object, calling 73 | # connect() gives us a *branched connection*. 74 | 75 | with connectable.connect() as connection: 76 | context.configure( 77 | connection=connection, 78 | target_metadata=target_metadata 79 | ) 80 | 81 | with context.begin_transaction(): 82 | context.run_migrations() 83 | 84 | 85 | if context.is_offline_mode(): 86 | run_migrations_offline() 87 | else: 88 | run_migrations_online() 89 | -------------------------------------------------------------------------------- /pymlst/wg/commands/import.py: -------------------------------------------------------------------------------- 1 | """import CLI command file.""" 2 | 3 | import logging 4 | import os 5 | import tempfile 6 | 7 | import click 8 | import requests 9 | 10 | import pymlst 11 | from pymlst.common import utils, web, exceptions 12 | 13 | 14 | @click.command(name='import') 15 | @click.option('--force', '-f', 16 | is_flag=True, 17 | help='Overwrite alrealdy existing DATABASE') 18 | @click.option('--prompt/--no-prompt', 19 | default=True, 20 | help='Do not prompt if multiple ' 21 | 'choices are found, fail instead.') 22 | @click.argument('database', 23 | type=click.Path(exists=False)) 24 | @click.argument('species', 25 | type=click.STRING, 26 | nargs=-1) 27 | def cli(force, prompt, database, species): 28 | """Creates a wgMLST DATABASE from an online resource. 29 | 30 | The research can be filtered by adding a SPECIES name.""" 31 | 32 | utils.create_logger() 33 | 34 | try: 35 | 36 | if os.path.exists(database): 37 | if force: 38 | open(database, "w").close() 39 | else: 40 | raise exceptions.PyMLSTError("Database alreadly exists, use --force to override it") 41 | 42 | url = web.retrieve_cgmlst(' '.join(species), prompt) 43 | 44 | if url is None: 45 | logging.info('No choice selected') 46 | return 47 | 48 | logging.info('Downloading the core genome...') 49 | 50 | with tempfile.NamedTemporaryFile('w+', delete=False) as tmp: 51 | 52 | skipped = web.get_cgmlst_file(url, tmp) 53 | tmp.close() 54 | if len(skipped) > 0: 55 | logging.info('Skipped the following malformed file(s): %s', ', '.join(skipped)) 56 | infos = web.get_cgmlst_info(url) 57 | with pymlst.open_wg(os.path.abspath(database)) as mlst: 58 | mlst.create(tmp.name) 59 | mlst.add_infos("cgmlst.org", infos[0], infos[1]) 60 | 61 | except requests.exceptions.HTTPError: 62 | raise click.ClickException('Could not retrieve online data') 63 | except requests.exceptions.ConnectionError: 64 | raise click.ClickException('Could not access to the server, please verify your internet connection') 65 | except requests.exceptions.Timeout: 66 | raise click.ClickException('The server took too long to respond') 67 | except exceptions.StructureError: 68 | raise click.ClickException('It seems like the structure of the website/API changed ' 69 | 'since this application was developed.') 70 | except exceptions.PyMLSTError as err: 71 | raise click.ClickException(str(err)) 72 | -------------------------------------------------------------------------------- /pymlst/data/alembic/cla/env.py: -------------------------------------------------------------------------------- 1 | from logging.config import fileConfig 2 | 3 | from sqlalchemy import engine_from_config 4 | from sqlalchemy import pool 5 | 6 | from alembic import context 7 | 8 | from pymlst.cla import model 9 | 10 | # this is the Alembic Config object, which provides 11 | # access to the values within the .ini file in use. 12 | 13 | config = context.config 14 | 15 | # Interpret the config file for Python logging. 16 | # This line sets up loggers basically. 17 | # fileConfig(config.config_file_name) 18 | 19 | # add your model's MetaData object here 20 | # for 'autogenerate' support 21 | # from myapp import mymodel 22 | # target_metadata = mymodel.Base.metadata 23 | target_metadata = model.metadata 24 | 25 | # other values from the config, defined by the needs of env.py, 26 | # can be acquired: 27 | # my_important_option = config.get_main_option("my_important_option") 28 | # ... etc. 29 | 30 | 31 | def run_migrations_offline(): 32 | """Run migrations in 'offline' mode. 33 | 34 | This configures the context with just a URL 35 | and not an Engine, though an Engine is acceptable 36 | here as well. By skipping the Engine creation 37 | we don't even need a DBAPI to be available. 38 | 39 | Calls to context.execute() here emit the given string to the 40 | script output. 41 | 42 | """ 43 | url = config.get_main_option("sqlalchemy.url") 44 | context.configure( 45 | url=url, 46 | target_metadata=target_metadata, 47 | literal_binds=True, 48 | dialect_opts={"paramstyle": "named"}, 49 | ) 50 | 51 | with context.begin_transaction(): 52 | context.run_migrations() 53 | 54 | 55 | def run_migrations_online(): 56 | """Run migrations in 'online' mode. 57 | 58 | In this scenario we need to create an Engine 59 | and associate a connection with the context. 60 | 61 | """ 62 | connectable = config.attributes.get('connection', None) 63 | 64 | if connectable is None: 65 | # only create Engine if we don't have a Connection 66 | # from the outside 67 | connectable = engine_from_config( 68 | config.get_section(config.config_ini_section), 69 | prefix='sqlalchemy.', 70 | poolclass=pool.NullPool) 71 | 72 | # when connectable is already a Connection object, calling 73 | # connect() gives us a *branched connection*. 74 | 75 | with connectable.connect() as connection: 76 | context.configure( 77 | connection=connection, 78 | target_metadata=target_metadata 79 | ) 80 | 81 | with context.begin_transaction(): 82 | context.run_migrations() 83 | 84 | 85 | if context.is_offline_mode(): 86 | run_migrations_offline() 87 | else: 88 | run_migrations_online() 89 | -------------------------------------------------------------------------------- /pymlst/data/alembic/pytyper/env.py: -------------------------------------------------------------------------------- 1 | from logging.config import fileConfig 2 | 3 | from sqlalchemy import engine_from_config 4 | from sqlalchemy import pool 5 | 6 | from alembic import context 7 | 8 | from pymlst.pytyper import model 9 | 10 | # this is the Alembic Config object, which provides 11 | # access to the values within the .ini file in use. 12 | 13 | config = context.config 14 | 15 | # Interpret the config file for Python logging. 16 | # This line sets up loggers basically. 17 | # fileConfig(config.config_file_name) 18 | 19 | # add your model's MetaData object here 20 | # for 'autogenerate' support 21 | # from myapp import mymodel 22 | # target_metadata = mymodel.Base.metadata 23 | target_metadata = model.metadata 24 | 25 | # other values from the config, defined by the needs of env.py, 26 | # can be acquired: 27 | # my_important_option = config.get_main_option("my_important_option") 28 | # ... etc. 29 | 30 | 31 | def run_migrations_offline(): 32 | """Run migrations in 'offline' mode. 33 | 34 | This configures the context with just a URL 35 | and not an Engine, though an Engine is acceptable 36 | here as well. By skipping the Engine creation 37 | we don't even need a DBAPI to be available. 38 | 39 | Calls to context.execute() here emit the given string to the 40 | script output. 41 | 42 | """ 43 | url = config.get_main_option("sqlalchemy.url") 44 | context.configure( 45 | url=url, 46 | target_metadata=target_metadata, 47 | literal_binds=True, 48 | dialect_opts={"paramstyle": "named"}, 49 | ) 50 | 51 | with context.begin_transaction(): 52 | context.run_migrations() 53 | 54 | 55 | def run_migrations_online(): 56 | """Run migrations in 'online' mode. 57 | 58 | In this scenario we need to create an Engine 59 | and associate a connection with the context. 60 | 61 | """ 62 | connectable = config.attributes.get('connection', None) 63 | 64 | if connectable is None: 65 | # only create Engine if we don't have a Connection 66 | # from the outside 67 | connectable = engine_from_config( 68 | config.get_section(config.config_ini_section), 69 | prefix='sqlalchemy.', 70 | poolclass=pool.NullPool) 71 | 72 | # when connectable is already a Connection object, calling 73 | # connect() gives us a *branched connection*. 74 | 75 | with connectable.connect() as connection: 76 | context.configure( 77 | connection=connection, 78 | target_metadata=target_metadata 79 | ) 80 | 81 | with context.begin_transaction(): 82 | context.run_migrations() 83 | 84 | 85 | if context.is_offline_mode(): 86 | run_migrations_offline() 87 | else: 88 | run_migrations_online() 89 | -------------------------------------------------------------------------------- /docs/source/documentation/cgmlst/other_analysis.rst: -------------------------------------------------------------------------------- 1 | .. _cgmlst_other_analysis: 2 | 3 | .. toctree:: 4 | :glob: 5 | 6 | ======================== 7 | Other analysis available 8 | ======================== 9 | 10 | From the results obtained with cg/wgMLST analysis, you can proceed to 11 | further analysis. 12 | 13 | .. _cgmlst_other_subgraph: 14 | 15 | Subgraph 16 | ======== 17 | 18 | The subgraph command performs a simple hierarchical clustering to 19 | group strains with a distance below the threshold. 20 | 21 | .. figure:: subgraph.png 22 | :alt: Subgraph representation 23 | :align: center 24 | :height: 400px 25 | 26 | MST representation of subgraph analysis at a threshold of 10. 27 | 28 | 29 | You need: 30 | 31 | :DISTANCE: The distance matrix obtained with :ref:`distance ` command. 32 | 33 | .. code-block:: bash 34 | 35 | wgMLST subgraph -h 36 | Usage: wgMLST subgraph [OPTIONS] DISTANCE 37 | 38 | Searches group of strains at a DISTANCE threshold. 39 | 40 | Options: 41 | -o, --output FILENAME Output group files (default:stdout). 42 | -t, --threshold INTEGER Minimum distance to conserve for extraction 43 | of group (default:50). 44 | -e, --export [list|count|group] Export type (default:list). 45 | 46 | 47 | .. _cgmlst_other_recombination: 48 | 49 | Recombination 50 | ============= 51 | 52 | The recombination command determines the number of different positions 53 | in the multiple alignment. You can use the result to define a 54 | threshold and the final list of genes without potential recombination. 55 | 56 | .. code:: 57 | 58 | #Gene Mutation Lenght mutation per 100 base 59 | PA0001 1 1545 0.064 60 | PA0002 1 1104 0.090 61 | PA0004 1 2421 0.041 62 | PA0010 1 552 0.181 63 | PA0011 0 888 0.0 64 | PA0022 1 558 0.179 65 | PA0038 1 216 0.462 66 | PA0062 1 417 0.239 67 | PA0065 1 666 0.150 68 | ... 69 | 70 | 71 | You need: 72 | 73 | :GENES: List of genes used for export MSA and obtained with :ref:`gene 74 | ` command. 75 | :ALIGNMENT: The Multiple Sequence Alignment obtained with :ref:`msa ` command. 76 | 77 | .. code-block:: bash 78 | 79 | wgMLST recombination -h 80 | Usage: wgMLST recombination [OPTIONS] GENES ALIGNMENT 81 | 82 | Searches potential gene recombinations from wgMLST database export. 83 | 84 | Options: 85 | -o, --output FILENAME Output number of variations by genes 86 | (default:stdout). 87 | 88 | .. warning:: 89 | 90 | The algorithm is designed to find recombination on closed strains 91 | and could not work correctly on more diverse ST. 92 | 93 | -------------------------------------------------------------------------------- /docs/source/documentation/cgmlst/export_res.rst: -------------------------------------------------------------------------------- 1 | .. _cgmlst_export_res: 2 | 3 | .. toctree:: 4 | :glob: 5 | 6 | ============== 7 | Export results 8 | ============== 9 | 10 | When the database is complete and :ref:`validated `, you 11 | can export results for futher analysis. 12 | 13 | .. _cgmlst_export_distance: 14 | 15 | Distance 16 | ======== 17 | 18 | A matrix of cgMLST distances can be computed from the database and 19 | defined as the number of different alleles between each pair of two 20 | strains, omitting the missing data. 21 | 22 | .. code:: 23 | 24 | #Strain 33_PA 34_PA 35_PA 61_PA 84_PA 98_PA 25 | 33_PA 0 39 37 25 20 23 26 | 34_PA 39 0 5 33 35 39 27 | 35_PA 37 5 0 31 33 37 28 | 61_PA 25 33 31 0 22 27 29 | 84_PA 20 35 33 22 0 21 30 | 98_PA 23 39 37 27 21 0 31 | 32 | .. code-block:: bash 33 | 34 | wgMLST distance --help 35 | Usage: wgMLST distance [OPTIONS] DATABASE 36 | 37 | Extracts a distance matrix from a wgMLST DATABASE. 38 | Options: 39 | -m, --mincover INTEGER Minimun number of strains found to retain a gene 40 | (default:0) 41 | -k, --keep Keeps only gene with different alleles (omit 42 | missing). 43 | -d, --duplicate Keeps duplicate genes (default remove). 44 | -V, --inverse Keeps only gene that do not match the filter of 45 | mincover or keep options. 46 | 47 | .. warning:: 48 | 49 | To have correct distance calculation, you need to filter genes with 50 | low frequency observations. See :ref:`validate ` to 51 | have more informations on **-m** option. 52 | 53 | .. _cgmlst_export_mlst: 54 | 55 | MLST 56 | ==== 57 | 58 | The MLST profiles can be also exported. The number indicated the 59 | allele *id* in the database. An formatted version compatible with grapetree 60 | can be defined. 61 | 62 | .. code:: 63 | 64 | #GeneId 33_PA 34_PA 35_PA 61_PA 84_PA 98_PA 65 | PA0120 3918 3918 3918 3918 3918 3918 66 | PA0527 3963 3963 3963 3963 3963 3963 67 | PA0691 3954 3954 3954 8945 3954 3954 68 | PA0935 3910 3910 3910 3910 3910 3910 69 | ... 70 | 71 | 72 | .. code-block:: bash 73 | 74 | wgMLST mlst --help 75 | Usage: wgMLST mlst [OPTIONS] DATABASE 76 | 77 | Extracts an MLST table from a wgMLST DATABASE. 78 | Options: 79 | ... 80 | -f, --form [default|grapetree] Specify format of output 81 | 82 | .. note:: 83 | 84 | Similarly to :ref:`distance `, the gene export on this mlst table can be 85 | defined with -m, -k, and -d options. 86 | 87 | -------------------------------------------------------------------------------- /pymlst/data/alembic/alembic.ini: -------------------------------------------------------------------------------- 1 | # A generic, single database configuration. 2 | 3 | [wg] 4 | script_location = wg 5 | sqlalchemy.url = sqlite:///wg/idle.db 6 | 7 | [cla] 8 | script_location = cla 9 | sqlalchemy.url = sqlite:///cla/idle.db 10 | 11 | [pytyper] 12 | script_location = pytyper 13 | sqlalchemy.url = sqlite:///pytyper/idle.db 14 | 15 | [DEFAULT] 16 | # path to migration scripts 17 | # script_location = alembic 18 | 19 | # template used to generate migration files 20 | # file_template = %%(rev)s_%%(slug)s 21 | 22 | # sys.path path, will be prepended to sys.path if present. 23 | # defaults to the current working directory. 24 | prepend_sys_path = . 25 | 26 | # timezone to use when rendering the date 27 | # within the migration file as well as the filename. 28 | # string value is passed to dateutil.tz.gettz() 29 | # leave blank for localtime 30 | # timezone = 31 | 32 | # max length of characters to apply to the 33 | # "slug" field 34 | # truncate_slug_length = 40 35 | 36 | # set to 'true' to run the environment during 37 | # the 'revision' command, regardless of autogenerate 38 | # revision_environment = false 39 | 40 | # set to 'true' to allow .pyc and .pyo files without 41 | # a source .py file to be detected as revisions in the 42 | # versions/ directory 43 | # sourceless = false 44 | 45 | # version location specification; this defaults 46 | # to alembic/versions. When using multiple version 47 | # directories, initial revisions must be specified with --version-path 48 | # version_locations = %(here)s/bar %(here)s/bat alembic/versions 49 | 50 | # the output encoding used when revision files 51 | # are written from script.py.mako 52 | # output_encoding = utf-8 53 | 54 | sqlalchemy.url = driver://user:pass@localhost/dbname 55 | 56 | 57 | [post_write_hooks] 58 | # post_write_hooks defines scripts or Python functions that are run 59 | # on newly generated revision scripts. See the documentation for further 60 | # detail and examples 61 | 62 | # format using "black" - use the console_scripts runner, against the "black" entrypoint 63 | # hooks = black 64 | # black.type = console_scripts 65 | # black.entrypoint = black 66 | # black.options = -l 79 REVISION_SCRIPT_FILENAME 67 | 68 | # Logging configuration 69 | [loggers] 70 | keys = root,sqlalchemy,alembic 71 | 72 | [handlers] 73 | keys = console 74 | 75 | [formatters] 76 | keys = generic 77 | 78 | [logger_root] 79 | level = WARN 80 | handlers = console 81 | qualname = 82 | 83 | [logger_sqlalchemy] 84 | level = WARN 85 | handlers = 86 | qualname = sqlalchemy.engine 87 | 88 | [logger_alembic] 89 | level = INFO 90 | handlers = 91 | qualname = alembic 92 | 93 | [handler_console] 94 | class = StreamHandler 95 | args = (sys.stderr,) 96 | level = NOTSET 97 | formatter = generic 98 | 99 | [formatter_generic] 100 | format = %(levelname)-5.5s [%(name)s] %(message)s 101 | datefmt = %H:%M:%S 102 | -------------------------------------------------------------------------------- /pymlst/data/alembic/wg/versions/52ae99cb5f33_initial.py: -------------------------------------------------------------------------------- 1 | """initial 2 | 3 | Revision ID: 52ae99cb5f33 4 | Revises: 5 | Create Date: 2021-05-21 10:23:49.557993 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '52ae99cb5f33' 14 | down_revision = None 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # This is the initial revision created after the PyMLST refactoring. 21 | # The old un-versioned databases data are untouched. 22 | # Old databased indexes are dropped and replaced by new ones. 23 | # A new alembic_version table is added automatically to enable versioning. 24 | 25 | engine = op.get_bind() 26 | inspector = sa.inspect(engine) 27 | tables = inspector.get_table_names() 28 | 29 | if 'sequences' not in tables: 30 | op.create_table('sequences', 31 | sa.Column('id', sa.Integer(), nullable=False), 32 | sa.Column('sequence', sa.Text(), nullable=True), 33 | sa.PrimaryKeyConstraint('id'), 34 | sa.UniqueConstraint('sequence')) 35 | 36 | if 'mlst' not in tables: 37 | op.create_table('mlst', 38 | sa.Column('id', sa.Integer(), nullable=False), 39 | sa.Column('souche', sa.Text(), nullable=True), 40 | sa.Column('gene', sa.Text(), nullable=True), 41 | sa.Column('seqid', sa.Integer(), nullable=True), 42 | sa.ForeignKeyConstraint(['seqid'], ['sequences.id'], ), 43 | sa.PrimaryKeyConstraint('id')) 44 | 45 | if 'mlst_type' not in tables: 46 | table = op.create_table('mlst_type', 47 | sa.Column('name', sa.String(length=4), nullable=False, 48 | primary_key=True)) 49 | data = [ { 'name': 'wg' } ] 50 | op.bulk_insert(table, data) 51 | 52 | indexes = inspector.get_indexes('mlst') 53 | for ind in indexes: 54 | op.drop_index(ind['name']) 55 | 56 | op.create_index('ix_gene', 'mlst', ['gene'], unique=False) 57 | op.create_index('ix_seqid', 'mlst', ['seqid'], unique=False) 58 | op.create_index('ix_souche', 'mlst', ['souche'], unique=False) 59 | op.create_index('ix_souche_gene_seqid', 'mlst', ['gene', 'souche', 'seqid'], unique=False) 60 | 61 | 62 | def downgrade(): 63 | # Remove index and mlst_type 64 | op.drop_index('ix_souche_gene_seqid', table_name='mlst') 65 | op.drop_index('ix_souche', table_name='mlst') 66 | op.drop_index('ix_seqid', table_name='mlst') 67 | op.drop_index('ix_gene', table_name='mlst') 68 | op.drop_table('mlst_type') 69 | 70 | # Rebuild older index 71 | op.create_index('ID_gene', 'mlst', ['gene'], unique=False) 72 | op.create_index('ID_seqid', 'mlst', ['seqid'], unique=False) 73 | op.create_index('ID_souche', 'mlst', ['souche'], unique=False) 74 | op.create_index('ID_index', 'mlst', ['souche', 'gene'], unique=False) 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![PyPI version](https://badge.fury.io/py/PyMLST.svg)](https://pypi.org/project/PyMLST/) 2 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/pymlst/README.html) 3 | [![Documentation Status](https://readthedocs.org/projects/pymlst/badge/?version=latest)](https://pymlst.readthedocs.io/en/latest/?badge=latest) 4 | 5 | # pyMLST 6 | ![pyMLST](docs/source/logo.png "A Python Mlst Local Search Tool") 7 | 8 | A Python Mlst Local Search Tool. 9 | 10 | ## Purpose 11 | Bacterial typing is critical to unraveling the spread of pathogens. 12 | For this purpose, data from next-generation sequencing are now widely used, with core multilocus sequence typing (cgMLST) or whole genome multilocus sequence typing (wgMLST) becoming the new standard. 13 | These methods are an extension of the traditional MLST method, which uses a short list of housekeeping genes. 14 | cgMLST and wgMLST use a large set of genes corresponding to the core or whole genome. 15 | Similar to MLST, each unique sequence corresponds to a specific allele, and the combination of alleles determines the sequence type (ST) of the strain. 16 | 17 | We have developed pyMLST to perform this task. Unlike other tools, it uses a local SQLite database to store allele sequences and MLST profiles. 18 | This allows the collection of strains to be expanded iteratively. The input can be (i) an assembler-generated draft genome, (ii) the direct raw data, or (iii) other genomes stored in the sequence database. 19 | 20 | ## New version 21 | V2.2: 22 | 23 | - Introduced new typing methods with fimH and phylogrouping for *Escherichia coli* and spa for *Staphylococcus aureus* (pyTyper search command) 24 | 25 | V2.1: 26 | 27 | - Use raw reads (FASTQ) directly with the kma integration (search2 and add2 command) 28 | 29 | V2.0: 30 | 31 | - An automatic import database mechanism to initiated cgMLST and MLST databases. 32 | - A new process to fill incomplet genes using MAFFT alignment. 33 | - A more complete command line interface with a sub-command system. 34 | - A configuration file for defined PATH to external tools. 35 | - An easy installation with pypi repository. 36 | 37 | 38 | ## Documentation 39 | The details of installation, workflow and running parameters could be found on the [**documentation**](https://pymlst.readthedocs.io/en/latest/). 40 | 41 | 42 | ## Publications 43 | If you use pyMLST, please cite the following paper: 44 | 45 | Bignenet A. et al., Introduction and benchmarking of pyMLST: 46 | open-source software for assessing bacterial clonality using core 47 | genome MLST. 2023 Microbials Genomics, 9(11), 1126. 48 | doi: [10.1099/mgen.0.001126](https://doi.org/10.1099/mgen.0.001126) 49 | 50 | 51 | PyMLST v1 have been already use to analyse most of clinical bacteria: 52 | 53 | - [*Escherichia coli* and *Klebsiella pneumoniae*](https://doi.org/10.1016/j.cmi.2021.07.022) 54 | - [*Acinetobacter baumanii*](https://doi.org/10.1038/s41598-023-49268-x) 55 | - [*Pseudomonas aeruginosa*](https://doi.org/10.1016/j.jhin.2020.06.013) 56 | - [*Proteus mirabilis*](https://doi.org/10.1093/jac/dkz472) 57 | -------------------------------------------------------------------------------- /pymlst/cmd.py: -------------------------------------------------------------------------------- 1 | """PyMLST entry commands and common parameters creation. 2 | 3 | Subcommands are being instantiated dynamically from their respective folders. 4 | """ 5 | 6 | import os 7 | import click 8 | 9 | from click import Option 10 | 11 | from pymlst import version 12 | from pymlst.common import utils 13 | 14 | 15 | class PyMlstCommand(click.MultiCommand): 16 | """Global PyMLST command.""" 17 | 18 | def __init__(self, path, help_msg): 19 | """Initializes the command.""" 20 | super().__init__(help='Subcommands are loaded from a ' 21 | 'plugin folder dynamically') 22 | 23 | opt_help = dict(help_option_names=['-h', '--help']) 24 | 25 | opt_version = Option(['--version', '-v'], is_flag=True, callback=print_version, 26 | expose_value=False, is_eager=True, 27 | help='Prints PyMLST version.') 28 | self.params.append(opt_version) 29 | self.context_settings.update(opt_help) 30 | self.help = help_msg 31 | self.path = path 32 | 33 | def list_commands(self, ctx): 34 | """Lists the available commands. 35 | 36 | The commands are loaded dynamically from files within 37 | a directory. 38 | """ 39 | cmd_names = [] 40 | for filename in os.listdir(self.path): 41 | if filename.endswith('.py') and not filename.startswith('__init__'): 42 | cmd_names.append(filename[:-3]) 43 | cmd_names.sort() 44 | return cmd_names 45 | 46 | def get_command(self, ctx, name): 47 | """Gets a command by name.""" 48 | name_scope = {} 49 | cmd_file = os.path.join(self.path, name + '.py') 50 | try: 51 | with open(cmd_file) as file: 52 | code = compile(file.read(), cmd_file, 'exec') 53 | eval(code, name_scope, name_scope) 54 | except FileNotFoundError: 55 | raise click.ClickException( 56 | 'Unknown sub-command \'{}\''.format(name)) 57 | return name_scope['cli'] 58 | 59 | 60 | def print_version(ctx, param, value): 61 | """Prints the package version.""" 62 | del param 63 | if not value or ctx.resilient_parsing: 64 | return 65 | click.echo('Version: ' + version.__version__) 66 | click.echo('Release: ' + version.__release__) 67 | ctx.exit() 68 | 69 | 70 | py = PyMlstCommand( 71 | os.path.join(os.path.dirname(__file__), 'common', 'commands'), 72 | ''' 73 | \b 74 | Common utility commands. 75 | 76 | Tree pipelines are available:\n 77 | claMLST for classical MLST analysis\n 78 | wgMLST for Whole/Core genome MLST analysis\n 79 | pyTyper for other typing analysis''') 80 | 81 | wg = PyMlstCommand( 82 | os.path.join(os.path.dirname(__file__), 'wg', 'commands'), 83 | 'Whole/Core genome MLST commands.') 84 | 85 | cla = PyMlstCommand( 86 | os.path.join(os.path.dirname(__file__), 'cla', 'commands'), 87 | 'Classical MLST commands.') 88 | 89 | pytyper = PyMlstCommand( 90 | os.path.join(os.path.dirname(__file__), 'pytyper', 'commands'), 91 | 'Other typing commands.') 92 | -------------------------------------------------------------------------------- /pymlst/cla/commands/import.py: -------------------------------------------------------------------------------- 1 | """import CLI command file.""" 2 | 3 | import logging 4 | import os 5 | import sys 6 | import tempfile 7 | 8 | import click 9 | 10 | import requests 11 | 12 | import pymlst 13 | 14 | from pymlst.common import web, exceptions 15 | from pymlst.common import utils 16 | 17 | 18 | @click.command(name='import') 19 | @click.option('--force', '-f', 20 | is_flag=True, 21 | help='Overwrites alrealdy existing DATABASE') 22 | @click.option('--prompt/--no-prompt', 23 | default=True, 24 | help='Do not prompt if multiple ' 25 | 'choices are found, fail instead.') 26 | @click.option('--mlst', '-m', 27 | type=click.STRING, 28 | default='', 29 | help='Specifies the desired MLST scheme name.') 30 | @click.option('--repository', '-r', default='pubmlst', 31 | type=click.Choice(['pubmlst','pasteur'], case_sensitive=False), 32 | help='Choose the online repository to use') 33 | # @click.option('--pubmlst/--pasteur', 34 | # default=True, show_default="pubmlst", 35 | # help= "Choose the online repository") 36 | @click.argument('database', 37 | type=click.Path(exists=False)) 38 | @click.argument('species', 39 | type=click.STRING, 40 | nargs=-1) 41 | 42 | 43 | def cli(force, prompt, mlst, repository, database, species): 44 | """Creates a claMLST DATABASE from an online resource. 45 | 46 | The research can be filtered by adding a SPECIES name.""" 47 | 48 | utils.create_logger() 49 | 50 | try: 51 | 52 | if os.path.exists(database): 53 | if force: 54 | open(database, "w").close() 55 | else: 56 | raise exceptions.PyMLSTError("Database alreadly exists, use --force to override it") 57 | 58 | url = web.retrieve_mlst(' '.join(species), prompt, mlst, repository) 59 | 60 | if url is None: 61 | logging.info('No choice selected') 62 | return 63 | 64 | logging.info('Downloading mlst...') 65 | 66 | with tempfile.TemporaryDirectory() as tmp_dir, \ 67 | pymlst.open_cla(os.path.abspath(database)) as mlst_db: 68 | 69 | version = web.get_mlst_files(url, tmp_dir) 70 | 71 | mlst_db.create(open(tmp_dir + '/profiles.csv', 'rt'), 72 | [open(tmp_dir + '/locus/' + locus, 'r') 73 | for locus in os.listdir(tmp_dir + '/locus')]) 74 | mlst_db.add_infos(repository, ' '.join(species), mlst, version) 75 | 76 | except requests.exceptions.HTTPError: 77 | raise click.ClickException('Could not retrieve online data') 78 | except requests.exceptions.ConnectionError: 79 | raise click.ClickException('Could not access to the server, please verify your internet connection') 80 | except requests.exceptions.Timeout: 81 | raise click.ClickException('The server took too long to respond') 82 | except exceptions.StructureError: 83 | raise click.ClickException('It seems like the structure of the website/API changed ' 84 | 'since this application was developed.') 85 | except exceptions.PyMLSTError as err: 86 | raise click.ClickException(str(err)) 87 | -------------------------------------------------------------------------------- /docs/source/documentation/cgmlst/initialise.rst: -------------------------------------------------------------------------------- 1 | .. _cgmlst_initialise: 2 | 3 | .. toctree:: 4 | :glob: 5 | 6 | ===================== 7 | Initialise a database 8 | ===================== 9 | 10 | The first step of a cg/wgMLST analysis is to initialise a database 11 | by a list of genes with a reference sequence for each of them. 12 | 13 | :cgMLST: A list of genes corresponding to the coregenome of a species. 14 | 15 | :wgMLST: A list of genes corresponding to the whole genome of a 16 | species or a clone. 17 | 18 | Import from cgmlst.org 19 | ====================== 20 | 21 | You can automatically import a cgMLST resource from `cgmlst.org 22 | `_. 23 | 24 | .. code-block:: bash 25 | 26 | wgMLST import -h 27 | Usage: wgMLST import [OPTIONS] DATABASE [SPECIES]... 28 | 29 | Creates a wgMLST DATABASE from an online resource. 30 | 31 | The research can be filtered by adding a SPECIES name. 32 | 33 | Options: 34 | -f, --force Overwrite alrealdy existing DATABASE 35 | --prompt / --no-prompt Do not prompt if multiple choices are found, 36 | fail instead. 37 | 38 | 39 | Create from external scheme 40 | =========================== 41 | 42 | The cg/wgMLST database can be created using a **scheme** corresponding to 43 | a list of different genes (a multi-fasta file containing gene 44 | sequences in nucleotide format). 45 | 46 | .. code:: 47 | 48 | >ACICU_RS02500 49 | TTATTTCTTCACAACAGATGGTGCAATTGGGTCGGCAGTGATATAGCCAACTGCTGCTGC 50 | ... 51 | GTGGTTAGAAGCAGTGGTCAT 52 | >ACICU_RS11305 53 | CGCACCTAATGGAAGAAAAGGGATCCCCGTAAACCATTTTAAAATATCGCGACGTGTTGG 54 | ... 55 | TTTGGAATTGATGCAGAAATTAAATCTTAA 56 | >ACICU_RS08820 57 | ATGGCTTATCAAACTTTAGAACAGCTACAGCAGTCTAAAGCCAAGCTTCACGAAACTGTG 58 | ... 59 | TCGCAGTTACGTTAA 60 | 61 | .. warning:: 62 | 63 | At contrary to other cg/wgMLST tools, only one allele for each 64 | gene must be include on the scheme file. 65 | 66 | 67 | You can get scheme for: 68 | 69 | :cgMLST: 70 | 71 | * Using a scheme from a scientific publication and not available on 72 | `cgmlst.org `_. 73 | 74 | * Using the annotation of the genes from the reference genome of 75 | the species. After adding your strains to the database, you can 76 | filter to core genome by removing genes absent from least 95% of 77 | the strains (see :ref:`validate `) 78 | 79 | :wgMLST: 80 | 81 | * Using gene annotations from a genome close to your strains 82 | 83 | * Using pangenome results from analysis of your strains with 84 | e.g. `Roary `_. 85 | 86 | 87 | 88 | .. code-block:: bash 89 | 90 | wgMLST create --help 91 | Usage: wgMLST create [OPTIONS] DATABASE COREGENE 92 | 93 | Creates a wgMLST DATABASE from a template COREGENE. 94 | 95 | Options: 96 | -f, --force Overwrite alrealdy existing DATABASE 97 | -c, --concatenate Automatically concatenates genes with duplicated sequences 98 | -r, --remove Automatically removes genes with duplicated sequences 99 | -s, --species TEXT Name of the species 100 | -V, --version TEXT Version of the database 101 | 102 | 103 | .. warning:: 104 | 105 | If the same sequence is used more than once in your scheme, you can 106 | specify how to handle it using the **-c** or **-r** options. 107 | 108 | 109 | -------------------------------------------------------------------------------- /docs/source/documentation/clamlst/initialise.rst: -------------------------------------------------------------------------------- 1 | .. _clamlst_initialise: 2 | 3 | .. toctree:: 4 | :glob: 5 | 6 | ========================== 7 | Initialise a MLST database 8 | ========================== 9 | 10 | A MLST database contains the different alleles for each gene of the 11 | scheme and a table of association of the alleles to determined the 12 | sequence type (ST). 13 | 14 | 15 | Import from pubMLST 16 | =================== 17 | 18 | You can automatically import a MLST resource from `pubmlst 19 | `_ or `pasteur `_. 20 | 21 | .. code-block:: bash 22 | 23 | claMLST import -h 24 | Usage: claMLST import [OPTIONS] DATABASE [SPECIES]... 25 | 26 | Creates a claMLST DATABASE from an online resource. 27 | 28 | The research can be filtered by adding a SPECIES name. 29 | 30 | Options: 31 | --prompt / --no-prompt Do not prompt if multiple choices are found, 32 | fail instead. 33 | -f, --force Overwrites alrealdy existing DATABASE 34 | -m, --mlst TEXT Specifies the desired MLST scheme name. 35 | -r, --repository Choose the online repository to use [pubmlst|pasteur] 36 | 37 | 38 | 39 | Create from other resource 40 | ========================== 41 | 42 | Alternatively, you can create a database with the allele sequence and 43 | MLST profile of your favorite species. To create a database, pyMLST 44 | needs the gene name in the MLST profile header to match the name in 45 | the fasta file. For example, the rpoB gene in the MLST profile header 46 | must match the rpoB.fas file. You will also need to remove the 47 | additional column corresponding to the clonal complex in the MLST 48 | profile file, if present. 49 | 50 | .. code-block:: bash 51 | 52 | claMLST create --help 53 | Usage: claMLST create [OPTIONS] DATABASE PROFILE ALLELES... 54 | 55 | Creates a classical MLST DATABASE from a txt PROFILE and fasta ALLELES files. 56 | 57 | Options: 58 | -f, --force Overwrites alrealdy existing DATABASE 59 | -s, --species TEXT Name of the species 60 | -V, --version TEXT Version of the database 61 | 62 | 63 | 64 | Scheme example 65 | -------------- 66 | 67 | .. code:: 68 | 69 | ST cpn60 fusA gltA pyrG recA rplB rpoB 70 | 1 1 1 1 1 5 1 1 71 | 2 2 2 2 2 2 2 2 72 | 3 3 3 2 2 3 1 3 73 | ... 74 | 75 | Allele example 76 | -------------- 77 | 78 | .. code:: 79 | 80 | >cpn60_1 81 | ATGAACCCAATGGATTTAAAACGCGGTATCGACATTGCAGTAAAAACTGTAGTTGAAAAT 82 | ATCCGTTCTATTGCTAAACCAGCTGATGATTTCAAAGCAATTGAACAAGTAGGTTCAATC 83 | TCTGCTAACTCTGATACTACTGTTGGTAAACTTATTGCTCAAGCAATGGAAAAAGTAGGT 84 | AAAGAAGGCGTAATCACTGTAGAAGAAGGTTCTGGCTTCGAAGACGCATTAGACGTTGTA 85 | GAAGGTATGCAGTTTGACCGTGGTTATATCTCTCCGTACTTTGCAAACAAACAAGATACT 86 | TTAACTGCTGAACTTGAAAATCCGTTCATTCTTCTTGTTGATAAAAAAATCAGCAACATT 87 | CGTGAATTGATTTCTGTTTTAGAAGCAGTTGCTAAAACTGGTAAA 88 | >cpn60_2 89 | ATGAACCCAATGGATTTAAAACGCGGTATCGACATTGCAGTAAAAACTGTAGTTGAAAAT 90 | ATCCGTTCTATTGCTAAACCAGCTGATGATTTCAAAGCAATTGAACAAGTAGGTTCAATC 91 | TCTGCTAACTCTGATACTACTGTTGGTAAACTTATTGCTCAAGCAATGGAAAAAGTAGGT 92 | AAAGAAGGCGTAATCACTGTAGAAGAAGGCTCAGGCTTCGAAGACGCATTAGACGTTGTA 93 | GAAGGTATGCAGTTTGACCGTGGTTATATCTCTCCGTACTTTGCAAACAAACAAGATACT 94 | TTAACTGCTGAACTTGAAAATCCGTTCATCCTTCTTGTTGATAAAAAAATCAGCAACATT 95 | CGTGAATTGATTTCTGTTTTAGAAGCAGTTGCTAAAACTGGTAAA 96 | ... 97 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """ 5 | This file is used to create the package we'll publish to PyPI. 6 | 7 | .. currentmodule:: setup.py 8 | .. moduleauthor:: Benoit Valot 9 | """ 10 | 11 | import importlib.util 12 | import os 13 | from pathlib import Path 14 | from setuptools import setup, find_packages 15 | from codecs import open # Use a consistent encoding. 16 | from os import path 17 | 18 | here = path.abspath(path.dirname(__file__)) 19 | 20 | # Get the long description from the relevant file 21 | with open(path.join(here, 'README.md'), encoding='utf-8') as f: 22 | long_description = f.read() 23 | 24 | # Get the base version from the library. (We'll find it in the `version.py` 25 | # file in the src directory, but we'll bypass actually loading up the library.) 26 | vspec = importlib.util.spec_from_file_location( 27 | "version", 28 | str(Path(__file__).resolve().parent / 29 | 'pymlst'/"version.py") 30 | ) 31 | vmod = importlib.util.module_from_spec(vspec) 32 | vspec.loader.exec_module(vmod) 33 | version = getattr(vmod, '__version__') 34 | 35 | # If the environment has a build number set... 36 | if os.getenv('buildnum') is not None: 37 | # ...append it to the version. 38 | version = "{version}.{buildnum}".format( 39 | version=version, 40 | buildnum=os.getenv('buildnum') 41 | ) 42 | 43 | setup( 44 | name='PyMLST', 45 | description="python Mlst Local Search Tool", 46 | long_description=long_description, 47 | long_description_content_type='text/markdown', 48 | packages=find_packages( 49 | exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), 50 | version=version, 51 | setup_requires=['wheel'], 52 | install_requires=[ 53 | # Include dependencies here 54 | 'biopython>=1.78', 55 | 'click>=7.1', 56 | 'pytest>=6.2', 57 | 'sqlalchemy>=1.4,<2', 58 | 'networkx>=2.5', 59 | 'decorator>=4.4', 60 | 'requests>=2.23', 61 | 'pandas>=1.2', 62 | 'numpy>=1.20.0', 63 | 'beautifulsoup4>=4.9', 64 | 'questionary>=1.9', 65 | 'setuptools>=44.0', 66 | 'alembic>=1.6', 67 | 'GitPython>=3.1' 68 | ], 69 | entry_points=""" 70 | [console_scripts] 71 | pyMLST=pymlst.cmd:py 72 | wgMLST=pymlst.cmd:wg 73 | claMLST=pymlst.cmd:cla 74 | pyTyper=pymlst.cmd:pytyper 75 | """, 76 | python_requires=">=3.7.0", 77 | license='GPLv3', # noqa 78 | author='Benoit Valot', 79 | author_email='benoit.valot@univ-fcomte.fr', 80 | # Use the URL to the github repo. 81 | url='https://github.com/bvalot/pyMLST.git', 82 | download_url=( 83 | f'https://github.com/bvalot/pyMLST/archive/refs/tags/{version}.tar.gz' 84 | ), 85 | keywords=[ 86 | 'cgMLST', 'MLST', 'bacterial genome' 87 | # Add package keywords here. 88 | ], 89 | # See https://PyPI.python.org/PyPI?%3Aaction=list_classifiers 90 | classifiers=[ 91 | # How mature is this project? Common values are 92 | # 3 - Alpha 93 | # 4 - Beta 94 | # 5 - Production/Stable 95 | 'Development Status :: 5 - Production/Stable', 96 | 97 | # Indicate who your project is intended for. 98 | 'Intended Audience :: Developers', 99 | 'Topic :: Software Development :: Libraries', 100 | 101 | # Pick your license. (It should match "license" above.) 102 | # noqa 103 | '''License :: OSI Approved :: GNU General Public License v3 (GPLv3)''', 104 | # noqa 105 | # Specify the Python versions you support here. In particular, ensure 106 | # that you indicate whether you support Python 2, Python 3 or both. 107 | 'Programming Language :: Python :: 3.7', 108 | ], 109 | include_package_data=True 110 | ) 111 | -------------------------------------------------------------------------------- /tests/test_cla.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sqlalchemy import select 3 | 4 | import pymlst 5 | from pymlst.cla import model 6 | from pymlst.cla.core import DatabaseCLA 7 | from pymlst.common import exceptions 8 | 9 | 10 | @pytest.fixture() 11 | def cla(): 12 | with pymlst.open_cla() as cla_mlst: 13 | yield cla_mlst 14 | 15 | 16 | @pytest.fixture() 17 | def db(): 18 | db = DatabaseCLA(None, 1) 19 | try: 20 | yield db 21 | finally: 22 | db.close() 23 | 24 | 25 | @pytest.fixture() 26 | def db_many(db): 27 | seqs = [ # gene, seq, allele 28 | ('g1', 'AAA', 1), 29 | ('g2', 'ATA', 1), 30 | ('g3', 'TTT', 1), 31 | ('g4', 'CCC', 1), 32 | ('g5', 'CCT', 1), 33 | ('g1', 'AAT', 2), 34 | ('g2', 'ATT', 2), 35 | ('g3', 'TCT', 2), 36 | ('g4', 'ACC', 2), 37 | ('g5', 'CTC', 2), 38 | ] 39 | for gene, seq, allele in seqs: 40 | db.add_sequence(seq, gene, allele) 41 | mlst = [ # st, gene, allele 42 | (1, 'g1', 1), 43 | (1, 'g2', 2), 44 | (1, 'g3', 1), 45 | (1, 'g4', 2), 46 | (1, 'g5', 2), 47 | (2, 'g1', 1), 48 | (2, 'g2', 1), 49 | (2, 'g3', 2), 50 | (2, 'g4', 1), 51 | (2, 'g5', 1), 52 | (3, 'g1', 2), 53 | (3, 'g2', 1), 54 | (3, 'g3', 1), 55 | (3, 'g4', 1), 56 | (3, 'g5', 2), 57 | ] 58 | for st, gene, allele in mlst: 59 | db.add_mlst(st, gene, allele) 60 | return db 61 | 62 | 63 | def test_add_sequence(db): 64 | db.add_sequence('AAA', 'g1', 2) 65 | seq = db.connection.execute( 66 | select(model.sequences) 67 | ).fetchall() 68 | assert len(seq) == 1 69 | assert (seq[0].sequence == 'AAA' 70 | and seq[0].gene == 'g1' 71 | and seq[0].allele == 2) 72 | 73 | 74 | def test_add_mlst(db): 75 | db.add_sequence('AAA', 'g1', 2) 76 | db.add_mlst(5, 'g1', 2) 77 | mlst = db.connection.execute( 78 | select(model.mlst) 79 | ).fetchall() 80 | assert len(mlst) == 1 81 | assert (mlst[0].st == 5 82 | and mlst[0].gene == 'g1' 83 | and mlst[0].allele == 2) 84 | assert len(db.core_genome) == 0 85 | 86 | 87 | # def test_add_mlst_no_sequence(db): 88 | # db.add_sequence('AAA', 'g1', 1) 89 | # with pytest.raises(exceptions.AlleleSequenceNotFound): 90 | # db.add_mlst(5, 'g1', 2) 91 | 92 | 93 | def test_add_mlst_reference(db): 94 | db.add_sequence('AAA', 'g1', 1) 95 | db.add_mlst(5, 'g1', 1) 96 | assert len(db.core_genome) == 1 97 | assert db.core_genome['g1'] == 'AAA' 98 | 99 | 100 | def test_get_genes_by_allele(db_many): 101 | genes = db_many.get_genes_by_allele(2) 102 | assert genes == { 103 | 'g1': 'AAT', 104 | 'g2': 'ATT', 105 | 'g3': 'TCT', 106 | 'g4': 'ACC', 107 | 'g5': 'CTC', 108 | } 109 | 110 | 111 | def test_get_allele_by_sequence_and_gene(db_many): 112 | allele = db_many.get_allele_by_sequence_and_gene('AAT', 'g1') 113 | assert allele == 2 114 | 115 | 116 | def test_get_allele_by_sequence_and_gene_none(db_many): 117 | allele = db_many.get_allele_by_sequence_and_gene('AAT', 'g2') 118 | assert allele is None 119 | 120 | 121 | def test_get_st_by_gene_and_allele(db_many): 122 | st = db_many.get_st_by_gene_and_allele('g3', 1) 123 | assert st == [1, 3] 124 | st = db_many.get_st_by_gene_and_allele('g2', 2) 125 | assert st == [1] 126 | 127 | 128 | def test_get_st_by_gene_and_allele_none(db_many): 129 | st = db_many.get_st_by_gene_and_allele('g5', 3) 130 | assert st == [] 131 | 132 | 133 | def test_get_sequence_by_gene_and_allele(db_many): 134 | seq = db_many.get_sequence_by_gene_and_allele('g3', 2) 135 | assert seq == 'TCT' 136 | 137 | 138 | def test_get_sequence_by_gene_and_allele_none(db_many): 139 | seq = db_many.get_sequence_by_gene_and_allele('g3', 6) 140 | assert seq is None 141 | -------------------------------------------------------------------------------- /docs/source/documentation/cgmlst/check.rst: -------------------------------------------------------------------------------- 1 | .. _cgmlst_check: 2 | 3 | .. toctree:: 4 | :glob: 5 | 6 | ============================= 7 | Check quality of the database 8 | ============================= 9 | 10 | After :ref:`loading ` all your strains to the database, 11 | you need to check allele calling quality before :ref:`export results 12 | `. 13 | 14 | .. note:: 15 | 16 | You can have information of current data in the database using 17 | **stats** command. 18 | 19 | .. code-block:: bash 20 | 21 | wgMLST stats -h 22 | Usage: wgMLST stats [OPTIONS] DATABASE 23 | 24 | Extract stats from a wgMLST DATABASE. 25 | 26 | 27 | .. _strain_check: 28 | 29 | Validate strains 30 | ================ 31 | 32 | To search potential strain with problems like bad assembly or wrong 33 | species, you can use the **strain** command with the **-c** option. 34 | 35 | 36 | .. code-block:: bash 37 | 38 | wgMLST strain -h 39 | Usage: wgMLST strain [OPTIONS] DATABASE 40 | 41 | Extracts a list of strains from a wgMLST DATABASE. 42 | 43 | Options: 44 | -m, --mincover INTEGER Minimun number of strain found to keep a gene 45 | (default:0) 46 | -k, --keep Keep only gene with different allele (omit missing). 47 | -d, --duplicate Conserve duplicate gene (default remove). 48 | -V, --inverse Keep only gene that do not meet the filter 49 | of mincover or keep options. 50 | -c, --count Count the number of gene present in the database for 51 | each strains. 52 | -o, --output FILENAME Export strain list to (default=stdout). 53 | 54 | .. note:: 55 | 56 | If some strains show low number of genes found in comparison to the 57 | other, you can remove it using :ref:`remove ` 58 | command. 59 | 60 | .. note:: 61 | 62 | Similarly to :ref:`gene ` command or :ref:`export `, you can filter gene 63 | that you want to conserved for the search. 64 | 65 | By default, only duplicate genes are removed. 66 | 67 | .. _gene_check: 68 | 69 | Validate genes 70 | ============== 71 | 72 | Similarly to strains, it could be interesting to saved genes list to 73 | conserved for the rest of the analysis using **gene** command. 74 | 75 | .. code-block:: bash 76 | 77 | wgMLST gene -h 78 | Usage: wgMLST gene [OPTIONS] DATABASE 79 | 80 | Extracts a list of genes from a wgMLST DATABASE. 81 | 82 | Options: 83 | -m, --mincover INTEGER Minimun number of strain found to keep a gene 84 | (default:0) 85 | -k, --keep Keep only gene with different allele (omit missing). 86 | -d, --duplicate Conserve duplicate gene (default remove). 87 | -V, --inverse Keep only gene that do not meet the filter of 88 | mincover or keep options. 89 | -o, --output FILENAME Export GENE list to (default=stdout). 90 | 91 | .. note:: 92 | 93 | Gene list that pass your threshold can be used further for :ref:`export 94 | sequence `. 95 | 96 | .. _m_option_check: 97 | 98 | .. warning:: 99 | 100 | An important parameter are the **-m** option that defined the 101 | minimum number of strains found to keep a gene. 102 | 103 | If you are interesting by coregene, you can defined this number to 104 | correspond to **95%** of the strain in the database. 105 | (As example, if you have 100 strains in your database, you need to 106 | set this parameter to 95) 107 | 108 | 109 | .. _remove_check: 110 | 111 | Remove strains or genes 112 | ======================= 113 | 114 | After checking the database, if some strains or genes need to be 115 | removed, you can use the **remove** commands. 116 | 117 | .. code-block:: bash 118 | 119 | wgMLST remove -h 120 | Usage: wgMLST remove [OPTIONS] DATABASE [GENES_OR_STRAINS]... 121 | 122 | Removes STRAINS or GENES from a wgMLST DATABASE. 123 | 124 | Options: 125 | --strains / --genes Choose the item you wish to remove [default: strains] 126 | -f, --file FILENAME File list of genes or strains to removed on the wgMLST 127 | database. 128 | -------------------------------------------------------------------------------- /tests/test_typer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sqlalchemy import select 3 | 4 | import pymlst 5 | from pymlst.pytyper import model 6 | from pymlst.pytyper.method import FIM, SPA, CLMT 7 | from pymlst.pytyper.core import DatabaseTyper, TypingResult, FimH, Spa, Clmt 8 | from pymlst.common import exceptions 9 | 10 | 11 | @pytest.fixture() 12 | def fim(): 13 | with pymlst.open_typer(FIM) as fim_typer: 14 | yield fim_typer 15 | 16 | @pytest.fixture() 17 | def spa(): 18 | with pymlst.open_typer(SPA) as spa_typer: 19 | yield spa_typer 20 | 21 | @pytest.fixture() 22 | def clmt(): 23 | with pymlst.open_typer(CLMT) as clmt_typer: 24 | yield clmt_typer 25 | 26 | 27 | @pytest.fixture() 28 | def db(): 29 | db = DatabaseTyper(None) 30 | try: 31 | yield db 32 | finally: 33 | db.close() 34 | 35 | @pytest.fixture() 36 | def result(): 37 | res = TypingResult('sample1', FIM) 38 | yield res 39 | 40 | @pytest.fixture() 41 | def db_many(db): 42 | seqs = [ # seq, typing, allele 43 | ('AAA', 'fim', 'fimH1'), 44 | ('ATA', 'fim', 'fimH2'), 45 | ('TTT', 'fim', 'fimH3'), 46 | ('CCC', 'spa', '01'), 47 | ('CCT', 'spa', '02'), 48 | ('AAT', 'spa', '03'), 49 | ('ATT', 'clmt', 'arpA'), 50 | ('TCT', 'clmt', 'chuA'), 51 | ('ACC', 'clmt', 'yjaA'), 52 | ('CTC', 'clmt', 'TspE4.C2'), 53 | ] 54 | for seq, method, allele in seqs: 55 | db.add_sequence(seq, method, allele) 56 | sts = [ # st, typing, allele 57 | ('fimH1', 'fim', 'fimH1'), 58 | ('fimH2', 'fim', 'fimH2'), 59 | ('fimH3', 'fim', 'fimH3'), 60 | ('t1', 'spa', '01-02-02-01'), 61 | ('t2', 'spa', '02-02-03-01'), 62 | ('t3', 'spa', '01-01-02-01'), 63 | ('A', 'clmt', 'arpA|+,chuA|-,yjaA|-,TspE4.C2|-'), 64 | ('B1', 'clmt', 'arpA|+,chuA|-,yjaA|-,TspE4.C2|+'), 65 | ('B2', 'clmt', 'arpA|-,chuA|+,yjaA|+,TspE4.C2|+'), 66 | ('D|E', 'clmt', 'arpA|+,chuA|+,yjaA|-,TspE4.C2|-'), 67 | ] 68 | for st, method, allele in sts: 69 | db.add_st(st, method, allele) 70 | return db 71 | 72 | def test_check_db(db_many): 73 | res = db_many.check_db(FIM) 74 | assert(res) == True 75 | 76 | def test_check_new_db(db): 77 | res = db.check_db(CLMT) 78 | assert(res) == False 79 | 80 | def test_add_sequence(db): 81 | db.add_sequence('AAA', FIM, '02') 82 | seq = db.connection.execute( 83 | select(model.typerSeq) 84 | ).fetchall() 85 | assert len(seq) == 1 86 | assert (seq[0].sequence == 'AAA' 87 | and seq[0].typing == FIM 88 | and seq[0].allele == '02') 89 | 90 | def test_add_st(db): 91 | db.add_st('fimH1', FIM, 'fimH1') 92 | st = db.connection.execute( 93 | select(model.typerSt) 94 | ).fetchall() 95 | assert len(st) == 1 96 | assert (st[0].st == 'fimH1' 97 | and st[0].typing == FIM 98 | and st[0].allele == 'fimH1') 99 | 100 | def test_get_sequences(db_many): 101 | seqs = db_many.get_sequences(FIM) 102 | seqs2 = db_many.get_sequences(CLMT) 103 | assert len(seqs) == 3 104 | assert len(seqs2) == 4 105 | assert seqs[1] == ('fimH2', 'ATA') 106 | assert seqs2[0] == ('arpA','ATT') 107 | 108 | def test_get_sequence_by_allele(db_many): 109 | seq = db_many.get_sequence_by_allele(FIM, 'fimH1') 110 | assert seq == 'AAA' 111 | with pytest.raises(exceptions.AlleleSequenceNotFound): 112 | db_many.get_sequence_by_allele(SPA, '04') 113 | 114 | def test_get_allele_by_sequence(db_many): 115 | al = db_many.get_allele_by_sequence(SPA, 'CCT') 116 | assert al == '02' 117 | al2 = db_many.get_allele_by_sequence(FIM, 'GCG') 118 | assert al2 == 'New' 119 | 120 | def test_get_st(db_many): 121 | st = db_many.get_st(FIM, 'fimH2') 122 | assert st == 'fimH2' 123 | st2 = db_many.get_st(CLMT, '02') 124 | assert st2 == '' 125 | 126 | def test_pyTyper_instance(fim, spa, clmt): 127 | assert isinstance(fim, FimH) 128 | assert isinstance(spa, Spa) 129 | assert isinstance(clmt, Clmt) 130 | 131 | def test_pyTyper_check_input(fim): 132 | a = fim.check_input(0.9, 0.9) 133 | with pytest.raises(exceptions.BadCoverageRange): 134 | fim.check_input(0.9, 18) 135 | with pytest.raises(exceptions.BadIdentityRange): 136 | fim.check_input(12, 0.2) 137 | 138 | def test_typingResult_full(result): 139 | result.set_allele('12') 140 | result.set_st('t1235') 141 | result.set_notes('Some informations') 142 | assert str(result) == 'sample1 fim t1235 12' 143 | 144 | def test_typingResult_empty(result): 145 | assert str(result) == 'sample1 fim ' 146 | -------------------------------------------------------------------------------- /pymlst/common/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import time 4 | from pathlib import Path 5 | 6 | from Bio import SeqIO 7 | from Bio.Data.CodonTable import TranslationError 8 | from alembic import command 9 | from alembic.config import Config 10 | from sqlalchemy import create_engine, inspect, select 11 | 12 | from pymlst import config 13 | from pymlst.common import flag, exceptions 14 | 15 | 16 | def records_to_dict(records): 17 | seq_dict = {} 18 | for seq in records: 19 | seq_dict[seq.id] = seq.seq.upper() 20 | return seq_dict 21 | 22 | 23 | def read_genome(handle): 24 | if handle.seekable(): 25 | handle.seek(0) 26 | records = SeqIO.parse(handle, 'fasta') 27 | return records_to_dict(records) 28 | 29 | 30 | def write_genome(genome_dict, handle): 31 | for seq_id, seq in genome_dict.items(): 32 | handle.write('>' + str(seq_id) + '\n' 33 | + str(seq) + '\n') 34 | 35 | def file_name(handle): 36 | filename = os.path.basename(handle.name) 37 | if filename.endswith(".fasta"): 38 | return filename.rstrip(".fasta") 39 | if filename.endswith(".fna"): 40 | return filename.rstrip(".fna") 41 | else: 42 | return filename.split('.')[0] 43 | 44 | 45 | def strip_file(file): 46 | found = [] 47 | if file is not None: 48 | for line in file: 49 | found.append(line.rstrip('\n')) 50 | return found 51 | 52 | 53 | def compar_seqs(seqs): 54 | count = 0 55 | for index in range(0, len(seqs[0])): 56 | seqs_char = {s[index] for s in seqs} 57 | if '-' in seqs_char: 58 | seqs_char.remove('-') 59 | if len(seqs_char) > 1: 60 | count += 1 61 | return count 62 | 63 | 64 | def write_count(count, texte): 65 | if count: 66 | count.write(texte) 67 | 68 | 69 | def validate_sequence(sequence): 70 | try: 71 | sequence.translate(cds=True, table=11) 72 | except TranslationError: 73 | return False 74 | else: 75 | return True 76 | 77 | 78 | def create_logger(): 79 | log = config.get_logging_level() 80 | if log == "DEBUG": 81 | level = logging.DEBUG 82 | elif log == "INFO": 83 | level = logging.INFO 84 | elif log == "WARNING": 85 | level = logging.WARNING 86 | else: 87 | level = logging.ERROR 88 | logging.basicConfig( 89 | level=level, 90 | format='[%(levelname)s: %(asctime)s] %(message)s') 91 | 92 | 93 | def clean_kwargs(kwargs): 94 | """Removes kwargs with None values produced by Click. 95 | 96 | Because of the way the Click library binds 97 | every arguments and options to kwargs entries, 98 | when a user doesn't specify an option, its name 99 | is bound to None in the kwargs dictionary. 100 | 101 | By removing the None entries we can pass the kwargs directly 102 | to the API core functions without overriding the default values. 103 | """ 104 | for key, value in kwargs.copy().items(): 105 | if value is None: 106 | kwargs.pop(key) 107 | return kwargs 108 | 109 | def get_output(kwargs): 110 | """Extract output from kwargs for extractor 111 | """ 112 | if 'output' in kwargs: 113 | out_kwargs = {'output': kwargs['output']} 114 | kwargs.pop('output') 115 | else: 116 | out_kwargs = {} 117 | return kwargs,out_kwargs 118 | 119 | 120 | def check_type(conn, mlst_type): 121 | inspector = inspect(conn) 122 | tables = inspector.get_table_names() 123 | if len(tables) == 0: 124 | return 125 | elif 'mlst_type' not in tables: 126 | ##set_type(conn, mlst_type) 127 | logging.warning('The base missing mlst_type metadata, continue with %s', mlst_type) 128 | return 129 | m_t = conn.execute( 130 | select(flag.mlst_type.c.name) 131 | ).fetchone() 132 | if m_t.name != mlst_type: 133 | raise exceptions.WrongBaseType( 134 | 'The base you are attempting to perform ' 135 | 'on belongs to the wrong MLST type') 136 | 137 | 138 | # def set_type(conn, mlst_type): 139 | # flag.metadata.create_all(conn) 140 | # conn.execute( 141 | # flag.mlst_type.insert(), 142 | # name=mlst_type) 143 | 144 | 145 | def get_updated_engine(path, module): 146 | env_path = config.get_data(os.path.join('alembic', module)) 147 | alembic_cfg = Config() 148 | alembic_cfg.set_main_option('script_location', env_path) 149 | logging.getLogger('alembic').setLevel(logging.CRITICAL) 150 | 151 | if path is None: 152 | engine = create_engine('sqlite://') # creates a :memory: database 153 | else: 154 | engine = create_engine('sqlite:///' + os.path.abspath(path)) 155 | 156 | with engine.begin() as conn: 157 | check_type(conn, module) 158 | alembic_cfg.attributes['connection'] = conn 159 | command.upgrade(alembic_cfg, 'head') 160 | 161 | return engine 162 | 163 | def clean_geneid(geneid): 164 | """Remove '_' on geneid to be compatible with kma search""" 165 | return(geneid.replace("_", "-")) 166 | -------------------------------------------------------------------------------- /pymlst/common/kma.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | ##Copyright (c) 2021 Benoit Valot 5 | ##benoit.valot@univ-fcomte.fr 6 | ##UMR 6249 Chrono-Environnement, Besançon, France 7 | ##Licence GPL 8 | import logging 9 | 10 | import subprocess 11 | import tempfile 12 | from io import BytesIO 13 | import os 14 | 15 | from pymlst import config 16 | from pymlst.common import utils 17 | from pymlst.common.psl import Psl 18 | from pymlst.common import exceptions 19 | 20 | index = [".comp.b", ".length.b", ".name", ".seq.b"] 21 | suffix = ".kma" 22 | 23 | def run_kma(fastq, basename, identity, coverage, reads): 24 | """Run kma on fastq(s) and return sequences""" 25 | if is_database_indexing(basename) is False: 26 | raise exceptions.PyMLSTError('Dabatase must be index with KMA') 27 | 28 | path = config.get_binary_path('kma') 29 | if path is None: 30 | raise exceptions.BinaryNotFound('KMA binary was not found') 31 | 32 | with tempfile.NamedTemporaryFile('w+t') as tmp: 33 | baseout = tmp.name 34 | command = [path, '-t_db', basename+suffix, '-o', baseout, '-nf'] 35 | if len(fastq) == 1: 36 | command.extend(['-i', fastq[0].name]) 37 | elif len(fastq) == 2: 38 | command.extend(['-ipe', fastq[0].name, fastq[1].name]) 39 | else: 40 | raise exceptions.PyMLSTError('Too many fastq files in input of run_kma') 41 | 42 | logging.info("Running KMA with cg/wgMLST database") 43 | proc = subprocess.Popen(command, stderr=subprocess.PIPE, \ 44 | stdout=subprocess.PIPE) 45 | 46 | output, error = proc.communicate() 47 | if os.path.exists(baseout + ".res") and os.path.exists(baseout + ".fsa"): 48 | for line in BytesIO(error).readlines(): 49 | logging.debug(line.decode().rstrip()) 50 | else: 51 | for line in BytesIO(error).readlines(): 52 | logging.error(line.decode().rstrip()) 53 | raise exceptions.PyMLSTError( 54 | 'An error occurred while running KMA') 55 | 56 | with open(baseout + ".res", 'r') as kma: 57 | kma_res = read_kma_res(kma, coverage, identity, reads) 58 | seqs = utils.read_genome(baseout + ".fsa") 59 | 60 | del_kma_tmp(baseout) 61 | if len(kma_res) == 0: 62 | raise exceptions.CoreGenomePathNotFound( 63 | 'No path was found for the core genome') 64 | return kma_res,seqs 65 | 66 | 67 | def del_kma_tmp(baseout): 68 | """Delete temporary file create by kma""" 69 | for a in [".aln", ".res", ".fsa"]: 70 | if os.path.exists(baseout + a): 71 | os.remove(baseout + a) 72 | 73 | def is_database_indexing(basename): 74 | """Verify if a pyMLST database is indexing""" 75 | for i in index: 76 | if os.path.exists(basename + suffix + i) is False: 77 | return False 78 | return True 79 | 80 | def index_database(basename, coregenes): 81 | """Index a database with kma if the base is not already indexing 82 | 83 | :coregene is a temporary file containing coregenes sequences 84 | """ 85 | if is_database_indexing(basename) is False: 86 | path = config.get_binary_path('kma') 87 | if path is None: 88 | raise exceptions.BinaryNotFound('KMA binary was not found') 89 | logging.info("Indexing database %s with kma", \ 90 | os.path.basename(basename)) 91 | 92 | command = [path, 'index', '-i', coregenes.name, '-o', basename + suffix] 93 | proc = subprocess.Popen(command, stderr=subprocess.PIPE, \ 94 | stdout=subprocess.PIPE) 95 | output, error = proc.communicate() 96 | if is_database_indexing(basename) is False: 97 | for line in BytesIO(error).readlines(): 98 | logging.error(line.decode().rstrip()) 99 | raise exceptions.PyMLSTError( 100 | 'An error occurred while indexing KMA') 101 | else: 102 | for line in BytesIO(error).readlines(): 103 | logging.debug(line.decode().rstrip()) 104 | 105 | 106 | def delete_indexing(basename): 107 | """Remove indexing file""" 108 | for i in index: 109 | if os.path.exists(basename + suffix + i): 110 | os.remove(basename + suffix + i) 111 | 112 | def read_kma_res(kma, cover, ident, reads): 113 | kmas=[] 114 | header = kma.readline().rstrip("\n").split("\t") 115 | if len(header) != 11 or header[0].startswith("#Template") is False: 116 | raise Exception(kma.name + " seems not to be a kma result file\n") 117 | for line in kma: 118 | values = line.rstrip("\n").split("\t") 119 | if len(values) != 11: 120 | raise Exception("Incorrect line\n" + line) 121 | ele = {a:b.strip(" ") for a,b in zip(header, values)} 122 | if float(ele.get("Template_Coverage")) >= cover*100 and \ 123 | float(ele.get("Query_Coverage")) >= cover*100 and \ 124 | float(ele.get("Template_Identity")) >= ident*100 and \ 125 | float(ele.get("Query_Identity")) >= ident*100 and \ 126 | float(ele.get("Depth")) >= reads : 127 | kmas.append(ele.get("#Template")) 128 | return kmas 129 | 130 | def index_tmpfile(): 131 | return tempfile.NamedTemporaryFile(mode='w+t', suffix='.fasta') 132 | -------------------------------------------------------------------------------- /pymlst/data/pytyper/clmt.fna: -------------------------------------------------------------------------------- 1 | >chuAalbertii CP024282.1:1722782-1722917 Escherichia albertii strain 2014C-4356 chromosome, complete genome 2 | GCCACCGATTTATTGCGCTCTGTCCCCGGGATTACCCTTGACGGCACTGGGCGGACGAACGGCCAGGATG 3 | TGAACATGCGTGGCTATGACCATCGCGGCGTGCTGATTCTTGTTGACGGCGTTCGTCAGGGAACAG 4 | >citPfergus CP042945.1:1413338-1413637 Escherichia fergusonii strain ATCC 35471 chromosome, complete genome 5 | AAAAAGGCGCGAATGCCTTGCTGAGTTTTAAAATATTCGGCATGCCGCTTCCGCTTTACGCGTTTGCGTT 6 | AATTACACTATTACTTTCGCATTTTAATGATTCATTGCCAAACGACCTGGTCGGTGGTTTCGCAATCATG 7 | TTTATTATTGGCGCTATTTTTGGTGAGATTGGTAAGCGTCTGCCGATATTTAACAAATATATTGGTGGCG 8 | CACCAGTAATGATATTTCTGGTAGCGGCATATTTTGTTTATGCTGGTATTTTTACTCAGAAAGAAATCGA 9 | CACCATTAGCAATGTTATGG 10 | >arpA CP054940.1:4413854-4414253 Escherichia coli strain MS6192 chromosome, complete genome 11 | AACGCTATTCGCCAGCTTGCCGCTGCTAACATTTTTCCTGGCGACATGCTGTTTAAAAACTTCGGTGTCA 12 | CCCGTCACGGGCGTGTGGTTTTTTATGATTACGATGAAATTTGCTACATGACGGAAGTGAATTTCCGCGA 13 | CATCCCGCCGCCGCGCTATCCGGAAGACGAACTTGCCAGCGAACCGTGGTATAGCGTCTCGCCGGGCGAT 14 | GTTTTCCCGGAAGAGTTTCGCCACTGGCTATGCGCCGACCCGCGTATTGGTCCGCTGTTTGAAGAGATGC 15 | ACGCCGACCTGTTCCGCGCTGATTACTGGCGCGCACTACAAAACCGCATACGTGAAGGGCATGTGGAAGA 16 | TGTTTATGCGTATCGGCGCAGGCAAAGATTTAGCGTACGGTATGGGGAGA 17 | >chuA CP054236.1:249971-250258 Escherichia coli strain EcPF5 chromosome, complete genome 18 | ATGGTACCGGACGAACCAACGGTCAGGATGTAAATATGCGTGGCTATGATCATCGCGGCGTGCTGGTTCT 19 | TGTCGATGGTGTTCGCCAGGGAACGGATACCGGACACCTGAATGGCACTTTTCTCGATCCGGCGCTGATC 20 | AAGCGTGTTGAGATTGTTCGCGGACCTTCAGCATTACTGTATGGCAGTGGCGCGCTGGGTGGAGTGATCT 21 | CCTACGATACGGTCGATGCAAAAGATTTATTGCAGGAAGGACAAAGCAGTGGTTTTCGTGTCTTTGGTAC 22 | TGGCGGCA 23 | >yjaA AP023226.1:c4557291-4557081 Escherichia coli YJ3 DNA, complete genome 24 | CAAACGTGAAGTGTCAGGAGACGCTGCCTTCAGTAACCAGCGCCTGTTAATCGCCAATTTCTTTGTTGCA 25 | GAAAAAGTTCTGCAAGATCTTGTTCTGCAACTCCACCCACGTTCAACCTGGCATTCTTTTTTGCCAGCAA 26 | AACGTATGGATATTGTTGTGAGCGCGCTGGAAATGAATGAGGGCGGTTTGTCACAGGTTGAGGAACGCAT 27 | T 28 | >TspE4.C2 CP054219.1:2890121-2890272 Escherichia coli strain EcPF18 chromosome, complete genome 29 | CACTATTCGTAAGGTCATCCCTTCAAGTTCGATAGTCTGAATATCTACCCGCGTTTCTGTCTCACCCGCA 30 | AGGACAGCGCTGGCGATATAGCCCTCTCTGCGCTGCGTAATACTTTGTTGGCGCGATGAGGGGCGACCCG 31 | CAGCGATAAACT 32 | >trpAgpC CP055256.1:2872982-2873201 Escherichia coli strain AH25 chromosome, complete genome 33 | AGTTTTATGCCCAGTGCGAGAAAGTCGGCGTCGATTCGGTGCTGGTTGCCGATGTGCCAATTGAAGAGTC 34 | CGCGCCCTTCCGCCAGGCCGCGTTGCGCCATAATGTCGCACCTATCTTCATCTGCCCGCCAAATGCCGAT 35 | GACGACCTGCTGCGCCAGATAGCCTCTTACGGTCGTGGTTACACCTATTTGCTGTCACGAGCGGGCGTGA 36 | CCGGCGCAGA 37 | >aesI CP041520.1:c2679520-2679205 Escherichia coli strain ESBL92 chromosome 38 | CCTCTACTCACCCAAAAGTCACAGCCCGGCGACACTATTTTATCTGCATGGTGGTGGTTTTATTCTCGGC 39 | AATCTTGATACCCACGATCGGATTATGCGACTGCTGGCAAATTACACCCAATGTACAGTGATTGGTATTG 40 | CTTACACTCTTTCGCCGGAAGCACGTTTTCCGCAGGCAATAGAGGAAATTGTGGCTGCCTGTTGCCACTT 41 | CCACCAGCAGGCAGAGGATTATCAAATCAATATGTCACGCATTGGTTTTGCCGGTGATTCTGCAGGCGCA 42 | ATGCTGGCGCTCGCCAGTGCGTTGTGGTTACGTGAT 43 | >aesII CP178317.1:c496736-496612 Escherichia coli strain ECBR1023 chromosome, complete genome 44 | TGCCTGTTGCCACTTCCACCAGCAGGCGGGAGATTATCAAATCAACATGTCCCGCATTGGCTTTGCCGGT 45 | GATTCTGCAGGTGCCATGCTGGCGCTCGCCAGTGCGTTGTGGTTGCGTGATAAAC 46 | >chuIII CP057804.1:c3018837-3018655 Escherichia coli strain RHB14-C20 chromosome, complete genome 47 | GTGTTGAGATTGTCCGTGGGCCTTCGGCATTACTGTATGGCAGTGGCGCGCTGGGAGGGGTTATCTCCTA 48 | CGATACGGTCGATGCAAAAGATTTATTGCAGGAAGGACAAAGCAGTGGTTTTCGTGTCTTTGGCACTGGC 49 | GGCACGGGAGACCATAGCCTGGGGCTGGGCGCCAGTGCTTTTG 50 | >chuIV CP089930.1:c1592281-1591821 Escherichia coli strain E69 chromosome, complete genome 51 | CTGGCGAAAGGAACCTGGAAAATTGATTCTGCCCAGGCTCTGAGCGGGTTAGTGCGTTATTACAATAACG 52 | ACGCGCGTGAACCAAAAAATCCGCAGACCGTTGAGGCTTCTGATAGCAGCAACCCGATGGTTGATCGCTC 53 | AACGATTCAACGTGATGCGCAGCTTGCTTATAAACTCGCTCCGTTGGGCAACGACTGGTTAAATGCCGAT 54 | GCAAAAGTTTACTGGTCGGAAGTCCGTATTAATGCGCAGAACACGGGGAGTTCCGGCGAGTATCGTGAAC 55 | AGACGACAAAAGGTGCCAAACTGGAGAACCGTTCCACTCTGTTTGCCGATAGTTTTGCCTCTCACCTGCT 56 | GACATATGGCGGTGAGTATTATCGTCAGGAACAGCATCCTGGCGGTGCGACGACGGGGTTCCCGCAAGCG 57 | AAAATCGATTTCAGCTCCGGTTGGCTGCAAGATGAGATAAC 58 | >chuV CP173213.1:c4016984-4016385 Escherichia marmotae strain F12YCO47 chromosome, complete genome 59 | ACTGTATGGCAGTGGCGCATTGGGAGGGGTTATCTCCTACGATACGGTCGATGCAAAAGATTTATTGCAG 60 | GAAGGACAAAGCAGCGGTTTTCGTGTCTTTGGCACTGGCGGCACGGGAGATCATAGCCTGGGGTTAGGCG 61 | CGAGTGCTTTTGGGCGAACGGAAAATCTGGATGGTATTGTGGCCTGGTCCAGCCGCGATCGTGGTGATTT 62 | ACGCCAGAGCAATGGCGAAACCGCGCCGAATGATGAGGCCATTAATAACATGTTGGCGAAAGGGACCTGG 63 | CAAATTGATTCTGCCCAGGCTCTGAGTGGATTAGTGCGTTATTACAATAACGACGCGCGCGAACCAAAAA 64 | ATCCGCAGACCGTTGAAGCTTCTGACAGCAGTAATCCGATGGTTGATCGTTCAACGATTCAACGTGATGC 65 | GCAACTTGCTTATAAACTCGCACCAGTGGGCAACGACTGGTTAAATGCCGATGCAAAAGTTTACTGGTCG 66 | GAAGTCCGTATTAATGCCCAGAACACGGGGAGTTCCGGCGAATATCGTGAACAGACAACAAAAGGTGCCA 67 | AACTGGAGAACCGCTCCACGCTGTTTGCCGATAGTTTTGC 68 | >trpA CP054236.1:3012950-3013734 Escherichia coli strain EcPF5 chromosome, complete genome 69 | GCTACGAATCTCTGTTTGCCCAGTTGAAGGAGCGCAAAGAAGGCGCATTCGTTCCTTTCGTCACCCTCGG 70 | TGATCCGGGCATTGAGCAGTCGTTGAAAATTATCGATACGCTAATTGAAGCCGGTGCTGACGCGCTGGAG 71 | TTAGGCATCCCCTTCTCCGACCCACTGGCGGATGGCCCGACGATTCAAAACGCCACACTGCGTGCTTTTG 72 | CGGCGGGAGTAACCCCGGCGCAGTGCTTTGAGATGCTGGCACTCATTCGCCAGAAGCACCCGACCATTCC 73 | CATCGGCCTTTTGATGTATGCCAACCTGGTGTTTAACAAAGGCATTGATGAGTTTTATGCCGAGTGCGAG 74 | AAAGTCGGCGTCGATTCGGTGCTGGTTGCCGATGTGCCCGTGGAAGAGTCCGCGCCCTTCCGCCAGGCCG 75 | CGTTGCGTCATAATGTCGCACCTATCTTTATTTGCCCGCCGAATGCCGACGATGATTTGCTGCGCCAGAT 76 | AGCCTCTTACGGTCGTGGTTACACCTATTTGCTGTCGCGAGCGGGCGTGACCGGCGCAGAAAACCGCGCC 77 | GCGTTACCCCTCAATCATCTGGTTGCGAAGCTGAAAGAGTACAACGCTGCGCCTCCATTGCAGGGATTTG 78 | GTATTTCCGCCCCGGATCAGGTAAAAGCCGCGATTGATGCAGGAGCTGCGGGCGCGATTTCTGGTTCGGC 79 | CATCGTTAAAATCATCGAGCAACATATTAATGAGCCAGAGAAAATGCTGGCGGCACTGAAAGCTTTTGTA 80 | CAACCGATGAAAGCG 81 | >trpBA CP054236.1:3012886-3013375 Escherichia coli strain EcPF5 chromosome, complete genome 82 | CGGCGATAAAGACATCTTCACCGTTCACGATATTTTGAAAGCACGAGGGGAAATCTGATGGAACGCTACG 83 | AATCTCTGTTTGCCCAGTTGAAGGAGCGCAAAGAAGGCGCATTCGTTCCTTTCGTCACCCTCGGTGATCC 84 | GGGCATTGAGCAGTCGTTGAAAATTATCGATACGCTAATTGAAGCCGGTGCTGACGCGCTGGAGTTAGGC 85 | ATCCCCTTCTCCGACCCACTGGCGGATGGCCCGACGATTCAAAACGCCACACTGCGTGCTTTTGCGGCGG 86 | GAGTAACCCCGGCGCAGTGCTTTGAGATGCTGGCACTCATTCGCCAGAAGCACCCGACCATTCCCATCGG 87 | CCTTTTGATGTATGCCAACCTGGTGTTTAACAAAGGCATTGATGAGTTTTATGCCGAGTGCGAGAAAGTC 88 | GGCGTCGATTCGGTGCTGGTTGCCGATGTGCCCGTGGAAGAGTCCGCGCCCTTCCGCCAGGCCGCGTTGC 89 | >fdm CP055251.1:1245966-1246230 Escherichia coli strain AH01 chromosome, complete genome 90 | TGGCGGCATTGTTAGCGTACCGGGCGTCTACGCTGGATTTATTCACGGTTTCCTGTTTGGCGACGCCTTT 91 | GATAAAGGGTTGACGTTTAAAATGGGACAGACCCACGTTCACGCATGGCTGGGAGAATTATTACCGTTAA 92 | TTGAGAAAGGATTACTGAAACCAGAAGAAATTGTTACCCACTATATGCCGTTTGAAGAGGCCGCCCGGGG 93 | ATATGAGATTTTCGAAAAACGTGAAGAGGAGTGCCGTAAGGTGATTCTGGTACCC 94 | >ybgD CP054224.1:3139367-3139543 Escherichia coli strain EcPF16 chromosome, complete genome 95 | TATGCGGCTGATGAAGGATCCGGTGAAATTCACTTTAAAGGTGAAGTTATTGAAGCACCGTGTGAAATAC 96 | ATCAGGATGATATTGATAAAGAGGTTGAACTCGGTCAGGTGACCACCAGCCACATTAATCAGTCACATCA 97 | CAGCGATGCCGTTGCTGTCGACCTGCGCTTAGTCAAC -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | import os 13 | import sys 14 | from unittest.mock import MagicMock 15 | 16 | # Determine the absolute path to the directory containing the python modules. 17 | _pysrc = os.path.abspath(os.path.join(os.path.abspath(__file__), "..", "..", "..")) 18 | 19 | # Insert it into the path. 20 | sys.path.insert(0, _pysrc) 21 | 22 | # Now we can import local modules. 23 | import pymlst # noqa 24 | 25 | # -- Document __init__ methods by default. -------------------------------- 26 | # This section was added to allow __init__() to be documented automatically. 27 | # You can comment this section out to go back to the default behavior. 28 | # See: http://stackoverflow.com/questions/5599254 29 | 30 | 31 | def skip(app, what, name, obj, skip, options): 32 | if name == "__init__": 33 | return False 34 | return skip 35 | 36 | 37 | def setup(app): 38 | app.connect("autodoc-skip-member", skip) 39 | 40 | 41 | class Mock(MagicMock): 42 | @classmethod 43 | def __getattr__(cls, name): 44 | return MagicMock() 45 | 46 | 47 | MOCK_MODULES = [ 48 | "numpy", 49 | "networkx", 50 | "scipy", 51 | "sklearn", 52 | "matplotlib", 53 | "matplotlib.pyplot", 54 | "scipy.interpolate", 55 | "scipy.special", 56 | "math", 57 | "pandas", 58 | ] 59 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) 60 | 61 | # -- General configuration ------------------------------------------------ 62 | 63 | # If your documentation needs a minimal Sphinx version, state it here. 64 | # 65 | # needs_sphinx = '1.0' 66 | 67 | # Add any Sphinx extension module names here, as strings. They can be 68 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 69 | # ones. 70 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode", "sphinx.ext.githubpages"] 71 | 72 | # Add any paths that contain templates here, relative to this directory. 73 | templates_path = ["_templates"] 74 | 75 | # The suffix(es) of source filenames. 76 | # You can specify multiple suffix as a list of string: 77 | # 78 | # source_suffix = ['.rst', '.md'] 79 | source_suffix = ".rst" 80 | 81 | # The master toctree document. 82 | master_doc = "index" 83 | 84 | # General information about the project. 85 | project = "pymlst" 86 | copyright = "2019, Zagarwin; 2023, Bvalot" 87 | author = "Benoit Valot" 88 | 89 | # The version info for the project you're documenting, acts as replacement for 90 | # |version| and |release|, also used in various other places throughout the 91 | # built documents. 92 | # 93 | # The short X.Y version. 94 | # version = pymlst.__version__ 95 | # The full version, including alpha/beta/rc tags. 96 | # release = pymlst.__release__ 97 | 98 | # The full version, including alpha/beta/rc tags 99 | 100 | release = pymlst.__release__ 101 | 102 | # The language for content autogenerated by Sphinx. Refer to documentation 103 | # for a list of supported languages. 104 | # 105 | # This is also used if you do content translation via gettext catalogs. 106 | # Usually you set "language" from the command line for these cases. 107 | language = None 108 | 109 | # List of patterns, relative to source directory, that match files and 110 | # directories to ignore when looking for source files. 111 | # This pattern also affects html_static_path and html_extra_path. 112 | exclude_patterns = [] 113 | 114 | # The name of the Pygments (syntax highlighting) style to use. 115 | pygments_style = "sphinx" 116 | 117 | # If true, `todo` and `todoList` produce output, else they produce nothing. 118 | todo_include_todos = False 119 | 120 | # -- Options for HTML output ---------------------------------------------- 121 | 122 | # The theme to use for HTML and HTML Help pages. See the documentation for 123 | # a list of builtin themes. 124 | 125 | # fmt: off 126 | # noqa 127 | # fmt: on 128 | html_theme = "sphinx_rtd_theme" 129 | html_logo = "logo.png" 130 | 131 | html_theme_options = { 132 | } 133 | # fmt: off 134 | # noqa 135 | # noqa 136 | # fmt: on 137 | 138 | # Add any paths that contain custom static files (such as style sheets) here, 139 | # relative to this directory. They are copied after the builtin static files, 140 | # so a file named "default.css" will overwrite the builtin "default.css". 141 | html_static_path = ["_static"] 142 | 143 | # Custom sidebar templates, must be a dictionary that maps document names 144 | # to template names. 145 | # 146 | # This is required for the alabaster theme 147 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 148 | # html_sidebars = { 149 | # "**": [ 150 | # "about.html", 151 | # "navigation.html", 152 | # "relations.html", # needs 'show_related': True theme option to display 153 | # "searchbox.html", 154 | # "donate.html", 155 | # ] 156 | # } 157 | 158 | # -- Options for HTMLHelp output ------------------------------------------ 159 | 160 | # Output file base name for HTML help builder. 161 | htmlhelp_basename = "pymlstdoc" 162 | 163 | # -- Options for LaTeX output --------------------------------------------- 164 | 165 | latex_elements = { 166 | # The paper size ('letterpaper' or 'a4paper'). 167 | # 168 | # 'papersize': 'letterpaper', 169 | # The font size ('10pt', '11pt' or '12pt'). 170 | # 171 | # 'pointsize': '10pt', 172 | # Additional stuff for the LaTeX preamble. 173 | # 174 | # 'preamble': '', 175 | # Latex figure (float) alignment 176 | # 177 | # 'figure_align': 'htbp', 178 | } 179 | 180 | # Grouping the document tree into LaTeX files. List of tuples 181 | # (source start file, target name, title, 182 | # author, documentclass [howto, manual, or own class]). 183 | latex_documents = [ 184 | ( 185 | master_doc, 186 | "pymlst.tex", 187 | "pyMLST Documentation", 188 | author, 189 | "manual", 190 | ) 191 | ] 192 | 193 | # -- Options for manual page output --------------------------------------- 194 | 195 | # One entry per manual page. List of tuples 196 | # (source start file, name, description, authors, manual section). 197 | man_pages = [ 198 | ( 199 | master_doc, 200 | "pymlst", 201 | "pyMLST Documentation", 202 | [author], 203 | 1, 204 | ) 205 | ] 206 | 207 | # -- Options for Texinfo output ------------------------------------------- 208 | 209 | # Grouping the document tree into Texinfo files. List of tuples 210 | # (source start file, target name, title, author, 211 | # dir menu entry, description, category) 212 | texinfo_documents = [ 213 | ( 214 | master_doc, 215 | "pymlst", 216 | "pyMLST Documentation", 217 | author, 218 | "cg/wgMLST analysis using pyMLST", 219 | "O", 220 | "bioinformatic", 221 | ) 222 | ] 223 | -------------------------------------------------------------------------------- /pymlst/common/psl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | ##Copyright (c) 2019 Benoit Valot 5 | ##benoit.valot@univ-fcomte.fr 6 | ##UMR 6249 Chrono-Environnement, Besançon, France 7 | ##Licence GPL 8 | 9 | from Bio.Data.CodonTable import TranslationError 10 | 11 | from pymlst.common import mafft 12 | 13 | 14 | def test_cds(seq): 15 | try: 16 | seq.translate(table="Bacterial", cds=True) 17 | except TranslationError: 18 | return False 19 | else: 20 | return True 21 | 22 | class Psl: 23 | """A simple Psl class""" 24 | def __init__(self, pslline): 25 | pslelement = pslline.rstrip("\n").split("\t") 26 | if len(pslelement) != 21: 27 | raise Exception("Psl line have not 21 elements:\n"+pslline) 28 | self.pslelement = pslelement 29 | self.chro = pslelement[13] 30 | self.start = int(pslelement[15]) 31 | self.end = int(pslelement[16]) 32 | self.strand = pslelement[8] 33 | self.rstart = int(pslelement[11]) 34 | self.rend = int(pslelement[12]) 35 | self.rtotal = int(pslelement[10]) 36 | self.coverage = (float(self.rend) - self.rstart)/self.rtotal 37 | 38 | def gene_id(self): 39 | return self.pslelement[9] 40 | 41 | def get_sequence(self, seq): 42 | if self.strand == '+': 43 | return seq[self.start:self.end] 44 | return seq[self.start:self.end].reverse_complement() 45 | 46 | # def searchCorrect(self): 47 | # if int(self.pslelement[11]) != 0: 48 | # diff = int(self.pslelement[11]) 49 | # if self.strand == "+": 50 | # self.start = self.start - diff 51 | # else: 52 | # self.end = self.end + diff 53 | # elif int(self.pslelement[10]) != int(self.pslelement[12]): 54 | # diff = int(self.pslelement[10]) - int(self.pslelement[12]) 55 | # if self.strand == "+": 56 | # self.end = self.end + diff 57 | # else: 58 | # self.start = self.start - diff 59 | # self.coverage = 1 60 | # 61 | # def searchCorrectCDS(self, seq, coverage): 62 | # prot = self.get_sequence(seq) 63 | # ##modifs start and stop not create 64 | # if prot.startswith("M") is False and prot.endswith("*") is False: 65 | # return False 66 | # windows = int((1-coverage)*self.rtotal) 67 | # if prot.startswith("M") is False: 68 | # return self.__searchCDS(seq, True, False, windows, 0) 69 | # elif prot.endswith("*") is False: 70 | # return self.__searchCDS(seq, False, True, windows, 0) 71 | # else: 72 | # raise Exception("A problem of start/stop for gene " + self.gene_id()) 73 | 74 | # def searchPartialCDS(self, seq, coverage): 75 | # ##modifs start and stop not create 76 | # if self.rstart !=0 and self.rend != self.rtotal: 77 | # return False 78 | # windows = int((1-coverage)*self.rtotal) 79 | # if self.rstart !=0: 80 | # diff = self.rstart 81 | # return self.__searchCDS(seq, True, False, windows, diff) 82 | # elif self.rend != self.rtotal: 83 | # diff = self.rtotal - self.rend 84 | # return self.__searchCDS(seq, False, True, windows, diff) 85 | # else: 86 | # raise Exception("A problem of start/stop for gene " + self.gene_id()) 87 | 88 | def get_aligned_sequence(self, seq, coregene): 89 | if self.strand == '+': 90 | expand_start = self.rstart > 0 91 | expand_end = self.rend < self.rtotal 92 | else: 93 | expand_start = self.rend < self.rtotal 94 | expand_end = self.rstart > 0 95 | 96 | if expand_start: 97 | start = self.start - 36 98 | if start < 0: 99 | start = 0 100 | else: 101 | start = self.start 102 | 103 | if expand_end: 104 | end = self.end + 36 105 | if end > len(seq): 106 | end = len(seq) 107 | else: 108 | end = self.end 109 | 110 | target = seq[start:end] 111 | if self.strand != '+': 112 | target = target.reverse_complement() 113 | 114 | al_start, al_end = mafft.get_aligned_area(coregene, str(target)) 115 | if al_start is not None: 116 | return target[al_start:al_end] 117 | 118 | return None 119 | 120 | # def __searchCDS(self, seq, start, stop, windows, diff): 121 | # ##correct windows/diff multiple of 3 122 | # windows = windows - windows%3 123 | # diff = diff - diff%3 124 | # ##modifs start and stop not create 125 | # if start and stop: 126 | # return False 127 | # ##modifs start 128 | # if start: 129 | # ##modulo = (self.end-self.start)%3 130 | # if self.strand == "+": 131 | # theoStart = self.__getTheoricStart(diff) 132 | # val = [i for i in range(theoStart+windows, theoStart-windows, -3) \ 133 | # if test_cds(seq.seq[i:self.end], False)] 134 | # if len(val)==1: 135 | # self.start=val[0] 136 | # return True 137 | # elif len(val) >1: 138 | # best = self.__getBest(val) 139 | # self.logger.info("Choosing best start for gene " + self.gene_id() + " " \ 140 | # + str(best) + " " + str(val)) 141 | # self.start = best 142 | # return True 143 | # else: 144 | # return False 145 | # else: 146 | # theoEnd = self.__getTheoricEnd(diff) 147 | # val = [i for i in range(theoEnd-windows, theoEnd+windows, 3) \ 148 | # if test_cds(seq.seq[self.start:i], True)] 149 | # if len(val) == 1: 150 | # self.end = val[0] 151 | # return True 152 | # elif len(val) >1: 153 | # best = self.__getBest(val) 154 | # self.logger.info("Choosing best start for gene " + self.gene_id() + " " \ 155 | # + str(best) + " " + str(val)) 156 | # self.end = best 157 | # return True 158 | # else: 159 | # return False 160 | # ##modifs end 161 | # elif stop: 162 | # ##modulo = (self.end-self.start)%3 163 | # if self.strand == "+": 164 | # theoEnd = self.__getTheoricEnd(diff) 165 | # val = [i for i in range(theoEnd-windows, theoEnd+windows, 3) \ 166 | # if test_cds(seq.seq[self.start:i], False)] 167 | # if len(val) == 1: 168 | # self.end = val[0] 169 | # return True 170 | # else: 171 | # return False 172 | # else: 173 | # theoStart = self.__getTheoricStart(diff) 174 | # val = [i for i in range(theoStart+windows, theoStart-windows, -3) \ 175 | # if test_cds(seq.seq[i:self.end], True)] 176 | # if len(val) == 1: 177 | # self.start = val[0] 178 | # return True 179 | # else: 180 | # return False 181 | # 182 | # def __getTheoricStart(self, diff): 183 | # modulo = (self.end-self.start)%3 184 | # return self.start + modulo - diff 185 | # 186 | # def __getTheoricEnd(self, diff): 187 | # modulo = (self.end-self.start)%3 188 | # return self.end - modulo + diff 189 | # 190 | # def __getBest(self, val): 191 | # best = val[0] 192 | # for v in val[1:]: 193 | # if self.strand == "+": 194 | # if abs(abs(self.end - v) - self.rtotal) < abs(abs(self.end - best) - self.rtotal): 195 | # best = v 196 | # else: 197 | # if abs(abs(v - self.start) - self.rtotal) 198 | # < abs(abs(best - self.start) - self.rtotal): 199 | # best = v 200 | # return best 201 | -------------------------------------------------------------------------------- /tests/test_wg.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from sqlalchemy import select, exists 4 | from sqlalchemy.sql.functions import count 5 | from sqlalchemy.sql.operators import in_op as in_ 6 | 7 | import pymlst 8 | from pymlst.common import exceptions 9 | from pymlst.wg import model 10 | from pymlst.wg.core import DatabaseWG, DuplicationHandling 11 | 12 | data_path = os.path.join(os.path.dirname(__file__), 'data') 13 | wg_path = os.path.join(data_path, 'wg') 14 | 15 | 16 | def fasta(name): 17 | return open(os.path.join(wg_path, name + '.fasta')) 18 | 19 | 20 | @pytest.fixture() 21 | def wg(): 22 | with pymlst.open_wg() as wg_mlst: 23 | yield wg_mlst 24 | 25 | 26 | @pytest.fixture() 27 | def db(): 28 | db = DatabaseWG(None, 'ref') 29 | try: 30 | yield db 31 | finally: 32 | db.close() 33 | 34 | 35 | @pytest.fixture() 36 | def db_simple(db): 37 | seqs = [ 38 | ('g1', 'AAA'), 39 | ('g1', 'ATA'), 40 | ('g2', 'TTT'), 41 | ('g3', 'CCC'), 42 | ('g4', 'CCC'), 43 | ] 44 | for gene, seq in seqs: 45 | db.add_genome(gene, 'A', seq) 46 | return db 47 | 48 | 49 | @pytest.fixture() 50 | def db_many(db): 51 | seqs_ref = [ 52 | ('g1', 'AAA'), 53 | ('g2', 'ATA'), 54 | ('g3', 'TTT'), 55 | ('g4', 'CCC'), 56 | ('g5', 'GGG'), 57 | ] 58 | for gene, seq in seqs_ref: 59 | db.add_core_genome(gene, seq) 60 | seqs = [ 61 | ('A', 'g1', 'AAA'), 62 | ('A', 'g2', 'ATA'), 63 | ('A', 'g3', 'ATT'), 64 | ('A', 'g4', 'CCC'), 65 | ('B', 'g1', 'AAT'), 66 | ('B', 'g2', 'ATA'), 67 | ('B', 'g3', 'TTT'), 68 | ('B', 'g4', 'CAC'), 69 | ('B', 'g5', 'GGG'), 70 | ('C', 'g1', 'AAA'), 71 | ('C', 'g3', 'TTT'), 72 | ('C', 'g4', 'CAA'), 73 | ('D', 'g4', 'CAC'), 74 | ] 75 | for strain, gene, seq in seqs: 76 | db.add_genome(gene, strain, seq) 77 | return db 78 | 79 | 80 | def test_add_genome(db): 81 | db.add_genome('g1', 'A', 'AAA') 82 | seq = db.connection.execute( 83 | select([model.sequences]) 84 | ).fetchall() 85 | assert len(seq) == 1 86 | assert seq[0].sequence == 'AAA' 87 | mlst = db.connection.execute( 88 | select([model.mlst]) 89 | ).fetchall() 90 | assert len(mlst) == 1 91 | assert (mlst[0].gene == 'g1' 92 | and mlst[0].souche == 'A' 93 | and mlst[0].seqid == seq[0].id) 94 | 95 | 96 | def test_add_core_genome(db): 97 | added = db.add_core_genome('g1', 'AAA') 98 | assert added 99 | seq = db.connection.execute( 100 | select([model.sequences]) 101 | ).fetchone() 102 | assert seq.sequence == 'AAA' 103 | mlst = db.connection.execute( 104 | select([model.mlst]) 105 | ).fetchone() 106 | assert mlst.souche == db.ref == 'ref' 107 | assert mlst.gene == 'g1' and mlst.seqid == seq.id 108 | 109 | 110 | def test_add_core_genome_exist_no_duplication_handle(db): 111 | db.add_core_genome('g1', 'AAA') 112 | with pytest.raises(exceptions.DuplicatedGeneSequence): 113 | db.add_core_genome('g2', 'AAA') 114 | 115 | 116 | def test_add_core_genome_exist_concatenate_handle(db): 117 | db.add_core_genome('g1', 'AAA') 118 | added = db.add_core_genome('g2', 'AAA', DuplicationHandling.CONCATENATE) 119 | assert not added 120 | seq = db.connection.execute( 121 | select([model.sequences.c.id]) 122 | .where(model.sequences.c.sequence == 'AAA') 123 | ).fetchall() 124 | assert len(seq) == 1 125 | mlst = db.connection.execute( 126 | select([model.mlst.c.gene]) 127 | ).fetchall() 128 | assert len(mlst) == 1 129 | assert mlst[0].gene == 'g1;g2' 130 | 131 | 132 | def test_add_core_genome_exist_remove_handle(db): 133 | db.add_core_genome('g1', 'AAA') 134 | added = db.add_core_genome('g2', 'AAA', DuplicationHandling.REMOVE) 135 | assert not added 136 | seq = db.connection.execute( 137 | select([model.sequences]) 138 | ).fetchall() 139 | assert len(seq) == 0 140 | mlst = db.connection.execute( 141 | select([model.mlst]) 142 | ).fetchall() 143 | assert len(mlst) == 0 144 | 145 | 146 | def test_add_core_genome_gene_exist(db): 147 | db.add_core_genome('g1', 'AAA') 148 | with pytest.raises(exceptions.DuplicatedGeneName): 149 | db.add_core_genome('g1', 'AAT') 150 | 151 | 152 | def test_add_genome_with_invalid_gene_name(db): 153 | with pytest.raises(exceptions.InvalidGeneName): 154 | db.add_core_genome('g1;', 'AAA') 155 | 156 | 157 | def test_get_core_genome(db): 158 | db.add_core_genome('g1', 'AAA') 159 | db.add_core_genome('g2', 'TTT') 160 | db.add_genome('g3', 'A', 'CCC') 161 | core_genome = db.core_genome 162 | assert core_genome == { 163 | 'g1': 'AAA', 164 | 'g2': 'TTT', 165 | } 166 | 167 | 168 | def test_remove_gene(db_simple): 169 | db_simple.remove_gene('g1') 170 | mlst_e = db_simple.connection.execute( 171 | select([model.mlst]) 172 | .where(model.mlst.c.gene == 'g1') 173 | ).fetchone() 174 | assert mlst_e is None 175 | seq_e = db_simple.connection.execute( 176 | select([model.sequences]) 177 | .where(in_(model.sequences.c.sequence, 178 | ['AAA', 'ATA'])) 179 | ).fetchone() 180 | assert seq_e is None 181 | 182 | 183 | def test_remove_gene_sequence_still_referenced(db_simple): 184 | db_simple.remove_gene('g3') 185 | seq_e = db_simple.connection.execute( 186 | select([model.sequences]) 187 | .where(model.sequences.c.sequence == 'CCC') 188 | ).fetchone() 189 | assert seq_e is not None 190 | 191 | 192 | def test_remove_gene_from_core_genome_dict(db): 193 | db.add_core_genome('g1', 'AAA') 194 | assert 'g1' in db.core_genome 195 | db.remove_gene('g1') 196 | assert 'g1' not in db.core_genome 197 | 198 | 199 | def test_remove_strain(db_many): 200 | db_many.remove_strain('B') 201 | mlst_e = db_many.connection.execute( 202 | select([model.mlst]) 203 | .where(model.mlst.c.souche == 'B') 204 | ).fetchone() 205 | assert mlst_e is None 206 | seq_c = db_many.connection.execute( 207 | select([count(model.sequences.c.id)]) 208 | ).fetchone() 209 | assert seq_c[0] == 8 # Removed 1 sequence only 210 | 211 | 212 | def test_remove_reference_strain_attempt(db): 213 | db.add_core_genome('g1', 'AAA') 214 | with pytest.raises(exceptions.ReferenceStrainRemoval): 215 | db.remove_strain('ref') 216 | 217 | 218 | def test_contains_souche(db_many): 219 | assert db_many.contains_souche('B') 220 | db_many.connection.execute( 221 | model.mlst.delete() 222 | .where(model.mlst.c.souche == 'B')) 223 | assert not db_many.contains_souche('B') 224 | 225 | 226 | def test_get_gene_sequences_many_strains(db_many): 227 | g1_seq = db_many.get_gene_sequences('g1') 228 | assert g1_seq == [ 229 | [1, ['A', 'C'], 'AAA'], 230 | [7, ['B'], 'AAT'], 231 | ] 232 | g2_seq = db_many.get_gene_sequences('g2') 233 | assert g2_seq == [ 234 | [2, ['A', 'B'], 'ATA']] 235 | 236 | 237 | def test_get_gene_sequences_one_strain_duplicated_gene(db_simple): 238 | g1_seq = db_simple.get_gene_sequences('g1') 239 | assert g1_seq == [ 240 | [1, ['A'], 'AAA'], 241 | [2, ['A'], 'ATA']] 242 | 243 | 244 | def test_get_duplicated_genes(db_simple): 245 | dupli = db_simple.get_duplicated_genes() 246 | assert dupli == {'g1'} 247 | 248 | 249 | def test_get_all_strains(db_many): 250 | strains = db_many.get_all_strains() 251 | assert strains == ['A', 'B', 'C', 'D'] 252 | 253 | 254 | def test_get_core_genes(db_many): 255 | genes = db_many.get_core_genes() 256 | assert genes == ['g1', 'g2', 'g3', 'g4', 'g5'] 257 | 258 | 259 | def test_count_sequences_per_gene(db_many): 260 | seq_c = db_many.count_sequences_per_gene() 261 | assert seq_c == { 262 | 'g1': 2, 263 | 'g2': 1, 264 | 'g3': 2, 265 | 'g4': 3, 266 | 'g5': 1 267 | } 268 | 269 | 270 | def test_count_souches_per_gene(db_many): 271 | str_c = db_many.count_souches_per_gene() 272 | assert str_c == { 273 | 'g1': 3, 274 | 'g2': 2, 275 | 'g3': 3, 276 | 'g4': 4, 277 | 'g5': 1 278 | } 279 | 280 | 281 | def test_count_genes_per_souche(db_many): 282 | gene_c = db_many.count_genes_per_souche(['g1', 'g2', 'g3', 'g4', 'g5']) 283 | assert gene_c == { 284 | 'A': 4, 285 | 'B': 5, 286 | 'C': 3, 287 | 'D': 1, 288 | 'ref': 5 289 | } 290 | 291 | 292 | def test_count_sequences(db_many): 293 | seq_c = db_many.count_sequences() 294 | assert seq_c == 9 295 | 296 | 297 | def test_get_strains_distances(db_many): 298 | distances = db_many.get_strains_distances(['g1', 'g2', 'g3', 'g4', 'g5']) 299 | assert distances == { 300 | 'A': { 301 | 'A': 0, 302 | 'B': 3, 303 | 'C': 2, 304 | 'D': 1, 305 | }, 306 | 'B': { 307 | 'A': 3, 308 | 'B': 0, 309 | 'C': 2, 310 | 'D': 0, 311 | }, 312 | 'C': { 313 | 'A': 2, 314 | 'B': 2, 315 | 'C': 0, 316 | 'D': 1, 317 | }, 318 | 'D': { 319 | 'A': 1, 320 | 'B': 0, 321 | 'C': 1, 322 | 'D': 0, 323 | }, 324 | } 325 | 326 | 327 | def test_get_mlst(db_many): 328 | mlst = db_many.get_mlst(['g1', 'g2', 'g3', 'g4', 'g5']) 329 | assert mlst == { 330 | 'g1': { 331 | 'A': '1', 332 | 'B': '7', 333 | 'C': '1', 334 | }, 335 | 'g2': { 336 | 'A': '2', 337 | 'B': '2', 338 | }, 339 | 'g3': { 340 | 'A': '6', 341 | 'B': '3', 342 | 'C': '3', 343 | }, 344 | 'g4': { 345 | 'A': '4', 346 | 'B': '8', 347 | 'C': '9', 348 | 'D': '8', 349 | }, 350 | 'g5': { 351 | 'B': '5', 352 | }, 353 | } 354 | -------------------------------------------------------------------------------- /pymlst/common/web.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import logging 4 | 5 | import zipfile 6 | import requests 7 | import questionary 8 | 9 | from git import Repo 10 | from bs4 import BeautifulSoup 11 | from Bio import SeqIO 12 | 13 | import urllib3 14 | urllib3.disable_warnings() 15 | 16 | from pymlst.common import exceptions 17 | 18 | PUBMLST_URL = 'https://rest.pubmlst.org/db' 19 | PASTEUR_URL = 'https://bigsdb.pasteur.fr/api/db' 20 | CGMLST_URL = 'https://www.cgmlst.org/ncs' 21 | 22 | 23 | 24 | def request(query): 25 | result = requests.get(query, timeout=600, verify=False) 26 | result.raise_for_status() 27 | return result 28 | 29 | 30 | def display_prompt(message, choices): 31 | style = questionary.Style([ 32 | ('qmark', 'fg:#673ab7 bold'), 33 | ('question', 'bold'), 34 | ('answer', 'fg:#f44336 bold'), 35 | ('pointer', 'fg:#673ab7 bold'), 36 | ('highlighted', 'fg:#673ab7 bold'), 37 | ('selected', 'fg:#cc5454'), 38 | ('separator', 'fg:#cc5454'), 39 | ('instruction', ''), 40 | ('text', ''), 41 | ]) 42 | 43 | return questionary.select(message, 44 | choices, 45 | style=style) \ 46 | .ask() 47 | 48 | 49 | def is_mlst_scheme(url, description): 50 | desc_lower = description.lower() 51 | blacklist = ['cgmlst', 'wgmlst', 'extended mlst'] 52 | for word in blacklist: 53 | if word in desc_lower: 54 | return False 55 | scheme_json = request(url).json() 56 | if 'profiles_csv' not in scheme_json: 57 | return False 58 | return len(scheme_json['loci']) < 10 59 | 60 | 61 | def process_results(choices, query, prompt): 62 | choices_length = len(choices) 63 | if choices_length == 0: 64 | raise exceptions.PyMLSTWebError('No result found for \'{}\'\n'.format(query)) 65 | if choices_length == 1: 66 | logging.info("One element found : {}".format(choices[0])) 67 | return choices[0] 68 | if prompt: 69 | logging.info("{} elements found, please choose one:".format(str(len(choices)))) 70 | return display_prompt('({}) Results found'.format(choices_length), 71 | choices) 72 | raise exceptions.PyMLSTWebError('More than 1 result found for \'{}\'\n'.format(query)) 73 | 74 | 75 | def get_mlst_species(query, repo_url): 76 | """Gets MLST species from pubmlst.org. 77 | 78 | :param query: A sub-string to filter species names. 79 | :param repo_url: An online repository url 80 | :return: A Dictionary with species name in Key and URL in Value. 81 | """ 82 | try: 83 | whole_base = request(repo_url).json() 84 | except ValueError as error: 85 | raise exceptions.StructureError() from error 86 | 87 | species = {} 88 | species_all = {} 89 | query_low = query.lower() 90 | 91 | try: 92 | for record in whole_base: 93 | if record['name'] == 'test': 94 | continue 95 | for database in record['databases']: 96 | des = database['description'].replace('sequence/profile definitions', '').lower() 97 | if database['name'].endswith('seqdef'): 98 | if query_low in des: 99 | species[des] = database['href'] 100 | for sub_query in query_low.split(' '): 101 | if sub_query in des: 102 | species_all[des] = database['href'] 103 | except KeyError as error: 104 | raise exceptions.StructureError() from error 105 | if len(species) > 0: 106 | return species 107 | logging.info("No elements found for {}, search for each individual term".format(query)) 108 | return species_all 109 | 110 | 111 | def get_mlst_schemes(species_url, query): 112 | """Gets schemes profiles from PubMLST for a given species URL. 113 | 114 | :param species_url: The species URL (see get_mlst_species()). 115 | :param query: A sub-string to filter schemes names. 116 | :return: A Dictionary with schemes name in Key and URL in Value. 117 | """ 118 | schemes_url = species_url + '/schemes' 119 | schemes_json = request(schemes_url).json() 120 | 121 | schemes = {} 122 | query_low = query.lower() 123 | 124 | try: 125 | for scheme in schemes_json['schemes']: 126 | if not is_mlst_scheme(scheme['scheme'], scheme['description']): 127 | continue 128 | des = scheme['description'].lower() 129 | if query_low in des: 130 | schemes[des] = scheme['scheme'] 131 | except KeyError as error: 132 | raise exceptions.StructureError() from error 133 | 134 | return schemes 135 | 136 | 137 | def retrieve_mlst(query, prompt_enabled, mlst='', repository='pubmlst'): 138 | """Retrieves MLST data, prompts user if necessary and if possible. 139 | 140 | :param query: A sub-string to filter species names. 141 | :param prompt_enabled: Whether or not to prompt user for actions. 142 | If disabled and many choices are possible, 143 | will raise an Exception. 144 | :param mlst: A sub-string to filter schemes names. 145 | :param repository: Defined the online repository [PUBMLST,PASTEUR] 146 | :return: A scheme URL. 147 | """ 148 | if repository.upper()=="PUBMLST": 149 | species = get_mlst_species(query, PUBMLST_URL) 150 | elif repository.upper()=="PASTEUR": 151 | species = get_mlst_species(query, PASTEUR_URL) 152 | if mlst=='': 153 | mlst='mlst' 154 | else: 155 | raise exceptions.PyMLSTWebError("Only PUBMLST or PASTEUR repository are defined") 156 | species_choice = process_results(list(species.keys()), query, prompt_enabled) 157 | if species_choice is None: 158 | return None 159 | 160 | species_url = species[species_choice] 161 | 162 | schemes = get_mlst_schemes(species_url, mlst) 163 | scheme_choice = process_results(list(schemes.keys()), mlst, prompt_enabled) 164 | if scheme_choice is None: 165 | return None 166 | 167 | return schemes[scheme_choice] 168 | 169 | 170 | def get_cgmlst_species(query): 171 | """Gets cgMLST species from cgmlst.org. 172 | 173 | :param query: A sub-string to filter species names. 174 | :return: A Dictionary with species name in Key and download URL in Value. 175 | """ 176 | page = request(CGMLST_URL) 177 | 178 | soup = BeautifulSoup(page.content, 'html.parser') 179 | 180 | table = soup.find('tbody') 181 | if table is None: 182 | raise exceptions.StructureError() 183 | 184 | lines = table.find_all('a') 185 | 186 | species = {} 187 | query_low = query.lower() 188 | 189 | for line in lines: 190 | text = line.get_text() 191 | if 'cgMLST' not in text: 192 | continue 193 | name = text.replace('cgMLST', '').strip() 194 | if query_low in name.lower(): 195 | url = line.get('href') 196 | if url is None: 197 | raise exceptions.StructureError() 198 | species[name] = url 199 | 200 | return species 201 | 202 | 203 | def retrieve_cgmlst(query, prompt_enabled): 204 | """Retrieves cgMLST data, prompts user if necessary and if possible. 205 | 206 | :param query: A sub-string to filter species names. 207 | :param prompt_enabled: Whether or not to prompt user for actions. 208 | If disabled and many choices are possible, 209 | will raise an Exception. 210 | :return: A species download URL. 211 | """ 212 | species = get_cgmlst_species(query) 213 | choice = process_results(list(species.keys()), query, prompt_enabled) 214 | if choice is None: 215 | return None 216 | 217 | species_url = species[choice] 218 | 219 | return species_url 220 | 221 | def get_cgmlst_info(url): 222 | """Retrieve informations of cgMLST data 223 | 224 | :param url: The url information page 225 | 226 | """ 227 | page = request(url) 228 | 229 | soup = BeautifulSoup(page.content, 'html.parser') 230 | 231 | table = soup.find('tbody') 232 | if table is None: 233 | raise exceptions.StructureError() 234 | 235 | lines = [v.get_text() for v in table.contents] 236 | genus = "" 237 | species = "" 238 | version = "" 239 | for line in lines: 240 | if line.startswith("Genus"): 241 | genus = line.lstrip("Genus") 242 | if line.startswith("Species"): 243 | species = line.lstrip("Species") 244 | if line.startswith("Last Change"): 245 | version = line.lstrip("Last Change") 246 | return(genus+" "+species, version) 247 | 248 | 249 | def get_cgmlst_file(url, handle): 250 | """Download cgMLST data and use them to initialize a fasta file. 251 | 252 | :param url: The download URL. 253 | :param handle: The file handle. 254 | """ 255 | with tempfile.TemporaryDirectory() as tmp_dir: 256 | url += 'alleles' 257 | zip_req = request(url) 258 | zip_tmp = os.path.join(tmp_dir, 'tmp.zip') 259 | open(zip_tmp, 'wb').write(zip_req.content) 260 | 261 | fas_tmp = os.path.join(tmp_dir, 'fas') 262 | os.mkdir(fas_tmp) 263 | with zipfile.ZipFile(zip_tmp) as z_file: 264 | z_file.extractall(fas_tmp) 265 | skipped = [] 266 | for fasta in os.listdir(fas_tmp): 267 | try: 268 | iterator = next(SeqIO.parse(os.path.join(fas_tmp, fasta), 'fasta')) 269 | except (StopIteration, ValueError, TypeError): 270 | skipped.append(fasta) 271 | continue 272 | handle.write('> ' + fasta.replace('.fasta', '') + '\n') 273 | handle.write(str(iterator.seq) + '\n') 274 | return skipped 275 | 276 | 277 | def clean_csv(csv_content, locus_nb): 278 | lines = csv_content.split('\n') 279 | header = lines[0].split('\t') 280 | diff = len(header) - (locus_nb + 1) 281 | if diff > 0: 282 | lines[0] = '\t'.join(header[0:-diff]) 283 | return '\n'.join(lines) 284 | 285 | 286 | def get_mlst_files(url, directory): 287 | """Download MLST data and puts them in the given directory. 288 | 289 | :param url: The scheme URL. 290 | :param directory: The directory. 291 | """ 292 | mlst_scheme = request(url).json() 293 | version = mlst_scheme.get('last_added', "Not found") 294 | logging.info("Database version : {}".format(version)) 295 | 296 | # Downloading the locus files in a directory : 297 | locus_dir = os.path.join(directory, 'locus') 298 | os.mkdir(locus_dir) 299 | for loci in mlst_scheme['loci']: 300 | name = loci.split('/')[-1] 301 | loci_fasta = request(loci + '/alleles_fasta') 302 | loci_file_name = os.path.join(locus_dir, name + '.fasta') 303 | open(loci_file_name, 'wb').write(loci_fasta.content) 304 | 305 | # Downloading the profiles CSV : 306 | profiles_url = url + '/profiles_csv' 307 | profiles = request(profiles_url) 308 | open(os.path.join(directory, 'profiles.csv'), 'wt').write(profiles.text) 309 | return(version) 310 | 311 | 312 | def clone_repo(url, directory): 313 | """Clone a git repository and puts the content in the given directory. 314 | 315 | :param url: The git URL. 316 | :param directory: The directory. 317 | """ 318 | repo = Repo.clone_from(url, directory) 319 | logging.debug("Clone database from %s", url) 320 | 321 | -------------------------------------------------------------------------------- /pymlst/wg/extractors.py: -------------------------------------------------------------------------------- 1 | """Set of methods to extract different types of results from wgMLST""" 2 | import abc 3 | import importlib 4 | import logging 5 | import click 6 | 7 | from abc import ABC 8 | import pandas as pd 9 | 10 | from pymlst.common import mafft, exceptions, utils 11 | from pymlst.wg.core import Extractor 12 | 13 | 14 | def read_gene_list(base, gene_file): 15 | core = base.get_core_genes() 16 | if gene_file is None: 17 | return core 18 | else: 19 | select = [] 20 | for g in utils.strip_file(gene_file): 21 | if g in core: 22 | select.append(g) 23 | else: 24 | logging.debug("Gene {} not found in the database".format(g)) 25 | return select 26 | 27 | 28 | class SequenceExtractor(Extractor): 29 | """ Extracts coregene sequences into fasta file.""" 30 | 31 | def __init__(self, file=None, reference=False): 32 | """ 33 | :param file: Path of the file containing the coregens to extract 34 | """ 35 | self.list_file = file 36 | self.reference = reference 37 | 38 | def extract(self, base, output): 39 | coregene = read_gene_list(base, self.list_file) 40 | logging.info("Number of gene to analyse : %s", len(coregene)) 41 | for gene in coregene: 42 | if self.reference: 43 | seq = base.get_gene_sequence_reference(gene) 44 | output.write(">" + gene + "|reference" + "\n") 45 | output.write(seq + "\n") 46 | else: 47 | seqs = base.get_gene_sequences(gene) 48 | for seq in seqs: 49 | output.write(">" + gene + "|" + str(seq[0]) + " " 50 | + ";".join(seq[1]) + "\n") 51 | output.write(seq[2] + "\n") 52 | 53 | 54 | class MsaExtractor(Extractor): 55 | """ Compute Multiple Sequence Alignment (MSA) and extracts the aligned sequences. """ 56 | 57 | def __init__(self, file=None, realign=False): 58 | """ 59 | :param file: Path of the file containing the coregens to extract 60 | :param realign: Realign genes with same length 61 | """ 62 | 63 | self.list_file = file 64 | self.realign = realign 65 | 66 | def extract(self, base, output): 67 | coregene = read_gene_list(base, self.list_file) 68 | if len(coregene) == 0: 69 | raise exceptions.PyMLSTError('No valid genes selected, verify your genes list') 70 | strains = base.get_all_strains() 71 | duplicated = base.get_duplicated_genes() 72 | 73 | sequences = {s: [] for s in strains} 74 | for index, gene in enumerate(coregene): 75 | if gene in duplicated: 76 | logging.info("%s/%s | %s %s", index + 1, len(coregene), gene, "No: Repeat gene") 77 | continue 78 | seqs = base.get_gene_sequences(gene) 79 | size = set() 80 | for seq in seqs: 81 | size.add(len(seq[2])) 82 | if len(size) == 1 and self.realign is False: 83 | self.add_sequence_strain(seqs, strains, sequences) 84 | logging.info("%s/%s | %s %s", index + 1, len(coregene), gene, "Direct") 85 | else: 86 | genes = {str(s[0]): s[2] for s in seqs} 87 | corrseqs = mafft.align(genes) 88 | for seq in seqs: 89 | seq[2] = corrseqs.get(str(seq[0])) 90 | self.add_sequence_strain(seqs, strains, sequences) 91 | logging.info("%s/%s | %s %s", index + 1, len(coregene), gene, "Align") 92 | 93 | # output align result 94 | for strain in strains: 95 | output.write('>' + strain + "\n") 96 | output.write("\n".join(map(str, sequences.get(strain))) + "\n") 97 | 98 | def add_sequence_strain(self, seqs, strains, sequences): 99 | """Add a sequence to multi-align, take the first gene in case of repetition""" 100 | size = 0 101 | if len(seqs) > 0: 102 | size = len(seqs[0][2]) 103 | for strain in strains: 104 | seq = [i[2] for i in seqs if strain in i[1]] 105 | if len(seq) == 0: 106 | sequences.get(strain).append('-' * size) 107 | elif len(seq) == 1: 108 | sequences.get(strain).append(seq[0]) 109 | else: 110 | raise exceptions.PyMLSTError( 111 | 'Repeated genes must be excluded in order to export alignment') 112 | 113 | 114 | 115 | 116 | class TableExtractor(Extractor): 117 | """ Extraction of cgMLST distance matrix, MLST profiles, Genes and Strains list from a wgMLST database. """ 118 | def __init__(self, 119 | mincover=0, 120 | keep=False, 121 | duplicate=False, 122 | inverse=False): 123 | self.mincover = mincover 124 | self.keep = keep 125 | self.duplicate = duplicate 126 | self.inverse = inverse 127 | 128 | @abc.abstractmethod 129 | def extract(self, base, output): 130 | pass 131 | 132 | def get_valid_shema(self, base): 133 | # read samples mlst 134 | strains = base.get_all_strains() 135 | # Minimun number of strain 136 | if self.mincover < 0 or self.mincover > len(strains): 137 | raise exceptions.PyMLSTError( 138 | 'Mincover must be between 0 and number of strains {}'.format(len(strains))) 139 | 140 | # allgene 141 | allgene = base.get_core_genes() 142 | # duplicate gene 143 | dupli = base.get_duplicated_genes() 144 | # cover without duplication 145 | count_souches = base.count_souches_per_gene() 146 | # Count distinct gene 147 | diff = base.count_sequences_per_gene() 148 | 149 | # filter coregene that is not sufficient mincover or keep only different or return inverse 150 | valid_shema = [] 151 | # Test different case for validation 152 | for gene in allgene: 153 | valid = [] 154 | if self.keep is True: 155 | if diff.get(gene, 0) > 1: 156 | valid.append(True) 157 | else: 158 | valid.append(False) 159 | else: 160 | valid.append(True) 161 | if count_souches.get(gene, 0) >= self.mincover: 162 | valid.append(True) 163 | else: 164 | valid.append(False) 165 | if not self.duplicate: 166 | if gene in dupli: 167 | valid.append(False) 168 | else: 169 | valid.append(True) 170 | else: 171 | valid.append(True) 172 | if self.inverse is False: 173 | if sum(valid) == 3: 174 | valid_shema.append(gene) 175 | else: 176 | if sum(valid) < 3: 177 | valid_shema.append(gene) 178 | 179 | # report 180 | logging.info("Number of coregene used : %s/%s", len(valid_shema), len(allgene)) 181 | return(valid_shema) 182 | 183 | class TableExtractorCommand(click.core.Command): 184 | """ Options supported by :class:`~pymlst.wg.extractors.TableExtractor`. """ 185 | def __init__(self, *args, **kwargs): 186 | super().__init__(*args, **kwargs) 187 | self.params.insert(0, click.core.Option(('--mincover', '-m'), 188 | type=click.INT, 189 | help='Minimum number of strains found to retain a gene (default:0)')) 190 | self.params.insert(1, click.core.Option(('--keep', '-k'), 191 | is_flag=True, 192 | help='Keeps only gene with different alleles (omit missing).')) 193 | self.params.insert(2, click.core.Option(('--duplicate', '-d'), 194 | is_flag=True, 195 | help='Keeps duplicate genes (default remove).')) 196 | self.params.insert(3, click.core.Option(('--inverse', '-V'), 197 | is_flag=True, 198 | help='Keeps only gene that do not ' \ 199 | 'match the filter of mincover or keep options.')) 200 | 201 | class GeneExtractor(TableExtractor): 202 | """ Extracts a list of genes from a wgMLST database. """ 203 | def __init__(self,**kwargs): 204 | super().__init__(**kwargs) 205 | 206 | def extract(self, base, output): 207 | valid_schema = super().get_valid_shema(base) 208 | output.write("\n".join(sorted(valid_schema)) + "\n") 209 | 210 | class StatsExtractor(Extractor): 211 | """ Extracts stats, number of strains, coregenes and sequences from a wgMLST database. """ 212 | def extract(self, base, output): 213 | infos = base.get_infos() 214 | for c,v in zip(['name', 'source', 'species', 'version'], infos): 215 | if v is None: 216 | v = "" 217 | output.write(c + "\t" + v + "\n") 218 | output.write("Coregenes\t" + str(len(base.get_core_genes())) + "\n") 219 | output.write("Strains\t" + str(len(base.get_all_strains())) + "\n") 220 | output.write("Sequences\t" + str(base.count_sequences()) + "\n") 221 | 222 | class StrainExtractor(TableExtractor): 223 | """ Extracts a list of strains from a wgMLST database. """ 224 | def __init__(self, count=False, **kwargs): 225 | super().__init__(**kwargs) 226 | self.count = count 227 | 228 | def extract(self, base, output): 229 | if self.count is False: 230 | output.write("\n".join(base.get_all_strains()) + "\n") 231 | else: 232 | tmp = base.count_genes_per_souche(super().get_valid_shema(base)) 233 | for strain in base.get_all_strains(): 234 | output.write(strain + "\t" + str(tmp.get(strain)) + "\n") 235 | 236 | class DistanceExtractor(TableExtractor): 237 | """ Extracts a distance matrix from a wgMLST database. """ 238 | def extract(self, base, output): 239 | if self.duplicate: 240 | logging.warning("Calculate distance between strains " + 241 | "using duplicate genes could reported bad result.") 242 | strains = base.get_all_strains() 243 | output.write(str(len(strains)) + "\n") 244 | distance = base.get_strains_distances(super().get_valid_shema(base)) 245 | for strain in strains: 246 | output.write(strain + "\t") 247 | dist = [str(distance.get(strain, {}).get(s2, 0)) for s2 in strains] 248 | output.write("\t".join(dist) + "\n") 249 | 250 | class MlstExtractor(TableExtractor): 251 | """ Extracts an MLST table from a wgMLST database. """ 252 | def __init__(self, form="default", **kwargs): 253 | super().__init__(**kwargs) 254 | self.form = form 255 | 256 | def extract(self, base, output): 257 | valid_shema = super().get_valid_shema(base) 258 | strains = base.get_all_strains() 259 | mlst = base.get_mlst(valid_shema) 260 | table = pd.DataFrame(columns=["#GeneId"] + strains) 261 | rows = [] 262 | for gene in valid_shema: 263 | row = {"#GeneId": gene} 264 | mlstg = mlst.get(gene, {}) 265 | for strain in strains: 266 | row[strain] = mlstg.get(strain, None) 267 | rows.append(row) 268 | table = pd.concat([table, pd.DataFrame.from_dict(rows)], ignore_index=True) 269 | table = table.set_index('#GeneId') 270 | 271 | if self.form == 'grapetree': 272 | if self.duplicate: 273 | logging.warnings("Export grapetree table " + 274 | "using duplicate genes is not recommended.") 275 | table = table.fillna(-1) 276 | table = table.transpose() 277 | else: 278 | table = table.fillna("") 279 | 280 | table.to_csv(output, sep='\t') 281 | -------------------------------------------------------------------------------- /rcfile.rc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code. 6 | extension-pkg-allow-list= 7 | 8 | # A comma-separated list of package or module names from where C extensions may 9 | # be loaded. Extensions are loading into the active Python interpreter and may 10 | # run arbitrary code. (This is an alternative name to extension-pkg-allow-list 11 | # for backward compatibility.) 12 | extension-pkg-whitelist= 13 | 14 | # Specify a score threshold to be exceeded before program exits with error. 15 | fail-under=10.0 16 | 17 | # Files or directories to be skipped. They should be base names, not paths. 18 | ignore=CVS 19 | 20 | # Files or directories matching the regex patterns are skipped. The regex 21 | # matches against base names, not paths. 22 | ignore-patterns= 23 | 24 | # Python code to execute, usually for sys.path manipulation such as 25 | # pygtk.require(). 26 | #init-hook= 27 | 28 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 29 | # number of processors available to use. 30 | jobs=1 31 | 32 | # Control the amount of potential inferred values when inferring a single 33 | # object. This can help the performance when dealing with large functions or 34 | # complex, nested conditions. 35 | limit-inference-results=100 36 | 37 | # List of plugins (as comma separated values of python module names) to load, 38 | # usually to register additional checkers. 39 | load-plugins= 40 | 41 | # Pickle collected data for later comparisons. 42 | persistent=yes 43 | 44 | # When enabled, pylint would attempt to guess common misconfiguration and emit 45 | # user-friendly hints instead of false-positive error messages. 46 | suggestion-mode=yes 47 | 48 | # Allow loading of arbitrary C extensions. Extensions are imported into the 49 | # active Python interpreter and may run arbitrary code. 50 | unsafe-load-any-extension=no 51 | 52 | 53 | [MESSAGES CONTROL] 54 | 55 | # Only show warnings with the listed confidence levels. Leave empty to show 56 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 57 | confidence= 58 | 59 | # Disable the message, report, category or checker with the given id(s). You 60 | # can either give multiple identifiers separated by comma (,) or put this 61 | # option multiple times (only on the command line, not in the configuration 62 | # file where it should appear only once). You can also use "--disable=all" to 63 | # disable everything first and then reenable specific checks. For example, if 64 | # you want to run only the similarities checker, you can use "--disable=all 65 | # --enable=similarities". If you want to run only the classes checker, but have 66 | # no Warning level messages displayed, use "--disable=all --enable=classes 67 | # --disable=W". 68 | disable=not-context-manager 69 | 70 | # Enable the message, report, category or checker with the given id(s). You can 71 | # either give multiple identifier separated by comma (,) or put this option 72 | # multiple time (only on the command line, not in the configuration file where 73 | # it should appear only once). See also the "--disable" option for examples. 74 | enable= 75 | 76 | 77 | [REPORTS] 78 | 79 | # Python expression which should return a score less than or equal to 10. You 80 | # have access to the variables 'error', 'warning', 'refactor', and 'convention' 81 | # which contain the number of messages in each category, as well as 'statement' 82 | # which is the total number of statements analyzed. This score is used by the 83 | # global evaluation report (RP0004). 84 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 85 | 86 | # Template used to display messages. This is a python new-style format string 87 | # used to format the message information. See doc for all details. 88 | #msg-template= 89 | 90 | # Set the output format. Available formats are text, parseable, colorized, json 91 | # and msvs (visual studio). You can also give a reporter class, e.g. 92 | # mypackage.mymodule.MyReporterClass. 93 | output-format=text 94 | 95 | # Tells whether to display a full report or only the messages. 96 | reports=no 97 | 98 | # Activate the evaluation score. 99 | score=yes 100 | 101 | 102 | [REFACTORING] 103 | 104 | # Maximum number of nested blocks for function / method body 105 | max-nested-blocks=5 106 | 107 | # Complete name of functions that never returns. When checking for 108 | # inconsistent-return-statements if a never returning function is called then 109 | # it will be considered as an explicit return statement and no message will be 110 | # printed. 111 | never-returning-functions=sys.exit 112 | 113 | 114 | [FORMAT] 115 | 116 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 117 | expected-line-ending-format= 118 | 119 | # Regexp for a line that is allowed to be longer than the limit. 120 | ignore-long-lines=^\s*(# )??$ 121 | 122 | # Number of spaces of indent required inside a hanging or continued line. 123 | indent-after-paren=4 124 | 125 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 126 | # tab). 127 | indent-string=' ' 128 | 129 | # Maximum number of characters on a single line. 130 | max-line-length=100 131 | 132 | # Maximum number of lines in a module. 133 | max-module-lines=1000 134 | 135 | # Allow the body of a class to be on the same line as the declaration if body 136 | # contains single statement. 137 | single-line-class-stmt=no 138 | 139 | # Allow the body of an if to be on the same line as the test if there is no 140 | # else. 141 | single-line-if-stmt=no 142 | 143 | 144 | [SIMILARITIES] 145 | 146 | # Ignore comments when computing similarities. 147 | ignore-comments=yes 148 | 149 | # Ignore docstrings when computing similarities. 150 | ignore-docstrings=yes 151 | 152 | # Ignore imports when computing similarities. 153 | ignore-imports=no 154 | 155 | # Minimum lines number of a similarity. 156 | min-similarity-lines=4 157 | 158 | 159 | [LOGGING] 160 | 161 | # The type of string formatting that logging methods do. `old` means using % 162 | # formatting, `new` is for `{}` formatting. 163 | logging-format-style=old 164 | 165 | # Logging modules to check that the string format arguments are in logging 166 | # function parameter format. 167 | logging-modules=logging 168 | 169 | 170 | [BASIC] 171 | 172 | # Naming style matching correct argument names. 173 | argument-naming-style=snake_case 174 | 175 | # Regular expression matching correct argument names. Overrides argument- 176 | # naming-style. 177 | #argument-rgx= 178 | 179 | # Naming style matching correct attribute names. 180 | attr-naming-style=snake_case 181 | 182 | # Regular expression matching correct attribute names. Overrides attr-naming- 183 | # style. 184 | #attr-rgx= 185 | 186 | # Bad variable names which should always be refused, separated by a comma. 187 | bad-names=foo, 188 | bar, 189 | baz, 190 | toto, 191 | tutu, 192 | tata 193 | 194 | # Bad variable names regexes, separated by a comma. If names match any regex, 195 | # they will always be refused 196 | bad-names-rgxs= 197 | 198 | # Naming style matching correct class attribute names. 199 | class-attribute-naming-style=any 200 | 201 | # Regular expression matching correct class attribute names. Overrides class- 202 | # attribute-naming-style. 203 | #class-attribute-rgx= 204 | 205 | # Naming style matching correct class constant names. 206 | class-const-naming-style=UPPER_CASE 207 | 208 | # Regular expression matching correct class constant names. Overrides class- 209 | # const-naming-style. 210 | #class-const-rgx= 211 | 212 | # Naming style matching correct class names. 213 | class-naming-style=PascalCase 214 | 215 | # Regular expression matching correct class names. Overrides class-naming- 216 | # style. 217 | #class-rgx= 218 | 219 | # Naming style matching correct constant names. 220 | const-naming-style=UPPER_CASE 221 | 222 | # Regular expression matching correct constant names. Overrides const-naming- 223 | # style. 224 | #const-rgx= 225 | 226 | # Minimum line length for functions/classes that require docstrings, shorter 227 | # ones are exempt. 228 | docstring-min-length=-1 229 | 230 | # Naming style matching correct function names. 231 | function-naming-style=snake_case 232 | 233 | # Regular expression matching correct function names. Overrides function- 234 | # naming-style. 235 | #function-rgx= 236 | 237 | # Good variable names which should always be accepted, separated by a comma. 238 | good-names=i, 239 | j, 240 | k, 241 | ex, 242 | Run, 243 | _ 244 | 245 | # Good variable names regexes, separated by a comma. If names match any regex, 246 | # they will always be accepted 247 | good-names-rgxs= 248 | 249 | # Include a hint for the correct naming format with invalid-name. 250 | include-naming-hint=no 251 | 252 | # Naming style matching correct inline iteration names. 253 | inlinevar-naming-style=any 254 | 255 | # Regular expression matching correct inline iteration names. Overrides 256 | # inlinevar-naming-style. 257 | #inlinevar-rgx= 258 | 259 | # Naming style matching correct method names. 260 | method-naming-style=snake_case 261 | 262 | # Regular expression matching correct method names. Overrides method-naming- 263 | # style. 264 | #method-rgx= 265 | 266 | # Naming style matching correct module names. 267 | module-naming-style=snake_case 268 | 269 | # Regular expression matching correct module names. Overrides module-naming- 270 | # style. 271 | #module-rgx= 272 | 273 | # Colon-delimited sets of names that determine each other's naming style when 274 | # the name regexes allow several styles. 275 | name-group= 276 | 277 | # Regular expression which should only match function or class names that do 278 | # not require a docstring. 279 | no-docstring-rgx=^_ 280 | 281 | # List of decorators that produce properties, such as abc.abstractproperty. Add 282 | # to this list to register other decorators that produce valid properties. 283 | # These decorators are taken in consideration only for invalid-name. 284 | property-classes=abc.abstractproperty 285 | 286 | # Naming style matching correct variable names. 287 | variable-naming-style=snake_case 288 | 289 | # Regular expression matching correct variable names. Overrides variable- 290 | # naming-style. 291 | #variable-rgx= 292 | 293 | 294 | [STRING] 295 | 296 | # This flag controls whether inconsistent-quotes generates a warning when the 297 | # character used as a quote delimiter is used inconsistently within a module. 298 | check-quote-consistency=no 299 | 300 | # This flag controls whether the implicit-str-concat should generate a warning 301 | # on implicit string concatenation in sequences defined over several lines. 302 | check-str-concat-over-line-jumps=no 303 | 304 | 305 | [TYPECHECK] 306 | 307 | # List of decorators that produce context managers, such as 308 | # contextlib.contextmanager. Add to this list to register other decorators that 309 | # produce valid context managers. 310 | contextmanager-decorators=contextlib.contextmanager 311 | 312 | # List of members which are set dynamically and missed by pylint inference 313 | # system, and so shouldn't trigger E1101 when accessed. Python regular 314 | # expressions are accepted. 315 | generated-members= 316 | 317 | # Tells whether missing members accessed in mixin class should be ignored. A 318 | # mixin class is detected if its name ends with "mixin" (case insensitive). 319 | ignore-mixin-members=yes 320 | 321 | # Tells whether to warn about missing members when the owner of the attribute 322 | # is inferred to be None. 323 | ignore-none=yes 324 | 325 | # This flag controls whether pylint should warn about no-member and similar 326 | # checks whenever an opaque object is returned when inferring. The inference 327 | # can return multiple potential results while evaluating a Python object, but 328 | # some branches might not be evaluated, which results in partial inference. In 329 | # that case, it might be useful to still emit no-member and other checks for 330 | # the rest of the inferred objects. 331 | ignore-on-opaque-inference=yes 332 | 333 | # List of class names for which member attributes should not be checked (useful 334 | # for classes with dynamically set attributes). This supports the use of 335 | # qualified names. 336 | ignored-classes=optparse.Values,thread._local,_thread._local 337 | 338 | # List of module names for which member attributes should not be checked 339 | # (useful for modules/projects where namespaces are manipulated during runtime 340 | # and thus existing member attributes cannot be deduced by static analysis). It 341 | # supports qualified module names, as well as Unix pattern matching. 342 | ignored-modules= 343 | 344 | # Show a hint with possible names when a member name was not found. The aspect 345 | # of finding the hint is based on edit distance. 346 | missing-member-hint=yes 347 | 348 | # The minimum edit distance a name should have in order to be considered a 349 | # similar match for a missing member name. 350 | missing-member-hint-distance=1 351 | 352 | # The total number of similar names that should be taken in consideration when 353 | # showing a hint for a missing member. 354 | missing-member-max-choices=1 355 | 356 | # List of decorators that change the signature of a decorated function. 357 | signature-mutators= 358 | 359 | 360 | [MISCELLANEOUS] 361 | 362 | # List of note tags to take in consideration, separated by a comma. 363 | notes=FIXME, 364 | XXX, 365 | TODO 366 | 367 | # Regular expression of note tags to take in consideration. 368 | #notes-rgx= 369 | 370 | 371 | [VARIABLES] 372 | 373 | # List of additional names supposed to be defined in builtins. Remember that 374 | # you should avoid defining new builtins when possible. 375 | additional-builtins= 376 | 377 | # Tells whether unused global variables should be treated as a violation. 378 | allow-global-unused-variables=yes 379 | 380 | # List of names allowed to shadow builtins 381 | allowed-redefined-builtins= 382 | 383 | # List of strings which can identify a callback function by name. A callback 384 | # name must start or end with one of those strings. 385 | callbacks=cb_, 386 | _cb 387 | 388 | # A regular expression matching the name of dummy variables (i.e. expected to 389 | # not be used). 390 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 391 | 392 | # Argument names that match this expression will be ignored. Default to name 393 | # with leading underscore. 394 | ignored-argument-names=_.*|^ignored_|^unused_ 395 | 396 | # Tells whether we should check for unused import in __init__ files. 397 | init-import=no 398 | 399 | # List of qualified module names which can have objects that can redefine 400 | # builtins. 401 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 402 | 403 | 404 | [SPELLING] 405 | 406 | # Limits count of emitted suggestions for spelling mistakes. 407 | max-spelling-suggestions=4 408 | 409 | # Spelling dictionary name. Available dictionaries: none. To make it work, 410 | # install the 'python-enchant' package. 411 | spelling-dict= 412 | 413 | # List of comma separated words that should not be checked. 414 | spelling-ignore-words= 415 | 416 | # A path to a file that contains the private dictionary; one word per line. 417 | spelling-private-dict-file= 418 | 419 | # Tells whether to store unknown words to the private dictionary (see the 420 | # --spelling-private-dict-file option) instead of raising a message. 421 | spelling-store-unknown-words=no 422 | 423 | 424 | [CLASSES] 425 | 426 | # Warn about protected attribute access inside special methods 427 | check-protected-access-in-special-methods=no 428 | 429 | # List of method names used to declare (i.e. assign) instance attributes. 430 | defining-attr-methods=__init__, 431 | __new__, 432 | setUp, 433 | __post_init__ 434 | 435 | # List of member names, which should be excluded from the protected access 436 | # warning. 437 | exclude-protected=_asdict, 438 | _fields, 439 | _replace, 440 | _source, 441 | _make 442 | 443 | # List of valid names for the first argument in a class method. 444 | valid-classmethod-first-arg=cls 445 | 446 | # List of valid names for the first argument in a metaclass class method. 447 | valid-metaclass-classmethod-first-arg=cls 448 | 449 | 450 | [IMPORTS] 451 | 452 | # List of modules that can be imported at any level, not just the top level 453 | # one. 454 | allow-any-import-level= 455 | 456 | # Allow wildcard imports from modules that define __all__. 457 | allow-wildcard-with-all=no 458 | 459 | # Analyse import fallback blocks. This can be used to support both Python 2 and 460 | # 3 compatible code, which means that the block might have code that exists 461 | # only in one or another interpreter, leading to false positives when analysed. 462 | analyse-fallback-blocks=no 463 | 464 | # Deprecated modules which should not be used, separated by a comma. 465 | deprecated-modules=optparse,tkinter.tix 466 | 467 | # Output a graph (.gv or any supported image format) of external dependencies 468 | # to the given file (report RP0402 must not be disabled). 469 | ext-import-graph= 470 | 471 | # Output a graph (.gv or any supported image format) of all (i.e. internal and 472 | # external) dependencies to the given file (report RP0402 must not be 473 | # disabled). 474 | import-graph= 475 | 476 | # Output a graph (.gv or any supported image format) of internal dependencies 477 | # to the given file (report RP0402 must not be disabled). 478 | int-import-graph= 479 | 480 | # Force import order to recognize a module as part of the standard 481 | # compatibility libraries. 482 | known-standard-library= 483 | 484 | # Force import order to recognize a module as part of a third party library. 485 | known-third-party=enchant 486 | 487 | # Couples of modules and preferred modules, separated by a comma. 488 | preferred-modules= 489 | 490 | 491 | [DESIGN] 492 | 493 | # Maximum number of arguments for function / method. 494 | max-args=5 495 | 496 | # Maximum number of attributes for a class (see R0902). 497 | max-attributes=7 498 | 499 | # Maximum number of boolean expressions in an if statement (see R0916). 500 | max-bool-expr=5 501 | 502 | # Maximum number of branch for function / method body. 503 | max-branches=12 504 | 505 | # Maximum number of locals for function / method body. 506 | max-locals=15 507 | 508 | # Maximum number of parents for a class (see R0901). 509 | max-parents=7 510 | 511 | # Maximum number of public methods for a class (see R0904). 512 | max-public-methods=20 513 | 514 | # Maximum number of return / yield for function / method body. 515 | max-returns=6 516 | 517 | # Maximum number of statements in function / method body. 518 | max-statements=50 519 | 520 | # Minimum number of public methods for a class (see R0903). 521 | min-public-methods=2 522 | 523 | 524 | [EXCEPTIONS] 525 | 526 | # Exceptions that will emit a warning when being caught. Defaults to 527 | # "BaseException, Exception". 528 | overgeneral-exceptions=BaseException, 529 | Exception 530 | --------------------------------------------------------------------------------