├── pymlst
    ├── cla
    │   ├── __init__.py
    │   ├── commands
    │   │   ├── __init__.py
    │   │   ├── info.py
    │   │   ├── remove.py
    │   │   ├── search.py
    │   │   ├── create.py
    │   │   ├── search2.py
    │   │   └── import.py
    │   └── model.py
    ├── common
    │   ├── __init__.py
    │   ├── commands
    │   │   ├── __init__.py
    │   │   └── configure.py
    │   ├── flag.py
    │   ├── exceptions.py
    │   ├── mafft.py
    │   ├── blat.py
    │   ├── utils.py
    │   ├── kma.py
    │   ├── psl.py
    │   └── web.py
    ├── pytyper
    │   ├── __init__.py
    │   ├── commands
    │   │   ├── __init__.py
    │   │   └── search.py
    │   ├── method.py
    │   ├── url.py
    │   └── model.py
    ├── wg
    │   ├── commands
    │   │   ├── __init__.py
    │   │   ├── stats.py
    │   │   ├── recombination.py
    │   │   ├── gene.py
    │   │   ├── distance.py
    │   │   ├── subgraph.py
    │   │   ├── mlst.py
    │   │   ├── strain.py
    │   │   ├── add.py
    │   │   ├── msa.py
    │   │   ├── sequence.py
    │   │   ├── add2.py
    │   │   ├── remove.py
    │   │   ├── create.py
    │   │   └── import.py
    │   ├── __init__.py
    │   ├── model.py
    │   └── extractors.py
    ├── data
    │   ├── pytyper
    │   │   ├── clmt.txt
    │   │   ├── fimh.fna
    │   │   ├── spa.fna
    │   │   └── clmt.fna
    │   └── alembic
    │   │   ├── cla
    │   │       ├── script.py.mako
    │   │       ├── versions
    │   │       │   ├── c0f871a99d96_add_database_infos.py
    │   │       │   └── 21efe503d07d_initial.py
    │   │       └── env.py
    │   │   ├── wg
    │   │       ├── script.py.mako
    │   │       ├── versions
    │   │       │   ├── a793f8f3fd83_add_database_infos.py
    │   │       │   └── 52ae99cb5f33_initial.py
    │   │       └── env.py
    │   │   ├── pytyper
    │   │       ├── script.py.mako
    │   │       ├── versions
    │   │       │   └── 1f96d027f4aa_initial.py
    │   │       └── env.py
    │   │   └── alembic.ini
    ├── version.py
    ├── __init__.py
    ├── config.py
    └── cmd.py
├── setup.cfg
├── environment.yml
├── docs
    ├── source
    │   ├── logo.png
    │   ├── documentation
    │   │   ├── cgmlst.png
    │   │   ├── cgmlst
    │   │   │   ├── subgraph.png
    │   │   │   ├── check
    │   │   │   ├── export_seq.rst
    │   │   │   ├── add.rst
    │   │   │   ├── other_analysis.rst
    │   │   │   ├── export_res.rst
    │   │   │   ├── initialise.rst
    │   │   │   └── check.rst
    │   │   ├── pytyper.rst
    │   │   ├── clamlst.rst
    │   │   ├── cgmlst.rst
    │   │   ├── installation.rst
    │   │   ├── clamlst
    │   │   │   ├── search.rst
    │   │   │   └── initialise.rst
    │   │   └── pytyper
    │   │   │   └── search.rst
    │   ├── development.rst
    │   ├── requirements.txt
    │   ├── api.rst
    │   ├── development
    │   │   ├── getting_started.rst
    │   │   ├── docs.rst
    │   │   └── make.rst
    │   ├── index.rst
    │   └── conf.py
    ├── Makefile
    └── make.bat
├── pytest.ini
├── complete.sh
├── requirements.txt
├── .readthedocs.yaml
├── MANIFEST.in
├── LICENSE
├── .github
    └── workflows
    │   └── python-publish.yml
├── Makefile
├── .gitignore
├── README.md
├── setup.py
├── tests
    ├── test_cla.py
    ├── test_typer.py
    └── test_wg.py
└── rcfile.rc


/pymlst/cla/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pymlst/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pymlst/cla/commands/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pymlst/pytyper/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/pymlst/wg/commands/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pymlst/common/commands/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pymlst/pytyper/commands/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README
3 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: pymlst
2 | dependencies:
3 |   - python=3.7
4 | 


--------------------------------------------------------------------------------
/docs/source/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bvalot/pyMLST/HEAD/docs/source/logo.png


--------------------------------------------------------------------------------
/pymlst/wg/__init__.py:
--------------------------------------------------------------------------------
1 | """A module offering tools to work with Whole Genome MLST databases."""
2 | 


--------------------------------------------------------------------------------
/pymlst/pytyper/method.py:
--------------------------------------------------------------------------------
1 | # Different typing static variable
2 | FIM='fim'
3 | SPA='spa'
4 | CLMT='clmt'
5 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 |     ignore::DeprecationWarning
4 |     ignore::UserWarning
5 | 


--------------------------------------------------------------------------------
/docs/source/documentation/cgmlst.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bvalot/pyMLST/HEAD/docs/source/documentation/cgmlst.png


--------------------------------------------------------------------------------
/docs/source/documentation/cgmlst/subgraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bvalot/pyMLST/HEAD/docs/source/documentation/cgmlst/subgraph.png


--------------------------------------------------------------------------------
/pymlst/pytyper/url.py:
--------------------------------------------------------------------------------
1 | SPA_URL_TYPE = "http://spa.ridom.de/dynamic/spatypes.txt"
2 | SPA_URL_SEQ = "http://spa.ridom.de/dynamic/sparepeats.fasta"
3 | FIM_URL = "https://bitbucket.org/genomicepidemiology/fimtyper_db.git"
4 | 


--------------------------------------------------------------------------------
/pymlst/common/flag.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy import MetaData, Table, String, Column
2 | 
3 | metadata = MetaData()
4 | 
5 | mlst_type = Table('mlst_type', metadata,
6 |                   Column('name', String(length=4), primary_key=True))
7 | 


--------------------------------------------------------------------------------
/complete.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | _wgmlst_completions()
 4 | {
 5 | 	if [ "${#COMP_WORDS[@]}" != "2" ]; then
 6 | 		return
 7 | 	fi
 8 | 
 9 | 	COMPREPLY=($(compgen -W "add_strain create_db" "${COMP_WORDS[1]}"))
10 | }
11 | 
12 | complete -F _wgmlst_completions wgMLST
13 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | biopython>=1.78
 2 | click>=7.1
 3 | pytest>=6.2
 4 | pytest-cov>=2.10
 5 | sqlalchemy>=1.4,<2
 6 | networkx>=2.5
 7 | decorator>=4.4
 8 | requests>=2.23
 9 | pandas>=1.2
10 | numpy>=1.20
11 | beautifulsoup4>=4.9
12 | questionary>=1.9
13 | setuptools>=44.0
14 | alembic>=1.6
15 | GitPython>=3.1
16 | 


--------------------------------------------------------------------------------
/pymlst/data/pytyper/clmt.txt:
--------------------------------------------------------------------------------
 1 | Allele,arpA,chuA,yjaA,TspE4.C2
 2 | A,+,-,-,-
 3 | B1,+,-,-,+
 4 | G|F,-,+,-,-
 5 | H|B2,-,+,+,-
 6 | B2,-,+,+,+
 7 | G|B2,-,+,-,+
 8 | I|A|C,+,-,+,-
 9 | D|E,+,+,-,-
10 | D|E,+,+,-,+
11 | E|I,+,+,+,-
12 | I|II,-,-,+,-
13 | III|IV|V,-,-,-,-
14 | U,-,-,-,+
15 | U,-,-,+,+
16 | U,+,-,+,+
17 | U,+,+,+,+
18 | 


--------------------------------------------------------------------------------
/pymlst/version.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | This module contains project version information.
 6 | 
 7 | .. currentmodule:: pymlst.version
 8 | .. moduleauthor:: benoit_valot <benoit.valot@univ-fcomte.fr>
 9 | """
10 | 
11 | __version__ = "2.2.2"  #: the working version
12 | __release__ = "2.2"  #: the release version
13 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: "ubuntu-22.04"
 5 |   tools:
 6 |     python: "3.9"
 7 | 
 8 | # Build from the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/source/conf.py
11 | 
12 | # Explicitly set the version of Python and its requirements
13 | python:
14 |   install:
15 |     - requirements: docs/source/requirements.txt
16 | 


--------------------------------------------------------------------------------
/docs/source/development.rst:
--------------------------------------------------------------------------------
 1 | .. _development:
 2 | 
 3 | ***********
 4 | Development
 5 | ***********
 6 | 
 7 | This section describes how to configure your environment for
 8 | development, details the make file option, and how to build
 9 | documentation.
10 | 
11 | .. toctree::
12 |    :maxdepth: 1
13 |    :caption: Table of contents:
14 | 
15 |    development/getting_started
16 |    development/make
17 |    development/docs
18 | 


--------------------------------------------------------------------------------
/pymlst/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | python Mlst Local Search Tool
 6 | 
 7 | .. currentmodule:: pymlst
 8 | .. moduleauthor:: benoit_valot <benoit.valot@univ-fcomte.fr>
 9 | """
10 | 
11 | from . import config
12 | from .version import __version__, __release__  # noqa
13 | from .wg.core import open_wg
14 | from .cla.core import open_cla
15 | from .pytyper.core import open_typer
16 | 


--------------------------------------------------------------------------------
/pymlst/data/pytyper/fimh.fna:
--------------------------------------------------------------------------------
1 | >fimh fimH1
2 | TTCGCCTGTAAAACCGCCAATGGTACTGCTATCCCTATTGGCGGTGGCAGCGCCAATGTTTATGTAAACC
3 | TTGCGCCTGCCGTGAATGTGGGGCAAAACCTGGTCGTGGATCTTTCGACGCAAATCTTTTGCCATAACGA
4 | TTACCCGGAAACCATTACAGACTATGTCACACTGCAACGAGGTTCGGCTTATGGCGGCGTGTTATCTAGT
5 | TTTTCCGGGACCGTAAAATATAATGGCAGTAGCTATCCTTTCCCTACTACCAGCGAAACGCCGCGGGTTG
6 | TTTATAATTCGAGAACGGATAAGCCGTGGCCGGTGGCGCTTTATTTGACGCCTGTGAGCAGTGCTGGCGG
7 | GGTGGCGATTAAAGCTGGTTCATTAATTGCCGTGCTTATTTTGCGACAGACCAACAACTATAACAGCGAT
8 | GATTTTCAGTTTGTGTGGAATATTTACGCCAATAATGATGTGGTGGTGCCCACTGGCGGCTGTGATGTT


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include pymlst/data/alembic/alembic.ini
 2 | include pymlst/data/alembic/wg/env.py
 3 | include pymlst/data/alembic/wg/script.py.mako
 4 | include pymlst/data/alembic/wg/versions/*.py
 5 | include pymlst/data/alembic/cla/env.py
 6 | include pymlst/data/alembic/cla/script.py.mako
 7 | include pymlst/data/alembic/cla/versions/*.py
 8 | include pymlst/data/alembic/pytyper/env.py
 9 | include pymlst/data/alembic/pytyper/script.py.mako
10 | include pymlst/data/alembic/pytyper/versions/*.py
11 | include pymlst/data/pytyper/*fna
12 | include pymlst/data/pytyper/*txt
13 | 


--------------------------------------------------------------------------------
/pymlst/pytyper/model.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import MetaData, Column, Table, Integer, Text, ForeignKey, Index
 2 | 
 3 | metadata = MetaData()
 4 | 
 5 | typerSeq = Table("typerSeq", metadata,
 6 | 	      Column("id", Integer, primary_key=True),
 7 | 	      Column("sequence", Text, unique=True),
 8 | 	      Column("typing", Text),
 9 | 	      Column("allele", Text))
10 | 
11 | typerSt = Table("typerSt", metadata,
12 | 	      Column("id", Integer, primary_key=True),
13 | 	      Column("typing", Text),
14 | 	      Column("st", Text),              
15 | 	      Column("allele", Text))
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/docs/source/requirements.txt:
--------------------------------------------------------------------------------
 1 | sphinx==3.4.3
 2 | sphinxcontrib-applehelp<=1.0.4
 3 | sphinxcontrib-devhelp<=1.0.2
 4 | sphinxcontrib-htmlhelp<=2.0.1
 5 | sphinxcontrib-jsmath<=1.0.1
 6 | sphinxcontrib-qthelp<=1.0.3
 7 | sphinxcontrib-serializinghtml<=1.1.5
 8 | sphinx_rtd_theme==0.5.1
 9 | jinja2==3.0.3
10 | docutils==0.16
11 | networkx>=2.5
12 | biopython>=1.78
13 | click<=7.1
14 | pytest>=6.2
15 | pytest-cov>=2.10
16 | sqlalchemy>=1.4,<2
17 | networkx>=2.5
18 | decorator>=4.4
19 | requests>=2.23
20 | pandas>=1.2
21 | numpy>=1.20
22 | beautifulsoup4>=4.9
23 | questionary>=1.9
24 | setuptools>=44.0
25 | alembic>=1.6
26 | GitPython>=3.1
27 | 


--------------------------------------------------------------------------------
/docs/source/documentation/pytyper.rst:
--------------------------------------------------------------------------------
 1 | .. _pytyper:
 2 | 
 3 | ********************
 4 | Other Typing Methods
 5 | ********************
 6 | 
 7 | Other typing method are available using a series of
 8 | Python scripts described below.
 9 | 
10 | 
11 | .. toctree::
12 |    :maxdepth: 1
13 | 
14 |    pytyper/search
15 | 
16 | All avalaible commands can be listed using help fonction:
17 | 
18 | .. code-block:: bash
19 |    
20 |    pyTyper --help
21 |    
22 |    Usage: pyTyper [OPTIONS] COMMAND [ARGS]...
23 | 
24 |    Other typing commands.
25 | 
26 |    Commands:
27 |    search  Searches strain type using specified METHOD for an assembly...
28 | 


--------------------------------------------------------------------------------
/pymlst/data/alembic/cla/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 | 
18 | 
19 | def upgrade():
20 |     ${upgrades if upgrades else "pass"}
21 | 
22 | 
23 | def downgrade():
24 |     ${downgrades if downgrades else "pass"}
25 | 


--------------------------------------------------------------------------------
/pymlst/data/alembic/wg/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 | 
18 | 
19 | def upgrade():
20 |     ${upgrades if upgrades else "pass"}
21 | 
22 | 
23 | def downgrade():
24 |     ${downgrades if downgrades else "pass"}
25 | 


--------------------------------------------------------------------------------
/pymlst/data/alembic/pytyper/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 | 
18 | 
19 | def upgrade():
20 |     ${upgrades if upgrades else "pass"}
21 | 
22 | 
23 | def downgrade():
24 |     ${downgrades if downgrades else "pass"}
25 | 


--------------------------------------------------------------------------------
/pymlst/wg/commands/stats.py:
--------------------------------------------------------------------------------
 1 | """extract stats CLI command file."""
 2 | 
 3 | import os
 4 | 
 5 | import click
 6 | 
 7 | import pymlst
 8 | from pymlst.common import utils, exceptions
 9 | from pymlst.wg.extractors import StatsExtractor
10 | 
11 | @click.command(name='stats')
12 | 
13 | @click.argument('database', type=click.Path(exists=True))
14 | def cli(database, **kwargs):
15 |     """Extracts stats from a wgMLST DATABASE."""
16 | 
17 |     try:
18 | 
19 |         with pymlst.open_wg(os.path.abspath(database)) as mlst:
20 |             mlst.extract(StatsExtractor())
21 | 
22 |     except exceptions.PyMLSTError as err:
23 |         raise click.ClickException(str(err))
24 | 


--------------------------------------------------------------------------------
/docs/source/documentation/cgmlst/check:
--------------------------------------------------------------------------------
 1 | .. _cgmlst_add:
 2 | 
 3 | .. toctree::
 4 |     :glob:
 5 | 
 6 | ===========================
 7 | Add strains to the database
 8 | ===========================
 9 | 
10 | Next, you need to add your strain iteratively to the database.
11 | A draft genome can be used (we recommend to use Spades for assembly).
12 | You can also add reference genome for comparison.
13 | 
14 | 
15 | Get the Source
16 | ==============
17 | 
18 | The source code for the `PyMLST` project lives at
19 | `github <https://github.com/bvalot/pyMLST.git>`_.  
20 | You can use `git clone` to get it.
21 | 
22 | .. code-block:: bash
23 | 
24 |    git clone https://github.com/bvalot/pyMLST.git
25 | 
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | Copyright (c) 2021, Benoit Valot
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | (at your option) any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see <http://www.gnu.org/licenses/>.


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/pymlst/cla/commands/info.py:
--------------------------------------------------------------------------------
 1 | """Info CLI command file."""
 2 | 
 3 | import os
 4 | import click
 5 | 
 6 | import pymlst
 7 | from pymlst.common import exceptions, utils
 8 | 
 9 | 
10 | @click.command(name='info')
11 | @click.option('--output', '-o',
12 |               type=click.File('w'),
13 |               help='Writes ST search result to (default:stdout).')
14 | @click.argument('database',
15 |                 type=click.Path(exists=False))
16 | 
17 | 
18 | def cli(database, **kwargs):
19 |     """Output the information about  a classical MLST DATABASE"""
20 | 
21 |     try:
22 |         with pymlst.open_cla(os.path.abspath(database)) as mlst:
23 |             mlst.get_infos(**utils.clean_kwargs(kwargs))
24 |             
25 |     except exceptions.PyMLSTError as err:
26 |         raise click.ClickException(str(err))
27 | 


--------------------------------------------------------------------------------
/pymlst/cla/commands/remove.py:
--------------------------------------------------------------------------------
 1 | """remove CLI command file."""
 2 | 
 3 | import os
 4 | 
 5 | import click
 6 | 
 7 | import pymlst
 8 | from pymlst.common import utils, exceptions
 9 | 
10 | 
11 | @click.command(name='remove')
12 | @click.argument('database',
13 |                 type=click.Path(exists=True))
14 | @click.argument('gene',
15 |                 type=click.STRING)
16 | @click.argument('allele',
17 |                 type=click.INT)
18 | 
19 | 
20 | def cli(database, **kwargs):
21 |     """Removes ALLELE sequence from the GENE on a mlst DATABASE."""
22 |     
23 |     try:
24 |         with pymlst.open_cla(os.path.abspath(database)) as mlst:
25 |             mlst.remove_allele(**utils.clean_kwargs(kwargs))
26 |                 
27 |     except exceptions.PyMLSTError as err:
28 |         raise click.ClickException(str(err))
29 | 


--------------------------------------------------------------------------------
/pymlst/wg/commands/recombination.py:
--------------------------------------------------------------------------------
 1 | """recombination CLI command file."""
 2 | 
 3 | import click
 4 | 
 5 | from pymlst.common import utils, exceptions
 6 | from pymlst.wg import core
 7 | 
 8 | 
 9 | @click.command(name='recombination')
10 | @click.option('--output', '-o',
11 |               type=click.File('w'),
12 |               help='Output number of variations by genes (default:stdout).')
13 | @click.argument('genes',
14 |                 type=click.File('r'))
15 | @click.argument('alignment',
16 |                 type=click.File('r'))
17 | def cli(genes, alignment, **kwargs):
18 |     """Searches potential gene recombinations from wgMLST database export."""
19 | 
20 |     try:
21 | 
22 |         core.find_recombination(genes, alignment, **utils.clean_kwargs(kwargs))
23 | 
24 |     except exceptions.PyMLSTError as err:
25 |         raise click.ClickException(str(err))
26 | 


--------------------------------------------------------------------------------
/pymlst/cla/model.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import MetaData, Table, Column, Integer, Text
 2 | 
 3 | metadata = MetaData()
 4 | 
 5 | sequences = Table('sequences', metadata,
 6 |                   Column('id', Integer, primary_key=True),
 7 |                   Column('sequence', Text, unique=True),
 8 |                   Column('gene', Text),
 9 |                   Column('allele', Integer))
10 | 
11 | mlst = Table('mlst', metadata,
12 |              Column('id', Integer, primary_key=True),
13 |              Column('st', Integer),
14 |              Column('gene', Text),
15 |              Column('allele', Integer))
16 | 
17 | mlst_type = Table('mlst_type', metadata,
18 |                   Column('name', Text),
19 |                   Column('source', Text),
20 |                   Column('species', Text),
21 |                   Column('mlst', Text),
22 |                   Column('version', Text))
23 | 


--------------------------------------------------------------------------------
/pymlst/data/alembic/wg/versions/a793f8f3fd83_add_database_infos.py:
--------------------------------------------------------------------------------
 1 | """Add database infos
 2 | 
 3 | Revision ID: a793f8f3fd83
 4 | Revises: 52ae99cb5f33
 5 | Create Date: 2025-03-14 15:38:05.090257
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'a793f8f3fd83'
14 | down_revision = '52ae99cb5f33'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.add_column('mlst_type', sa.Column('source', sa.String(), nullable=True)) 
21 |     op.add_column('mlst_type', sa.Column('species', sa.String(), nullable=True))
22 |     op.add_column('mlst_type', sa.Column('version', sa.String(), nullable=True))
23 |     
24 | 
25 | def downgrade():
26 |     op.drop_column('mlst_type', 'source')
27 |     op.drop_column('mlst_type', 'species')
28 |     op.drop_column('mlst_type', 'version')
29 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/pymlst/wg/commands/gene.py:
--------------------------------------------------------------------------------
 1 | """extract gene CLI command file."""
 2 | 
 3 | import os
 4 | 
 5 | import click
 6 | 
 7 | import pymlst
 8 | from pymlst.common import utils, exceptions
 9 | from pymlst.wg.extractors import GeneExtractor, TableExtractorCommand
10 | 
11 | @click.command(name='gene', cls=TableExtractorCommand)
12 | @click.option('--output', '-o',
13 |               type=click.File('w'),
14 |               help='Export GENE list to (default=stdout).')
15 | @click.argument('database', type=click.Path(exists=True))
16 | def cli(database, **kwargs):
17 |     """Extracts a list of genes from a wgMLST DATABASE."""
18 | 
19 |     tab_kwargs,out_kwargs = utils.get_output(utils.clean_kwargs(kwargs))
20 | 
21 |     try:
22 | 
23 |         with pymlst.open_wg(os.path.abspath(database)) as mlst:
24 |             mlst.extract(GeneExtractor(**tab_kwargs), **out_kwargs)
25 | 
26 |     except exceptions.PyMLSTError as err:
27 |         raise click.ClickException(str(err))
28 | 


--------------------------------------------------------------------------------
/pymlst/wg/commands/distance.py:
--------------------------------------------------------------------------------
 1 | """extract cgMLST distance CLI command file."""
 2 | 
 3 | import os
 4 | 
 5 | import click
 6 | 
 7 | import pymlst
 8 | from pymlst.common import utils, exceptions
 9 | from pymlst.wg.extractors import DistanceExtractor, TableExtractorCommand
10 | 
11 | @click.command(name='distance', cls=TableExtractorCommand)
12 | @click.option('--output', '-o',
13 |               type=click.File('w'),
14 |               help='Export distance to (default=stdout).')
15 | @click.argument('database', type=click.Path(exists=True))
16 | def cli(database, **kwargs):
17 |     """Extracts a distance matrix from a wgMLST DATABASE."""
18 | 
19 |     tab_kwargs,out_kwargs = utils.get_output(utils.clean_kwargs(kwargs))
20 | 
21 |     try:
22 | 
23 |         with pymlst.open_wg(os.path.abspath(database)) as mlst:
24 |             mlst.extract(DistanceExtractor(**tab_kwargs), **out_kwargs)
25 | 
26 |     except exceptions.PyMLSTError as err:
27 |         raise click.ClickException(str(err))
28 | 


--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
 1 | .. _fasta: https://en.wikipedia.org/wiki/FASTA_format
 2 | .. _fastq: https://en.wikipedia.org/wiki/FASTQ_format
 3 | .. _BLAT: http://genome.ucsc.edu/cgi-bin/hgBlat
 4 | .. _KMA: https://bitbucket.org/genomicepidemiology/kma/src/master/
 5 | .. _BWA: https://bio-bwa.sourceforge.net/
 6 | 
 7 | 
 8 | .. _api:
 9 | 
10 | .. toctree::
11 |     :glob:
12 | 
13 | =================
14 | API Documentation
15 | =================
16 | 
17 | .. automodule:: pymlst
18 | 
19 | Whole Genome MLST
20 | -----------------
21 | 
22 | .. automodule:: pymlst.wg.core
23 |     :members:
24 |     :member-order: bysource
25 | 
26 | .. automodule:: pymlst.wg.extractors
27 |     :members:
28 |     :member-order: bysource
29 | 
30 | Classical MLST
31 | -----------------
32 | 
33 | .. automodule:: pymlst.cla.core
34 |     :members:
35 |     :member-order: bysource
36 | 
37 | Other Typing
38 | -----------------
39 | 
40 | .. automodule:: pymlst.pytyper.core
41 |     :members:
42 |     :member-order: bysource
43 | 


--------------------------------------------------------------------------------
/pymlst/wg/model.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import MetaData, Column, Table, Integer, Text, ForeignKey, Index
 2 | 
 3 | metadata = MetaData()
 4 | 
 5 | sequences = Table('sequences', metadata,
 6 |                   Column('id', Integer, primary_key=True),
 7 |                   Column('sequence', Text, unique=True))
 8 | 
 9 | mlst = Table('mlst', metadata,
10 |              Column('id', Integer, primary_key=True),
11 |              Column('souche', Text),
12 |              Column('gene', Text),
13 |              Column('seqid', Integer, ForeignKey(sequences.c.id)),
14 |              Index('ix_souche', 'souche'),
15 |              Index('ix_gene', 'gene'),
16 |              Index('ix_seqid', 'seqid'),
17 |              Index('ix_souche_gene_seqid', 'gene', 'souche', 'seqid'))
18 | 
19 | mlst_type = Table('mlst_type', metadata,
20 |                   Column('name', Text),
21 |                   Column('source', Text),
22 |                   Column('species', Text),
23 |                   Column('version', Text))
24 | 


--------------------------------------------------------------------------------
/pymlst/data/pytyper/spa.fna:
--------------------------------------------------------------------------------
 1 | >spa CP127590.1:c70231-69071 Staphylococcus aureus strain C867 chromosome, complete genome
 2 | TTGAAAAAGAAAAACATTTATTCAATTCGTAAACTAGGTGTAGGTATTGCATCTGTAACTTTAGGTACAT
 3 | TACTTATATCTGGTGGCGTAACACCTGCTGCAAATGCTGCGCAACACGATGAAGCTCAACAAAATGCTTT
 4 | TTATCAAGTGTTAAATATGCCTAACTTAAACGCTGATCAACGTAATGGTTTTATCCAAAGCCTTAAAGAT
 5 | GATCCAAGCCAAAGTGCTAACGTTTTAGGTGAAGCTCAAAAACTTAATGACTCTCAAGCTCCAAAAGCTG
 6 | ATGCGCAACAAAATAACTTCAACAAAGATCAACAAAGCGCCTTCTATGAAATCTTGAACATGCCTAACTT
 7 | AAACGAAGAACAACGCAATGGTTTCATCCAAAGCTTAAAAGATGACCCAAGCCAAAGTGCTAACCTATTG
 8 | TCAGAAGCTAAAAAGTTAAATGAATCTCAAGCACCGAAAGCGGATAACAAATTCAACAAAGAACAACAAA
 9 | ATGCTTTCTATGAAATCTTACATTTACCTAACTTAAACGAAGAACAACGCAATGGTTTCATCCAAAGCTT
10 | CGTTAAACCTGGTGATACAGTAAATGACATTGCAAAAGCAAACGGCACTACTGCTGACAAAATTGCTGCA
11 | GATAACAAATTAGCTGATAAAAACATGATCAAACCTGGTCAAGAACTTGTTGTTGATAAGAAGCAACCAG
12 | CAAACCATGCAGATGCTAACAAAGCTCAAGCATTACCAGAAACTGGTGAAGAAAATCCATTCATCGGTAC
13 | AACTGTATTTGGTGGATTATCATTAGCCTTAGGTGCAGCGTTATTAGCTGGACGTCGTCGCGAACTATAA
14 | 


--------------------------------------------------------------------------------
/pymlst/data/alembic/cla/versions/c0f871a99d96_add_database_infos.py:
--------------------------------------------------------------------------------
 1 | """Add database infos
 2 | 
 3 | Revision ID: c0f871a99d96
 4 | Revises: 21efe503d07d
 5 | Create Date: 2025-03-14 09:29:25.322104
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'c0f871a99d96'
14 | down_revision = '21efe503d07d'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.add_column('mlst_type', sa.Column('source', sa.String(), nullable=True)) 
21 |     op.add_column('mlst_type', sa.Column('species', sa.String(), nullable=True))
22 |     op.add_column('mlst_type', sa.Column('mlst', sa.String(), nullable=True)) 
23 |     op.add_column('mlst_type', sa.Column('version', sa.String(), nullable=True))
24 |     
25 | 
26 | 
27 | def downgrade():
28 |     op.drop_column('mlst_type', 'source')
29 |     op.drop_column('mlst_type', 'species')
30 |     op.drop_column('mlst_type', 'mlst')    
31 |     op.drop_column('mlst_type', 'version')
32 | 


--------------------------------------------------------------------------------
/docs/source/documentation/clamlst.rst:
--------------------------------------------------------------------------------
 1 | .. _clamlst:
 2 | 
 3 | ***********************
 4 | classical MLST analysis
 5 | ***********************
 6 | 
 7 | A workflow analysis of classical MLST is performed using a series of
 8 | Python scripts described below.
 9 | 
10 | 
11 | .. toctree::
12 |    :maxdepth: 1
13 | 
14 |    clamlst/initialise
15 |    clamlst/search
16 | 
17 | All avalaible commands can be listed using help fonction:
18 | 
19 | .. code-block:: bash
20 |    
21 |    claMLST --help
22 |    
23 |    Usage: claMLST [OPTIONS] COMMAND [ARGS]...
24 |    
25 |    Classical MLST commands.
26 |    
27 |    Commands:
28 |    create   Creates a classical MLST DATABASE from a SCHEME csv and ALLELES...
29 |    import   Creates a claMLST DATABASE from an online resource.
30 |    info     Output the informations about a classical MLST DATABASE
31 |    remove   Removes ALLELE sequence from the GENE on a mlst DATABASE.
32 |    search   Searches ST number for an assembly GENOME using an mlst DATABASE.
33 |    search2  Searches ST number from FASTQS(.gz) raw reads using an mlst...
34 | 


--------------------------------------------------------------------------------
/pymlst/wg/commands/subgraph.py:
--------------------------------------------------------------------------------
 1 | """subgraph CLI command file."""
 2 | 
 3 | import click
 4 | 
 5 | from pymlst.common import utils, exceptions
 6 | from pymlst.wg import core
 7 | 
 8 | 
 9 | @click.command(name='subgraph')
10 | @click.option('--output', '-o',
11 |               type=click.File('w'),
12 |               help='Output group files (default:stdout).')
13 | @click.option('--threshold', '-t',
14 |               type=click.INT,
15 |               help='Minimum distance to conserve '
16 |                    'for extraction of group (default:50).')
17 | @click.option('--export', '-e',
18 |               type=click.Choice(['list', 'count', 'group'], case_sensitive=False),
19 |               help='Export type (default:list).')
20 | @click.argument('distance',
21 |                 type=click.File('r'))
22 | def cli(distance, **kwargs):
23 |     """Searches group of strains at a DISTANCE threshold."""
24 | 
25 |     try:
26 | 
27 |         core.find_subgraph(distance, **utils.clean_kwargs(kwargs))
28 | 
29 |     except exceptions.PyMLSTError as err:
30 |         raise click.ClickException(str(err))
31 | 


--------------------------------------------------------------------------------
/pymlst/wg/commands/mlst.py:
--------------------------------------------------------------------------------
 1 | """extract MLST table CLI command file."""
 2 | 
 3 | import os
 4 | 
 5 | import click
 6 | 
 7 | import pymlst
 8 | from pymlst.common import utils, exceptions
 9 | from pymlst.wg.extractors import MlstExtractor, TableExtractorCommand
10 | 
11 | @click.command(name='mlst', cls=TableExtractorCommand)
12 | @click.option('--form', '-f',
13 |               type=click.Choice(["default", "grapetree"]),
14 |               help='Specify format of output')
15 | @click.option('--output', '-o',
16 |               type=click.File('w'),
17 |               help='Export strain list to (default=stdout).')
18 | @click.argument('database', type=click.Path(exists=True))
19 | def cli(database, **kwargs):
20 |     """Extracts an MLST table from a wgMLST DATABASE."""
21 | 
22 |     tab_kwargs,out_kwargs = utils.get_output(utils.clean_kwargs(kwargs))
23 |     
24 |     try:
25 | 
26 |         with pymlst.open_wg(os.path.abspath(database)) as mlst:
27 |             mlst.extract(MlstExtractor(**tab_kwargs), **out_kwargs)
28 | 
29 |     except exceptions.PyMLSTError as err:
30 |         raise click.ClickException(str(err))
31 | 


--------------------------------------------------------------------------------
/pymlst/wg/commands/strain.py:
--------------------------------------------------------------------------------
 1 | """extract strains CLI command file."""
 2 | 
 3 | import os
 4 | 
 5 | import click
 6 | 
 7 | import pymlst
 8 | from pymlst.common import utils, exceptions
 9 | from pymlst.wg.extractors import StrainExtractor, TableExtractorCommand
10 | 
11 | @click.command(name='strain', cls=TableExtractorCommand)
12 | @click.option('--count', '-c',
13 |               is_flag=True,
14 |               help='Count the number of gene present in the database for each strains.')
15 | @click.option('--output', '-o',
16 |               type=click.File('w'),
17 |               help='Export strain list to (default=stdout).')
18 | @click.argument('database', type=click.Path(exists=True))
19 | def cli(database, **kwargs):
20 |     """Extracts a list of strains from a wgMLST DATABASE."""
21 | 
22 |     tab_kwargs,out_kwargs = utils.get_output(utils.clean_kwargs(kwargs))
23 | 
24 |     try:
25 | 
26 |         with pymlst.open_wg(os.path.abspath(database)) as mlst:
27 |             mlst.extract(StrainExtractor(**tab_kwargs), **out_kwargs)
28 | 
29 |     except exceptions.PyMLSTError as err:
30 |         raise click.ClickException(str(err))
31 | 


--------------------------------------------------------------------------------
/pymlst/wg/commands/add.py:
--------------------------------------------------------------------------------
 1 | """add CLI command file."""
 2 | 
 3 | import os
 4 | 
 5 | import click
 6 | 
 7 | import pymlst
 8 | from pymlst.common import utils, exceptions
 9 | 
10 | @click.command(name='add')
11 | @click.option('--strain', '-s',
12 |               type=click.STRING,
13 |               help='Name of the strain (default:genome name).')
14 | @click.option('--identity', '-i',
15 |               type=click.FLOAT,
16 |               help='Minimum identity to search gene (default=0.95).')
17 | @click.option('--coverage', '-c',
18 |               type=click.FLOAT,
19 |               help='Minimum coverage to search gene (default=0.9).')
20 | @click.argument('database',
21 |                 type=click.Path(exists=True))
22 | @click.argument('genome',
23 |                 type=click.File("r"))
24 | 
25 | def cli(genome, database, **kwargs):
26 |     """Adds a strain GENOME to the wgMLST DATABASE."""
27 | 
28 |     try:
29 | 
30 |         with pymlst.open_wg(os.path.abspath(database)) as mlst:
31 |             mlst.add_strain(genome, **utils.clean_kwargs(kwargs))
32 | 
33 |     except exceptions.PyMLSTError as err:
34 |         raise click.ClickException(str(err))
35 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.11'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install build
33 |     - name: Build package
34 |       run: python -m build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/pymlst/wg/commands/msa.py:
--------------------------------------------------------------------------------
 1 | """Multiple Sequence Alignment CLI command file."""
 2 | 
 3 | import os
 4 | 
 5 | import click
 6 | 
 7 | import pymlst
 8 | from pymlst.common import utils, exceptions
 9 | from pymlst.wg.extractors import MsaExtractor
10 | 
11 | 
12 | @click.command(name='msa')
13 | @click.option('--output', '-o',
14 |               type=click.File('w'),
15 |               help='Output result in fasta format (default:stdout).')
16 | @click.option('--file', '-f',
17 |               type=click.File('r'),
18 |               help='file containing list of coregenes to extract (default:all coregenes).')
19 | @click.option('--realign', '-r',
20 |               is_flag=True,
21 |               help='Realigns genes with same length (Default:No).')
22 | @click.argument('database',
23 |                 type=click.Path(exists=True))
24 | def cli(database, **kwargs):
25 |     """Computes Multiple Sequence Alignment from a wgMLST DATABASE."""
26 | 
27 |     seq_kwargs, out_kwargs = utils.get_output(utils.clean_kwargs(kwargs))
28 | 
29 |     try:
30 | 
31 |         with pymlst.open_wg(os.path.abspath(database)) as mlst:
32 |             mlst.extract(MsaExtractor(**seq_kwargs), **out_kwargs)
33 | 
34 |     except exceptions.PyMLSTError as err:
35 |         raise click.ClickException(str(err))
36 | 


--------------------------------------------------------------------------------
/pymlst/wg/commands/sequence.py:
--------------------------------------------------------------------------------
 1 | """extract sequence CLI command file."""
 2 | 
 3 | import os
 4 | 
 5 | import click
 6 | 
 7 | import pymlst
 8 | from pymlst.common import utils, exceptions
 9 | from pymlst.wg.extractors import SequenceExtractor
10 | 
11 | 
12 | @click.command(name='sequence')
13 | @click.option('--output', '-o',
14 |               type=click.File('w'),
15 |               help='Output result in fasta format (default:stdout).')
16 | @click.option('--file', '-f',
17 |               type=click.File('r'),
18 |               help='File containing list of coregenes to extract (default:all coregenes).')
19 | @click.option('--reference',
20 |               is_flag=True,
21 |               help='Return sequence of the reference instead of strains alleles')
22 | @click.argument('database',
23 |                 type=click.Path(exists=True))
24 | def cli(database, **kwargs):
25 |     """Extracts sequences from a wgMLST DATABASE."""
26 | 
27 |     seq_kwargs, out_kwargs = utils.get_output(utils.clean_kwargs(kwargs))
28 | 
29 |     try:
30 | 
31 |         with pymlst.open_wg(os.path.abspath(database)) as mlst:
32 |             mlst.extract(SequenceExtractor(**seq_kwargs), **out_kwargs)
33 | 
34 |     except exceptions.PyMLSTError as err:
35 |         raise click.ClickException(str(err))
36 | 


--------------------------------------------------------------------------------
/pymlst/common/exceptions.py:
--------------------------------------------------------------------------------
 1 | class PyMLSTError(Exception):
 2 |     pass
 3 | 
 4 | 
 5 | class GeneError(PyMLSTError):
 6 |     pass
 7 | 
 8 | 
 9 | class DuplicatedGeneSequence(GeneError):
10 |     pass
11 | 
12 | 
13 | class DuplicatedGeneName(GeneError):
14 |     pass
15 | 
16 | 
17 | class InvalidGeneName(GeneError):
18 |     pass
19 | 
20 | 
21 | class AlleleSequenceNotFound(PyMLSTError):
22 |     pass
23 | 
24 | 
25 | class WrongBaseType(PyMLSTError):
26 |     pass
27 | 
28 | 
29 | class ReferenceStrainRemoval(PyMLSTError):
30 |     pass
31 | 
32 | 
33 | class BadIdentityRange(PyMLSTError):
34 |     pass
35 | 
36 | class BadCoverageRange(PyMLSTError):
37 |     pass
38 | 
39 | 
40 | class BinaryNotFound(PyMLSTError):
41 |     pass
42 | 
43 | 
44 | class StrainAlreadyPresent(PyMLSTError):
45 |     pass
46 | 
47 | 
48 | class ChromosomeNotFound(PyMLSTError):
49 |     pass
50 | 
51 | 
52 | class CoreGenomePathNotFound(PyMLSTError):
53 |     pass
54 | 
55 | 
56 | class NothingToRemove(PyMLSTError):
57 |     pass
58 | 
59 | 
60 | class UndefinedExportType(PyMLSTError):
61 |     pass
62 | 
63 | class EmptyDatabase(PyMLSTError):
64 |     pass
65 | 
66 | class BadInputForCreate(PyMLSTError):
67 |     pass
68 | 
69 | 
70 | class PyMLSTWebError(PyMLSTError):
71 |     pass
72 | 
73 | 
74 | class StructureError(PyMLSTWebError):
75 |     pass
76 | 


--------------------------------------------------------------------------------
/pymlst/wg/commands/add2.py:
--------------------------------------------------------------------------------
 1 | """add CLI command file."""
 2 | 
 3 | import os
 4 | 
 5 | import click
 6 | 
 7 | import pymlst
 8 | from pymlst.common import utils, exceptions
 9 | 
10 | @click.command(name='add2')
11 | @click.option('--strain', '-s',
12 |               type=click.STRING,
13 |               help='Name of the strain (default:genome name).')
14 | @click.option('--identity', '-i',
15 |               type=click.FLOAT,
16 |               help='Minimum identity to search gene (default=0.95).')
17 | @click.option('--coverage', '-c',
18 |               type=click.FLOAT,
19 |               help='Minimum coverage to search gene (default=0.9).')
20 | @click.option('--reads', '-r',
21 |               type=click.INT,
22 |               help='Minimum reads coverage to search a gene (default=10).')
23 | @click.argument('database', nargs=1, 
24 |                 type=click.Path(exists=True))
25 | @click.argument('fastqs', nargs=-1, 
26 |                 type=click.File("r"))
27 | 
28 | def cli(fastqs, database, **kwargs):
29 |     """Adds a strain from FASTQS(.gz) reads to the wgMLST DATABASE."""
30 | 
31 |     try:
32 | 
33 |         with pymlst.open_wg(os.path.abspath(database)) as mlst:
34 |             mlst.add_reads(fastqs, **utils.clean_kwargs(kwargs))
35 | 
36 |     except exceptions.PyMLSTError as err:
37 |         raise click.ClickException(str(err))
38 | 


--------------------------------------------------------------------------------
/pymlst/cla/commands/search.py:
--------------------------------------------------------------------------------
 1 | """search CLI command file."""
 2 | 
 3 | import os
 4 | 
 5 | import click
 6 | 
 7 | import pymlst
 8 | from pymlst.common import utils, exceptions
 9 | 
10 | 
11 | @click.command(name='search')
12 | @click.option('--identity', '-i',
13 |               type=click.FLOAT,
14 |               help='Minimum identity to search gene (default=0.9).')
15 | @click.option('--coverage', '-c',
16 |               type=click.FLOAT,
17 |               help='Minimum coverage to search gene (default=0.9).')
18 | @click.option('--fasta', '-f',
19 |               type=click.File('w'),
20 |               help='Writes fasta file with gene allele.')
21 | @click.option('--output', '-o',
22 |               type=click.File('w'),
23 |               help='Writes ST search result to (default:stdout).')
24 | @click.argument('database',
25 |                 type=click.Path(exists=True))
26 | @click.argument('genomes',
27 |                 type=click.File('r'), nargs=-1)
28 | 
29 | 
30 | def cli(genomes, database, **kwargs):
31 |     """Searches ST number for an assembly GENOME using an mlst DATABASE."""
32 |     
33 |     try:
34 |         with pymlst.open_cla(os.path.abspath(database)) as mlst:
35 |             mlst.multi_search(genomes, **utils.clean_kwargs(kwargs))
36 |                 
37 |     except exceptions.PyMLSTError as err:
38 |         raise click.ClickException(str(err))
39 | 


--------------------------------------------------------------------------------
/docs/source/development/getting_started.rst:
--------------------------------------------------------------------------------
 1 | .. _getting_started_dev:
 2 | 
 3 | .. toctree::
 4 |     :glob:
 5 | 
 6 | ***************
 7 | Getting Started
 8 | ***************
 9 | 
10 | This section provides instructions for setting up your development environment.  If you follow the
11 | steps from top to bottom you should be ready to roll by the end.
12 | 
13 | 
14 | Get the Source
15 | ==============
16 | 
17 | The source code for the `PyMLST` project lives at
18 | `github <https://github.com/bvalot/pyMLST.git>`_.  
19 | You can use `git clone` to get it.
20 | 
21 | .. code-block:: bash
22 | 
23 |    git clone https://github.com/bvalot/pyMLST.git
24 | 
25 | Create the Virtual Environment
26 | ==============================
27 | 
28 | You can create a virtual environment and install the project's dependencies using :ref:`make <make>`.
29 | 
30 | .. code-block:: bash
31 | 
32 |     make venv
33 |     source venv/bin/activate
34 | 	make install
35 | 
36 | 
37 | Try It Out
38 | ==========
39 | 
40 | We recommend that you test your environment by running appropriate
41 | tests.
42 | You can do this with the `make test` target.
43 | 
44 | .. code-block:: bash
45 | 
46 |     make test
47 | 
48 | If the tests run and pass, you're ready to roll.
49 | 
50 | Getting Answers
51 | ===============
52 | 
53 | Once the environment is set up, you can perform a quick build of this project
54 | documentation using the `make answers` target.
55 | 
56 | .. code-block:: bash
57 | 
58 |     make answers
59 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .DEFAULT_GOAL := build
 2 | .PHONY: build publish package coverage test lint docs venv
 3 | PROJ_SLUG = pymlst
 4 | CLI_NAME = pymlst
 5 | PY_VERSION = 3.9
 6 | LINTER = pylint
 7 | 
 8 | 
 9 | 
10 | build:
11 | 	pip install --editable .
12 | 
13 | run:
14 | 	$(CLI_NAME) run
15 | 
16 | submit:
17 | 	$(CLI_NAME) submit
18 | 
19 | freeze:
20 | 	pip freeze > requirements.txt
21 | 
22 | test:
23 | 	py.test --cov-report term --cov=$(PROJ_SLUG) tests/
24 | 
25 | quicktest:
26 | 	py.test --cov-report term --cov=$(PROJ_SLUG) tests/
27 | 
28 | coverage:
29 | 	py.test --cov-report html --cov=$(PROJ_SLUG) tests/
30 | 
31 | docs: 
32 | 	mkdir -p docs/source/_static
33 | 	mkdir -p docs/source/_templates
34 | 	cd docs && $(MAKE) html
35 | 
36 | answers:
37 | 	cd docs && $(MAKE) html
38 | 	xdg-open docs/build/html/index.html
39 | 
40 | package: clean
41 | 	python setup.py sdist
42 | 
43 | publish: package
44 | 	twine upload --repository-url https://test.pypi.org/legacy/ dist/*
45 | 
46 | clean :
47 | 	rm -rf dist \
48 | 	rm -rf docs/build \
49 | 	rm -rf *.egg-info
50 | 	coverage erase
51 | 
52 | venv :
53 | 	virtualenv --python python$(PY_VERSION) venv
54 | 
55 | venv_docs :
56 | 	virtualenv --python python$(PY_VERSION) venv_docs
57 | 
58 | install:
59 | 	pip install -r requirements.txt
60 | 
61 | install_docs:
62 | 	pip install -r docs/source/requirements.txt
63 | 
64 | licenses:
65 | 	pip-licenses --with-url --format=rst \
66 | 	--ignore-packages $(shell cat .pip-lic-ignore | awk '{$$1=$$1};1')
67 | 


--------------------------------------------------------------------------------
/pymlst/cla/commands/create.py:
--------------------------------------------------------------------------------
 1 | """create CLI command file."""
 2 | 
 3 | import os
 4 | import click
 5 | 
 6 | import pymlst
 7 | from pymlst.common import exceptions
 8 | 
 9 | 
10 | @click.command(name='create')
11 | @click.option('--force', '-f',
12 |               is_flag=True,
13 |               help='Overwrites alrealdy existing DATABASE')
14 | @click.option('--species', '-s',
15 |               type=click.STRING,
16 |               help='Name of the species')
17 | @click.option('--version', '-V',
18 |               type=click.STRING,
19 |               help='Version of the database')
20 | @click.argument('database',
21 |                 type=click.Path(exists=False))
22 | @click.argument('profile',
23 |                 type=click.File('r'))
24 | @click.argument('alleles',
25 |                 type=click.File('r'), nargs=-1, required=True)
26 | 
27 | 
28 | def cli(force, species, version, database, profile, alleles):
29 |     """Creates a classical MLST DATABASE from a txt PROFILE and fasta ALLELES files."""
30 | 
31 |     try:
32 | 
33 |         if os.path.exists(database):
34 |             if force:
35 |                 open(database, "w").close()
36 |             else:
37 |                 raise exceptions.PyMLSTError("Database alreadly exists, use --force to override it")
38 | 
39 |         with pymlst.open_cla(os.path.abspath(database)) as mlst:
40 |             mlst.create(profile, alleles)
41 |             mlst.add_infos("Custom", species, "", version)
42 |             
43 |     except exceptions.PyMLSTError as err:
44 |         raise click.ClickException(str(err))
45 | 


--------------------------------------------------------------------------------
/pymlst/wg/commands/remove.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import click
 4 | import logging
 5 | 
 6 | import pymlst
 7 | from pymlst.common import utils, exceptions
 8 | 
 9 | 
10 | @click.command(name="remove")
11 | # @click.option('--item', '-i', default='strains', show_default=True,
12 | #               type=click.Choice(['strains','genes'], case_sensitive=False),
13 | #               help= "Choose the item you wish to remove : strain or genes")
14 | 
15 | @click.option('--strains/--genes',
16 |               default=True, show_default="strains", 
17 |               help= "Choose the item you wish to remove")
18 | @click.option('--file', '-f',type=click.File('r'),
19 |               help='File list of genes or strains to removed on the wgMLST database.')
20 | @click.argument('database', type=click.Path(exists=True), nargs=1)
21 | @click.argument('genes_or_strains', required=False, type=str, nargs=-1)
22 | 
23 | 
24 | def cli(database, strains, genes_or_strains, **kwargs):
25 |     """Removes STRAINS or GENES from a wgMLST DATABASE."""
26 | 
27 |     utils.create_logger()
28 | 
29 |     try:
30 |         with pymlst.open_wg(os.path.abspath(database)) as mlst:
31 |             if strains:
32 |                 logging.info("We will remove one or more strain(s)")
33 |                 mlst.remove_strain(genes_or_strains, **utils.clean_kwargs(kwargs))
34 |        
35 |             else :
36 |                 logging.info("We will remove one or more gene(s)")
37 |                 mlst.remove_gene(genes_or_strains, **utils.clean_kwargs(kwargs))
38 | 
39 |     except exceptions.PyMLSTError as err:
40 |         raise click.ClickException(str(err))
41 | 


--------------------------------------------------------------------------------
/pymlst/cla/commands/search2.py:
--------------------------------------------------------------------------------
 1 | """search CLI command file."""
 2 | 
 3 | import os
 4 | 
 5 | import click
 6 | 
 7 | import pymlst
 8 | from pymlst.common import utils, exceptions
 9 | 
10 | 
11 | @click.command(name='search2')
12 | @click.option('--identity', '-i',
13 |               type=click.FLOAT,
14 |               help='Minimum identity to search gene (default=0.9).')
15 | @click.option('--coverage', '-c',
16 |               type=click.FLOAT,
17 |               help='Minimum coverage to search gene (default=0.95).')
18 | @click.option('--reads', '-r',
19 |               type=click.INT,
20 |               help='Minimum reads coverage to search gene (default=10).')
21 | @click.option('--paired/--single', default=True, 
22 |               help= "Defines type of fastqs files.")
23 | @click.option('--fasta', '-f',
24 |               type=click.File('w'),
25 |               help='Writes fasta file with gene allele.')
26 | @click.option('--output', '-o',
27 |               type=click.File('w'),
28 |               help='Writes ST search result to (default:stdout).')
29 | @click.argument('database',
30 |                 type=click.Path(exists=True))
31 | @click.argument('fastqs',
32 |                 type=click.File('r'), nargs=-1)
33 | 
34 | 
35 | def cli(fastqs, database, **kwargs):
36 |     """Searches ST number from FASTQS(.gz) raw reads using an mlst DATABASE."""
37 |     
38 |     try:
39 |         with pymlst.open_cla(os.path.abspath(database)) as mlst:
40 |             mlst.multi_read(fastqs, **utils.clean_kwargs(kwargs))
41 |                 
42 |     except exceptions.PyMLSTError as err:
43 |         raise click.ClickException(str(err))
44 | 


--------------------------------------------------------------------------------
/pymlst/data/alembic/pytyper/versions/1f96d027f4aa_initial.py:
--------------------------------------------------------------------------------
 1 | """Initial
 2 | 
 3 | Revision ID: 1f96d027f4aa
 4 | Revises: 
 5 | Create Date: 2024-04-29 10:11:29.815236
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '1f96d027f4aa'
14 | down_revision = None
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     
21 |     engine = op.get_bind()
22 |     inspector = sa.inspect(engine)
23 |     tables = inspector.get_table_names()
24 | 
25 |     if 'typerSeq' not in tables:
26 |         op.create_table('typerSeq',
27 |             sa.Column('id', sa.Integer(), nullable=False),
28 |             sa.Column('sequence', sa.Text(), nullable=False),
29 |             sa.Column('typing', sa.Text(), nullable=True),
30 |             sa.Column('allele', sa.Text(), nullable=False),
31 |             sa.PrimaryKeyConstraint('id'),
32 |             sa.UniqueConstraint('sequence'))
33 | 
34 |     if 'typerSt' not in tables:
35 |         op.create_table('typerSt',
36 |             sa.Column('id', sa.Integer(), nullable=False),
37 |             sa.Column('st', sa.Text(), nullable=False),
38 |             sa.Column('typing', sa.Text(), nullable=True),
39 |             sa.Column('allele', sa.Text(), nullable=False),
40 |             sa.PrimaryKeyConstraint('id'))
41 | 
42 |     if 'mlst_type' not in tables:
43 |         table = op.create_table('mlst_type',
44 |             sa.Column('name', sa.String(7), nullable=False,
45 |                       primary_key=True))
46 |         data = [ { 'name' : 'pytyper'}]
47 |         op.bulk_insert(table, data)
48 | 
49 | 
50 | def downgrade():
51 |     pass
52 | 


--------------------------------------------------------------------------------
/docs/source/documentation/cgmlst/export_seq.rst:
--------------------------------------------------------------------------------
 1 | .. _cgmlst_export_seq:
 2 | 
 3 | .. toctree::
 4 |     :glob:
 5 | 
 6 | ================
 7 | Export sequences
 8 | ================
 9 | 
10 | You can access to allele sequences present in the database and specify
11 | a list of genes to export with **-l** option.
12 | 
13 | .. note::
14 | 
15 |    The gene list can be obtained with the :ref:`gene <gene_check>` command.
16 | 
17 | .. _cgmlst_export_sequence:
18 | 
19 | Sequence
20 | ========
21 | 
22 | A simple export of the different sequences.
23 | 
24 | .. code-block:: bash
25 | 
26 |    wgMLST sequence -h
27 |    Usage: wgMLST sequence [OPTIONS] DATABASE
28 | 
29 |    Extracts sequences from a wgMLST DATABASE.
30 |    
31 |    Options:
32 |    -o, --output FILENAME  Output result in fasta format (default:stdout).
33 |    -f, --file FILENAME    File containing list of coregenes to extract
34 |                           (default:all coregenes).
35 |    --reference            Returns reference sequence instead of strain alleles.
36 | 
37 | .. _cgmlst_export_msa:
38 | 
39 | MSA
40 | ===
41 | 
42 | A multialign fasta file with concatenated genes. The file can be used
43 | directly for phylogenetic analysis using maximum likelihood or
44 | Bayesian approaches.
45 | 
46 | .. code-block:: bash
47 |    
48 |    wgMLST msa -h
49 |    Usage: wgMLST msa [OPTIONS] DATABASE
50 |    
51 |    Computes Multiple Sequence Alignment from a wgMLST DATABASE.
52 |    
53 |    Options:
54 |    ...
55 |    -r, --realign          Realigns genes with same length (Default:No).
56 | 
57 | 
58 | .. warning::
59 | 
60 |    It is highly recommended to define a limited list of genes to be
61 |    exported for the phylogenetic approach.
62 |    
63 | 
64 | 


--------------------------------------------------------------------------------
/docs/source/development/docs.rst:
--------------------------------------------------------------------------------
 1 | .. _docs:
 2 | 
 3 | **************************
 4 | Building the Documentation
 5 | **************************
 6 | 
 7 | Sphinx
 8 | ------
 9 | 
10 | The documentation in this project is generated by
11 | `Sphinx <http://www.sphinx-doc.org/en/master/>`_
12 | from `reStructuredTex <http://docutils.sourceforge.net/rst.html>`_.
13 | 
14 | Ubuntu/Debian
15 | -------------
16 | 
17 | This project started with `Debian 11 <https://www.debian.org/distrib/>`_.
18 | This does not mean that you cannot use another distribution, or even
19 | another operating system, but you may need to perform additional setup
20 | steps to make your builds work.
21 | 
22 | Prerequisites
23 | ^^^^^^^^^^^^^
24 | 
25 | You need to install sphinx dependancy using the :ref:`Makefile <make>`
26 | file.
27 | 
28 | .. code-block:: bash
29 | 
30 | 	make install_docs
31 | 
32 | 
33 | The project uses the Sphinx
34 | `LatexBuilder <http://www.sphinx-doc.org/en/master/usage/builders/index.html#sphinx.builders.latex.LaTeXBuilder>`_
35 | to generate a `PDF <https://acrobat.adobe.com/us/en/acrobat/about-adobe-pdf.html>`_
36 | document.  If you are using Debian you will need to install
37 | `texlive <https://www.tug.org/texlive/>`_ and
38 | `latexmk <https://mg.readthedocs.io/latexmk.html>`_.
39 | 
40 | .. code-block:: bash
41 | 	
42 |     sudo apt-get install texlive-latex-recommended \
43 |         texlive-latex-extra \
44 |         texlive-fonts-recommended \
45 |         latexmk
46 | 
47 | 
48 | make
49 | ----
50 | 
51 | Once everything is in place, you can build the documentation using the
52 | :ref:`make docs <make_docs>` the target defined in the project's
53 | :ref:`Makefile <make>`.
54 | 
55 | .. code-block::
56 | 
57 |     make docs
58 | 


--------------------------------------------------------------------------------
/pymlst/pytyper/commands/search.py:
--------------------------------------------------------------------------------
 1 | """ search CLI command file. """
 2 | 
 3 | import os
 4 | import click
 5 | 
 6 | import pymlst
 7 | from pymlst.pytyper import model
 8 | from pymlst.pytyper.method import FIM, SPA, CLMT
 9 | from pymlst.common import utils, exceptions
10 | 
11 | @click.command(name="search")
12 | @click.option("--identity", "-i",
13 |               type=click.FLOAT,
14 |               help="Minimum identity to search gene.")
15 | @click.option("--coverage", "-c",
16 |               type=click.FLOAT,
17 |               help="Minimum coverage to search gene.")
18 | @click.option('--fasta', '-f',
19 |               type=click.File('w'),
20 |               help='Writes fasta file with gene allele.')
21 | @click.option('--output', '-o',
22 |               type=click.File('w'),
23 |               help='Writes search result to (default:stdout).')
24 | 
25 | # Database is initialized automatically without intervention from user
26 | @click.argument('method',
27 |                 type=click.Choice([FIM, SPA, CLMT]),
28 |                 required=True)
29 | @click.argument('genomes',
30 |                 type=click.File('r'),
31 |                 required=True,
32 |                 nargs=-1)
33 | 
34 | def cli(method, genomes, **kwargs):
35 |     
36 |     """Searches strain type using specified METHOD for an assembly GENOME.
37 | 
38 |     fim: fimH typing for Escherichia coli\n
39 |     spa: spa typing for Staphylococcus aureus\n
40 |     clmt: Phylogouping using ClermontTyping method for Escherichia coli
41 |     """
42 |     
43 |     try:
44 |         with pymlst.open_typer(method) as typer:
45 |             typer.multi_search(genomes, **utils.clean_kwargs(kwargs))
46 | 
47 |     except exceptions.PyMLSTError as err:
48 |         raise click.ClickException(str(err))
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/docs/source/documentation/cgmlst.rst:
--------------------------------------------------------------------------------
 1 | .. _cgmlst:
 2 | 
 3 | ******************
 4 | cg/wgMLST analysis
 5 | ******************
 6 | 
 7 | A workflow analysis of cg/wgMLST is performed using a series of Python
 8 | scripts described below.
 9 | 
10 | 
11 | .. toctree::
12 |    :maxdepth: 1
13 | 
14 |    cgmlst/initialise
15 |    cgmlst/add
16 |    cgmlst/check
17 |    cgmlst/export_res
18 |    cgmlst/export_seq
19 |    cgmlst/other_analysis
20 | 
21 | 
22 | .. figure:: cgmlst.png
23 |    :alt: pymlst architecture
24 |    :align: center
25 | 	
26 |    pyMLST architecture for cg/wgMLST analysis
27 | 
28 | 
29 | All avalaible commands can be listed using help fonction:
30 | 
31 | .. code-block:: bash
32 | 
33 |    wgMLST --help
34 | 
35 |    Usage: wgMLST [OPTIONS] COMMAND [ARGS]...
36 | 
37 |    Whole/Core genome MLST analysis.
38 |    
39 |    Commands:
40 |    add            Adds a strain GENOME to the wgMLST DATABASE.
41 |    add2           Adds a strain from FASTQS(.gz) reads to the wgMLST...
42 |    create         Creates a wgMLST DATABASE from a template COREGENE.
43 |    distance       Extracts a distance matrix from a wgMLST DATABASE.
44 |    gene           Extracts a list of genes from a wgMLST DATABASE.
45 |    import         Creates a wgMLST DATABASE from an online resource.
46 |    mlst           Extracts an MLST table from a wgMLST DATABASE.
47 |    msa            Computes Multiple Sequence Alignment from a wgMLST...
48 |    recombination  Searches potential gene recombinations from wgMLST...
49 |    remove         Removes STRAINS or GENES from a wgMLST DATABASE.
50 |    sequence       Extracts sequences from a wgMLST DATABASE.
51 |    stats          Extracts stats from a wgMLST DATABASE.
52 |    strain         Extracts a list of strains from a wgMLST DATABASE.
53 |    subgraph       Searches group of strains at a DISTANCE threshold.
54 | 


--------------------------------------------------------------------------------
/pymlst/wg/commands/create.py:
--------------------------------------------------------------------------------
 1 | """create CLI command file."""
 2 | 
 3 | import os
 4 | import click
 5 | 
 6 | import pymlst
 7 | from pymlst.common import exceptions, utils
 8 | 
 9 | @click.command(name='create')
10 | @click.option('--force', '-f',
11 |               is_flag=True,
12 |               help='Overwrite alrealdy existing DATABASE')
13 | @click.option('--concatenate', '-c',
14 |               is_flag=True,
15 |               help='Automatically concatenates GENES with duplicated sequences.')
16 | @click.option('--remove', '-r',
17 |               is_flag=True,
18 |               help='Automatically removes GENES with duplicated sequences.')
19 | @click.option('--species', '-s',
20 |               type=click.STRING,
21 |               help='Name of the species')
22 | @click.option('--version', '-V',
23 |               type=click.STRING,
24 |               help='Version of the database')
25 | @click.argument('database', type=click.Path(exists=False))
26 | @click.argument('coregene', type=click.File('r'))
27 | 
28 | def cli(force, species, version, database, **kwargs):
29 |     """Creates a wgMLST DATABASE from a template COREGENE."""
30 |        
31 |     try:
32 | 
33 |         if os.path.exists(database):
34 |             if force:
35 |                 open(database, "w").close()
36 |             else:
37 |                 raise exceptions.PyMLSTError("Database alreadly exists, use --force to override it")
38 | 
39 |         with pymlst.open_wg(os.path.abspath(database)) as mlst:
40 |             mlst.create(**utils.clean_kwargs(kwargs))
41 |             mlst.add_infos("custom", species, version)
42 | 
43 |     except exceptions.DuplicatedGeneSequence as err:
44 |         raise click.UsageError('{}, use -c or -r options to manage it'
45 |                                .format(str(err)))
46 |     except exceptions.PyMLSTError as err:
47 |         raise click.ClickException(str(err))
48 | 


--------------------------------------------------------------------------------
/pymlst/common/mafft.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | import logging
 3 | import sys
 4 | import subprocess
 5 | 
 6 | from Bio import AlignIO
 7 | from io import StringIO
 8 | 
 9 | from pymlst import config
10 | from pymlst.common import utils, exceptions
11 | 
12 | 
13 | def align(genes):
14 |     path = config.get_binary_path('mafft')
15 |     if not path:
16 |         raise exceptions.BinaryNotFound('MAFFT binary was not found')
17 |     with tempfile.NamedTemporaryFile(mode='w+t', suffix='.fasta') as tmp:
18 |         utils.write_genome(genes, tmp)
19 |         tmp.flush()
20 |         p = subprocess.Popen([path, "--auto", tmp.name], \
21 |                              stdout=subprocess.PIPE, \
22 |                              stderr=subprocess.PIPE, \
23 |                              encoding=sys.stdout.encoding)
24 |         #records = AlignIO.parse(p.stdout, "fasta")
25 |         try:
26 |             outs, errs = p.communicate()
27 |             alignments = next(AlignIO.parse(StringIO(outs), "fasta"))
28 |         except:
29 |             logging.error("MAFFT doesn't finish correctly\n" + \
30 |                           errs)
31 |             return {}
32 |         return utils.records_to_dict(alignments)
33 | 
34 | 
35 | def __first_aligned_position(sequence):
36 |     position = 0
37 |     for char in sequence:
38 |         if char != '-':
39 |             return position
40 |         position += 1
41 |     return -1
42 | 
43 | 
44 | def get_aligned_area(query, target):
45 |     alignments = align({'query': query, 'target': target})
46 |     if len(alignments) != 2:
47 |         return None, None
48 |     q_align = alignments['query']
49 |     q_len = len(q_align)
50 |     start_index = __first_aligned_position(q_align)
51 |     if start_index == -1:
52 |         return None, None
53 |     end_index = q_len - __first_aligned_position(reversed(q_align))
54 |     return start_index, end_index
55 | 


--------------------------------------------------------------------------------
/pymlst/data/alembic/cla/versions/21efe503d07d_initial.py:
--------------------------------------------------------------------------------
 1 | """initial
 2 | 
 3 | Revision ID: 21efe503d07d
 4 | Revises: 
 5 | Create Date: 2021-05-21 15:55:22.181990
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '21efe503d07d'
14 | down_revision = None
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # This is the initial revision created after the PyMLST refactoring.
21 |     # The old un-versioned databases data are untouched.
22 |     # A new alembic_version table is added automatically to enable versioning.
23 | 
24 |     engine = op.get_bind()
25 |     inspector = sa.inspect(engine)
26 |     tables = inspector.get_table_names()
27 | 
28 |     if 'mlst' not in tables:
29 |         op.create_table('mlst',
30 |             sa.Column('id', sa.Integer(), nullable=False),
31 |             sa.Column('st', sa.Integer(), nullable=True),
32 |             sa.Column('gene', sa.Text(), nullable=True),
33 |             sa.Column('allele', sa.Integer(), nullable=True),
34 |             sa.PrimaryKeyConstraint('id'))
35 | 
36 |     if 'sequences' not in tables:
37 |         op.create_table('sequences',
38 |             sa.Column('id', sa.Integer(), nullable=False),
39 |             sa.Column('sequence', sa.Text(), nullable=True),
40 |             sa.Column('gene', sa.Text(), nullable=True),
41 |             sa.Column('allele', sa.Integer(), nullable=True),
42 |             sa.PrimaryKeyConstraint('id'),
43 |             sa.UniqueConstraint('sequence'))
44 | 
45 |     if 'mlst_type' not in tables:
46 |         table = op.create_table('mlst_type',
47 |             sa.Column('name', sa.String(length=4), nullable=False,
48 |                       primary_key=True))
49 |         data = [ { 'name':  'cla' } ]
50 |         op.bulk_insert(table, data)
51 | 
52 | 
53 | def downgrade():
54 |     ##remove mlst_type table
55 |     op.drop_table('mlst_type')
56 | 


--------------------------------------------------------------------------------
/pymlst/common/commands/configure.py:
--------------------------------------------------------------------------------
 1 | """configure CLI command file."""
 2 | 
 3 | import click
 4 | 
 5 | from pymlst import config
 6 | 
 7 | CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
 8 | 
 9 | @click.command(context_settings=CONTEXT_SETTINGS)
10 | @click.option('--blat', '-b',
11 |               type=click.Path(exists=True, dir_okay=False),
12 |               help='Blat executable absolute path.')
13 | @click.option('--kma', '-k',
14 |               type=click.Path(exists=True, dir_okay=False),
15 |               help='Kma executable absolute path.')
16 | @click.option('--mafft', '-m',
17 |               type=click.Path(exists=True, dir_okay=False),
18 |               help='Mafft executable absolute path.')
19 | @click.option('--log', '-l',
20 |               type=click.Choice(['DEBUG', 'INFO', 'WARNING', 'ERROR']),
21 |               help='Level of logging, default=INFO')
22 | @click.option('--reset', '-r',
23 |               is_flag=True,
24 |               help='Reset the configuration.')
25 | def cli(blat, kma, mafft, log, reset):
26 |     """Configure executables paths and log level."""
27 |     if reset:
28 |         config.reset_binary_paths()
29 |         config.set_logging_level("INFO")
30 |         click.echo('Resetting the configuration...')
31 | 
32 |     if mafft or blat or kma:
33 |         paths = {}
34 |         if blat:
35 |             paths['blat'] = blat
36 |         if kma:
37 |             paths['kma'] = kma
38 |         if mafft:
39 |             paths['mafft'] = mafft
40 |         config.update_binary_paths(paths)
41 |     if log:
42 |         config.set_logging_level(log)
43 | 
44 |     paths = config.list_binary_paths()
45 |     log = config.get_logging_level()
46 |     click.echo('--- Configuration ---')
47 |     if len(paths) > 0:
48 |         for key, value in paths:
49 |             click.echo(key + ': ' + value)
50 |         click.echo('---------------------')
51 |     click.echo('LOG : ' + log)
52 |     click.echo('---------------------')
53 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | 
  5 | # C extensions
  6 | *.so
  7 | *~
  8 | *.conf
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | env/
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | docs/build
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # dotenv
 86 | .env
 87 | 
 88 | # virtualenv
 89 | .venv
 90 | venv/
 91 | ENV/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # JetBrains
107 | .idea
108 | 
109 | # Sphinx documentation
110 | docs/_build/
111 | 
112 | # PyBuilder
113 | target/
114 | 
115 | #Configuration file
116 | pymlst.conf
117 | 
118 | #Alembic database
119 | pymlst/data/alembic/*/*db
120 | pymlst/data/*db


--------------------------------------------------------------------------------
/docs/source/documentation/installation.rst:
--------------------------------------------------------------------------------
 1 | .. _installation:
 2 | 
 3 | .. toctree::
 4 |     :glob:
 5 | 
 6 | ============
 7 | Installation
 8 | ============
 9 | 
10 | This section provides instructions for installation and configuration of pyMLST.
11 | 
12 | 
13 | Automatic Installation
14 | ======================
15 | 
16 | You can install pyMLST and their dependancy using `bioconda <https://anaconda.org/bioconda/pymlst>`_:
17 | 
18 | .. code-block:: bash
19 | 
20 |    conda install -c conda-forge -c bioconda pymlst
21 | 
22 | Manual Installation
23 | ===================
24 | 
25 | * From `pypi repository <https://pypi.org/project/PyMLST/>`_:
26 | 
27 |   .. code-block:: bash
28 | 
29 | 	 pip install pymlst
30 | 
31 | * From `github source <https://github.com/bvalot/pyMLST/>`_:
32 | 
33 |   .. code-block:: bash
34 | 
35 | 	 virtualenv venv
36 | 	 source venv/bin/activate
37 | 	 make install
38 | 	 make build
39 | 
40 | 
41 | Dependancy
42 | ==========
43 | 
44 | PyMLST uses 3 external tools to run alignment:
45 | 
46 | * Mafft (>=7.307)
47 |   
48 |   .. code-block:: bash
49 | 
50 | 	 sudo apt install mafft
51 | 	 
52 | * Blat (v35). You need to compile source or obtain executable at:
53 |   https://genome.ucsc.edu/FAQ/FAQblat.html
54 |   
55 | * kma (>=1.3) You need to compile source from:
56 |   https://bitbucket.org/genomicepidemiology/kma/src/master/
57 | 
58 | 
59 | Configuration
60 | =============
61 | 
62 | Configure the executable locations (if they are not on the PATH) and log level :
63 | 
64 | .. code-block:: bash
65 | 				
66 |    pyMLST configure --help
67 |    Usage: pyMLST configure [OPTIONS]
68 | 
69 |    Configure executables paths and log level.
70 | 
71 |    Options:
72 |    -b, --blat FILE   Blat executable absolute path.
73 |    -k, --kma FILE    Kma executable absolute path.
74 |    -m, --mafft FILE  Mafft executable absolute path.
75 |    -l, --log [DEBUG|INFO|WARNING|ERROR]
76 |                      Level of logging, default=INFO  
77 |    -r, --reset       Reset the configuration.
78 |    --help            Show this message and exit.
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. PyMLST documentation master file
 2 |    You can adapt this file completely to your liking, but it should at least
 3 |    contain the root `toctree` directive.
 4 | 
 5 | ******
 6 | pyMLST
 7 | ******
 8 | 
 9 | .. figure:: logo.png
10 |    :align: center
11 |    :height: 150px
12 |    :alt: pyMLST
13 | 
14 |    python Mlst Local Search Tool
15 | 
16 | Purpose
17 | =======
18 | 
19 | 
20 | Bacterial typing is critical to unraveling the spread of pathogens.
21 | For this purpose, data from next-generation sequencing are
22 | now widely used, with core multilocus sequence typing (cgMLST) or
23 | whole genome multilocus sequence typing (wgMLST) becoming the new
24 | standard. These methods are an extension of the traditional MLST
25 | method, which uses a short list of housekeeping genes. cgMLST and
26 | wgMLST use a large set of genes corresponding to the core or whole
27 | genome. Similar to MLST, each unique sequence corresponds to a
28 | specific allele, and the combination of alleles determines the
29 | sequence type (ST) of the strain.
30 | 
31 | 
32 | We have developed pyMLST to perform this task. Unlike other tools, it
33 | uses a local SQLite database to store allele sequences and MLST
34 | profiles. This allows the collection of strains to be expanded
35 | iteratively. The input can be (i) an assembler-generated draft
36 | genome, (ii) the direct raw data, or (iii) other genomes stored in the
37 | sequence database.
38 | 
39 | 
40 | Documentation
41 | =============
42 | 
43 | .. toctree::
44 |    :maxdepth: 2
45 |    :caption: Users:
46 |    
47 |    documentation/installation
48 |    documentation/cgmlst
49 |    documentation/clamlst
50 |    documentation/pytyper
51 | 
52 |    
53 | .. toctree::
54 |    :maxdepth: 2
55 |    :caption: Developers:
56 | 
57 |    development
58 |    api
59 | 
60 | 
61 | Citation
62 | ========
63 | 
64 | If you use pyMLST, please cite the following paper:
65 | 
66 | Bignenet A. et al., Introduction and benchmarking of pyMLST:
67 | open-source software for assessing bacterial clonality using core
68 | genome MLST. 2023 Microbials Genomics, 9(11), 1126.
69 | doi: `10.1099/mgen.0.001126 <https://doi.org/10.1099/mgen.0.001126>`_
70 | 


--------------------------------------------------------------------------------
/docs/source/documentation/clamlst/search.rst:
--------------------------------------------------------------------------------
 1 | .. _clamlst_search:
 2 | 
 3 | .. toctree::
 4 |     :glob:
 5 | 
 6 | ===============================
 7 | Search MLST profile of a strain
 8 | ===============================
 9 | 
10 | Similarly to wgMLST analysis, you need a draft genome or raw reads
11 | data to find the MLST profile. 
12 | 
13 | .. note::
14 | 
15 |    You can perform MLST searches on multiple genomes or raw reads
16 |    simultaneously.
17 |    
18 |   
19 | 
20 | Genome data
21 | ^^^^^^^^^^^
22 | 
23 | You can search ST from GENOME fasta sequence files.
24 | 
25 | .. code-block:: bash
26 |    
27 |    claMLST search --help
28 |    Usage: claMLST search [OPTIONS] DATABASE GENOMES
29 |    
30 |    Searches ST number for assembly GENOMES using an mlst DATABASE
31 |    
32 |    Options:
33 |    -i, --identity FLOAT   Minimum identity to search gene (default=0.9)
34 |    -c, --coverage FLOAT   Minimum coverage to search gene (default=0.9)
35 |    -f, --fasta FILENAME   Writes fasta file with gene allele
36 |    -o, --output FILENAME  Writes ST search result to (default:stdout)
37 | 
38 | 
39 | Reads data
40 | ^^^^^^^^^^
41 | 
42 | Alternatively, you can search ST directly from raw reads with single
43 | or paired FASTQS(.gz) files.
44 | 
45 | .. code-block:: bash
46 |    
47 |    claMLST search2 --help
48 |    Usage: claMLST search2 [OPTIONS] DATABASE [FASTQS]...
49 |    
50 |    Searches ST number from FASTQS(.gz) raw reads using an mlst DATABASE.
51 |    
52 |    Options:
53 |    -i, --identity FLOAT   Minimum identity to search gene (default=0.9).
54 |    -c, --coverage FLOAT   Minimum coverage to search gene (default=0.95).
55 |    -r, --reads INTEGER    Minimum reads coverage to search gene (default=10).
56 |    --paired / --single    Defines type of fastqs files.
57 |    -f, --fasta FILENAME   Writes fasta file with gene allele.
58 |    -o, --output FILENAME  Writes ST search result to (default:stdout).
59 | 
60 | 
61 | .. note::
62 | 
63 |    The default identity and coverage thresholds are set to 0.9 and can
64 |    be modulated using the **-i** and **-c** options.
65 | 
66 | .. note::
67 | 
68 |    If new alleles are present, you can obtain their sequences with
69 |    the **-f** option.
70 | 
71 | 


--------------------------------------------------------------------------------
/pymlst/common/blat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | ##Copyright (c) 2019 Benoit Valot
 5 | ##benoit.valot@univ-fcomte.fr
 6 | ##UMR 6249 Chrono-Environnement, Besançon, France
 7 | ##Licence GPL
 8 | import logging
 9 | 
10 | import subprocess
11 | import tempfile
12 | from io import BytesIO
13 | 
14 | from pymlst import config
15 | from pymlst.common.psl import Psl
16 | from pymlst.common import exceptions
17 | 
18 | 
19 | def run_blat(genome, tmpfile, tmpout, identity, coverage, maxintron=20):
20 |     """Run Blat and return Psl Object"""
21 |     path = config.get_binary_path('blat')
22 |     if path is None:
23 |         raise exceptions.BinaryNotFound('BLAT binary was not found')
24 | 
25 |     command = [path, '-maxIntron='+str(maxintron), '-fine', \
26 |                '-minIdentity='+str(identity*100), \
27 |                genome.name, tmpfile.name, tmpout.name]
28 |     proc = subprocess.Popen(command, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
29 | 
30 |     output, error = proc.communicate()
31 |     for line in BytesIO(output).readlines():
32 |         logging.debug(line.decode().rstrip())
33 |     have_error = False
34 |     for line in BytesIO(error).readlines():
35 |         have_error = True
36 |         logging.error(line.decode().rstrip())
37 |     if have_error:
38 |         raise exceptions.PyMLSTError(
39 |             'An error occurred while running BLAT')
40 |     genes = {}
41 |     for line in open(tmpout.name, 'r'):
42 |         try:
43 |             int(line.split()[0])
44 |         except (ValueError, IndexError):
45 |             continue
46 |         psl = Psl(line)
47 |         if coverage <= psl.coverage <= 1:
48 |             genes.setdefault(psl.gene_id(), []).append(psl)
49 |     if len(genes) == 0:
50 |         raise exceptions.CoreGenomePathNotFound(
51 |             'No path was found for the core genome')
52 |     return genes
53 | 
54 | 
55 | def blat_tmp():
56 |     """Return a fasta and a psl temporary file"""
57 |     tmpfile = tempfile.NamedTemporaryFile(mode='w+t', suffix='.fasta', delete=False)
58 |     tmpout = tempfile.NamedTemporaryFile(mode='w+t', suffix='.psl', delete=False)
59 |     tmpout.close()
60 |     return tmpfile, tmpout
61 | 


--------------------------------------------------------------------------------
/docs/source/documentation/cgmlst/add.rst:
--------------------------------------------------------------------------------
 1 | .. _cgmlst_add:
 2 | 
 3 | .. toctree::
 4 |     :glob:
 5 | 
 6 | ===========================
 7 | Add strains to the database
 8 | ===========================
 9 | 
10 | Next, you need to iteratively add your strains to the database. You
11 | can use a draft genome (we recommend using `Spades
12 | <http://cab.spbu.ru/software/spades/>`_ for assembly).
13 | You can also add a reference genome for comparison.
14 | 
15 | 
16 | .. note::
17 | 
18 |    You need to add each strain one by one to the database. You can
19 |    specify strain name using **-s** option.
20 | 
21 | 
22 | Genome data
23 | ^^^^^^^^^^^
24 | 
25 | You can add strains using GENOME fasta sequence file.
26 | 
27 | .. code-block:: bash
28 |    
29 |    wgMLST add --help
30 |    Usage: wgMLST add [OPTIONS] DATABASE GENOME
31 |    
32 |    Adds a strain GENOME to the wgMLST DATABASE.
33 |    
34 |    Options:
35 |    -s, --strain TEXT     Name of the strain (default:genome name)
36 |    -i, --identity FLOAT  Minimum identity to search gene (default=0.95)
37 |    -c, --coverage FLOAT  Minimum coverage to search gene (default=0.9)
38 | 
39 | 
40 | Reads data
41 | ^^^^^^^^^^
42 | 
43 | Alternatively, you can also add strains from raw reads direcly with
44 | single or paired FASTQS(.gz) files.
45 | 
46 | .. code-block:: bash
47 |    
48 |    wgMLST add2 --help
49 |    Usage: wgMLST add2 [OPTIONS] DATABASE [FASTQS]...
50 |    
51 |    Adds a strain from FASTQS(.gz) reads to the wgMLST DATABASE.
52 |    
53 |    Options:
54 |    -s, --strain TEXT     Name of the strain (default:genome name).
55 |    -i, --identity FLOAT  Minimum identity to search gene (default=0.95).
56 |    -c, --coverage FLOAT  Minimum coverage to search gene (default=0.9).
57 |    -r, --reads INTEGER   Minimum reads coverage to search a gene
58 |                          (default=10).
59 | 
60 | 
61 | .. note::
62 | 
63 |    The defaut identity and coverage treshold are set to 0.9 and can be
64 |    modulated with **-i** and **-c** options.
65 | 
66 | 
67 | .. warning::
68 | 
69 |    Carefully check that the allele calling has been performed
70 |    correctly for each genome. Check the number of genes found for each
71 |    strain using the :ref:`strain command <strain_check>`.
72 |    
73 | 


--------------------------------------------------------------------------------
/pymlst/config.py:
--------------------------------------------------------------------------------
 1 | import configparser
 2 | import os
 3 | import shutil
 4 | 
 5 | _ROOT = os.path.abspath(os.path.dirname(__file__))
 6 | _CONF_PATH = os.path.join(_ROOT, 'data', 'pymlst.conf')
 7 | 
 8 | _BIN_SECTION = 'BINARIES'
 9 | _LOG_SECTION = 'LOGGING'
10 | _LOG_LEVEL = 'Log Level'
11 | 
12 | def get_data(path):
13 |     return os.path.join(_ROOT, 'data', path)
14 | 
15 | def write_config(conf):
16 |     with open(_CONF_PATH, 'w') as file:
17 |         conf.write(file)
18 | 
19 | def get_config():
20 |     """Retrieves the configuration file."""
21 |     conf = configparser.ConfigParser()
22 |     if os.path.exists(_CONF_PATH):
23 |         conf.read(_CONF_PATH)
24 |     return conf
25 | 
26 | 
27 | def update_binary_paths(paths):
28 |     """Updates the paths stored in the configuration file."""
29 |     conf = get_config()
30 | 
31 |     if not conf.has_section(_BIN_SECTION):
32 |         conf.add_section(_BIN_SECTION)
33 | 
34 |     for key, value in paths.items():
35 |         conf[_BIN_SECTION][key] = os.path.abspath(value)
36 |     write_config(conf)
37 | 
38 | 
39 | def reset_binary_paths():
40 |     """Removes the configuration file."""
41 |     conf = get_config()
42 |     if conf.has_section(_BIN_SECTION):
43 |         conf.remove_section(_BIN_SECTION)
44 |     write_config(conf)
45 | 
46 | 
47 | def get_binary_path(bin_name):
48 |     """Retrieves a binary path."""
49 |     conf = get_config()
50 |     if conf.has_option(_BIN_SECTION, bin_name):
51 |         return conf.get(_BIN_SECTION, bin_name)
52 |     return shutil.which(bin_name)  # path research
53 | 
54 | 
55 | def list_binary_paths():
56 |     """Lists the binary paths stored in the configuration file."""
57 |     conf = get_config()
58 |     if conf.has_section(_BIN_SECTION):
59 |         return conf.items(_BIN_SECTION)
60 |     return []
61 | 
62 | def get_logging_level():
63 |     """Return log level"""
64 |     conf = get_config()
65 |     if conf.has_section(_LOG_SECTION):
66 |         return conf.get(_LOG_SECTION, _LOG_LEVEL)
67 |     return("INFO")
68 |     
69 | def set_logging_level(levelname):
70 |     """Defined level of logging"""
71 |     conf = get_config()
72 | 
73 |     if not conf.has_section(_LOG_SECTION):
74 |         conf.add_section(_LOG_SECTION)
75 |     conf[_LOG_SECTION][_LOG_LEVEL] = levelname  
76 |     write_config(conf)
77 | 


--------------------------------------------------------------------------------
/docs/source/documentation/pytyper/search.rst:
--------------------------------------------------------------------------------
 1 | .. _pytyper_search:
 2 | 
 3 | .. toctree::
 4 |     :glob:
 5 | 
 6 | =======================================
 7 | Search other typing profile of a strain
 8 | =======================================
 9 | 
10 | In complement to classical MSLT, you can search for other typing METHOD
11 | using a draft genome or raw reads.
12 | 
13 | 
14 | Typing method
15 | ^^^^^^^^^^^^^
16 | 
17 | fimH typing
18 |    FimH typing is based on the allelic sequence of the fimH gene in the
19 |    species *Escherichia coli*
20 |    (`Dias et al, 2010 <https://journals.asm.org/doi/10.1128/jcm.01858-09>`_).
21 |    Allelic sequence were download from `CGE <https://bitbucket.org/genomicepidemiology/fimtyper_db/>`_.
22 | 
23 | spa typing
24 |    Spa typing is based on the repetitions polymorphism present on the protein
25 |    A gene (spa) in the species *Staphylococcus aureus*
26 |    (`Frénay et al, 1996 <https://link.springer.com/article/10.1007/BF01586186>`_).
27 |    Repetitions and sequence types definition were download from
28 |    `Ridom <https://spa.ridom.de/>`_.
29 | 
30 | Clermont typing
31 |    Clermont phylogrouping is based on the presence/absence of 4 different genes
32 |    in the species *Escherichia coli*
33 |    (`Clermont et al, 2012 <https://enviromicro-journals.onlinelibrary.wiley.com/doi/10.1111/1758-2229.12019>`_).   
34 | 
35 | 
36 | Genome data
37 | ^^^^^^^^^^^
38 | 
39 | You can search typing METHOD from GENOME fasta sequence files.
40 | 
41 | .. code-block:: bash
42 | 
43 |    pyTyper search --help
44 |    Usage: pyTyper search [OPTIONS] {fim|spa|clmt} GENOMES...
45 | 
46 |    Searches strain type using specified METHOD for an assembly GENOME.
47 | 
48 |    fim: fimH typing for Escherichia coli
49 |    spa: spa typing for Staphylococcus aureus
50 |    clmt: Phylogouping using ClermontTyping method for Escherichia coli
51 | 
52 |    Options:
53 |    -i, --identity FLOAT   Minimum identity to search gene.
54 |    -c, --coverage FLOAT   Minimum coverage to search gene.
55 |    -f, --fasta FILENAME   Writes fasta file with gene allele.
56 |    -o, --output FILENAME  Writes search result to (default:stdout).
57 | 
58 | 
59 | .. note::
60 | 
61 |    If new alleles are present or you want to have sequence target by the typing method in your strains,
62 |    you can obtain their sequences with the **-f** option.
63 | 
64 | .. note::
65 | 
66 |    You can perform searches on multiple genomes simultaneously.
67 | 


--------------------------------------------------------------------------------
/docs/source/development/make.rst:
--------------------------------------------------------------------------------
  1 | .. _make:
  2 | 
  3 | .. _using-the-makefile:
  4 | 
  5 | Using the `Makefile`
  6 | ====================
  7 | 
  8 | This project includes a `Makefile <https://www.gnu.org/software/make/>`_
  9 | that you can use to perform common tasks such as running tests and building
 10 | documentation.
 11 | 
 12 | Targets
 13 | -------
 14 | 
 15 | This section contains a brief description of the targets defined in the
 16 | ``Makefile``.
 17 | 
 18 | ``clean``
 19 | ^^^^^^^^^
 20 | 
 21 | Removes generated packages, documentation, temporary files, *etc*.
 22 | 
 23 | .. _make_lint:
 24 | 
 25 | ``lint``
 26 | ^^^^^^^^
 27 | 
 28 | Runs `pylint <https://www.pylint.org/>`_ against the project files.
 29 | 
 30 | .. _make_test:
 31 | 
 32 | ``test``
 33 | ^^^^^^^^
 34 | 
 35 | Runs the unit tests.
 36 | 
 37 | ``quicktest``
 38 | ^^^^^^^^^^^^^
 39 | 
 40 | Runs the unit tests without performing pre-test validations (like
 41 | :ref:`linting <make_lint>`).
 42 | 
 43 | .. _make_docs:
 44 | 
 45 | ``docs``
 46 | ^^^^^^^^
 47 | 
 48 | Builds the documentation for production.
 49 | 
 50 | .. note::
 51 | 
 52 |     You can also build the documents directly, bypassing validations like
 53 |     :ref:`linting <make_lint>` and :ref:`testing <make_test>` using
 54 |     `Sphinx Makefile <https://github.com/mapnik/sphinx-docs/blob/master/Makefile>`_
 55 |     directly.
 56 | 
 57 |     .. code-block:: bash
 58 | 
 59 |         cd docs
 60 |         make clean && make html
 61 |         make latexpdf
 62 | 
 63 | .. _make_answers:
 64 | 
 65 | ``answers``
 66 | ^^^^^^^^^^^
 67 | 
 68 | Performs a quick build of the documentation and open it in your browser.
 69 | 
 70 | ``package``
 71 | ^^^^^^^^^^^
 72 | 
 73 | Builds the package for publishing.
 74 | 
 75 | .. _make-publish:
 76 | 
 77 | ``publish``
 78 | ^^^^^^^^^^^
 79 | 
 80 | Publishes the package to your repository.
 81 | 
 82 | ``build``
 83 | ^^^^^^^^^
 84 | 
 85 | Installs the current project locally so that you may run the command-line application.
 86 | 
 87 | ``venv``
 88 | ^^^^^^^^
 89 | 
 90 | Creates a virtual environment.
 91 | 
 92 | ``install``
 93 | ^^^^^^^^^^^
 94 | 
 95 | Installs (or updates) project dependencies.
 96 | 
 97 | ``install_docs``
 98 | ^^^^^^^^^^^^^^^^
 99 | 
100 | Installs (or updates) documentation dependencies.
101 | 
102 | ``licenses``
103 | ^^^^^^^^^^^^
104 | 
105 | Generates a report of the projects dependencies and respective licenses.
106 | 
107 | .. note::
108 | 
109 |     If project dependencies change, please update this documentation.
110 | 


--------------------------------------------------------------------------------
/pymlst/data/alembic/wg/env.py:
--------------------------------------------------------------------------------
 1 | from logging.config import fileConfig
 2 | 
 3 | from sqlalchemy import engine_from_config
 4 | from sqlalchemy import pool
 5 | 
 6 | from alembic import context
 7 | 
 8 | from pymlst.wg import model
 9 | 
10 | # this is the Alembic Config object, which provides
11 | # access to the values within the .ini file in use.
12 | 
13 | config = context.config
14 | 
15 | # Interpret the config file for Python logging.
16 | # This line sets up loggers basically.
17 | # fileConfig(config.config_file_name)
18 | 
19 | # add your model's MetaData object here
20 | # for 'autogenerate' support
21 | # from myapp import mymodel
22 | # target_metadata = mymodel.Base.metadata
23 | target_metadata = model.metadata
24 | 
25 | # other values from the config, defined by the needs of env.py,
26 | # can be acquired:
27 | # my_important_option = config.get_main_option("my_important_option")
28 | # ... etc.
29 | 
30 | 
31 | def run_migrations_offline():
32 |     """Run migrations in 'offline' mode.
33 | 
34 |     This configures the context with just a URL
35 |     and not an Engine, though an Engine is acceptable
36 |     here as well.  By skipping the Engine creation
37 |     we don't even need a DBAPI to be available.
38 | 
39 |     Calls to context.execute() here emit the given string to the
40 |     script output.
41 | 
42 |     """
43 |     url = config.get_main_option("sqlalchemy.url")
44 |     context.configure(
45 |         url=url,
46 |         target_metadata=target_metadata,
47 |         literal_binds=True,
48 |         dialect_opts={"paramstyle": "named"},
49 |     )
50 | 
51 |     with context.begin_transaction():
52 |         context.run_migrations()
53 | 
54 | 
55 | def run_migrations_online():
56 |     """Run migrations in 'online' mode.
57 | 
58 |     In this scenario we need to create an Engine
59 |     and associate a connection with the context.
60 | 
61 |     """
62 |     connectable = config.attributes.get('connection', None)
63 | 
64 |     if connectable is None:
65 |         # only create Engine if we don't have a Connection
66 |         # from the outside
67 |         connectable = engine_from_config(
68 |             config.get_section(config.config_ini_section),
69 |             prefix='sqlalchemy.',
70 |             poolclass=pool.NullPool)
71 | 
72 |     # when connectable is already a Connection object, calling
73 |     # connect() gives us a *branched connection*.
74 | 
75 |     with connectable.connect() as connection:
76 |         context.configure(
77 |             connection=connection,
78 |             target_metadata=target_metadata
79 |         )
80 | 
81 |         with context.begin_transaction():
82 |             context.run_migrations()
83 | 
84 | 
85 | if context.is_offline_mode():
86 |     run_migrations_offline()
87 | else:
88 |     run_migrations_online()
89 | 


--------------------------------------------------------------------------------
/pymlst/wg/commands/import.py:
--------------------------------------------------------------------------------
 1 | """import CLI command file."""
 2 | 
 3 | import logging
 4 | import os
 5 | import tempfile
 6 | 
 7 | import click
 8 | import requests
 9 | 
10 | import pymlst
11 | from pymlst.common import utils, web, exceptions
12 | 
13 | 
14 | @click.command(name='import')
15 | @click.option('--force', '-f',
16 |               is_flag=True,
17 |               help='Overwrite alrealdy existing DATABASE')
18 | @click.option('--prompt/--no-prompt',
19 |               default=True,
20 |               help='Do not prompt if multiple '
21 |                    'choices are found, fail instead.')
22 | @click.argument('database',
23 |                 type=click.Path(exists=False))
24 | @click.argument('species',
25 |                 type=click.STRING,
26 |                 nargs=-1)
27 | def cli(force, prompt, database, species):
28 |     """Creates a wgMLST DATABASE from an online resource.
29 | 
30 |     The research can be filtered by adding a SPECIES name."""
31 | 
32 |     utils.create_logger()
33 | 
34 |     try:
35 | 
36 |         if os.path.exists(database):
37 |             if force:
38 |                 open(database, "w").close()
39 |             else:
40 |                 raise exceptions.PyMLSTError("Database alreadly exists, use --force to override it")
41 |         
42 |         url = web.retrieve_cgmlst(' '.join(species), prompt)
43 | 
44 |         if url is None:
45 |             logging.info('No choice selected')
46 |             return
47 | 
48 |         logging.info('Downloading the core genome...')
49 | 
50 |         with tempfile.NamedTemporaryFile('w+', delete=False) as tmp:
51 | 
52 |             skipped = web.get_cgmlst_file(url, tmp)
53 |             tmp.close()
54 |             if len(skipped) > 0:
55 |                 logging.info('Skipped the following malformed file(s): %s', ', '.join(skipped))
56 |             infos = web.get_cgmlst_info(url)
57 |             with pymlst.open_wg(os.path.abspath(database)) as mlst:
58 |                 mlst.create(tmp.name)
59 |                 mlst.add_infos("cgmlst.org", infos[0], infos[1])
60 |                 
61 |     except requests.exceptions.HTTPError:
62 |         raise click.ClickException('Could not retrieve online data')
63 |     except requests.exceptions.ConnectionError:
64 |         raise click.ClickException('Could not access to the server, please verify your internet connection')
65 |     except requests.exceptions.Timeout:
66 |         raise click.ClickException('The server took too long to respond')
67 |     except exceptions.StructureError:
68 |         raise click.ClickException('It seems like the structure of the website/API changed '
69 |                                    'since this application was developed.')
70 |     except exceptions.PyMLSTError as err:
71 |         raise click.ClickException(str(err))
72 | 


--------------------------------------------------------------------------------
/pymlst/data/alembic/cla/env.py:
--------------------------------------------------------------------------------
 1 | from logging.config import fileConfig
 2 | 
 3 | from sqlalchemy import engine_from_config
 4 | from sqlalchemy import pool
 5 | 
 6 | from alembic import context
 7 | 
 8 | from pymlst.cla import model
 9 | 
10 | # this is the Alembic Config object, which provides
11 | # access to the values within the .ini file in use.
12 | 
13 | config = context.config
14 | 
15 | # Interpret the config file for Python logging.
16 | # This line sets up loggers basically.
17 | # fileConfig(config.config_file_name)
18 | 
19 | # add your model's MetaData object here
20 | # for 'autogenerate' support
21 | # from myapp import mymodel
22 | # target_metadata = mymodel.Base.metadata
23 | target_metadata = model.metadata
24 | 
25 | # other values from the config, defined by the needs of env.py,
26 | # can be acquired:
27 | # my_important_option = config.get_main_option("my_important_option")
28 | # ... etc.
29 | 
30 | 
31 | def run_migrations_offline():
32 |     """Run migrations in 'offline' mode.
33 | 
34 |     This configures the context with just a URL
35 |     and not an Engine, though an Engine is acceptable
36 |     here as well.  By skipping the Engine creation
37 |     we don't even need a DBAPI to be available.
38 | 
39 |     Calls to context.execute() here emit the given string to the
40 |     script output.
41 | 
42 |     """
43 |     url = config.get_main_option("sqlalchemy.url")
44 |     context.configure(
45 |         url=url,
46 |         target_metadata=target_metadata,
47 |         literal_binds=True,
48 |         dialect_opts={"paramstyle": "named"},
49 |     )
50 | 
51 |     with context.begin_transaction():
52 |         context.run_migrations()
53 | 
54 | 
55 | def run_migrations_online():
56 |     """Run migrations in 'online' mode.
57 | 
58 |     In this scenario we need to create an Engine
59 |     and associate a connection with the context.
60 | 
61 |     """
62 |     connectable = config.attributes.get('connection', None)
63 | 
64 |     if connectable is None:
65 |         # only create Engine if we don't have a Connection
66 |         # from the outside
67 |         connectable = engine_from_config(
68 |             config.get_section(config.config_ini_section),
69 |             prefix='sqlalchemy.',
70 |             poolclass=pool.NullPool)
71 | 
72 |     # when connectable is already a Connection object, calling
73 |     # connect() gives us a *branched connection*.
74 | 
75 |     with connectable.connect() as connection:
76 |         context.configure(
77 |             connection=connection,
78 |             target_metadata=target_metadata
79 |         )
80 | 
81 |         with context.begin_transaction():
82 |             context.run_migrations()
83 | 
84 | 
85 | if context.is_offline_mode():
86 |     run_migrations_offline()
87 | else:
88 |     run_migrations_online()
89 | 


--------------------------------------------------------------------------------
/pymlst/data/alembic/pytyper/env.py:
--------------------------------------------------------------------------------
 1 | from logging.config import fileConfig
 2 | 
 3 | from sqlalchemy import engine_from_config
 4 | from sqlalchemy import pool
 5 | 
 6 | from alembic import context
 7 | 
 8 | from pymlst.pytyper import model
 9 | 
10 | # this is the Alembic Config object, which provides
11 | # access to the values within the .ini file in use.
12 | 
13 | config = context.config
14 | 
15 | # Interpret the config file for Python logging.
16 | # This line sets up loggers basically.
17 | # fileConfig(config.config_file_name)
18 | 
19 | # add your model's MetaData object here
20 | # for 'autogenerate' support
21 | # from myapp import mymodel
22 | # target_metadata = mymodel.Base.metadata
23 | target_metadata = model.metadata
24 | 
25 | # other values from the config, defined by the needs of env.py,
26 | # can be acquired:
27 | # my_important_option = config.get_main_option("my_important_option")
28 | # ... etc.
29 | 
30 | 
31 | def run_migrations_offline():
32 |     """Run migrations in 'offline' mode.
33 | 
34 |     This configures the context with just a URL
35 |     and not an Engine, though an Engine is acceptable
36 |     here as well.  By skipping the Engine creation
37 |     we don't even need a DBAPI to be available.
38 | 
39 |     Calls to context.execute() here emit the given string to the
40 |     script output.
41 | 
42 |     """
43 |     url = config.get_main_option("sqlalchemy.url")
44 |     context.configure(
45 |         url=url,
46 |         target_metadata=target_metadata,
47 |         literal_binds=True,
48 |         dialect_opts={"paramstyle": "named"},
49 |     )
50 | 
51 |     with context.begin_transaction():
52 |         context.run_migrations()
53 | 
54 | 
55 | def run_migrations_online():
56 |     """Run migrations in 'online' mode.
57 | 
58 |     In this scenario we need to create an Engine
59 |     and associate a connection with the context.
60 | 
61 |     """
62 |     connectable = config.attributes.get('connection', None)
63 | 
64 |     if connectable is None:
65 |         # only create Engine if we don't have a Connection
66 |         # from the outside
67 |         connectable = engine_from_config(
68 |             config.get_section(config.config_ini_section),
69 |             prefix='sqlalchemy.',
70 |             poolclass=pool.NullPool)
71 | 
72 |     # when connectable is already a Connection object, calling
73 |     # connect() gives us a *branched connection*.
74 | 
75 |     with connectable.connect() as connection:
76 |         context.configure(
77 |             connection=connection,
78 |             target_metadata=target_metadata
79 |         )
80 | 
81 |         with context.begin_transaction():
82 |             context.run_migrations()
83 | 
84 | 
85 | if context.is_offline_mode():
86 |     run_migrations_offline()
87 | else:
88 |     run_migrations_online()
89 | 


--------------------------------------------------------------------------------
/docs/source/documentation/cgmlst/other_analysis.rst:
--------------------------------------------------------------------------------
 1 | .. _cgmlst_other_analysis:
 2 | 
 3 | .. toctree::
 4 |     :glob:
 5 | 
 6 | ========================
 7 | Other analysis available
 8 | ========================
 9 | 
10 | From the results obtained with cg/wgMLST analysis, you can proceed to
11 | further analysis.
12 | 
13 | .. _cgmlst_other_subgraph:
14 | 
15 | Subgraph
16 | ========
17 | 
18 | The subgraph command performs a simple hierarchical clustering to
19 | group strains with a distance below the threshold.
20 | 
21 | .. figure:: subgraph.png
22 |    :alt: Subgraph representation
23 |    :align: center
24 |    :height: 400px
25 | 	
26 |    MST representation of subgraph analysis at a threshold of 10.
27 | 
28 | 
29 | You need:
30 | 
31 | :DISTANCE: The distance matrix obtained with :ref:`distance <cgmlst_export_distance>` command.
32 | 
33 | .. code-block:: bash
34 | 
35 |    wgMLST subgraph -h
36 |    Usage: wgMLST subgraph [OPTIONS] DISTANCE
37 |    
38 |    Searches group of strains at a DISTANCE threshold.
39 |    
40 |    Options:
41 |    -o, --output FILENAME             Output group files (default:stdout).
42 |    -t, --threshold INTEGER           Minimum distance to conserve for extraction
43 |                                      of group (default:50).
44 |    -e, --export [list|count|group]   Export type (default:list).
45 | 
46 |    
47 | .. _cgmlst_other_recombination:
48 | 
49 | Recombination
50 | =============
51 | 
52 | The recombination command determines the number of different positions
53 | in the multiple alignment. You can use the result to define a
54 | threshold and the final list of genes without potential recombination.
55 | 
56 | .. code::
57 | 
58 |    #Gene   Mutation  Lenght  mutation per 100 base
59 |    PA0001     1       1545    0.064
60 |    PA0002     1       1104    0.090
61 |    PA0004     1       2421    0.041
62 |    PA0010     1       552     0.181
63 |    PA0011     0       888     0.0
64 |    PA0022     1       558     0.179
65 |    PA0038     1       216     0.462
66 |    PA0062     1       417     0.239
67 |    PA0065     1       666     0.150
68 |    ...
69 | 
70 | 
71 | You need:
72 | 
73 | :GENES: List of genes used for export MSA and obtained with :ref:`gene
74 | 		<gene_check>` command.
75 | :ALIGNMENT: The Multiple Sequence Alignment obtained with :ref:`msa <cgmlst_export_msa>` command.
76 | 
77 | .. code-block:: bash
78 |    
79 |    wgMLST recombination -h
80 |    Usage: wgMLST recombination [OPTIONS] GENES ALIGNMENT
81 |    
82 |    Searches potential gene recombinations from wgMLST database export.
83 |    
84 |    Options:
85 |    -o, --output FILENAME  Output number of variations by genes
86 |                           (default:stdout).
87 |    
88 | .. warning::
89 | 
90 |    The algorithm is designed to find recombination on closed strains
91 |    and could not work correctly on more diverse ST.
92 | 
93 | 


--------------------------------------------------------------------------------
/docs/source/documentation/cgmlst/export_res.rst:
--------------------------------------------------------------------------------
 1 | .. _cgmlst_export_res:
 2 | 
 3 | .. toctree::
 4 |     :glob:
 5 | 
 6 | ==============
 7 | Export results
 8 | ==============
 9 | 
10 | When the database is complete and :ref:`validated <cgmlst_check>`, you
11 | can export results for futher analysis.
12 | 
13 | .. _cgmlst_export_distance:
14 | 
15 | Distance
16 | ========
17 | 
18 | A matrix of cgMLST distances can be computed from the database and
19 | defined as the number of different alleles between each pair of two
20 | strains, omitting the missing data.
21 | 
22 | .. code::
23 |    
24 |    #Strain  33_PA   34_PA   35_PA   61_PA   84_PA   98_PA 
25 |    33_PA     0       39      37      25      20      23   
26 |    34_PA     39      0       5       33      35      39   
27 |    35_PA     37      5       0       31      33      37   
28 |    61_PA     25      33      31      0       22      27   
29 |    84_PA     20      35      33      22      0       21   
30 |    98_PA     23      39      37      27      21      0  
31 | 
32 | .. code-block:: bash
33 | 				
34 |    wgMLST distance --help
35 |    Usage: wgMLST distance [OPTIONS] DATABASE
36 |    
37 |    Extracts a distance matrix from a wgMLST DATABASE.
38 |    Options:
39 |    -m, --mincover INTEGER  Minimun number of strains found to retain a gene
40 |                            (default:0)
41 |    -k, --keep              Keeps only gene with different alleles (omit
42 |                            missing).
43 |    -d, --duplicate         Keeps duplicate genes (default remove).
44 |    -V, --inverse           Keeps only gene that do not match the filter of
45 |                            mincover or keep options.
46 |    
47 | .. warning::
48 | 
49 |    To have correct distance calculation, you need to filter genes with
50 |    low frequency observations. See :ref:`validate <m_option_check>`  to
51 |    have more informations on **-m** option.
52 | 
53 | .. _cgmlst_export_mlst:
54 |   
55 | MLST
56 | ====
57 | 
58 | The MLST profiles can be also exported. The number indicated the
59 | allele *id* in the database. An formatted version compatible with grapetree
60 | can be defined.
61 | 
62 | .. code::
63 |    
64 |    #GeneId 33_PA   34_PA   35_PA   61_PA   84_PA   98_PA
65 |    PA0120  3918    3918    3918    3918    3918    3918 
66 |    PA0527  3963    3963    3963    3963    3963    3963 
67 |    PA0691  3954    3954    3954    8945    3954    3954
68 |    PA0935  3910    3910    3910    3910    3910    3910
69 |    ...
70 | 
71 | 
72 | .. code-block:: bash
73 | 				
74 |    wgMLST mlst --help
75 |    Usage: wgMLST mlst [OPTIONS] DATABASE
76 | 
77 |    Extracts an MLST table from a wgMLST DATABASE.
78 |    Options:
79 |    ...
80 |    -f, --form [default|grapetree]  Specify format of output
81 | 
82 | .. note::
83 | 
84 |    Similarly to :ref:`distance <cgmlst_export_distance>`, the gene export on this mlst table can be
85 |    defined with -m, -k, and -d options.
86 | 
87 | 


--------------------------------------------------------------------------------
/pymlst/data/alembic/alembic.ini:
--------------------------------------------------------------------------------
  1 | # A generic, single database configuration.
  2 | 
  3 | [wg]
  4 | script_location = wg
  5 | sqlalchemy.url = sqlite:///wg/idle.db
  6 | 
  7 | [cla]
  8 | script_location = cla
  9 | sqlalchemy.url = sqlite:///cla/idle.db
 10 | 
 11 | [pytyper]
 12 | script_location = pytyper
 13 | sqlalchemy.url = sqlite:///pytyper/idle.db
 14 | 
 15 | [DEFAULT]
 16 | # path to migration scripts
 17 | # script_location = alembic
 18 | 
 19 | # template used to generate migration files
 20 | # file_template = %%(rev)s_%%(slug)s
 21 | 
 22 | # sys.path path, will be prepended to sys.path if present.
 23 | # defaults to the current working directory.
 24 | prepend_sys_path = .
 25 | 
 26 | # timezone to use when rendering the date
 27 | # within the migration file as well as the filename.
 28 | # string value is passed to dateutil.tz.gettz()
 29 | # leave blank for localtime
 30 | # timezone =
 31 | 
 32 | # max length of characters to apply to the
 33 | # "slug" field
 34 | # truncate_slug_length = 40
 35 | 
 36 | # set to 'true' to run the environment during
 37 | # the 'revision' command, regardless of autogenerate
 38 | # revision_environment = false
 39 | 
 40 | # set to 'true' to allow .pyc and .pyo files without
 41 | # a source .py file to be detected as revisions in the
 42 | # versions/ directory
 43 | # sourceless = false
 44 | 
 45 | # version location specification; this defaults
 46 | # to alembic/versions.  When using multiple version
 47 | # directories, initial revisions must be specified with --version-path
 48 | # version_locations = %(here)s/bar %(here)s/bat alembic/versions
 49 | 
 50 | # the output encoding used when revision files
 51 | # are written from script.py.mako
 52 | # output_encoding = utf-8
 53 | 
 54 | sqlalchemy.url = driver://user:pass@localhost/dbname
 55 | 
 56 | 
 57 | [post_write_hooks]
 58 | # post_write_hooks defines scripts or Python functions that are run
 59 | # on newly generated revision scripts.  See the documentation for further
 60 | # detail and examples
 61 | 
 62 | # format using "black" - use the console_scripts runner, against the "black" entrypoint
 63 | # hooks = black
 64 | # black.type = console_scripts
 65 | # black.entrypoint = black
 66 | # black.options = -l 79 REVISION_SCRIPT_FILENAME
 67 | 
 68 | # Logging configuration
 69 | [loggers]
 70 | keys = root,sqlalchemy,alembic
 71 | 
 72 | [handlers]
 73 | keys = console
 74 | 
 75 | [formatters]
 76 | keys = generic
 77 | 
 78 | [logger_root]
 79 | level = WARN
 80 | handlers = console
 81 | qualname =
 82 | 
 83 | [logger_sqlalchemy]
 84 | level = WARN
 85 | handlers =
 86 | qualname = sqlalchemy.engine
 87 | 
 88 | [logger_alembic]
 89 | level = INFO
 90 | handlers =
 91 | qualname = alembic
 92 | 
 93 | [handler_console]
 94 | class = StreamHandler
 95 | args = (sys.stderr,)
 96 | level = NOTSET
 97 | formatter = generic
 98 | 
 99 | [formatter_generic]
100 | format = %(levelname)-5.5s [%(name)s] %(message)s
101 | datefmt = %H:%M:%S
102 | 


--------------------------------------------------------------------------------
/pymlst/data/alembic/wg/versions/52ae99cb5f33_initial.py:
--------------------------------------------------------------------------------
 1 | """initial
 2 | 
 3 | Revision ID: 52ae99cb5f33
 4 | Revises: 
 5 | Create Date: 2021-05-21 10:23:49.557993
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '52ae99cb5f33'
14 | down_revision = None
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # This is the initial revision created after the PyMLST refactoring.
21 |     # The old un-versioned databases data are untouched.
22 |     # Old databased indexes are dropped and replaced by new ones.
23 |     # A new alembic_version table is added automatically to enable versioning.
24 | 
25 |     engine = op.get_bind()
26 |     inspector = sa.inspect(engine)
27 |     tables = inspector.get_table_names()
28 | 
29 |     if 'sequences' not in tables:
30 |         op.create_table('sequences',
31 |             sa.Column('id', sa.Integer(), nullable=False),
32 |             sa.Column('sequence', sa.Text(), nullable=True),
33 |             sa.PrimaryKeyConstraint('id'),
34 |             sa.UniqueConstraint('sequence'))
35 | 
36 |     if 'mlst' not in tables:
37 |         op.create_table('mlst',
38 |             sa.Column('id', sa.Integer(), nullable=False),
39 |             sa.Column('souche', sa.Text(), nullable=True),
40 |             sa.Column('gene', sa.Text(), nullable=True),
41 |             sa.Column('seqid', sa.Integer(), nullable=True),
42 |             sa.ForeignKeyConstraint(['seqid'], ['sequences.id'], ),
43 |             sa.PrimaryKeyConstraint('id'))
44 | 
45 |     if 'mlst_type' not in tables:
46 |         table = op.create_table('mlst_type',
47 |                     sa.Column('name', sa.String(length=4), nullable=False,
48 |                               primary_key=True))
49 |         data = [ { 'name':  'wg' } ]
50 |         op.bulk_insert(table, data)
51 | 
52 |     indexes = inspector.get_indexes('mlst')
53 |     for ind in indexes:
54 |         op.drop_index(ind['name'])
55 | 
56 |     op.create_index('ix_gene', 'mlst', ['gene'], unique=False)
57 |     op.create_index('ix_seqid', 'mlst', ['seqid'], unique=False)
58 |     op.create_index('ix_souche', 'mlst', ['souche'], unique=False)
59 |     op.create_index('ix_souche_gene_seqid', 'mlst', ['gene', 'souche', 'seqid'], unique=False)
60 | 
61 | 
62 | def downgrade():
63 |     # Remove index and mlst_type
64 |     op.drop_index('ix_souche_gene_seqid', table_name='mlst')
65 |     op.drop_index('ix_souche', table_name='mlst')
66 |     op.drop_index('ix_seqid', table_name='mlst')
67 |     op.drop_index('ix_gene', table_name='mlst')
68 |     op.drop_table('mlst_type')
69 | 
70 |     # Rebuild older index
71 |     op.create_index('ID_gene', 'mlst', ['gene'], unique=False)
72 |     op.create_index('ID_seqid', 'mlst', ['seqid'], unique=False)
73 |     op.create_index('ID_souche', 'mlst', ['souche'], unique=False)
74 |     op.create_index('ID_index', 'mlst', ['souche', 'gene'], unique=False)    
75 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![PyPI version](https://badge.fury.io/py/PyMLST.svg)](https://pypi.org/project/PyMLST/)
 2 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/pymlst/README.html)
 3 | [![Documentation Status](https://readthedocs.org/projects/pymlst/badge/?version=latest)](https://pymlst.readthedocs.io/en/latest/?badge=latest)
 4 | 
 5 | # pyMLST
 6 | ![pyMLST](docs/source/logo.png "A Python Mlst Local Search Tool")
 7 | 
 8 | A Python Mlst Local Search Tool.
 9 | 
10 | ## Purpose
11 | Bacterial typing is critical to unraveling the spread of pathogens.
12 | For this purpose, data from next-generation sequencing are now widely used, with core multilocus sequence typing (cgMLST) or whole genome multilocus sequence typing (wgMLST) becoming the new standard.
13 | These methods are an extension of the traditional MLST method, which uses a short list of housekeeping genes.
14 | cgMLST and wgMLST use a large set of genes corresponding to the core or whole genome.
15 | Similar to MLST, each unique sequence corresponds to a specific allele, and the combination of alleles determines the sequence type (ST) of the strain.
16 | 
17 | We have developed pyMLST to perform this task. Unlike other tools, it uses a local SQLite database to store allele sequences and MLST profiles.
18 | This allows the collection of strains to be expanded iteratively. The input can be (i) an assembler-generated draft genome, (ii) the direct raw data, or (iii) other genomes stored in the sequence database.
19 | 
20 | ## New version
21 | V2.2:
22 | 
23 | - Introduced new typing methods with fimH and phylogrouping for *Escherichia coli* and spa for *Staphylococcus aureus* (pyTyper search command)
24 | 
25 | V2.1:
26 | 
27 | - Use raw reads (FASTQ) directly with the kma integration (search2 and add2 command)
28 | 
29 | V2.0:
30 | 
31 | - An automatic import database mechanism to initiated cgMLST and MLST databases.
32 | - A new process to fill incomplet genes using MAFFT alignment.
33 | - A more complete command line interface with a sub-command system.
34 | - A configuration file for defined PATH to external tools.
35 | - An easy installation with pypi repository.
36 | 
37 | 
38 | ## Documentation
39 | The details of installation, workflow and running parameters could be found on the [**documentation**](https://pymlst.readthedocs.io/en/latest/).
40 | 
41 | 
42 | ## Publications
43 | If you use pyMLST, please cite the following paper:
44 | 
45 | Bignenet A. et al., Introduction and benchmarking of pyMLST:
46 | open-source software for assessing bacterial clonality using core
47 | genome MLST. 2023 Microbials Genomics, 9(11), 1126.
48 | doi: [10.1099/mgen.0.001126](https://doi.org/10.1099/mgen.0.001126)
49 | 
50 | 
51 | PyMLST v1 have been already use to analyse most of clinical bacteria:
52 | 
53 |  - [*Escherichia coli* and *Klebsiella pneumoniae*](https://doi.org/10.1016/j.cmi.2021.07.022)
54 |  - [*Acinetobacter baumanii*](https://doi.org/10.1038/s41598-023-49268-x)
55 |  - [*Pseudomonas aeruginosa*](https://doi.org/10.1016/j.jhin.2020.06.013)
56 |  - [*Proteus mirabilis*](https://doi.org/10.1093/jac/dkz472)
57 | 


--------------------------------------------------------------------------------
/pymlst/cmd.py:
--------------------------------------------------------------------------------
 1 | """PyMLST entry commands and common parameters creation.
 2 | 
 3 | Subcommands are being instantiated dynamically from their respective folders.
 4 | """
 5 | 
 6 | import os
 7 | import click
 8 | 
 9 | from click import Option
10 | 
11 | from pymlst import version
12 | from pymlst.common import utils
13 | 
14 | 
15 | class PyMlstCommand(click.MultiCommand):
16 |     """Global PyMLST command."""
17 | 
18 |     def __init__(self, path, help_msg):
19 |         """Initializes the command."""
20 |         super().__init__(help='Subcommands are loaded from a '
21 |                               'plugin folder dynamically')
22 |         
23 |         opt_help = dict(help_option_names=['-h', '--help'])
24 | 
25 |         opt_version = Option(['--version', '-v'], is_flag=True, callback=print_version,
26 |                              expose_value=False, is_eager=True,
27 |                              help='Prints PyMLST version.')
28 |         self.params.append(opt_version)
29 |         self.context_settings.update(opt_help)
30 |         self.help = help_msg
31 |         self.path = path
32 | 
33 |     def list_commands(self, ctx):
34 |         """Lists the available commands.
35 | 
36 |         The commands are loaded dynamically from files within
37 |         a directory.
38 |         """
39 |         cmd_names = []
40 |         for filename in os.listdir(self.path):
41 |             if filename.endswith('.py') and not filename.startswith('__init__'):
42 |                 cmd_names.append(filename[:-3])
43 |         cmd_names.sort()
44 |         return cmd_names
45 | 
46 |     def get_command(self, ctx, name):
47 |         """Gets a command by name."""
48 |         name_scope = {}
49 |         cmd_file = os.path.join(self.path, name + '.py')
50 |         try:
51 |             with open(cmd_file) as file:
52 |                 code = compile(file.read(), cmd_file, 'exec')
53 |                 eval(code, name_scope, name_scope)
54 |         except FileNotFoundError:
55 |             raise click.ClickException(
56 |                 'Unknown sub-command \'{}\''.format(name))
57 |         return name_scope['cli']
58 | 
59 | 
60 | def print_version(ctx, param, value):
61 |     """Prints the package version."""
62 |     del param
63 |     if not value or ctx.resilient_parsing:
64 |         return
65 |     click.echo('Version: ' + version.__version__)
66 |     click.echo('Release: ' + version.__release__)
67 |     ctx.exit()
68 | 
69 | 
70 | py = PyMlstCommand(
71 |     os.path.join(os.path.dirname(__file__), 'common', 'commands'),
72 |     '''
73 |     \b
74 |     Common utility commands.
75 |     
76 |     Tree pipelines are available:\n
77 |       claMLST     for classical MLST analysis\n
78 |       wgMLST      for Whole/Core genome MLST analysis\n
79 |       pyTyper     for other typing analysis''')
80 |       
81 | wg = PyMlstCommand(
82 |     os.path.join(os.path.dirname(__file__), 'wg', 'commands'),
83 |     'Whole/Core genome MLST commands.')
84 | 
85 | cla = PyMlstCommand(
86 |     os.path.join(os.path.dirname(__file__), 'cla', 'commands'),
87 |     'Classical MLST commands.')
88 | 
89 | pytyper = PyMlstCommand(
90 |     os.path.join(os.path.dirname(__file__), 'pytyper', 'commands'),
91 |     'Other typing commands.')
92 | 


--------------------------------------------------------------------------------
/pymlst/cla/commands/import.py:
--------------------------------------------------------------------------------
 1 | """import CLI command file."""
 2 | 
 3 | import logging
 4 | import os
 5 | import sys
 6 | import tempfile
 7 | 
 8 | import click
 9 | 
10 | import requests
11 | 
12 | import pymlst
13 | 
14 | from pymlst.common import web, exceptions
15 | from pymlst.common import utils
16 | 
17 | 
18 | @click.command(name='import')
19 | @click.option('--force', '-f',
20 |               is_flag=True,
21 |               help='Overwrites alrealdy existing DATABASE')
22 | @click.option('--prompt/--no-prompt',
23 |               default=True,
24 |               help='Do not prompt if multiple '
25 |                    'choices are found, fail instead.')
26 | @click.option('--mlst', '-m',
27 |               type=click.STRING,
28 |               default='',
29 |               help='Specifies the desired MLST scheme name.')
30 | @click.option('--repository', '-r', default='pubmlst', 
31 |               type=click.Choice(['pubmlst','pasteur'], case_sensitive=False),
32 |               help='Choose the online repository to use')
33 | # @click.option('--pubmlst/--pasteur',
34 | #               default=True, show_default="pubmlst", 
35 | #               help= "Choose the online repository")
36 | @click.argument('database',
37 |                 type=click.Path(exists=False))
38 | @click.argument('species',
39 |                 type=click.STRING,
40 |                 nargs=-1)
41 | 
42 | 
43 | def cli(force, prompt, mlst, repository, database, species):
44 |     """Creates a claMLST DATABASE from an online resource.
45 | 
46 |     The research can be filtered by adding a SPECIES name."""
47 | 
48 |     utils.create_logger()
49 | 
50 |     try:
51 | 
52 |         if os.path.exists(database):
53 |             if force:
54 |                 open(database, "w").close()
55 |             else:
56 |                 raise exceptions.PyMLSTError("Database alreadly exists, use --force to override it")
57 | 
58 |         url = web.retrieve_mlst(' '.join(species), prompt, mlst, repository)
59 | 
60 |         if url is None:
61 |             logging.info('No choice selected')
62 |             return
63 | 
64 |         logging.info('Downloading mlst...')
65 | 
66 |         with tempfile.TemporaryDirectory() as tmp_dir, \
67 |                 pymlst.open_cla(os.path.abspath(database)) as mlst_db:
68 | 
69 |             version = web.get_mlst_files(url, tmp_dir)
70 | 
71 |             mlst_db.create(open(tmp_dir + '/profiles.csv', 'rt'),
72 |                            [open(tmp_dir + '/locus/' + locus, 'r')
73 |                            for locus in os.listdir(tmp_dir + '/locus')])
74 |             mlst_db.add_infos(repository, ' '.join(species), mlst, version)
75 | 
76 |     except requests.exceptions.HTTPError:
77 |         raise click.ClickException('Could not retrieve online data')
78 |     except requests.exceptions.ConnectionError:
79 |         raise click.ClickException('Could not access to the server, please verify your internet connection')
80 |     except requests.exceptions.Timeout:
81 |         raise click.ClickException('The server took too long to respond')
82 |     except exceptions.StructureError:
83 |         raise click.ClickException('It seems like the structure of the website/API changed '
84 |                                    'since this application was developed.')
85 |     except exceptions.PyMLSTError as err:
86 |         raise click.ClickException(str(err))
87 | 


--------------------------------------------------------------------------------
/docs/source/documentation/cgmlst/initialise.rst:
--------------------------------------------------------------------------------
  1 | .. _cgmlst_initialise:
  2 | 
  3 | .. toctree::
  4 |     :glob:
  5 | 
  6 | =====================
  7 | Initialise a database
  8 | =====================
  9 | 
 10 | The first step of a cg/wgMLST analysis is to initialise a database
 11 | by a list of genes with a reference sequence for each of them.
 12 | 
 13 | :cgMLST: A list of genes corresponding to the coregenome of a species.
 14 | 
 15 | :wgMLST: A list  of genes corresponding to the whole genome of a
 16 | 		 species or a clone.
 17 | 
 18 | Import from cgmlst.org
 19 | ======================
 20 | 
 21 | You can automatically import a cgMLST resource from `cgmlst.org
 22 | <https://www.cgmlst.org/>`_.
 23 | 
 24 | .. code-block:: bash
 25 | 				
 26 |    wgMLST import -h
 27 |    Usage: wgMLST import [OPTIONS] DATABASE [SPECIES]...
 28 | 
 29 |    Creates a wgMLST DATABASE from an online resource.
 30 |    
 31 |    The research can be filtered by adding a SPECIES name.
 32 |    
 33 |    Options:
 34 |    -f, --force             Overwrite alrealdy existing DATABASE
 35 |    --prompt / --no-prompt  Do not prompt if multiple choices are found,
 36 | 				           fail instead.
 37 | 
 38 | 
 39 | Create from external scheme
 40 | ===========================
 41 | 
 42 | The cg/wgMLST database can be created using a **scheme** corresponding to
 43 | a list of different genes (a multi-fasta file containing gene
 44 | sequences in nucleotide format).
 45 | 
 46 | .. code::
 47 | 
 48 |    >ACICU_RS02500
 49 |    TTATTTCTTCACAACAGATGGTGCAATTGGGTCGGCAGTGATATAGCCAACTGCTGCTGC
 50 |    ...
 51 |    GTGGTTAGAAGCAGTGGTCAT
 52 |    >ACICU_RS11305
 53 |    CGCACCTAATGGAAGAAAAGGGATCCCCGTAAACCATTTTAAAATATCGCGACGTGTTGG
 54 |    ...
 55 |    TTTGGAATTGATGCAGAAATTAAATCTTAA
 56 |    >ACICU_RS08820
 57 |    ATGGCTTATCAAACTTTAGAACAGCTACAGCAGTCTAAAGCCAAGCTTCACGAAACTGTG
 58 |    ...
 59 |    TCGCAGTTACGTTAA
 60 | 
 61 | .. warning::
 62 | 
 63 |    At contrary to other cg/wgMLST tools, only one allele for each
 64 |    gene must be include on the scheme file.
 65 | 
 66 | 
 67 | You can get scheme for:
 68 | 
 69 | :cgMLST:
 70 | 
 71 | * Using a scheme from a scientific publication and not available on
 72 |   `cgmlst.org <https://www.cgmlst.org/>`_.
 73 | 	 
 74 | * Using the annotation of the genes from the reference genome of
 75 |   the species. After adding your strains to the database, you can
 76 |   filter to core genome by removing genes absent from least 95% of
 77 |   the strains (see :ref:`validate <m_option_check>`)
 78 | 	 
 79 | :wgMLST:
 80 | 
 81 | * Using gene annotations from a genome close to your strains
 82 | 
 83 | * Using pangenome results from analysis of your strains with
 84 |   e.g. `Roary <https://sanger-pathogens.github.io/Roary/>`_.
 85 | 	
 86 | 	   
 87 | 
 88 | .. code-block:: bash
 89 | 
 90 |    wgMLST create --help
 91 |    Usage: wgMLST create [OPTIONS] DATABASE COREGENE
 92 |    
 93 |    Creates a wgMLST DATABASE from a template COREGENE.
 94 |    
 95 |    Options:
 96 |    -f, --force        Overwrite alrealdy existing DATABASE
 97 |    -c, --concatenate  Automatically concatenates genes with duplicated sequences
 98 |    -r, --remove       Automatically removes genes with duplicated sequences
 99 |    -s, --species TEXT  Name of the species
100 |    -V, --version TEXT  Version of the database
101 | 
102 | 
103 | .. warning::
104 | 
105 |    If the same sequence is used more than once in your scheme, you can
106 |    specify how to handle it using the **-c** or **-r** options.
107 |    
108 | 
109 | 


--------------------------------------------------------------------------------
/docs/source/documentation/clamlst/initialise.rst:
--------------------------------------------------------------------------------
 1 | .. _clamlst_initialise:
 2 | 
 3 | .. toctree::
 4 |    :glob:
 5 | 
 6 | ==========================
 7 | Initialise a MLST database
 8 | ==========================
 9 | 
10 | A MLST database contains the different alleles for each gene of the
11 | scheme and a table of association of the alleles to determined the
12 | sequence type (ST).
13 | 
14 | 
15 | Import from pubMLST
16 | ===================
17 | 
18 | You can automatically import a MLST resource from `pubmlst
19 | <https://pubmlst.org/data/>`_ or `pasteur <https://bigsdb.pasteur.fr/>`_.
20 | 
21 | .. code-block:: bash
22 | 
23 |    claMLST import -h
24 |    Usage: claMLST import [OPTIONS] DATABASE [SPECIES]...
25 | 
26 |    Creates a claMLST DATABASE from an online resource.
27 | 
28 |    The research can be filtered by adding a SPECIES name.
29 | 
30 |    Options:
31 |    --prompt / --no-prompt  Do not prompt if multiple choices are found,
32 | 				           fail instead.
33 |    -f, --force        	   Overwrites alrealdy existing DATABASE
34 |    -m, --mlst TEXT         Specifies the desired MLST scheme name.
35 |    -r, --repository        Choose the online repository to use [pubmlst|pasteur]
36 | 
37 | 
38 | 
39 | Create from other resource
40 | ==========================
41 | 
42 | Alternatively, you can create a database with the allele sequence and
43 | MLST profile of your favorite species. To create a database, pyMLST
44 | needs the gene name in the MLST profile header to match the name in
45 | the fasta file. For example, the rpoB gene in the MLST profile header
46 | must match the rpoB.fas file. You will also need to remove the
47 | additional column corresponding to the clonal complex in the MLST
48 | profile file, if present.
49 | 
50 | .. code-block:: bash
51 |    
52 |    claMLST create --help
53 |    Usage: claMLST create [OPTIONS] DATABASE PROFILE ALLELES...
54 | 
55 |    Creates a classical MLST DATABASE from a txt PROFILE and fasta ALLELES files.
56 | 
57 |    Options:
58 |    -f, --force        	  Overwrites alrealdy existing DATABASE
59 |    -s, --species TEXT     Name of the species
60 |    -V, --version TEXT     Version of the database
61 | 
62 | 
63 |    
64 | Scheme example
65 | --------------
66 | 
67 | .. code::
68 |    
69 |    ST      cpn60   fusA    gltA    pyrG    recA    rplB    rpoB
70 |    1       1       1       1       1       5       1       1
71 |    2       2       2       2       2       2       2       2
72 |    3       3       3       2       2       3       1       3
73 |    ...
74 | 		  
75 | Allele example
76 | --------------
77 | 
78 | .. code::
79 |    
80 |    >cpn60_1
81 |    ATGAACCCAATGGATTTAAAACGCGGTATCGACATTGCAGTAAAAACTGTAGTTGAAAAT
82 |    ATCCGTTCTATTGCTAAACCAGCTGATGATTTCAAAGCAATTGAACAAGTAGGTTCAATC
83 |    TCTGCTAACTCTGATACTACTGTTGGTAAACTTATTGCTCAAGCAATGGAAAAAGTAGGT
84 |    AAAGAAGGCGTAATCACTGTAGAAGAAGGTTCTGGCTTCGAAGACGCATTAGACGTTGTA
85 |    GAAGGTATGCAGTTTGACCGTGGTTATATCTCTCCGTACTTTGCAAACAAACAAGATACT
86 |    TTAACTGCTGAACTTGAAAATCCGTTCATTCTTCTTGTTGATAAAAAAATCAGCAACATT
87 |    CGTGAATTGATTTCTGTTTTAGAAGCAGTTGCTAAAACTGGTAAA
88 |    >cpn60_2
89 |    ATGAACCCAATGGATTTAAAACGCGGTATCGACATTGCAGTAAAAACTGTAGTTGAAAAT
90 |    ATCCGTTCTATTGCTAAACCAGCTGATGATTTCAAAGCAATTGAACAAGTAGGTTCAATC
91 |    TCTGCTAACTCTGATACTACTGTTGGTAAACTTATTGCTCAAGCAATGGAAAAAGTAGGT
92 |    AAAGAAGGCGTAATCACTGTAGAAGAAGGCTCAGGCTTCGAAGACGCATTAGACGTTGTA
93 |    GAAGGTATGCAGTTTGACCGTGGTTATATCTCTCCGTACTTTGCAAACAAACAAGATACT
94 |    TTAACTGCTGAACTTGAAAATCCGTTCATCCTTCTTGTTGATAAAAAAATCAGCAACATT
95 |    CGTGAATTGATTTCTGTTTTAGAAGCAGTTGCTAAAACTGGTAAA
96 |    ...
97 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | This file is used to create the package we'll publish to PyPI.
  6 | 
  7 | .. currentmodule:: setup.py
  8 | .. moduleauthor:: Benoit Valot <benoit.valot@univ-fcomte.fr>
  9 | """
 10 | 
 11 | import importlib.util
 12 | import os
 13 | from pathlib import Path
 14 | from setuptools import setup, find_packages
 15 | from codecs import open  # Use a consistent encoding.
 16 | from os import path
 17 | 
 18 | here = path.abspath(path.dirname(__file__))
 19 | 
 20 | # Get the long description from the relevant file
 21 | with open(path.join(here, 'README.md'), encoding='utf-8') as f:
 22 |     long_description = f.read()
 23 | 
 24 | # Get the base version from the library.  (We'll find it in the `version.py`
 25 | # file in the src directory, but we'll bypass actually loading up the library.)
 26 | vspec = importlib.util.spec_from_file_location(
 27 |   "version",
 28 |   str(Path(__file__).resolve().parent /
 29 |       'pymlst'/"version.py")
 30 | )
 31 | vmod = importlib.util.module_from_spec(vspec)
 32 | vspec.loader.exec_module(vmod)
 33 | version = getattr(vmod, '__version__')
 34 | 
 35 | # If the environment has a build number set...
 36 | if os.getenv('buildnum') is not None:
 37 |     # ...append it to the version.
 38 |     version = "{version}.{buildnum}".format(
 39 |         version=version,
 40 |         buildnum=os.getenv('buildnum')
 41 |     )
 42 | 
 43 | setup(
 44 |     name='PyMLST',
 45 |     description="python Mlst Local Search Tool",
 46 |     long_description=long_description,
 47 |     long_description_content_type='text/markdown',
 48 |     packages=find_packages(
 49 |         exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
 50 |     version=version,
 51 |     setup_requires=['wheel'],
 52 |     install_requires=[
 53 |         # Include dependencies here
 54 |         'biopython>=1.78',
 55 |         'click>=7.1',
 56 |         'pytest>=6.2',
 57 |         'sqlalchemy>=1.4,<2',
 58 |         'networkx>=2.5',
 59 |         'decorator>=4.4',
 60 |         'requests>=2.23',
 61 |         'pandas>=1.2',
 62 |         'numpy>=1.20.0',
 63 |         'beautifulsoup4>=4.9',
 64 |         'questionary>=1.9',
 65 |         'setuptools>=44.0',
 66 |         'alembic>=1.6',
 67 |         'GitPython>=3.1'
 68 |     ],
 69 |     entry_points="""
 70 |     [console_scripts]
 71 |     pyMLST=pymlst.cmd:py
 72 |     wgMLST=pymlst.cmd:wg
 73 |     claMLST=pymlst.cmd:cla
 74 |     pyTyper=pymlst.cmd:pytyper
 75 |     """,
 76 |     python_requires=">=3.7.0",
 77 |     license='GPLv3',  # noqa
 78 |     author='Benoit Valot',
 79 |     author_email='benoit.valot@univ-fcomte.fr',
 80 |     # Use the URL to the github repo.
 81 |     url='https://github.com/bvalot/pyMLST.git',
 82 |     download_url=(
 83 |         f'https://github.com/bvalot/pyMLST/archive/refs/tags/{version}.tar.gz'
 84 |     ),
 85 |     keywords=[
 86 |         'cgMLST', 'MLST', 'bacterial genome'
 87 |         # Add package keywords here.
 88 |     ],
 89 |     # See https://PyPI.python.org/PyPI?%3Aaction=list_classifiers
 90 |     classifiers=[
 91 |       # How mature is this project? Common values are
 92 |       #   3 - Alpha
 93 |       #   4 - Beta
 94 |       #   5 - Production/Stable
 95 |       'Development Status :: 5 - Production/Stable',
 96 | 
 97 |       # Indicate who your project is intended for.
 98 |       'Intended Audience :: Developers',
 99 |       'Topic :: Software Development :: Libraries',
100 | 
101 |       # Pick your license.  (It should match "license" above.)
102 |         # noqa
103 |       '''License :: OSI Approved :: GNU General Public License v3 (GPLv3)''',
104 |         # noqa
105 |       # Specify the Python versions you support here. In particular, ensure
106 |       # that you indicate whether you support Python 2, Python 3 or both.
107 |       'Programming Language :: Python :: 3.7',
108 |     ],
109 |     include_package_data=True
110 | )
111 | 


--------------------------------------------------------------------------------
/tests/test_cla.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from sqlalchemy import select
  3 | 
  4 | import pymlst
  5 | from pymlst.cla import model
  6 | from pymlst.cla.core import DatabaseCLA
  7 | from pymlst.common import exceptions
  8 | 
  9 | 
 10 | @pytest.fixture()
 11 | def cla():
 12 |     with pymlst.open_cla() as cla_mlst:
 13 |         yield cla_mlst
 14 | 
 15 | 
 16 | @pytest.fixture()
 17 | def db():
 18 |     db = DatabaseCLA(None, 1)
 19 |     try:
 20 |         yield db
 21 |     finally:
 22 |         db.close()
 23 | 
 24 | 
 25 | @pytest.fixture()
 26 | def db_many(db):
 27 |     seqs = [  # gene, seq, allele
 28 |         ('g1', 'AAA', 1),
 29 |         ('g2', 'ATA', 1),
 30 |         ('g3', 'TTT', 1),
 31 |         ('g4', 'CCC', 1),
 32 |         ('g5', 'CCT', 1),
 33 |         ('g1', 'AAT', 2),
 34 |         ('g2', 'ATT', 2),
 35 |         ('g3', 'TCT', 2),
 36 |         ('g4', 'ACC', 2),
 37 |         ('g5', 'CTC', 2),
 38 |     ]
 39 |     for gene, seq, allele in seqs:
 40 |         db.add_sequence(seq, gene, allele)
 41 |     mlst = [  # st, gene, allele
 42 |         (1, 'g1', 1),
 43 |         (1, 'g2', 2),
 44 |         (1, 'g3', 1),
 45 |         (1, 'g4', 2),
 46 |         (1, 'g5', 2),
 47 |         (2, 'g1', 1),
 48 |         (2, 'g2', 1),
 49 |         (2, 'g3', 2),
 50 |         (2, 'g4', 1),
 51 |         (2, 'g5', 1),
 52 |         (3, 'g1', 2),
 53 |         (3, 'g2', 1),
 54 |         (3, 'g3', 1),
 55 |         (3, 'g4', 1),
 56 |         (3, 'g5', 2),
 57 |     ]
 58 |     for st, gene, allele in mlst:
 59 |         db.add_mlst(st, gene, allele)
 60 |     return db
 61 | 
 62 | 
 63 | def test_add_sequence(db):
 64 |     db.add_sequence('AAA', 'g1', 2)
 65 |     seq = db.connection.execute(
 66 |         select(model.sequences)
 67 |     ).fetchall()
 68 |     assert len(seq) == 1
 69 |     assert (seq[0].sequence == 'AAA'
 70 |             and seq[0].gene == 'g1'
 71 |             and seq[0].allele == 2)
 72 | 
 73 | 
 74 | def test_add_mlst(db):
 75 |     db.add_sequence('AAA', 'g1', 2)
 76 |     db.add_mlst(5, 'g1', 2)
 77 |     mlst = db.connection.execute(
 78 |         select(model.mlst)
 79 |     ).fetchall()
 80 |     assert len(mlst) == 1
 81 |     assert (mlst[0].st == 5
 82 |             and mlst[0].gene == 'g1'
 83 |             and mlst[0].allele == 2)
 84 |     assert len(db.core_genome) == 0
 85 | 
 86 | 
 87 | # def test_add_mlst_no_sequence(db):
 88 | #     db.add_sequence('AAA', 'g1', 1)
 89 | #     with pytest.raises(exceptions.AlleleSequenceNotFound):
 90 | #         db.add_mlst(5, 'g1', 2)
 91 | 
 92 | 
 93 | def test_add_mlst_reference(db):
 94 |     db.add_sequence('AAA', 'g1', 1)
 95 |     db.add_mlst(5, 'g1', 1)
 96 |     assert len(db.core_genome) == 1
 97 |     assert db.core_genome['g1'] == 'AAA'
 98 | 
 99 | 
100 | def test_get_genes_by_allele(db_many):
101 |     genes = db_many.get_genes_by_allele(2)
102 |     assert genes == {
103 |         'g1': 'AAT',
104 |         'g2': 'ATT',
105 |         'g3': 'TCT',
106 |         'g4': 'ACC',
107 |         'g5': 'CTC',
108 |     }
109 | 
110 | 
111 | def test_get_allele_by_sequence_and_gene(db_many):
112 |     allele = db_many.get_allele_by_sequence_and_gene('AAT', 'g1')
113 |     assert allele == 2
114 | 
115 | 
116 | def test_get_allele_by_sequence_and_gene_none(db_many):
117 |     allele = db_many.get_allele_by_sequence_and_gene('AAT', 'g2')
118 |     assert allele is None
119 | 
120 | 
121 | def test_get_st_by_gene_and_allele(db_many):
122 |     st = db_many.get_st_by_gene_and_allele('g3', 1)
123 |     assert st == [1, 3]
124 |     st = db_many.get_st_by_gene_and_allele('g2', 2)
125 |     assert st == [1]
126 | 
127 | 
128 | def test_get_st_by_gene_and_allele_none(db_many):
129 |     st = db_many.get_st_by_gene_and_allele('g5', 3)
130 |     assert st == []
131 | 
132 | 
133 | def test_get_sequence_by_gene_and_allele(db_many):
134 |     seq = db_many.get_sequence_by_gene_and_allele('g3', 2)
135 |     assert seq == 'TCT'
136 | 
137 | 
138 | def test_get_sequence_by_gene_and_allele_none(db_many):
139 |     seq = db_many.get_sequence_by_gene_and_allele('g3', 6)
140 |     assert seq is None
141 | 


--------------------------------------------------------------------------------
/docs/source/documentation/cgmlst/check.rst:
--------------------------------------------------------------------------------
  1 | .. _cgmlst_check:
  2 | 
  3 | .. toctree::
  4 |     :glob:
  5 | 
  6 | =============================
  7 | Check quality of the database
  8 | =============================
  9 | 
 10 | After :ref:`loading <cgmlst_add>` all your strains to the database,
 11 | you need to check allele calling quality before :ref:`export results
 12 | <cgmlst_export_res>`.
 13 | 
 14 | .. note::
 15 | 
 16 |    You can have information of current data in the database using
 17 |    **stats** command.
 18 | 
 19 |    .. code-block:: bash
 20 | 
 21 | 	  wgMLST stats -h
 22 | 	  Usage: wgMLST stats [OPTIONS] DATABASE
 23 | 
 24 | 	  Extract stats from a wgMLST DATABASE.
 25 | 
 26 | 
 27 | .. _strain_check:
 28 | 
 29 | Validate strains
 30 | ================
 31 | 
 32 | To search potential strain with problems like bad assembly or wrong
 33 | species, you can use the **strain** command with the **-c** option.
 34 | 
 35 | 
 36 | .. code-block:: bash
 37 | 				
 38 |    wgMLST strain -h
 39 |    Usage: wgMLST strain [OPTIONS] DATABASE
 40 | 
 41 |    Extracts a list of strains from a wgMLST DATABASE.
 42 |    
 43 |    Options:
 44 |    -m, --mincover INTEGER  Minimun number of strain found to keep a gene
 45 |                            (default:0)
 46 |    -k, --keep              Keep only gene with different allele (omit missing).
 47 |    -d, --duplicate         Conserve duplicate gene (default remove).
 48 |    -V, --inverse           Keep only gene that do not meet the filter
 49 |                            of mincover or keep options.
 50 |    -c, --count             Count the number of gene present in the database for
 51 |                            each strains.
 52 |    -o, --output FILENAME   Export strain list to (default=stdout).
 53 | 
 54 | ..  note::
 55 | 	
 56 | 	If some strains show low number of genes found in comparison to the
 57 | 	other, you can remove it using :ref:`remove <remove_check>`
 58 | 	command.
 59 | 
 60 | .. note::
 61 | 
 62 |    Similarly to :ref:`gene <gene_check>` command or :ref:`export <cgmlst_export_res>`, you can filter gene
 63 |    that you want to conserved for the search.
 64 | 
 65 |    By default, only duplicate genes are removed.
 66 | 
 67 | .. _gene_check:
 68 | 
 69 | Validate genes
 70 | ==============
 71 | 
 72 | Similarly to strains, it could be interesting to saved genes list to
 73 | conserved for the rest of the analysis using **gene** command.
 74 | 
 75 | .. code-block:: bash
 76 | 				
 77 |    wgMLST gene -h
 78 |    Usage: wgMLST gene [OPTIONS] DATABASE
 79 |    
 80 |    Extracts a list of genes from a wgMLST DATABASE.
 81 |    
 82 |    Options:
 83 |    -m, --mincover INTEGER  Minimun number of strain found to keep a gene
 84 | 				           (default:0)
 85 |    -k, --keep              Keep only gene with different allele (omit missing).
 86 |    -d, --duplicate         Conserve duplicate gene (default remove).
 87 |    -V, --inverse           Keep only gene that do not meet the filter of
 88 |                            mincover or keep options.
 89 |    -o, --output FILENAME   Export GENE list to (default=stdout).
 90 | 
 91 | .. note::
 92 | 
 93 |    Gene list that pass your threshold can be used further for :ref:`export
 94 |    sequence <cgmlst_export_seq>`. 
 95 |    
 96 | .. _m_option_check:
 97 |    
 98 | .. warning::
 99 | 
100 |    An important parameter are the **-m** option that defined the
101 |    minimum number of strains found to keep a gene.
102 | 
103 |    If you are interesting by coregene, you can defined this number to
104 |    correspond to **95%** of the strain in the database.
105 |    (As example, if you have 100 strains in your database, you need to
106 |    set this parameter to 95)
107 |    
108 |    
109 | .. _remove_check:
110 | 
111 | Remove strains or genes
112 | =======================
113 | 
114 | After checking the database, if some strains or genes need to be
115 | removed, you can use the **remove** commands.
116 | 
117 | .. code-block:: bash
118 | 	  
119 |    wgMLST remove -h
120 |    Usage: wgMLST remove [OPTIONS] DATABASE [GENES_OR_STRAINS]...
121 | 	  
122 |    Removes STRAINS or GENES from a wgMLST DATABASE.
123 |    
124 |    Options:
125 |    --strains / --genes    Choose the item you wish to remove  [default: strains]
126 |    -f, --file FILENAME    File list of genes or strains to removed on the wgMLST
127 | 				          database.
128 | 


--------------------------------------------------------------------------------
/tests/test_typer.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from sqlalchemy import select
  3 | 
  4 | import pymlst
  5 | from pymlst.pytyper import model
  6 | from pymlst.pytyper.method import FIM, SPA, CLMT
  7 | from pymlst.pytyper.core import DatabaseTyper, TypingResult, FimH, Spa, Clmt
  8 | from pymlst.common import exceptions
  9 | 
 10 | 
 11 | @pytest.fixture()
 12 | def fim():
 13 |     with pymlst.open_typer(FIM) as fim_typer:
 14 |         yield fim_typer
 15 | 
 16 | @pytest.fixture()
 17 | def spa():
 18 |     with pymlst.open_typer(SPA) as spa_typer:
 19 |         yield spa_typer
 20 | 
 21 | @pytest.fixture()
 22 | def clmt():
 23 |     with pymlst.open_typer(CLMT) as clmt_typer:
 24 |         yield clmt_typer
 25 | 
 26 | 
 27 | @pytest.fixture()
 28 | def db():
 29 |     db = DatabaseTyper(None)
 30 |     try:
 31 |         yield db
 32 |     finally:
 33 |         db.close()
 34 | 
 35 | @pytest.fixture()
 36 | def result():
 37 |     res = TypingResult('sample1', FIM)
 38 |     yield res
 39 | 
 40 | @pytest.fixture()
 41 | def db_many(db):
 42 |     seqs = [  # seq, typing, allele
 43 |         ('AAA', 'fim', 'fimH1'),
 44 |         ('ATA', 'fim', 'fimH2'),
 45 |         ('TTT', 'fim', 'fimH3'),
 46 |         ('CCC', 'spa', '01'),
 47 |         ('CCT', 'spa', '02'),
 48 |         ('AAT', 'spa', '03'),
 49 |         ('ATT', 'clmt', 'arpA'),
 50 |         ('TCT', 'clmt', 'chuA'),
 51 |         ('ACC', 'clmt', 'yjaA'),
 52 |         ('CTC', 'clmt', 'TspE4.C2'),
 53 |     ]
 54 |     for seq, method, allele in seqs:
 55 |         db.add_sequence(seq, method, allele)
 56 |     sts = [  # st, typing, allele
 57 |         ('fimH1', 'fim', 'fimH1'),
 58 |         ('fimH2', 'fim', 'fimH2'),
 59 |         ('fimH3', 'fim', 'fimH3'),
 60 |         ('t1', 'spa', '01-02-02-01'),
 61 |         ('t2', 'spa', '02-02-03-01'),
 62 |         ('t3', 'spa', '01-01-02-01'),
 63 |         ('A', 'clmt', 'arpA|+,chuA|-,yjaA|-,TspE4.C2|-'),
 64 |         ('B1', 'clmt', 'arpA|+,chuA|-,yjaA|-,TspE4.C2|+'),
 65 |         ('B2', 'clmt', 'arpA|-,chuA|+,yjaA|+,TspE4.C2|+'),
 66 |         ('D|E', 'clmt', 'arpA|+,chuA|+,yjaA|-,TspE4.C2|-'),
 67 |     ]
 68 |     for st, method, allele in sts:
 69 |         db.add_st(st, method, allele)
 70 |     return db
 71 | 
 72 | def test_check_db(db_many):
 73 |     res = db_many.check_db(FIM)
 74 |     assert(res) == True
 75 | 
 76 | def test_check_new_db(db):
 77 |     res = db.check_db(CLMT)
 78 |     assert(res) == False
 79 | 
 80 | def test_add_sequence(db):
 81 |     db.add_sequence('AAA', FIM, '02')    
 82 |     seq = db.connection.execute(
 83 |         select(model.typerSeq)
 84 |     ).fetchall()
 85 |     assert len(seq) == 1
 86 |     assert (seq[0].sequence == 'AAA'
 87 |             and seq[0].typing == FIM
 88 |             and seq[0].allele == '02')
 89 | 
 90 | def test_add_st(db):
 91 |     db.add_st('fimH1', FIM, 'fimH1')    
 92 |     st = db.connection.execute(
 93 |         select(model.typerSt)
 94 |     ).fetchall()
 95 |     assert len(st) == 1
 96 |     assert (st[0].st == 'fimH1'
 97 |             and st[0].typing == FIM
 98 |             and st[0].allele == 'fimH1')
 99 | 
100 | def test_get_sequences(db_many):
101 |     seqs = db_many.get_sequences(FIM)
102 |     seqs2 = db_many.get_sequences(CLMT)
103 |     assert len(seqs) == 3
104 |     assert len(seqs2) == 4
105 |     assert seqs[1] == ('fimH2', 'ATA')
106 |     assert seqs2[0] == ('arpA','ATT')
107 | 
108 | def test_get_sequence_by_allele(db_many):
109 |     seq = db_many.get_sequence_by_allele(FIM, 'fimH1')
110 |     assert seq == 'AAA'
111 |     with pytest.raises(exceptions.AlleleSequenceNotFound):
112 |         db_many.get_sequence_by_allele(SPA, '04')
113 | 
114 | def test_get_allele_by_sequence(db_many):
115 |     al = db_many.get_allele_by_sequence(SPA, 'CCT')
116 |     assert al == '02'
117 |     al2 = db_many.get_allele_by_sequence(FIM, 'GCG')
118 |     assert al2 == 'New'
119 | 
120 | def test_get_st(db_many):
121 |     st = db_many.get_st(FIM, 'fimH2')
122 |     assert st == 'fimH2'
123 |     st2 = db_many.get_st(CLMT, '02')
124 |     assert st2 == ''
125 |     
126 | def test_pyTyper_instance(fim, spa, clmt):
127 |     assert isinstance(fim, FimH)
128 |     assert isinstance(spa, Spa)
129 |     assert isinstance(clmt, Clmt)
130 | 
131 | def test_pyTyper_check_input(fim):
132 |     a = fim.check_input(0.9, 0.9)
133 |     with pytest.raises(exceptions.BadCoverageRange):
134 |         fim.check_input(0.9, 18)
135 |     with pytest.raises(exceptions.BadIdentityRange):
136 |         fim.check_input(12, 0.2)
137 | 
138 | def test_typingResult_full(result):
139 |     result.set_allele('12')
140 |     result.set_st('t1235')
141 |     result.set_notes('Some informations')
142 |     assert str(result) == 'sample1 fim t1235 12'
143 | 
144 | def test_typingResult_empty(result):
145 |     assert str(result) == 'sample1 fim  '
146 | 


--------------------------------------------------------------------------------
/pymlst/common/utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import time
  4 | from pathlib import Path
  5 | 
  6 | from Bio import SeqIO
  7 | from Bio.Data.CodonTable import TranslationError
  8 | from alembic import command
  9 | from alembic.config import Config
 10 | from sqlalchemy import create_engine, inspect, select
 11 | 
 12 | from pymlst import config
 13 | from pymlst.common import flag, exceptions
 14 | 
 15 | 
 16 | def records_to_dict(records):
 17 |     seq_dict = {}
 18 |     for seq in records:
 19 |         seq_dict[seq.id] = seq.seq.upper()
 20 |     return seq_dict
 21 | 
 22 | 
 23 | def read_genome(handle):
 24 |     if handle.seekable():
 25 |         handle.seek(0)
 26 |     records = SeqIO.parse(handle, 'fasta')
 27 |     return records_to_dict(records)
 28 | 
 29 | 
 30 | def write_genome(genome_dict, handle):
 31 |     for seq_id, seq in genome_dict.items():
 32 |         handle.write('>' + str(seq_id) + '\n'
 33 |                      + str(seq) + '\n')
 34 | 
 35 | def file_name(handle):
 36 |     filename = os.path.basename(handle.name)
 37 |     if filename.endswith(".fasta"):
 38 |         return filename.rstrip(".fasta")
 39 |     if filename.endswith(".fna"):
 40 |         return filename.rstrip(".fna")
 41 |     else:
 42 |         return filename.split('.')[0]
 43 |     
 44 | 
 45 | def strip_file(file):
 46 |     found = []  
 47 |     if file is not None:
 48 |         for line in file:
 49 |             found.append(line.rstrip('\n'))
 50 |     return found
 51 | 
 52 | 
 53 | def compar_seqs(seqs):
 54 |     count = 0
 55 |     for index in range(0, len(seqs[0])):
 56 |         seqs_char = {s[index] for s in seqs}
 57 |         if '-' in seqs_char:
 58 |             seqs_char.remove('-')
 59 |         if len(seqs_char) > 1:
 60 |             count += 1
 61 |     return count
 62 | 
 63 | 
 64 | def write_count(count, texte):
 65 |     if count:
 66 |         count.write(texte)
 67 | 
 68 | 
 69 | def validate_sequence(sequence):
 70 |     try:
 71 |         sequence.translate(cds=True, table=11)
 72 |     except TranslationError:
 73 |         return False
 74 |     else:
 75 |         return True
 76 | 
 77 | 
 78 | def create_logger():
 79 |     log = config.get_logging_level()
 80 |     if log == "DEBUG":
 81 |         level = logging.DEBUG
 82 |     elif log == "INFO":
 83 |         level = logging.INFO
 84 |     elif log == "WARNING":
 85 |         level = logging.WARNING
 86 |     else:
 87 |         level = logging.ERROR
 88 |     logging.basicConfig(
 89 |         level=level,
 90 |         format='[%(levelname)s: %(asctime)s] %(message)s')
 91 | 
 92 | 
 93 | def clean_kwargs(kwargs):
 94 |     """Removes kwargs with None values produced by Click.
 95 | 
 96 |     Because of the way the Click library binds
 97 |     every arguments and options to kwargs entries,
 98 |     when a user doesn't specify an option, its name
 99 |     is bound to None in the kwargs dictionary.
100 | 
101 |     By removing the None entries we can pass the kwargs directly
102 |     to the API core functions without overriding the default values.
103 |     """
104 |     for key, value in kwargs.copy().items():
105 |         if value is None:
106 |             kwargs.pop(key)
107 |     return kwargs
108 | 
109 | def get_output(kwargs):
110 |     """Extract output from kwargs for extractor
111 |     """
112 |     if 'output' in kwargs:
113 |         out_kwargs = {'output': kwargs['output']}
114 |         kwargs.pop('output')
115 |     else:
116 |         out_kwargs = {}
117 |     return kwargs,out_kwargs
118 | 
119 | 
120 | def check_type(conn, mlst_type):
121 |     inspector = inspect(conn)
122 |     tables = inspector.get_table_names()
123 |     if len(tables) == 0:
124 |         return
125 |     elif 'mlst_type' not in tables:
126 |         ##set_type(conn, mlst_type)
127 |         logging.warning('The base missing mlst_type metadata, continue with %s', mlst_type)
128 |         return
129 |     m_t = conn.execute(
130 |         select(flag.mlst_type.c.name)
131 |     ).fetchone()
132 |     if m_t.name != mlst_type:
133 |         raise exceptions.WrongBaseType(
134 |             'The base you are attempting to perform '
135 |             'on belongs to the wrong MLST type')
136 | 
137 | 
138 | # def set_type(conn, mlst_type):
139 | #     flag.metadata.create_all(conn)
140 | #     conn.execute(
141 | #         flag.mlst_type.insert(),
142 | #         name=mlst_type)
143 | 
144 | 
145 | def get_updated_engine(path, module):
146 |     env_path = config.get_data(os.path.join('alembic', module))
147 |     alembic_cfg = Config()
148 |     alembic_cfg.set_main_option('script_location', env_path)
149 |     logging.getLogger('alembic').setLevel(logging.CRITICAL)
150 | 
151 |     if path is None:
152 |         engine = create_engine('sqlite://')  # creates a :memory: database
153 |     else:
154 |         engine = create_engine('sqlite:///' + os.path.abspath(path))
155 | 
156 |     with engine.begin() as conn:
157 |         check_type(conn, module)
158 |         alembic_cfg.attributes['connection'] = conn
159 |         command.upgrade(alembic_cfg, 'head')
160 | 
161 |     return engine
162 | 
163 | def clean_geneid(geneid):
164 |     """Remove '_' on geneid to be compatible with kma search"""
165 |     return(geneid.replace("_", "-"))
166 | 


--------------------------------------------------------------------------------
/pymlst/common/kma.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | ##Copyright (c) 2021 Benoit Valot
  5 | ##benoit.valot@univ-fcomte.fr
  6 | ##UMR 6249 Chrono-Environnement, Besançon, France
  7 | ##Licence GPL
  8 | import logging
  9 | 
 10 | import subprocess
 11 | import tempfile
 12 | from io import BytesIO
 13 | import os
 14 | 
 15 | from pymlst import config
 16 | from pymlst.common import utils
 17 | from pymlst.common.psl import Psl
 18 | from pymlst.common import exceptions
 19 | 
 20 | index = [".comp.b", ".length.b", ".name", ".seq.b"]
 21 | suffix = ".kma"
 22 | 
 23 | def run_kma(fastq, basename, identity, coverage, reads):
 24 |     """Run kma on fastq(s) and return sequences"""
 25 |     if is_database_indexing(basename) is False:
 26 |         raise exceptions.PyMLSTError('Dabatase must be index with KMA')
 27 |     
 28 |     path = config.get_binary_path('kma')
 29 |     if path is None:
 30 |         raise exceptions.BinaryNotFound('KMA binary was not found')
 31 | 
 32 |     with tempfile.NamedTemporaryFile('w+t') as tmp:
 33 |         baseout = tmp.name
 34 |     command = [path, '-t_db', basename+suffix, '-o', baseout, '-nf']
 35 |     if len(fastq) == 1:
 36 |         command.extend(['-i', fastq[0].name])
 37 |     elif len(fastq) == 2:
 38 |         command.extend(['-ipe', fastq[0].name, fastq[1].name])
 39 |     else:
 40 |         raise exceptions.PyMLSTError('Too many fastq files in input of run_kma')
 41 | 
 42 |     logging.info("Running KMA with cg/wgMLST database")
 43 |     proc = subprocess.Popen(command, stderr=subprocess.PIPE, \
 44 |                             stdout=subprocess.PIPE)
 45 | 
 46 |     output, error = proc.communicate()
 47 |     if os.path.exists(baseout + ".res") and os.path.exists(baseout + ".fsa"):
 48 |         for line in BytesIO(error).readlines():
 49 |             logging.debug(line.decode().rstrip())
 50 |     else:
 51 |         for line in BytesIO(error).readlines():
 52 |             logging.error(line.decode().rstrip())
 53 |         raise exceptions.PyMLSTError(
 54 |             'An error occurred while running KMA')   
 55 | 
 56 |     with open(baseout + ".res", 'r') as kma:
 57 |         kma_res = read_kma_res(kma, coverage, identity, reads)
 58 |     seqs = utils.read_genome(baseout + ".fsa")
 59 | 
 60 |     del_kma_tmp(baseout)
 61 |     if len(kma_res) == 0:
 62 |         raise exceptions.CoreGenomePathNotFound(
 63 |             'No path was found for the core genome')
 64 |     return kma_res,seqs
 65 | 
 66 | 
 67 | def del_kma_tmp(baseout):
 68 |     """Delete temporary file create by kma"""
 69 |     for a in [".aln", ".res", ".fsa"]:
 70 |         if os.path.exists(baseout + a):
 71 |             os.remove(baseout + a)
 72 |     
 73 | def is_database_indexing(basename):
 74 |     """Verify if a pyMLST database is indexing"""
 75 |     for i in index:
 76 |         if os.path.exists(basename + suffix + i) is False:
 77 |             return False
 78 |     return True
 79 | 
 80 | def index_database(basename, coregenes):
 81 |     """Index a database with kma if the base is not already indexing
 82 |     
 83 |     :coregene is a temporary file containing coregenes sequences
 84 |     """
 85 |     if is_database_indexing(basename) is False:
 86 |         path = config.get_binary_path('kma')
 87 |         if path is None:
 88 |             raise exceptions.BinaryNotFound('KMA binary was not found')
 89 |         logging.info("Indexing database %s with kma", \
 90 |                      os.path.basename(basename))
 91 |         
 92 |         command = [path, 'index', '-i', coregenes.name, '-o', basename + suffix]
 93 |         proc = subprocess.Popen(command, stderr=subprocess.PIPE, \
 94 |                                 stdout=subprocess.PIPE)
 95 |         output, error = proc.communicate()
 96 |         if is_database_indexing(basename) is False:
 97 |             for line in BytesIO(error).readlines():
 98 |                 logging.error(line.decode().rstrip())
 99 |             raise exceptions.PyMLSTError(
100 |                 'An error occurred while indexing KMA')
101 |         else:
102 |             for line in BytesIO(error).readlines():
103 |                 logging.debug(line.decode().rstrip())
104 | 
105 | 
106 | def delete_indexing(basename):
107 |     """Remove indexing file"""
108 |     for i in index:
109 |         if os.path.exists(basename + suffix + i):
110 |             os.remove(basename + suffix + i)
111 | 
112 | def read_kma_res(kma, cover, ident, reads):
113 |     kmas=[]
114 |     header = kma.readline().rstrip("\n").split("\t")
115 |     if len(header) != 11 or header[0].startswith("#Template") is False:
116 |         raise Exception(kma.name + " seems not to be a kma result file\n")
117 |     for line in kma:
118 |         values = line.rstrip("\n").split("\t")
119 |         if len(values) != 11:
120 |             raise Exception("Incorrect line\n" + line)
121 |         ele = {a:b.strip(" ") for a,b in zip(header, values)}
122 |         if float(ele.get("Template_Coverage")) >= cover*100 and \
123 |            float(ele.get("Query_Coverage")) >= cover*100 and \
124 |            float(ele.get("Template_Identity")) >= ident*100 and \
125 |            float(ele.get("Query_Identity")) >= ident*100 and \
126 |            float(ele.get("Depth")) >= reads :
127 |             kmas.append(ele.get("#Template"))
128 |     return kmas
129 | 
130 | def index_tmpfile():
131 |     return tempfile.NamedTemporaryFile(mode='w+t', suffix='.fasta')
132 | 


--------------------------------------------------------------------------------
/pymlst/data/pytyper/clmt.fna:
--------------------------------------------------------------------------------
 1 | >chuAalbertii CP024282.1:1722782-1722917 Escherichia albertii strain 2014C-4356 chromosome, complete genome
 2 | GCCACCGATTTATTGCGCTCTGTCCCCGGGATTACCCTTGACGGCACTGGGCGGACGAACGGCCAGGATG
 3 | TGAACATGCGTGGCTATGACCATCGCGGCGTGCTGATTCTTGTTGACGGCGTTCGTCAGGGAACAG
 4 | >citPfergus CP042945.1:1413338-1413637 Escherichia fergusonii strain ATCC 35471 chromosome, complete genome
 5 | AAAAAGGCGCGAATGCCTTGCTGAGTTTTAAAATATTCGGCATGCCGCTTCCGCTTTACGCGTTTGCGTT
 6 | AATTACACTATTACTTTCGCATTTTAATGATTCATTGCCAAACGACCTGGTCGGTGGTTTCGCAATCATG
 7 | TTTATTATTGGCGCTATTTTTGGTGAGATTGGTAAGCGTCTGCCGATATTTAACAAATATATTGGTGGCG
 8 | CACCAGTAATGATATTTCTGGTAGCGGCATATTTTGTTTATGCTGGTATTTTTACTCAGAAAGAAATCGA
 9 | CACCATTAGCAATGTTATGG
10 | >arpA CP054940.1:4413854-4414253 Escherichia coli strain MS6192 chromosome, complete genome
11 | AACGCTATTCGCCAGCTTGCCGCTGCTAACATTTTTCCTGGCGACATGCTGTTTAAAAACTTCGGTGTCA
12 | CCCGTCACGGGCGTGTGGTTTTTTATGATTACGATGAAATTTGCTACATGACGGAAGTGAATTTCCGCGA
13 | CATCCCGCCGCCGCGCTATCCGGAAGACGAACTTGCCAGCGAACCGTGGTATAGCGTCTCGCCGGGCGAT
14 | GTTTTCCCGGAAGAGTTTCGCCACTGGCTATGCGCCGACCCGCGTATTGGTCCGCTGTTTGAAGAGATGC
15 | ACGCCGACCTGTTCCGCGCTGATTACTGGCGCGCACTACAAAACCGCATACGTGAAGGGCATGTGGAAGA
16 | TGTTTATGCGTATCGGCGCAGGCAAAGATTTAGCGTACGGTATGGGGAGA
17 | >chuA CP054236.1:249971-250258 Escherichia coli strain EcPF5 chromosome, complete genome
18 | ATGGTACCGGACGAACCAACGGTCAGGATGTAAATATGCGTGGCTATGATCATCGCGGCGTGCTGGTTCT
19 | TGTCGATGGTGTTCGCCAGGGAACGGATACCGGACACCTGAATGGCACTTTTCTCGATCCGGCGCTGATC
20 | AAGCGTGTTGAGATTGTTCGCGGACCTTCAGCATTACTGTATGGCAGTGGCGCGCTGGGTGGAGTGATCT
21 | CCTACGATACGGTCGATGCAAAAGATTTATTGCAGGAAGGACAAAGCAGTGGTTTTCGTGTCTTTGGTAC
22 | TGGCGGCA
23 | >yjaA AP023226.1:c4557291-4557081 Escherichia coli YJ3 DNA, complete genome
24 | CAAACGTGAAGTGTCAGGAGACGCTGCCTTCAGTAACCAGCGCCTGTTAATCGCCAATTTCTTTGTTGCA
25 | GAAAAAGTTCTGCAAGATCTTGTTCTGCAACTCCACCCACGTTCAACCTGGCATTCTTTTTTGCCAGCAA
26 | AACGTATGGATATTGTTGTGAGCGCGCTGGAAATGAATGAGGGCGGTTTGTCACAGGTTGAGGAACGCAT
27 | T
28 | >TspE4.C2 CP054219.1:2890121-2890272 Escherichia coli strain EcPF18 chromosome, complete genome
29 | CACTATTCGTAAGGTCATCCCTTCAAGTTCGATAGTCTGAATATCTACCCGCGTTTCTGTCTCACCCGCA
30 | AGGACAGCGCTGGCGATATAGCCCTCTCTGCGCTGCGTAATACTTTGTTGGCGCGATGAGGGGCGACCCG
31 | CAGCGATAAACT
32 | >trpAgpC CP055256.1:2872982-2873201 Escherichia coli strain AH25 chromosome, complete genome
33 | AGTTTTATGCCCAGTGCGAGAAAGTCGGCGTCGATTCGGTGCTGGTTGCCGATGTGCCAATTGAAGAGTC
34 | CGCGCCCTTCCGCCAGGCCGCGTTGCGCCATAATGTCGCACCTATCTTCATCTGCCCGCCAAATGCCGAT
35 | GACGACCTGCTGCGCCAGATAGCCTCTTACGGTCGTGGTTACACCTATTTGCTGTCACGAGCGGGCGTGA
36 | CCGGCGCAGA
37 | >aesI CP041520.1:c2679520-2679205 Escherichia coli strain ESBL92 chromosome
38 | CCTCTACTCACCCAAAAGTCACAGCCCGGCGACACTATTTTATCTGCATGGTGGTGGTTTTATTCTCGGC
39 | AATCTTGATACCCACGATCGGATTATGCGACTGCTGGCAAATTACACCCAATGTACAGTGATTGGTATTG
40 | CTTACACTCTTTCGCCGGAAGCACGTTTTCCGCAGGCAATAGAGGAAATTGTGGCTGCCTGTTGCCACTT
41 | CCACCAGCAGGCAGAGGATTATCAAATCAATATGTCACGCATTGGTTTTGCCGGTGATTCTGCAGGCGCA
42 | ATGCTGGCGCTCGCCAGTGCGTTGTGGTTACGTGAT
43 | >aesII CP178317.1:c496736-496612 Escherichia coli strain ECBR1023 chromosome, complete genome
44 | TGCCTGTTGCCACTTCCACCAGCAGGCGGGAGATTATCAAATCAACATGTCCCGCATTGGCTTTGCCGGT
45 | GATTCTGCAGGTGCCATGCTGGCGCTCGCCAGTGCGTTGTGGTTGCGTGATAAAC
46 | >chuIII CP057804.1:c3018837-3018655 Escherichia coli strain RHB14-C20 chromosome, complete genome
47 | GTGTTGAGATTGTCCGTGGGCCTTCGGCATTACTGTATGGCAGTGGCGCGCTGGGAGGGGTTATCTCCTA
48 | CGATACGGTCGATGCAAAAGATTTATTGCAGGAAGGACAAAGCAGTGGTTTTCGTGTCTTTGGCACTGGC
49 | GGCACGGGAGACCATAGCCTGGGGCTGGGCGCCAGTGCTTTTG
50 | >chuIV CP089930.1:c1592281-1591821 Escherichia coli strain E69 chromosome, complete genome
51 | CTGGCGAAAGGAACCTGGAAAATTGATTCTGCCCAGGCTCTGAGCGGGTTAGTGCGTTATTACAATAACG
52 | ACGCGCGTGAACCAAAAAATCCGCAGACCGTTGAGGCTTCTGATAGCAGCAACCCGATGGTTGATCGCTC
53 | AACGATTCAACGTGATGCGCAGCTTGCTTATAAACTCGCTCCGTTGGGCAACGACTGGTTAAATGCCGAT
54 | GCAAAAGTTTACTGGTCGGAAGTCCGTATTAATGCGCAGAACACGGGGAGTTCCGGCGAGTATCGTGAAC
55 | AGACGACAAAAGGTGCCAAACTGGAGAACCGTTCCACTCTGTTTGCCGATAGTTTTGCCTCTCACCTGCT
56 | GACATATGGCGGTGAGTATTATCGTCAGGAACAGCATCCTGGCGGTGCGACGACGGGGTTCCCGCAAGCG
57 | AAAATCGATTTCAGCTCCGGTTGGCTGCAAGATGAGATAAC
58 | >chuV CP173213.1:c4016984-4016385 Escherichia marmotae strain F12YCO47 chromosome, complete genome
59 | ACTGTATGGCAGTGGCGCATTGGGAGGGGTTATCTCCTACGATACGGTCGATGCAAAAGATTTATTGCAG
60 | GAAGGACAAAGCAGCGGTTTTCGTGTCTTTGGCACTGGCGGCACGGGAGATCATAGCCTGGGGTTAGGCG
61 | CGAGTGCTTTTGGGCGAACGGAAAATCTGGATGGTATTGTGGCCTGGTCCAGCCGCGATCGTGGTGATTT
62 | ACGCCAGAGCAATGGCGAAACCGCGCCGAATGATGAGGCCATTAATAACATGTTGGCGAAAGGGACCTGG
63 | CAAATTGATTCTGCCCAGGCTCTGAGTGGATTAGTGCGTTATTACAATAACGACGCGCGCGAACCAAAAA
64 | ATCCGCAGACCGTTGAAGCTTCTGACAGCAGTAATCCGATGGTTGATCGTTCAACGATTCAACGTGATGC
65 | GCAACTTGCTTATAAACTCGCACCAGTGGGCAACGACTGGTTAAATGCCGATGCAAAAGTTTACTGGTCG
66 | GAAGTCCGTATTAATGCCCAGAACACGGGGAGTTCCGGCGAATATCGTGAACAGACAACAAAAGGTGCCA
67 | AACTGGAGAACCGCTCCACGCTGTTTGCCGATAGTTTTGC
68 | >trpA CP054236.1:3012950-3013734 Escherichia coli strain EcPF5 chromosome, complete genome
69 | GCTACGAATCTCTGTTTGCCCAGTTGAAGGAGCGCAAAGAAGGCGCATTCGTTCCTTTCGTCACCCTCGG
70 | TGATCCGGGCATTGAGCAGTCGTTGAAAATTATCGATACGCTAATTGAAGCCGGTGCTGACGCGCTGGAG
71 | TTAGGCATCCCCTTCTCCGACCCACTGGCGGATGGCCCGACGATTCAAAACGCCACACTGCGTGCTTTTG
72 | CGGCGGGAGTAACCCCGGCGCAGTGCTTTGAGATGCTGGCACTCATTCGCCAGAAGCACCCGACCATTCC
73 | CATCGGCCTTTTGATGTATGCCAACCTGGTGTTTAACAAAGGCATTGATGAGTTTTATGCCGAGTGCGAG
74 | AAAGTCGGCGTCGATTCGGTGCTGGTTGCCGATGTGCCCGTGGAAGAGTCCGCGCCCTTCCGCCAGGCCG
75 | CGTTGCGTCATAATGTCGCACCTATCTTTATTTGCCCGCCGAATGCCGACGATGATTTGCTGCGCCAGAT
76 | AGCCTCTTACGGTCGTGGTTACACCTATTTGCTGTCGCGAGCGGGCGTGACCGGCGCAGAAAACCGCGCC
77 | GCGTTACCCCTCAATCATCTGGTTGCGAAGCTGAAAGAGTACAACGCTGCGCCTCCATTGCAGGGATTTG
78 | GTATTTCCGCCCCGGATCAGGTAAAAGCCGCGATTGATGCAGGAGCTGCGGGCGCGATTTCTGGTTCGGC
79 | CATCGTTAAAATCATCGAGCAACATATTAATGAGCCAGAGAAAATGCTGGCGGCACTGAAAGCTTTTGTA
80 | CAACCGATGAAAGCG
81 | >trpBA CP054236.1:3012886-3013375 Escherichia coli strain EcPF5 chromosome, complete genome
82 | CGGCGATAAAGACATCTTCACCGTTCACGATATTTTGAAAGCACGAGGGGAAATCTGATGGAACGCTACG
83 | AATCTCTGTTTGCCCAGTTGAAGGAGCGCAAAGAAGGCGCATTCGTTCCTTTCGTCACCCTCGGTGATCC
84 | GGGCATTGAGCAGTCGTTGAAAATTATCGATACGCTAATTGAAGCCGGTGCTGACGCGCTGGAGTTAGGC
85 | ATCCCCTTCTCCGACCCACTGGCGGATGGCCCGACGATTCAAAACGCCACACTGCGTGCTTTTGCGGCGG
86 | GAGTAACCCCGGCGCAGTGCTTTGAGATGCTGGCACTCATTCGCCAGAAGCACCCGACCATTCCCATCGG
87 | CCTTTTGATGTATGCCAACCTGGTGTTTAACAAAGGCATTGATGAGTTTTATGCCGAGTGCGAGAAAGTC
88 | GGCGTCGATTCGGTGCTGGTTGCCGATGTGCCCGTGGAAGAGTCCGCGCCCTTCCGCCAGGCCGCGTTGC
89 | >fdm CP055251.1:1245966-1246230 Escherichia coli strain AH01 chromosome, complete genome
90 | TGGCGGCATTGTTAGCGTACCGGGCGTCTACGCTGGATTTATTCACGGTTTCCTGTTTGGCGACGCCTTT
91 | GATAAAGGGTTGACGTTTAAAATGGGACAGACCCACGTTCACGCATGGCTGGGAGAATTATTACCGTTAA
92 | TTGAGAAAGGATTACTGAAACCAGAAGAAATTGTTACCCACTATATGCCGTTTGAAGAGGCCGCCCGGGG
93 | ATATGAGATTTTCGAAAAACGTGAAGAGGAGTGCCGTAAGGTGATTCTGGTACCC
94 | >ybgD CP054224.1:3139367-3139543 Escherichia coli strain EcPF16 chromosome, complete genome
95 | TATGCGGCTGATGAAGGATCCGGTGAAATTCACTTTAAAGGTGAAGTTATTGAAGCACCGTGTGAAATAC
96 | ATCAGGATGATATTGATAAAGAGGTTGAACTCGGTCAGGTGACCACCAGCCACATTAATCAGTCACATCA
97 | CAGCGATGCCGTTGCTGTCGACCTGCGCTTAGTCAAC


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | # If extensions (or modules to document with autodoc) are in another directory,
 10 | # add these directories to sys.path here. If the directory is relative to the
 11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 12 | import os
 13 | import sys
 14 | from unittest.mock import MagicMock
 15 | 
 16 | # Determine the absolute path to the directory containing the python modules.
 17 | _pysrc = os.path.abspath(os.path.join(os.path.abspath(__file__), "..", "..", ".."))
 18 | 
 19 | # Insert it into the path.
 20 | sys.path.insert(0, _pysrc)
 21 | 
 22 | # Now we can import local modules.
 23 | import pymlst  # noqa
 24 | 
 25 | # -- Document __init__ methods by default. --------------------------------
 26 | # This section was added to allow __init__() to be documented automatically.
 27 | # You can comment this section out to go back to the default behavior.
 28 | # See: http://stackoverflow.com/questions/5599254
 29 | 
 30 | 
 31 | def skip(app, what, name, obj, skip, options):
 32 |     if name == "__init__":
 33 |         return False
 34 |     return skip
 35 | 
 36 | 
 37 | def setup(app):
 38 |     app.connect("autodoc-skip-member", skip)
 39 | 
 40 | 
 41 | class Mock(MagicMock):
 42 |     @classmethod
 43 |     def __getattr__(cls, name):
 44 |         return MagicMock()
 45 | 
 46 | 
 47 | MOCK_MODULES = [
 48 |     "numpy",
 49 |     "networkx",
 50 |     "scipy",
 51 |     "sklearn",
 52 |     "matplotlib",
 53 |     "matplotlib.pyplot",
 54 |     "scipy.interpolate",
 55 |     "scipy.special",
 56 |     "math",
 57 |     "pandas",
 58 | ]
 59 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 60 | 
 61 | # -- General configuration ------------------------------------------------
 62 | 
 63 | # If your documentation needs a minimal Sphinx version, state it here.
 64 | #
 65 | # needs_sphinx = '1.0'
 66 | 
 67 | # Add any Sphinx extension module names here, as strings. They can be
 68 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 69 | # ones.
 70 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode", "sphinx.ext.githubpages"]
 71 | 
 72 | # Add any paths that contain templates here, relative to this directory.
 73 | templates_path = ["_templates"]
 74 | 
 75 | # The suffix(es) of source filenames.
 76 | # You can specify multiple suffix as a list of string:
 77 | #
 78 | # source_suffix = ['.rst', '.md']
 79 | source_suffix = ".rst"
 80 | 
 81 | # The master toctree document.
 82 | master_doc = "index"
 83 | 
 84 | # General information about the project.
 85 | project = "pymlst"
 86 | copyright = "2019, Zagarwin; 2023, Bvalot"
 87 | author = "Benoit Valot"
 88 | 
 89 | # The version info for the project you're documenting, acts as replacement for
 90 | # |version| and |release|, also used in various other places throughout the
 91 | # built documents.
 92 | #
 93 | # The short X.Y version.
 94 | # version = pymlst.__version__
 95 | # The full version, including alpha/beta/rc tags.
 96 | # release = pymlst.__release__
 97 | 
 98 | # The full version, including alpha/beta/rc tags
 99 | 
100 | release = pymlst.__release__
101 | 
102 | # The language for content autogenerated by Sphinx. Refer to documentation
103 | # for a list of supported languages.
104 | #
105 | # This is also used if you do content translation via gettext catalogs.
106 | # Usually you set "language" from the command line for these cases.
107 | language = None
108 | 
109 | # List of patterns, relative to source directory, that match files and
110 | # directories to ignore when looking for source files.
111 | # This pattern also affects html_static_path and html_extra_path.
112 | exclude_patterns = []
113 | 
114 | # The name of the Pygments (syntax highlighting) style to use.
115 | pygments_style = "sphinx"
116 | 
117 | # If true, `todo` and `todoList` produce output, else they produce nothing.
118 | todo_include_todos = False
119 | 
120 | # -- Options for HTML output ----------------------------------------------
121 | 
122 | # The theme to use for HTML and HTML Help pages.  See the documentation for
123 | # a list of builtin themes.
124 | 
125 | # fmt: off
126 |   # noqa
127 | # fmt: on
128 | html_theme = "sphinx_rtd_theme"
129 | html_logo = "logo.png"
130 | 
131 | html_theme_options = {
132 | }
133 | # fmt: off
134 |   # noqa
135 |   # noqa
136 | # fmt: on
137 | 
138 | # Add any paths that contain custom static files (such as style sheets) here,
139 | # relative to this directory. They are copied after the builtin static files,
140 | # so a file named "default.css" will overwrite the builtin "default.css".
141 | html_static_path = ["_static"]
142 | 
143 | # Custom sidebar templates, must be a dictionary that maps document names
144 | # to template names.
145 | #
146 | # This is required for the alabaster theme
147 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
148 | # html_sidebars = {
149 | #     "**": [
150 | #         "about.html",
151 | #         "navigation.html",
152 | #         "relations.html",  # needs 'show_related': True theme option to display
153 | #         "searchbox.html",
154 | #         "donate.html",
155 | #     ]
156 | # }
157 | 
158 | # -- Options for HTMLHelp output ------------------------------------------
159 | 
160 | # Output file base name for HTML help builder.
161 | htmlhelp_basename = "pymlstdoc"
162 | 
163 | # -- Options for LaTeX output ---------------------------------------------
164 | 
165 | latex_elements = {
166 |     # The paper size ('letterpaper' or 'a4paper').
167 |     #
168 |     # 'papersize': 'letterpaper',
169 |     # The font size ('10pt', '11pt' or '12pt').
170 |     #
171 |     # 'pointsize': '10pt',
172 |     # Additional stuff for the LaTeX preamble.
173 |     #
174 |     # 'preamble': '',
175 |     # Latex figure (float) alignment
176 |     #
177 |     # 'figure_align': 'htbp',
178 | }
179 | 
180 | # Grouping the document tree into LaTeX files. List of tuples
181 | # (source start file, target name, title,
182 | #  author, documentclass [howto, manual, or own class]).
183 | latex_documents = [
184 |     (
185 |         master_doc,
186 |         "pymlst.tex",
187 |         "pyMLST Documentation",
188 |         author,
189 |         "manual",
190 |     )
191 | ]
192 | 
193 | # -- Options for manual page output ---------------------------------------
194 | 
195 | # One entry per manual page. List of tuples
196 | # (source start file, name, description, authors, manual section).
197 | man_pages = [
198 |     (
199 |         master_doc,
200 |         "pymlst",
201 |         "pyMLST Documentation",
202 |         [author],
203 |         1,
204 |     )
205 | ]
206 | 
207 | # -- Options for Texinfo output -------------------------------------------
208 | 
209 | # Grouping the document tree into Texinfo files. List of tuples
210 | # (source start file, target name, title, author,
211 | #  dir menu entry, description, category)
212 | texinfo_documents = [
213 |     (
214 |         master_doc,
215 |         "pymlst",
216 |         "pyMLST Documentation",
217 |         author,
218 |         "cg/wgMLST analysis using pyMLST",
219 |         "O",
220 |         "bioinformatic",
221 |     )
222 | ]
223 | 


--------------------------------------------------------------------------------
/pymlst/common/psl.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | ##Copyright (c) 2019 Benoit Valot
  5 | ##benoit.valot@univ-fcomte.fr
  6 | ##UMR 6249 Chrono-Environnement, Besançon, France
  7 | ##Licence GPL
  8 | 
  9 | from Bio.Data.CodonTable import TranslationError
 10 | 
 11 | from pymlst.common import mafft
 12 | 
 13 | 
 14 | def test_cds(seq):
 15 |     try:
 16 |         seq.translate(table="Bacterial", cds=True)
 17 |     except TranslationError:
 18 |         return False
 19 |     else:
 20 |         return True
 21 | 
 22 | class Psl:
 23 |     """A simple Psl class"""
 24 |     def __init__(self, pslline):
 25 |         pslelement = pslline.rstrip("\n").split("\t")
 26 |         if len(pslelement) != 21:
 27 |             raise Exception("Psl line have not 21 elements:\n"+pslline)
 28 |         self.pslelement = pslelement
 29 |         self.chro = pslelement[13]
 30 |         self.start = int(pslelement[15])
 31 |         self.end = int(pslelement[16])
 32 |         self.strand = pslelement[8]
 33 |         self.rstart = int(pslelement[11])
 34 |         self.rend = int(pslelement[12])
 35 |         self.rtotal = int(pslelement[10])
 36 |         self.coverage = (float(self.rend) - self.rstart)/self.rtotal
 37 | 
 38 |     def gene_id(self):
 39 |         return self.pslelement[9]
 40 | 
 41 |     def get_sequence(self, seq):
 42 |         if self.strand == '+':
 43 |             return seq[self.start:self.end]
 44 |         return seq[self.start:self.end].reverse_complement()
 45 | 
 46 |     # def searchCorrect(self):
 47 |     #     if int(self.pslelement[11]) != 0:
 48 |     #         diff = int(self.pslelement[11])
 49 |     #         if self.strand == "+":
 50 |     #             self.start = self.start - diff
 51 |     #         else:
 52 |     #             self.end = self.end + diff
 53 |     #     elif int(self.pslelement[10]) != int(self.pslelement[12]):
 54 |     #         diff = int(self.pslelement[10]) - int(self.pslelement[12])
 55 |     #         if self.strand == "+":
 56 |     #             self.end = self.end + diff
 57 |     #         else:
 58 |     #             self.start = self.start - diff
 59 |     #     self.coverage = 1
 60 |     #
 61 |     # def searchCorrectCDS(self, seq, coverage):
 62 |     #     prot = self.get_sequence(seq)
 63 |     #     ##modifs start and stop not create
 64 |     #     if prot.startswith("M") is False and prot.endswith("*") is False:
 65 |     #         return False
 66 |     #     windows = int((1-coverage)*self.rtotal)
 67 |     #     if prot.startswith("M") is False:
 68 |     #         return self.__searchCDS(seq, True, False, windows, 0)
 69 |     #     elif prot.endswith("*") is False:
 70 |     #         return self.__searchCDS(seq, False, True, windows, 0)
 71 |     #     else:
 72 |     #         raise Exception("A problem of start/stop  for gene " + self.gene_id())
 73 | 
 74 |     # def searchPartialCDS(self, seq, coverage):
 75 |     #     ##modifs start and stop not create
 76 |     #     if self.rstart !=0 and self.rend != self.rtotal:
 77 |     #         return False
 78 |     #     windows = int((1-coverage)*self.rtotal)
 79 |     #     if self.rstart !=0:
 80 |     #         diff = self.rstart
 81 |     #         return self.__searchCDS(seq, True, False, windows, diff)
 82 |     #     elif self.rend != self.rtotal:
 83 |     #         diff = self.rtotal - self.rend
 84 |     #         return self.__searchCDS(seq, False, True, windows, diff)
 85 |     #     else:
 86 |     #         raise Exception("A problem of start/stop for gene " + self.gene_id())
 87 | 
 88 |     def get_aligned_sequence(self, seq, coregene):
 89 |         if self.strand == '+':
 90 |             expand_start = self.rstart > 0
 91 |             expand_end = self.rend < self.rtotal
 92 |         else:
 93 |             expand_start = self.rend < self.rtotal
 94 |             expand_end = self.rstart > 0
 95 | 
 96 |         if expand_start:
 97 |             start = self.start - 36
 98 |             if start < 0:
 99 |                 start = 0
100 |         else:
101 |             start = self.start
102 | 
103 |         if expand_end:
104 |             end = self.end + 36
105 |             if end > len(seq):
106 |                 end = len(seq)
107 |         else:
108 |             end = self.end
109 | 
110 |         target = seq[start:end]
111 |         if self.strand != '+':
112 |             target = target.reverse_complement()
113 | 
114 |         al_start, al_end = mafft.get_aligned_area(coregene, str(target))
115 |         if al_start is not None:
116 |             return target[al_start:al_end]
117 | 
118 |         return None
119 | 
120 |     # def __searchCDS(self, seq, start, stop, windows, diff):
121 |     #     ##correct windows/diff multiple of 3
122 |     #     windows = windows - windows%3
123 |     #     diff = diff - diff%3
124 |     #     ##modifs start and stop not create
125 |     #     if start and stop:
126 |     #         return False
127 |     #     ##modifs start
128 |     #     if start:
129 |     #         ##modulo = (self.end-self.start)%3
130 |     #         if self.strand == "+":
131 |     #             theoStart = self.__getTheoricStart(diff)
132 |     #             val = [i for i in range(theoStart+windows, theoStart-windows, -3) \
133 |     #                    if test_cds(seq.seq[i:self.end], False)]
134 |     #             if len(val)==1:
135 |     #                 self.start=val[0]
136 |     #                 return True
137 |     #             elif len(val) >1:
138 |     #                 best = self.__getBest(val)
139 |     #                 self.logger.info("Choosing best start for gene " + self.gene_id() + " " \
140 |     #                                  + str(best) + " " + str(val))
141 |     #                 self.start = best
142 |     #                 return True
143 |     #             else:
144 |     #                 return False
145 |     #         else:
146 |     #             theoEnd = self.__getTheoricEnd(diff)
147 |     #             val = [i for i in range(theoEnd-windows, theoEnd+windows, 3) \
148 |     #                    if test_cds(seq.seq[self.start:i], True)]
149 |     #             if len(val) == 1:
150 |     #                 self.end = val[0]
151 |     #                 return True
152 |     #             elif len(val) >1:
153 |     #                 best = self.__getBest(val)
154 |     #                 self.logger.info("Choosing best start for gene " + self.gene_id() + " " \
155 |     #                                  + str(best) + " " + str(val))
156 |     #                 self.end = best
157 |     #                 return True
158 |     #             else:
159 |     #                 return False
160 |     #     ##modifs end
161 |     #     elif stop:
162 |     #         ##modulo = (self.end-self.start)%3
163 |     #         if self.strand == "+":
164 |     #             theoEnd = self.__getTheoricEnd(diff)
165 |     #             val = [i for i in range(theoEnd-windows, theoEnd+windows, 3) \
166 |     #                    if test_cds(seq.seq[self.start:i], False)]
167 |     #             if len(val) == 1:
168 |     #                 self.end = val[0]
169 |     #                 return True
170 |     #             else:
171 |     #                 return False
172 |     #         else:
173 |     #             theoStart = self.__getTheoricStart(diff)
174 |     #             val = [i for i in range(theoStart+windows, theoStart-windows, -3) \
175 |     #                    if test_cds(seq.seq[i:self.end], True)]
176 |     #             if len(val) == 1:
177 |     #                 self.start = val[0]
178 |     #                 return True
179 |     #             else:
180 |     #                 return False
181 |     #
182 |     # def __getTheoricStart(self, diff):
183 |     #     modulo = (self.end-self.start)%3
184 |     #     return self.start + modulo - diff
185 |     #
186 |     # def __getTheoricEnd(self, diff):
187 |     #     modulo = (self.end-self.start)%3
188 |     #     return self.end - modulo + diff
189 |     #
190 |     # def __getBest(self, val):
191 |     #     best = val[0]
192 |     #     for v in val[1:]:
193 |     #         if self.strand == "+":
194 |     #             if abs(abs(self.end - v) - self.rtotal) < abs(abs(self.end - best) - self.rtotal):
195 |     #                 best = v
196 |     #         else:
197 |     #             if abs(abs(v - self.start) - self.rtotal)
198 |     #             < abs(abs(best - self.start) - self.rtotal):
199 |     #                 best = v
200 |     #     return best
201 | 


--------------------------------------------------------------------------------
/tests/test_wg.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pytest
  3 | from sqlalchemy import select, exists
  4 | from sqlalchemy.sql.functions import count
  5 | from sqlalchemy.sql.operators import in_op as in_
  6 | 
  7 | import pymlst
  8 | from pymlst.common import exceptions
  9 | from pymlst.wg import model
 10 | from pymlst.wg.core import DatabaseWG, DuplicationHandling
 11 | 
 12 | data_path = os.path.join(os.path.dirname(__file__), 'data')
 13 | wg_path = os.path.join(data_path, 'wg')
 14 | 
 15 | 
 16 | def fasta(name):
 17 |     return open(os.path.join(wg_path, name + '.fasta'))
 18 | 
 19 | 
 20 | @pytest.fixture()
 21 | def wg():
 22 |     with pymlst.open_wg() as wg_mlst:
 23 |         yield wg_mlst
 24 | 
 25 | 
 26 | @pytest.fixture()
 27 | def db():
 28 |     db = DatabaseWG(None, 'ref')
 29 |     try:
 30 |         yield db
 31 |     finally:
 32 |         db.close()
 33 | 
 34 | 
 35 | @pytest.fixture()
 36 | def db_simple(db):
 37 |     seqs = [
 38 |         ('g1', 'AAA'),
 39 |         ('g1', 'ATA'),
 40 |         ('g2', 'TTT'),
 41 |         ('g3', 'CCC'),
 42 |         ('g4', 'CCC'),
 43 |     ]
 44 |     for gene, seq in seqs:
 45 |         db.add_genome(gene, 'A', seq)
 46 |     return db
 47 | 
 48 | 
 49 | @pytest.fixture()
 50 | def db_many(db):
 51 |     seqs_ref = [
 52 |         ('g1', 'AAA'),
 53 |         ('g2', 'ATA'),
 54 |         ('g3', 'TTT'),
 55 |         ('g4', 'CCC'),
 56 |         ('g5', 'GGG'),
 57 |     ]
 58 |     for gene, seq in seqs_ref:
 59 |         db.add_core_genome(gene, seq)
 60 |     seqs = [
 61 |         ('A', 'g1', 'AAA'),
 62 |         ('A', 'g2', 'ATA'),
 63 |         ('A', 'g3', 'ATT'),
 64 |         ('A', 'g4', 'CCC'),
 65 |         ('B', 'g1', 'AAT'),
 66 |         ('B', 'g2', 'ATA'),
 67 |         ('B', 'g3', 'TTT'),
 68 |         ('B', 'g4', 'CAC'),
 69 |         ('B', 'g5', 'GGG'),
 70 |         ('C', 'g1', 'AAA'),
 71 |         ('C', 'g3', 'TTT'),
 72 |         ('C', 'g4', 'CAA'),
 73 |         ('D', 'g4', 'CAC'),
 74 |     ]
 75 |     for strain, gene, seq in seqs:
 76 |         db.add_genome(gene, strain, seq)
 77 |     return db
 78 | 
 79 | 
 80 | def test_add_genome(db):
 81 |     db.add_genome('g1', 'A', 'AAA')
 82 |     seq = db.connection.execute(
 83 |         select([model.sequences])
 84 |     ).fetchall()
 85 |     assert len(seq) == 1
 86 |     assert seq[0].sequence == 'AAA'
 87 |     mlst = db.connection.execute(
 88 |         select([model.mlst])
 89 |     ).fetchall()
 90 |     assert len(mlst) == 1
 91 |     assert (mlst[0].gene == 'g1'
 92 |             and mlst[0].souche == 'A'
 93 |             and mlst[0].seqid == seq[0].id)
 94 | 
 95 | 
 96 | def test_add_core_genome(db):
 97 |     added = db.add_core_genome('g1', 'AAA')
 98 |     assert added
 99 |     seq = db.connection.execute(
100 |         select([model.sequences])
101 |     ).fetchone()
102 |     assert seq.sequence == 'AAA'
103 |     mlst = db.connection.execute(
104 |         select([model.mlst])
105 |     ).fetchone()
106 |     assert mlst.souche == db.ref == 'ref'
107 |     assert mlst.gene == 'g1' and mlst.seqid == seq.id
108 | 
109 | 
110 | def test_add_core_genome_exist_no_duplication_handle(db):
111 |     db.add_core_genome('g1', 'AAA')
112 |     with pytest.raises(exceptions.DuplicatedGeneSequence):
113 |         db.add_core_genome('g2', 'AAA')
114 | 
115 | 
116 | def test_add_core_genome_exist_concatenate_handle(db):
117 |     db.add_core_genome('g1', 'AAA')
118 |     added = db.add_core_genome('g2', 'AAA', DuplicationHandling.CONCATENATE)
119 |     assert not added
120 |     seq = db.connection.execute(
121 |          select([model.sequences.c.id])
122 |          .where(model.sequences.c.sequence == 'AAA')
123 |     ).fetchall()
124 |     assert len(seq) == 1
125 |     mlst = db.connection.execute(
126 |         select([model.mlst.c.gene])
127 |     ).fetchall()
128 |     assert len(mlst) == 1
129 |     assert mlst[0].gene == 'g1;g2'
130 | 
131 | 
132 | def test_add_core_genome_exist_remove_handle(db):
133 |     db.add_core_genome('g1', 'AAA')
134 |     added = db.add_core_genome('g2', 'AAA', DuplicationHandling.REMOVE)
135 |     assert not added
136 |     seq = db.connection.execute(
137 |          select([model.sequences])
138 |     ).fetchall()
139 |     assert len(seq) == 0
140 |     mlst = db.connection.execute(
141 |         select([model.mlst])
142 |     ).fetchall()
143 |     assert len(mlst) == 0
144 | 
145 | 
146 | def test_add_core_genome_gene_exist(db):
147 |     db.add_core_genome('g1', 'AAA')
148 |     with pytest.raises(exceptions.DuplicatedGeneName):
149 |         db.add_core_genome('g1', 'AAT')
150 | 
151 | 
152 | def test_add_genome_with_invalid_gene_name(db):
153 |     with pytest.raises(exceptions.InvalidGeneName):
154 |         db.add_core_genome('g1;', 'AAA')
155 | 
156 | 
157 | def test_get_core_genome(db):
158 |     db.add_core_genome('g1', 'AAA')
159 |     db.add_core_genome('g2', 'TTT')
160 |     db.add_genome('g3', 'A', 'CCC')
161 |     core_genome = db.core_genome
162 |     assert core_genome == {
163 |         'g1': 'AAA',
164 |         'g2': 'TTT',
165 |     }
166 | 
167 | 
168 | def test_remove_gene(db_simple):
169 |     db_simple.remove_gene('g1')
170 |     mlst_e = db_simple.connection.execute(
171 |         select([model.mlst])
172 |         .where(model.mlst.c.gene == 'g1')
173 |     ).fetchone()
174 |     assert mlst_e is None
175 |     seq_e = db_simple.connection.execute(
176 |         select([model.sequences])
177 |         .where(in_(model.sequences.c.sequence,
178 |                    ['AAA', 'ATA']))
179 |     ).fetchone()
180 |     assert seq_e is None
181 | 
182 | 
183 | def test_remove_gene_sequence_still_referenced(db_simple):
184 |     db_simple.remove_gene('g3')
185 |     seq_e = db_simple.connection.execute(
186 |         select([model.sequences])
187 |         .where(model.sequences.c.sequence == 'CCC')
188 |     ).fetchone()
189 |     assert seq_e is not None
190 | 
191 | 
192 | def test_remove_gene_from_core_genome_dict(db):
193 |     db.add_core_genome('g1', 'AAA')
194 |     assert 'g1' in db.core_genome
195 |     db.remove_gene('g1')
196 |     assert 'g1' not in db.core_genome
197 | 
198 | 
199 | def test_remove_strain(db_many):
200 |     db_many.remove_strain('B')
201 |     mlst_e = db_many.connection.execute(
202 |         select([model.mlst])
203 |         .where(model.mlst.c.souche == 'B')
204 |     ).fetchone()
205 |     assert mlst_e is None
206 |     seq_c = db_many.connection.execute(
207 |         select([count(model.sequences.c.id)])
208 |     ).fetchone()
209 |     assert seq_c[0] == 8  # Removed 1 sequence only
210 | 
211 | 
212 | def test_remove_reference_strain_attempt(db):
213 |     db.add_core_genome('g1', 'AAA')
214 |     with pytest.raises(exceptions.ReferenceStrainRemoval):
215 |         db.remove_strain('ref')
216 | 
217 | 
218 | def test_contains_souche(db_many):
219 |     assert db_many.contains_souche('B')
220 |     db_many.connection.execute(
221 |         model.mlst.delete()
222 |         .where(model.mlst.c.souche == 'B'))
223 |     assert not db_many.contains_souche('B')
224 | 
225 | 
226 | def test_get_gene_sequences_many_strains(db_many):
227 |     g1_seq = db_many.get_gene_sequences('g1')
228 |     assert g1_seq == [
229 |         [1, ['A', 'C'], 'AAA'],
230 |         [7, ['B'], 'AAT'],
231 |     ]
232 |     g2_seq = db_many.get_gene_sequences('g2')
233 |     assert g2_seq == [
234 |         [2, ['A', 'B'], 'ATA']]
235 | 
236 | 
237 | def test_get_gene_sequences_one_strain_duplicated_gene(db_simple):
238 |     g1_seq = db_simple.get_gene_sequences('g1')
239 |     assert g1_seq == [
240 |         [1, ['A'], 'AAA'],
241 |         [2, ['A'], 'ATA']]
242 | 
243 | 
244 | def test_get_duplicated_genes(db_simple):
245 |     dupli = db_simple.get_duplicated_genes()
246 |     assert dupli == {'g1'}
247 | 
248 | 
249 | def test_get_all_strains(db_many):
250 |     strains = db_many.get_all_strains()
251 |     assert strains == ['A', 'B', 'C', 'D']
252 | 
253 | 
254 | def test_get_core_genes(db_many):
255 |     genes = db_many.get_core_genes()
256 |     assert genes == ['g1', 'g2', 'g3', 'g4', 'g5']
257 | 
258 | 
259 | def test_count_sequences_per_gene(db_many):
260 |     seq_c = db_many.count_sequences_per_gene()
261 |     assert seq_c == {
262 |         'g1': 2,
263 |         'g2': 1,
264 |         'g3': 2,
265 |         'g4': 3,
266 |         'g5': 1
267 |     }
268 | 
269 | 
270 | def test_count_souches_per_gene(db_many):
271 |     str_c = db_many.count_souches_per_gene()
272 |     assert str_c == {
273 |         'g1': 3,
274 |         'g2': 2,
275 |         'g3': 3,
276 |         'g4': 4,
277 |         'g5': 1
278 |     }
279 | 
280 | 
281 | def test_count_genes_per_souche(db_many):
282 |     gene_c = db_many.count_genes_per_souche(['g1', 'g2', 'g3', 'g4', 'g5'])
283 |     assert gene_c == {
284 |         'A': 4,
285 |         'B': 5,
286 |         'C': 3,
287 |         'D': 1,
288 |         'ref': 5
289 |     }
290 | 
291 | 
292 | def test_count_sequences(db_many):
293 |     seq_c = db_many.count_sequences()
294 |     assert seq_c == 9
295 | 
296 | 
297 | def test_get_strains_distances(db_many):
298 |     distances = db_many.get_strains_distances(['g1', 'g2', 'g3', 'g4', 'g5'])
299 |     assert distances == {
300 |         'A': {
301 |             'A': 0,
302 |             'B': 3,
303 |             'C': 2,
304 |             'D': 1,
305 |         },
306 |         'B': {
307 |             'A': 3,
308 |             'B': 0,
309 |             'C': 2,
310 |             'D': 0,
311 |         },
312 |         'C': {
313 |             'A': 2,
314 |             'B': 2,
315 |             'C': 0,
316 |             'D': 1,
317 |         },
318 |         'D': {
319 |             'A': 1,
320 |             'B': 0,
321 |             'C': 1,
322 |             'D': 0,
323 |         },
324 |     }
325 | 
326 | 
327 | def test_get_mlst(db_many):
328 |     mlst = db_many.get_mlst(['g1', 'g2', 'g3', 'g4', 'g5'])
329 |     assert mlst == {
330 |         'g1': {
331 |             'A': '1',
332 |             'B': '7',
333 |             'C': '1',
334 |         },
335 |         'g2': {
336 |             'A': '2',
337 |             'B': '2',
338 |         },
339 |         'g3': {
340 |             'A': '6',
341 |             'B': '3',
342 |             'C': '3',
343 |         },
344 |         'g4': {
345 |             'A': '4',
346 |             'B': '8',
347 |             'C': '9',
348 |             'D': '8',
349 |         },
350 |         'g5': {
351 |             'B': '5',
352 |         },
353 |     }
354 | 


--------------------------------------------------------------------------------
/pymlst/common/web.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | import logging
  4 | 
  5 | import zipfile
  6 | import requests
  7 | import questionary
  8 | 
  9 | from git import Repo
 10 | from bs4 import BeautifulSoup
 11 | from Bio import SeqIO
 12 | 
 13 | import urllib3
 14 | urllib3.disable_warnings()
 15 | 
 16 | from pymlst.common import exceptions
 17 | 
 18 | PUBMLST_URL = 'https://rest.pubmlst.org/db'
 19 | PASTEUR_URL = 'https://bigsdb.pasteur.fr/api/db'
 20 | CGMLST_URL = 'https://www.cgmlst.org/ncs'
 21 | 
 22 | 
 23 | 
 24 | def request(query):
 25 |     result = requests.get(query, timeout=600, verify=False)
 26 |     result.raise_for_status()
 27 |     return result
 28 | 
 29 | 
 30 | def display_prompt(message, choices):
 31 |     style = questionary.Style([
 32 |         ('qmark', 'fg:#673ab7 bold'),
 33 |         ('question', 'bold'),
 34 |         ('answer', 'fg:#f44336 bold'),
 35 |         ('pointer', 'fg:#673ab7 bold'),
 36 |         ('highlighted', 'fg:#673ab7 bold'),
 37 |         ('selected', 'fg:#cc5454'),
 38 |         ('separator', 'fg:#cc5454'),
 39 |         ('instruction', ''),
 40 |         ('text', ''),
 41 |     ])
 42 | 
 43 |     return questionary.select(message,
 44 |                               choices,
 45 |                               style=style) \
 46 |                       .ask()
 47 | 
 48 | 
 49 | def is_mlst_scheme(url, description):
 50 |     desc_lower = description.lower()
 51 |     blacklist = ['cgmlst', 'wgmlst', 'extended mlst']
 52 |     for word in blacklist:
 53 |         if word in desc_lower:
 54 |             return False
 55 |     scheme_json = request(url).json()
 56 |     if 'profiles_csv' not in scheme_json:
 57 |         return False
 58 |     return len(scheme_json['loci']) < 10
 59 | 
 60 | 
 61 | def process_results(choices, query, prompt):
 62 |     choices_length = len(choices)
 63 |     if choices_length == 0:
 64 |         raise exceptions.PyMLSTWebError('No result found for \'{}\'\n'.format(query))
 65 |     if choices_length == 1:
 66 |         logging.info("One element found : {}".format(choices[0]))
 67 |         return choices[0]
 68 |     if prompt:
 69 |         logging.info("{} elements found, please choose one:".format(str(len(choices))))       
 70 |         return display_prompt('({}) Results found'.format(choices_length),
 71 |                               choices)
 72 |     raise exceptions.PyMLSTWebError('More than 1 result found for \'{}\'\n'.format(query))
 73 | 
 74 | 
 75 | def get_mlst_species(query, repo_url):
 76 |     """Gets MLST species from pubmlst.org.
 77 | 
 78 |     :param query: A sub-string to filter species names.
 79 |     :param repo_url: An online repository url
 80 |     :return: A Dictionary with species name in Key and URL in Value.
 81 |     """
 82 |     try:
 83 |         whole_base = request(repo_url).json()
 84 |     except ValueError as error:
 85 |         raise exceptions.StructureError() from error
 86 | 
 87 |     species = {}
 88 |     species_all = {}
 89 |     query_low = query.lower()
 90 | 
 91 |     try:
 92 |         for record in whole_base:
 93 |             if record['name'] == 'test':
 94 |                 continue
 95 |             for database in record['databases']:
 96 |                 des = database['description'].replace('sequence/profile definitions', '').lower()
 97 |                 if database['name'].endswith('seqdef'):
 98 |                     if query_low in des:
 99 |                         species[des] = database['href']
100 |                     for sub_query in query_low.split(' '):
101 |                         if sub_query in des:
102 |                             species_all[des] = database['href']
103 |     except KeyError as error:
104 |         raise exceptions.StructureError() from error
105 |     if len(species) > 0:
106 |         return species
107 |     logging.info("No elements found for {}, search for each individual term".format(query))
108 |     return species_all
109 | 
110 | 
111 | def get_mlst_schemes(species_url, query):
112 |     """Gets schemes profiles from PubMLST for a given species URL.
113 | 
114 |     :param species_url: The species URL (see get_mlst_species()).
115 |     :param query: A sub-string to filter schemes names.
116 |     :return: A Dictionary with schemes name in Key and URL in Value.
117 |     """
118 |     schemes_url = species_url + '/schemes'
119 |     schemes_json = request(schemes_url).json()
120 | 
121 |     schemes = {}
122 |     query_low = query.lower()
123 | 
124 |     try:
125 |         for scheme in schemes_json['schemes']:
126 |             if not is_mlst_scheme(scheme['scheme'], scheme['description']):
127 |                 continue
128 |             des = scheme['description'].lower()
129 |             if query_low in des:
130 |                 schemes[des] = scheme['scheme']
131 |     except KeyError as error:
132 |         raise exceptions.StructureError() from error
133 | 
134 |     return schemes
135 | 
136 | 
137 | def retrieve_mlst(query, prompt_enabled, mlst='', repository='pubmlst'):
138 |     """Retrieves MLST data, prompts user if necessary and if possible.
139 | 
140 |     :param query: A sub-string to filter species names.
141 |     :param prompt_enabled: Whether or not to prompt user for actions.
142 |                            If disabled and many choices are possible,
143 |                            will raise an Exception.
144 |     :param mlst: A sub-string to filter schemes names.
145 |     :param repository: Defined the online repository [PUBMLST,PASTEUR]
146 |     :return: A scheme URL.
147 |     """
148 |     if repository.upper()=="PUBMLST":
149 |         species = get_mlst_species(query, PUBMLST_URL)
150 |     elif repository.upper()=="PASTEUR":
151 |         species = get_mlst_species(query, PASTEUR_URL)
152 |         if mlst=='':
153 |             mlst='mlst'
154 |     else:
155 |         raise exceptions.PyMLSTWebError("Only PUBMLST or PASTEUR repository are defined")
156 |     species_choice = process_results(list(species.keys()), query, prompt_enabled)
157 |     if species_choice is None:
158 |         return None
159 | 
160 |     species_url = species[species_choice]
161 | 
162 |     schemes = get_mlst_schemes(species_url, mlst)
163 |     scheme_choice = process_results(list(schemes.keys()), mlst, prompt_enabled)
164 |     if scheme_choice is None:
165 |         return None
166 | 
167 |     return schemes[scheme_choice]
168 | 
169 | 
170 | def get_cgmlst_species(query):
171 |     """Gets cgMLST species from cgmlst.org.
172 | 
173 |     :param query: A sub-string to filter species names.
174 |     :return: A Dictionary with species name in Key and download URL in Value.
175 |     """
176 |     page = request(CGMLST_URL)
177 | 
178 |     soup = BeautifulSoup(page.content, 'html.parser')
179 | 
180 |     table = soup.find('tbody')
181 |     if table is None:
182 |         raise exceptions.StructureError()
183 | 
184 |     lines = table.find_all('a')
185 | 
186 |     species = {}
187 |     query_low = query.lower()
188 | 
189 |     for line in lines:
190 |         text = line.get_text()
191 |         if 'cgMLST' not in text:
192 |             continue
193 |         name = text.replace('cgMLST', '').strip()
194 |         if query_low in name.lower():
195 |             url = line.get('href')
196 |             if url is None:
197 |                 raise exceptions.StructureError()
198 |             species[name] = url
199 | 
200 |     return species
201 | 
202 | 
203 | def retrieve_cgmlst(query, prompt_enabled):
204 |     """Retrieves cgMLST data, prompts user if necessary and if possible.
205 | 
206 |     :param query: A sub-string to filter species names.
207 |     :param prompt_enabled: Whether or not to prompt user for actions.
208 |                            If disabled and many choices are possible,
209 |                            will raise an Exception.
210 |     :return: A species download URL.
211 |     """
212 |     species = get_cgmlst_species(query)
213 |     choice = process_results(list(species.keys()), query, prompt_enabled)
214 |     if choice is None:
215 |         return None
216 | 
217 |     species_url = species[choice]
218 | 
219 |     return species_url
220 | 
221 | def get_cgmlst_info(url):
222 |     """Retrieve informations of cgMLST data
223 | 
224 |     :param url: The url information page
225 |     
226 |     """
227 |     page = request(url)
228 | 
229 |     soup = BeautifulSoup(page.content, 'html.parser')
230 | 
231 |     table = soup.find('tbody')
232 |     if table is None:
233 |         raise exceptions.StructureError()
234 | 
235 |     lines = [v.get_text() for v in table.contents]
236 |     genus = ""
237 |     species = ""
238 |     version = ""
239 |     for line in lines:
240 |         if line.startswith("Genus"):
241 |             genus = line.lstrip("Genus")
242 |         if line.startswith("Species"):
243 |             species = line.lstrip("Species")
244 |         if line.startswith("Last Change"):
245 |             version = line.lstrip("Last Change")
246 |     return(genus+" "+species, version)
247 | 
248 |     
249 | def get_cgmlst_file(url, handle):
250 |     """Download cgMLST data and use them to initialize a fasta file.
251 | 
252 |     :param url: The download URL.
253 |     :param handle: The file handle.
254 |     """
255 |     with tempfile.TemporaryDirectory() as tmp_dir:
256 |         url += 'alleles'
257 |         zip_req = request(url)
258 |         zip_tmp = os.path.join(tmp_dir, 'tmp.zip')
259 |         open(zip_tmp, 'wb').write(zip_req.content)
260 | 
261 |         fas_tmp = os.path.join(tmp_dir, 'fas')
262 |         os.mkdir(fas_tmp)
263 |         with zipfile.ZipFile(zip_tmp) as z_file:
264 |             z_file.extractall(fas_tmp)
265 |         skipped = []
266 |         for fasta in os.listdir(fas_tmp):
267 |             try:
268 |                 iterator = next(SeqIO.parse(os.path.join(fas_tmp, fasta), 'fasta'))
269 |             except (StopIteration, ValueError, TypeError):
270 |                 skipped.append(fasta)
271 |                 continue
272 |             handle.write('> ' + fasta.replace('.fasta', '') + '\n')
273 |             handle.write(str(iterator.seq) + '\n')
274 |         return skipped
275 | 
276 | 
277 | def clean_csv(csv_content, locus_nb):
278 |     lines = csv_content.split('\n')
279 |     header = lines[0].split('\t')
280 |     diff = len(header) - (locus_nb + 1)
281 |     if diff > 0:
282 |         lines[0] = '\t'.join(header[0:-diff])
283 |     return '\n'.join(lines)
284 | 
285 | 
286 | def get_mlst_files(url, directory):
287 |     """Download MLST data and puts them in the given directory.
288 | 
289 |     :param url: The scheme URL.
290 |     :param directory: The directory.
291 |     """
292 |     mlst_scheme = request(url).json()
293 |     version = mlst_scheme.get('last_added', "Not found")
294 |     logging.info("Database version : {}".format(version))
295 | 
296 |     # Downloading the locus files in a directory :
297 |     locus_dir = os.path.join(directory, 'locus')
298 |     os.mkdir(locus_dir)
299 |     for loci in mlst_scheme['loci']:
300 |         name = loci.split('/')[-1]
301 |         loci_fasta = request(loci + '/alleles_fasta')
302 |         loci_file_name = os.path.join(locus_dir, name + '.fasta')
303 |         open(loci_file_name, 'wb').write(loci_fasta.content)
304 | 
305 |     # Downloading the profiles CSV :
306 |     profiles_url = url + '/profiles_csv'
307 |     profiles = request(profiles_url)
308 |     open(os.path.join(directory, 'profiles.csv'), 'wt').write(profiles.text)
309 |     return(version)
310 |     
311 | 
312 | def clone_repo(url, directory):
313 |     """Clone a git repository and puts the content in the given directory.
314 |     
315 |     :param url: The git URL.
316 |     :param directory: The directory.
317 |     """
318 |     repo = Repo.clone_from(url, directory)
319 |     logging.debug("Clone database from %s", url)
320 |     
321 | 


--------------------------------------------------------------------------------
/pymlst/wg/extractors.py:
--------------------------------------------------------------------------------
  1 | """Set of methods to extract different types of results from wgMLST"""
  2 | import abc
  3 | import importlib
  4 | import logging
  5 | import click
  6 | 
  7 | from abc import ABC
  8 | import pandas as pd
  9 | 
 10 | from pymlst.common import mafft, exceptions, utils
 11 | from pymlst.wg.core import Extractor
 12 | 
 13 | 
 14 | def read_gene_list(base, gene_file):
 15 |     core = base.get_core_genes()
 16 |     if gene_file is None:
 17 |         return core
 18 |     else:
 19 |         select = []
 20 |         for g in utils.strip_file(gene_file):
 21 |             if g in core:
 22 |                 select.append(g)
 23 |             else:
 24 |                 logging.debug("Gene {} not found in the database".format(g))
 25 |         return select
 26 | 
 27 | 
 28 | class SequenceExtractor(Extractor):
 29 |     """ Extracts coregene sequences into fasta file."""
 30 | 
 31 |     def __init__(self, file=None, reference=False):
 32 |         """    
 33 |         :param file: Path of the file containing the coregens to extract 
 34 |         """
 35 |         self.list_file = file
 36 |         self.reference = reference
 37 | 
 38 |     def extract(self, base, output):
 39 |         coregene = read_gene_list(base, self.list_file)
 40 |         logging.info("Number of gene to analyse : %s", len(coregene))
 41 |         for gene in coregene:
 42 |             if self.reference:
 43 |                 seq = base.get_gene_sequence_reference(gene)
 44 |                 output.write(">" + gene + "|reference" + "\n")
 45 |                 output.write(seq + "\n")                
 46 |             else:
 47 |                 seqs = base.get_gene_sequences(gene)
 48 |                 for seq in seqs:
 49 |                     output.write(">" + gene + "|" + str(seq[0]) + " "
 50 |                                  + ";".join(seq[1]) + "\n")
 51 |                     output.write(seq[2] + "\n")
 52 | 
 53 |                     
 54 | class MsaExtractor(Extractor):
 55 |     """ Compute Multiple Sequence Alignment (MSA) and extracts the aligned sequences. """
 56 | 
 57 |     def __init__(self, file=None, realign=False):
 58 |         """
 59 |         :param file: Path of the file containing the coregens to extract
 60 |         :param realign: Realign genes with same length 
 61 |         """
 62 |                 
 63 |         self.list_file = file
 64 |         self.realign = realign
 65 | 
 66 |     def extract(self, base, output):
 67 |         coregene = read_gene_list(base, self.list_file)
 68 |         if len(coregene) == 0:
 69 |             raise exceptions.PyMLSTError('No valid genes selected, verify your genes list')
 70 |         strains = base.get_all_strains()
 71 |         duplicated = base.get_duplicated_genes()
 72 | 
 73 |         sequences = {s: [] for s in strains}
 74 |         for index, gene in enumerate(coregene):
 75 |             if gene in duplicated:
 76 |                 logging.info("%s/%s | %s     %s", index + 1, len(coregene), gene, "No: Repeat gene")
 77 |                 continue
 78 |             seqs = base.get_gene_sequences(gene)
 79 |             size = set()
 80 |             for seq in seqs:
 81 |                 size.add(len(seq[2]))
 82 |             if len(size) == 1 and self.realign is False:
 83 |                 self.add_sequence_strain(seqs, strains, sequences)
 84 |                 logging.info("%s/%s | %s     %s", index + 1, len(coregene), gene, "Direct")
 85 |             else:
 86 |                 genes = {str(s[0]): s[2] for s in seqs}
 87 |                 corrseqs = mafft.align(genes)
 88 |                 for seq in seqs:
 89 |                     seq[2] = corrseqs.get(str(seq[0]))
 90 |                 self.add_sequence_strain(seqs, strains, sequences)
 91 |                 logging.info("%s/%s | %s     %s", index + 1, len(coregene), gene, "Align")
 92 | 
 93 |         # output align result
 94 |         for strain in strains:
 95 |             output.write('>' + strain + "\n")
 96 |             output.write("\n".join(map(str, sequences.get(strain))) + "\n")
 97 | 
 98 |     def add_sequence_strain(self, seqs, strains, sequences):
 99 |         """Add a sequence to multi-align, take the first gene in case of repetition"""
100 |         size = 0
101 |         if len(seqs) > 0:
102 |             size = len(seqs[0][2])
103 |         for strain in strains:
104 |             seq = [i[2] for i in seqs if strain in i[1]]
105 |             if len(seq) == 0:
106 |                 sequences.get(strain).append('-' * size)
107 |             elif len(seq) == 1:
108 |                 sequences.get(strain).append(seq[0])
109 |             else:
110 |                 raise exceptions.PyMLSTError(
111 |                     'Repeated genes must be excluded in order to export alignment')
112 | 
113 | 
114 |             
115 | 
116 | class TableExtractor(Extractor):
117 |     """ Extraction of cgMLST distance matrix, MLST profiles, Genes and Strains list from a wgMLST database.  """
118 |     def __init__(self,
119 |                  mincover=0,
120 |                  keep=False,
121 |                  duplicate=False,
122 |                  inverse=False):
123 |         self.mincover = mincover
124 |         self.keep = keep
125 |         self.duplicate = duplicate
126 |         self.inverse = inverse
127 | 
128 |     @abc.abstractmethod
129 |     def extract(self, base, output):
130 |         pass
131 | 
132 |     def get_valid_shema(self, base):
133 |         # read samples mlst
134 |         strains = base.get_all_strains()
135 |         # Minimun number of strain
136 |         if self.mincover < 0 or self.mincover > len(strains):
137 |             raise exceptions.PyMLSTError(
138 |                 'Mincover must be between 0 and number of strains {}'.format(len(strains)))
139 | 
140 |         # allgene
141 |         allgene = base.get_core_genes()
142 |         # duplicate gene
143 |         dupli = base.get_duplicated_genes()
144 |         # cover without duplication
145 |         count_souches = base.count_souches_per_gene()
146 |         # Count distinct gene
147 |         diff = base.count_sequences_per_gene()
148 | 
149 |         # filter coregene that is not sufficient mincover or keep only different or return inverse
150 |         valid_shema = []
151 |         # Test different case for validation
152 |         for gene in allgene:
153 |             valid = []
154 |             if self.keep is True:
155 |                 if diff.get(gene, 0) > 1:
156 |                     valid.append(True)
157 |                 else:
158 |                     valid.append(False)
159 |             else:
160 |                 valid.append(True)
161 |             if count_souches.get(gene, 0) >= self.mincover:
162 |                 valid.append(True)
163 |             else:
164 |                 valid.append(False)
165 |             if not self.duplicate:
166 |                 if gene in dupli:
167 |                     valid.append(False)
168 |                 else:
169 |                     valid.append(True)
170 |             else:
171 |                 valid.append(True)
172 |             if self.inverse is False:
173 |                 if sum(valid) == 3:
174 |                     valid_shema.append(gene)
175 |             else:
176 |                 if sum(valid) < 3:
177 |                     valid_shema.append(gene)
178 | 
179 |         # report
180 |         logging.info("Number of coregene used : %s/%s", len(valid_shema), len(allgene))
181 |         return(valid_shema)
182 | 
183 | class TableExtractorCommand(click.core.Command):
184 |     """ Options supported by :class:`~pymlst.wg.extractors.TableExtractor`. """
185 |     def __init__(self, *args, **kwargs):
186 |         super().__init__(*args, **kwargs)
187 |         self.params.insert(0, click.core.Option(('--mincover', '-m'),
188 |             type=click.INT,
189 |             help='Minimum number of strains found to retain a gene (default:0)'))
190 |         self.params.insert(1, click.core.Option(('--keep', '-k'),
191 |             is_flag=True,
192 |             help='Keeps only gene with different alleles (omit missing).'))
193 |         self.params.insert(2, click.core.Option(('--duplicate', '-d'),
194 |             is_flag=True,
195 |             help='Keeps duplicate genes (default remove).'))
196 |         self.params.insert(3, click.core.Option(('--inverse', '-V'),
197 |             is_flag=True,
198 |             help='Keeps only gene that do not ' \
199 |                 'match the filter of mincover or keep options.'))
200 | 
201 | class GeneExtractor(TableExtractor):
202 |     """ Extracts a list of genes from a wgMLST database. """
203 |     def __init__(self,**kwargs):
204 |         super().__init__(**kwargs)
205 |         
206 |     def extract(self, base, output):
207 |         valid_schema = super().get_valid_shema(base)
208 |         output.write("\n".join(sorted(valid_schema)) + "\n")
209 | 
210 | class StatsExtractor(Extractor):
211 |     """ Extracts stats, number of strains, coregenes and sequences from a wgMLST database. """
212 |     def extract(self, base, output):
213 |         infos = base.get_infos()
214 |         for c,v in zip(['name', 'source', 'species', 'version'], infos):
215 |             if v is None:
216 |                 v = ""
217 |             output.write(c + "\t" + v + "\n")
218 |         output.write("Coregenes\t" + str(len(base.get_core_genes())) + "\n")
219 |         output.write("Strains\t" + str(len(base.get_all_strains())) + "\n")
220 |         output.write("Sequences\t" + str(base.count_sequences()) + "\n")
221 | 
222 | class StrainExtractor(TableExtractor):
223 |     """ Extracts a list of strains from a wgMLST database. """
224 |     def __init__(self, count=False, **kwargs):
225 |         super().__init__(**kwargs)
226 |         self.count = count
227 |         
228 |     def extract(self, base, output):
229 |         if self.count is False:
230 |             output.write("\n".join(base.get_all_strains()) + "\n")
231 |         else:
232 |             tmp = base.count_genes_per_souche(super().get_valid_shema(base))
233 |             for strain in base.get_all_strains():
234 |                 output.write(strain + "\t" + str(tmp.get(strain)) + "\n")
235 | 
236 | class DistanceExtractor(TableExtractor):
237 |     """ Extracts a distance matrix from a wgMLST database. """    
238 |     def extract(self, base, output):
239 |         if self.duplicate:
240 |             logging.warning("Calculate distance between strains " +
241 |                          "using duplicate genes could reported bad result.")
242 |         strains = base.get_all_strains()
243 |         output.write(str(len(strains)) + "\n")
244 |         distance = base.get_strains_distances(super().get_valid_shema(base))
245 |         for strain in strains:
246 |             output.write(strain + "\t")
247 |             dist = [str(distance.get(strain, {}).get(s2, 0)) for s2 in strains]
248 |             output.write("\t".join(dist) + "\n")
249 | 
250 | class MlstExtractor(TableExtractor):
251 |     """ Extracts an MLST table from a wgMLST database. """
252 |     def __init__(self, form="default", **kwargs):
253 |         super().__init__(**kwargs)
254 |         self.form = form
255 | 
256 |     def extract(self, base, output):
257 |         valid_shema = super().get_valid_shema(base)
258 |         strains = base.get_all_strains()
259 |         mlst = base.get_mlst(valid_shema)
260 |         table = pd.DataFrame(columns=["#GeneId"] + strains)
261 |         rows = []
262 |         for gene in valid_shema:
263 |             row = {"#GeneId": gene}
264 |             mlstg = mlst.get(gene, {})
265 |             for strain in strains:
266 |                 row[strain] = mlstg.get(strain, None)
267 |             rows.append(row)
268 |         table = pd.concat([table, pd.DataFrame.from_dict(rows)], ignore_index=True)
269 |         table = table.set_index('#GeneId')
270 |         
271 |         if self.form == 'grapetree':
272 |             if self.duplicate:
273 |                 logging.warnings("Export grapetree table " +
274 |                              "using duplicate genes is not recommended.")
275 |             table = table.fillna(-1)
276 |             table = table.transpose()
277 |         else:
278 |             table = table.fillna("")
279 |         
280 |         table.to_csv(output, sep='\t')
281 | 


--------------------------------------------------------------------------------
/rcfile.rc:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # A comma-separated list of package or module names from where C extensions may
  4 | # be loaded. Extensions are loading into the active Python interpreter and may
  5 | # run arbitrary code.
  6 | extension-pkg-allow-list=
  7 | 
  8 | # A comma-separated list of package or module names from where C extensions may
  9 | # be loaded. Extensions are loading into the active Python interpreter and may
 10 | # run arbitrary code. (This is an alternative name to extension-pkg-allow-list
 11 | # for backward compatibility.)
 12 | extension-pkg-whitelist=
 13 | 
 14 | # Specify a score threshold to be exceeded before program exits with error.
 15 | fail-under=10.0
 16 | 
 17 | # Files or directories to be skipped. They should be base names, not paths.
 18 | ignore=CVS
 19 | 
 20 | # Files or directories matching the regex patterns are skipped. The regex
 21 | # matches against base names, not paths.
 22 | ignore-patterns=
 23 | 
 24 | # Python code to execute, usually for sys.path manipulation such as
 25 | # pygtk.require().
 26 | #init-hook=
 27 | 
 28 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
 29 | # number of processors available to use.
 30 | jobs=1
 31 | 
 32 | # Control the amount of potential inferred values when inferring a single
 33 | # object. This can help the performance when dealing with large functions or
 34 | # complex, nested conditions.
 35 | limit-inference-results=100
 36 | 
 37 | # List of plugins (as comma separated values of python module names) to load,
 38 | # usually to register additional checkers.
 39 | load-plugins=
 40 | 
 41 | # Pickle collected data for later comparisons.
 42 | persistent=yes
 43 | 
 44 | # When enabled, pylint would attempt to guess common misconfiguration and emit
 45 | # user-friendly hints instead of false-positive error messages.
 46 | suggestion-mode=yes
 47 | 
 48 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 49 | # active Python interpreter and may run arbitrary code.
 50 | unsafe-load-any-extension=no
 51 | 
 52 | 
 53 | [MESSAGES CONTROL]
 54 | 
 55 | # Only show warnings with the listed confidence levels. Leave empty to show
 56 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
 57 | confidence=
 58 | 
 59 | # Disable the message, report, category or checker with the given id(s). You
 60 | # can either give multiple identifiers separated by comma (,) or put this
 61 | # option multiple times (only on the command line, not in the configuration
 62 | # file where it should appear only once). You can also use "--disable=all" to
 63 | # disable everything first and then reenable specific checks. For example, if
 64 | # you want to run only the similarities checker, you can use "--disable=all
 65 | # --enable=similarities". If you want to run only the classes checker, but have
 66 | # no Warning level messages displayed, use "--disable=all --enable=classes
 67 | # --disable=W".
 68 | disable=not-context-manager
 69 | 
 70 | # Enable the message, report, category or checker with the given id(s). You can
 71 | # either give multiple identifier separated by comma (,) or put this option
 72 | # multiple time (only on the command line, not in the configuration file where
 73 | # it should appear only once). See also the "--disable" option for examples.
 74 | enable=
 75 | 
 76 | 
 77 | [REPORTS]
 78 | 
 79 | # Python expression which should return a score less than or equal to 10. You
 80 | # have access to the variables 'error', 'warning', 'refactor', and 'convention'
 81 | # which contain the number of messages in each category, as well as 'statement'
 82 | # which is the total number of statements analyzed. This score is used by the
 83 | # global evaluation report (RP0004).
 84 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 85 | 
 86 | # Template used to display messages. This is a python new-style format string
 87 | # used to format the message information. See doc for all details.
 88 | #msg-template=
 89 | 
 90 | # Set the output format. Available formats are text, parseable, colorized, json
 91 | # and msvs (visual studio). You can also give a reporter class, e.g.
 92 | # mypackage.mymodule.MyReporterClass.
 93 | output-format=text
 94 | 
 95 | # Tells whether to display a full report or only the messages.
 96 | reports=no
 97 | 
 98 | # Activate the evaluation score.
 99 | score=yes
100 | 
101 | 
102 | [REFACTORING]
103 | 
104 | # Maximum number of nested blocks for function / method body
105 | max-nested-blocks=5
106 | 
107 | # Complete name of functions that never returns. When checking for
108 | # inconsistent-return-statements if a never returning function is called then
109 | # it will be considered as an explicit return statement and no message will be
110 | # printed.
111 | never-returning-functions=sys.exit
112 | 
113 | 
114 | [FORMAT]
115 | 
116 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
117 | expected-line-ending-format=
118 | 
119 | # Regexp for a line that is allowed to be longer than the limit.
120 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
121 | 
122 | # Number of spaces of indent required inside a hanging or continued line.
123 | indent-after-paren=4
124 | 
125 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
126 | # tab).
127 | indent-string='    '
128 | 
129 | # Maximum number of characters on a single line.
130 | max-line-length=100
131 | 
132 | # Maximum number of lines in a module.
133 | max-module-lines=1000
134 | 
135 | # Allow the body of a class to be on the same line as the declaration if body
136 | # contains single statement.
137 | single-line-class-stmt=no
138 | 
139 | # Allow the body of an if to be on the same line as the test if there is no
140 | # else.
141 | single-line-if-stmt=no
142 | 
143 | 
144 | [SIMILARITIES]
145 | 
146 | # Ignore comments when computing similarities.
147 | ignore-comments=yes
148 | 
149 | # Ignore docstrings when computing similarities.
150 | ignore-docstrings=yes
151 | 
152 | # Ignore imports when computing similarities.
153 | ignore-imports=no
154 | 
155 | # Minimum lines number of a similarity.
156 | min-similarity-lines=4
157 | 
158 | 
159 | [LOGGING]
160 | 
161 | # The type of string formatting that logging methods do. `old` means using %
162 | # formatting, `new` is for `{}` formatting.
163 | logging-format-style=old
164 | 
165 | # Logging modules to check that the string format arguments are in logging
166 | # function parameter format.
167 | logging-modules=logging
168 | 
169 | 
170 | [BASIC]
171 | 
172 | # Naming style matching correct argument names.
173 | argument-naming-style=snake_case
174 | 
175 | # Regular expression matching correct argument names. Overrides argument-
176 | # naming-style.
177 | #argument-rgx=
178 | 
179 | # Naming style matching correct attribute names.
180 | attr-naming-style=snake_case
181 | 
182 | # Regular expression matching correct attribute names. Overrides attr-naming-
183 | # style.
184 | #attr-rgx=
185 | 
186 | # Bad variable names which should always be refused, separated by a comma.
187 | bad-names=foo,
188 |           bar,
189 |           baz,
190 |           toto,
191 |           tutu,
192 |           tata
193 | 
194 | # Bad variable names regexes, separated by a comma. If names match any regex,
195 | # they will always be refused
196 | bad-names-rgxs=
197 | 
198 | # Naming style matching correct class attribute names.
199 | class-attribute-naming-style=any
200 | 
201 | # Regular expression matching correct class attribute names. Overrides class-
202 | # attribute-naming-style.
203 | #class-attribute-rgx=
204 | 
205 | # Naming style matching correct class constant names.
206 | class-const-naming-style=UPPER_CASE
207 | 
208 | # Regular expression matching correct class constant names. Overrides class-
209 | # const-naming-style.
210 | #class-const-rgx=
211 | 
212 | # Naming style matching correct class names.
213 | class-naming-style=PascalCase
214 | 
215 | # Regular expression matching correct class names. Overrides class-naming-
216 | # style.
217 | #class-rgx=
218 | 
219 | # Naming style matching correct constant names.
220 | const-naming-style=UPPER_CASE
221 | 
222 | # Regular expression matching correct constant names. Overrides const-naming-
223 | # style.
224 | #const-rgx=
225 | 
226 | # Minimum line length for functions/classes that require docstrings, shorter
227 | # ones are exempt.
228 | docstring-min-length=-1
229 | 
230 | # Naming style matching correct function names.
231 | function-naming-style=snake_case
232 | 
233 | # Regular expression matching correct function names. Overrides function-
234 | # naming-style.
235 | #function-rgx=
236 | 
237 | # Good variable names which should always be accepted, separated by a comma.
238 | good-names=i,
239 |            j,
240 |            k,
241 |            ex,
242 |            Run,
243 |            _
244 | 
245 | # Good variable names regexes, separated by a comma. If names match any regex,
246 | # they will always be accepted
247 | good-names-rgxs=
248 | 
249 | # Include a hint for the correct naming format with invalid-name.
250 | include-naming-hint=no
251 | 
252 | # Naming style matching correct inline iteration names.
253 | inlinevar-naming-style=any
254 | 
255 | # Regular expression matching correct inline iteration names. Overrides
256 | # inlinevar-naming-style.
257 | #inlinevar-rgx=
258 | 
259 | # Naming style matching correct method names.
260 | method-naming-style=snake_case
261 | 
262 | # Regular expression matching correct method names. Overrides method-naming-
263 | # style.
264 | #method-rgx=
265 | 
266 | # Naming style matching correct module names.
267 | module-naming-style=snake_case
268 | 
269 | # Regular expression matching correct module names. Overrides module-naming-
270 | # style.
271 | #module-rgx=
272 | 
273 | # Colon-delimited sets of names that determine each other's naming style when
274 | # the name regexes allow several styles.
275 | name-group=
276 | 
277 | # Regular expression which should only match function or class names that do
278 | # not require a docstring.
279 | no-docstring-rgx=^_
280 | 
281 | # List of decorators that produce properties, such as abc.abstractproperty. Add
282 | # to this list to register other decorators that produce valid properties.
283 | # These decorators are taken in consideration only for invalid-name.
284 | property-classes=abc.abstractproperty
285 | 
286 | # Naming style matching correct variable names.
287 | variable-naming-style=snake_case
288 | 
289 | # Regular expression matching correct variable names. Overrides variable-
290 | # naming-style.
291 | #variable-rgx=
292 | 
293 | 
294 | [STRING]
295 | 
296 | # This flag controls whether inconsistent-quotes generates a warning when the
297 | # character used as a quote delimiter is used inconsistently within a module.
298 | check-quote-consistency=no
299 | 
300 | # This flag controls whether the implicit-str-concat should generate a warning
301 | # on implicit string concatenation in sequences defined over several lines.
302 | check-str-concat-over-line-jumps=no
303 | 
304 | 
305 | [TYPECHECK]
306 | 
307 | # List of decorators that produce context managers, such as
308 | # contextlib.contextmanager. Add to this list to register other decorators that
309 | # produce valid context managers.
310 | contextmanager-decorators=contextlib.contextmanager
311 | 
312 | # List of members which are set dynamically and missed by pylint inference
313 | # system, and so shouldn't trigger E1101 when accessed. Python regular
314 | # expressions are accepted.
315 | generated-members=
316 | 
317 | # Tells whether missing members accessed in mixin class should be ignored. A
318 | # mixin class is detected if its name ends with "mixin" (case insensitive).
319 | ignore-mixin-members=yes
320 | 
321 | # Tells whether to warn about missing members when the owner of the attribute
322 | # is inferred to be None.
323 | ignore-none=yes
324 | 
325 | # This flag controls whether pylint should warn about no-member and similar
326 | # checks whenever an opaque object is returned when inferring. The inference
327 | # can return multiple potential results while evaluating a Python object, but
328 | # some branches might not be evaluated, which results in partial inference. In
329 | # that case, it might be useful to still emit no-member and other checks for
330 | # the rest of the inferred objects.
331 | ignore-on-opaque-inference=yes
332 | 
333 | # List of class names for which member attributes should not be checked (useful
334 | # for classes with dynamically set attributes). This supports the use of
335 | # qualified names.
336 | ignored-classes=optparse.Values,thread._local,_thread._local
337 | 
338 | # List of module names for which member attributes should not be checked
339 | # (useful for modules/projects where namespaces are manipulated during runtime
340 | # and thus existing member attributes cannot be deduced by static analysis). It
341 | # supports qualified module names, as well as Unix pattern matching.
342 | ignored-modules=
343 | 
344 | # Show a hint with possible names when a member name was not found. The aspect
345 | # of finding the hint is based on edit distance.
346 | missing-member-hint=yes
347 | 
348 | # The minimum edit distance a name should have in order to be considered a
349 | # similar match for a missing member name.
350 | missing-member-hint-distance=1
351 | 
352 | # The total number of similar names that should be taken in consideration when
353 | # showing a hint for a missing member.
354 | missing-member-max-choices=1
355 | 
356 | # List of decorators that change the signature of a decorated function.
357 | signature-mutators=
358 | 
359 | 
360 | [MISCELLANEOUS]
361 | 
362 | # List of note tags to take in consideration, separated by a comma.
363 | notes=FIXME,
364 |       XXX,
365 |       TODO
366 | 
367 | # Regular expression of note tags to take in consideration.
368 | #notes-rgx=
369 | 
370 | 
371 | [VARIABLES]
372 | 
373 | # List of additional names supposed to be defined in builtins. Remember that
374 | # you should avoid defining new builtins when possible.
375 | additional-builtins=
376 | 
377 | # Tells whether unused global variables should be treated as a violation.
378 | allow-global-unused-variables=yes
379 | 
380 | # List of names allowed to shadow builtins
381 | allowed-redefined-builtins=
382 | 
383 | # List of strings which can identify a callback function by name. A callback
384 | # name must start or end with one of those strings.
385 | callbacks=cb_,
386 |           _cb
387 | 
388 | # A regular expression matching the name of dummy variables (i.e. expected to
389 | # not be used).
390 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
391 | 
392 | # Argument names that match this expression will be ignored. Default to name
393 | # with leading underscore.
394 | ignored-argument-names=_.*|^ignored_|^unused_
395 | 
396 | # Tells whether we should check for unused import in __init__ files.
397 | init-import=no
398 | 
399 | # List of qualified module names which can have objects that can redefine
400 | # builtins.
401 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
402 | 
403 | 
404 | [SPELLING]
405 | 
406 | # Limits count of emitted suggestions for spelling mistakes.
407 | max-spelling-suggestions=4
408 | 
409 | # Spelling dictionary name. Available dictionaries: none. To make it work,
410 | # install the 'python-enchant' package.
411 | spelling-dict=
412 | 
413 | # List of comma separated words that should not be checked.
414 | spelling-ignore-words=
415 | 
416 | # A path to a file that contains the private dictionary; one word per line.
417 | spelling-private-dict-file=
418 | 
419 | # Tells whether to store unknown words to the private dictionary (see the
420 | # --spelling-private-dict-file option) instead of raising a message.
421 | spelling-store-unknown-words=no
422 | 
423 | 
424 | [CLASSES]
425 | 
426 | # Warn about protected attribute access inside special methods
427 | check-protected-access-in-special-methods=no
428 | 
429 | # List of method names used to declare (i.e. assign) instance attributes.
430 | defining-attr-methods=__init__,
431 |                       __new__,
432 |                       setUp,
433 |                       __post_init__
434 | 
435 | # List of member names, which should be excluded from the protected access
436 | # warning.
437 | exclude-protected=_asdict,
438 |                   _fields,
439 |                   _replace,
440 |                   _source,
441 |                   _make
442 | 
443 | # List of valid names for the first argument in a class method.
444 | valid-classmethod-first-arg=cls
445 | 
446 | # List of valid names for the first argument in a metaclass class method.
447 | valid-metaclass-classmethod-first-arg=cls
448 | 
449 | 
450 | [IMPORTS]
451 | 
452 | # List of modules that can be imported at any level, not just the top level
453 | # one.
454 | allow-any-import-level=
455 | 
456 | # Allow wildcard imports from modules that define __all__.
457 | allow-wildcard-with-all=no
458 | 
459 | # Analyse import fallback blocks. This can be used to support both Python 2 and
460 | # 3 compatible code, which means that the block might have code that exists
461 | # only in one or another interpreter, leading to false positives when analysed.
462 | analyse-fallback-blocks=no
463 | 
464 | # Deprecated modules which should not be used, separated by a comma.
465 | deprecated-modules=optparse,tkinter.tix
466 | 
467 | # Output a graph (.gv or any supported image format) of external dependencies
468 | # to the given file (report RP0402 must not be disabled).
469 | ext-import-graph=
470 | 
471 | # Output a graph (.gv or any supported image format) of all (i.e. internal and
472 | # external) dependencies to the given file (report RP0402 must not be
473 | # disabled).
474 | import-graph=
475 | 
476 | # Output a graph (.gv or any supported image format) of internal dependencies
477 | # to the given file (report RP0402 must not be disabled).
478 | int-import-graph=
479 | 
480 | # Force import order to recognize a module as part of the standard
481 | # compatibility libraries.
482 | known-standard-library=
483 | 
484 | # Force import order to recognize a module as part of a third party library.
485 | known-third-party=enchant
486 | 
487 | # Couples of modules and preferred modules, separated by a comma.
488 | preferred-modules=
489 | 
490 | 
491 | [DESIGN]
492 | 
493 | # Maximum number of arguments for function / method.
494 | max-args=5
495 | 
496 | # Maximum number of attributes for a class (see R0902).
497 | max-attributes=7
498 | 
499 | # Maximum number of boolean expressions in an if statement (see R0916).
500 | max-bool-expr=5
501 | 
502 | # Maximum number of branch for function / method body.
503 | max-branches=12
504 | 
505 | # Maximum number of locals for function / method body.
506 | max-locals=15
507 | 
508 | # Maximum number of parents for a class (see R0901).
509 | max-parents=7
510 | 
511 | # Maximum number of public methods for a class (see R0904).
512 | max-public-methods=20
513 | 
514 | # Maximum number of return / yield for function / method body.
515 | max-returns=6
516 | 
517 | # Maximum number of statements in function / method body.
518 | max-statements=50
519 | 
520 | # Minimum number of public methods for a class (see R0903).
521 | min-public-methods=2
522 | 
523 | 
524 | [EXCEPTIONS]
525 | 
526 | # Exceptions that will emit a warning when being caught. Defaults to
527 | # "BaseException, Exception".
528 | overgeneral-exceptions=BaseException,
529 |                        Exception
530 | 


--------------------------------------------------------------------------------