├── .flake8
├── .github
└── workflows
│ ├── python-package.yml
│ └── python-publish.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CITATION.cff
├── LICENSE
├── README.md
├── docs
├── Makefile
├── api.rst
├── conf.py
├── index.rst
├── make.bat
├── prefix.csv
└── spec.rst
├── pyproject.toml
├── src
└── mavehgvs
│ ├── __init__.py
│ ├── exceptions.py
│ ├── patterns
│ ├── __init__.py
│ ├── combined.py
│ ├── dna.py
│ ├── position.py
│ ├── protein.py
│ ├── rna.py
│ └── util.py
│ ├── position.py
│ ├── py.typed
│ ├── util.py
│ └── variant.py
└── tests
├── __init__.py
├── test_patterns
├── __init__.py
├── test_dna.py
├── test_protein.py
├── test_rna.py
└── test_util.py
├── test_position.py
├── test_util.py
└── test_variant.py
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | extend-ignore = E203
3 | max-line-length = 88
4 | max-complexity = 10
5 |
--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3 |
4 | name: Python package
5 |
6 | on:
7 | push:
8 | branches: [ "main" ]
9 | pull_request:
10 | branches: [ "main" ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-22.04
16 | strategy:
17 | fail-fast: false
18 | matrix:
19 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
20 |
21 | steps:
22 | - uses: actions/checkout@v3
23 | - name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v3
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | - name: Install dependencies
28 | run: |
29 | python -m pip install --upgrade pip
30 | python -m pip install flake8 pytest
31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 | - name: Install package
33 | run: |
34 | python -m pip install .
35 | - name: Lint with flake8
36 | run: |
37 | # stop the build if there are Python syntax errors or undefined names
38 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
39 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
40 | flake8 . --count --exit-zero --statistics
41 | - name: Test with pytest
42 | run: |
43 | pytest
44 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Upload Python Package
10 |
11 | on:
12 | release:
13 | types: [published]
14 |
15 | permissions:
16 | contents: read
17 |
18 | jobs:
19 | deploy:
20 |
21 | runs-on: ubuntu-latest
22 |
23 | steps:
24 | - uses: actions/checkout@v3
25 | - name: Set up Python
26 | uses: actions/setup-python@v3
27 | with:
28 | python-version: '3.x'
29 | - name: Install dependencies
30 | run: |
31 | python -m pip install --upgrade pip
32 | pip install hatch
33 | - name: Build package
34 | run: hatch build
35 | - name: Publish package
36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 | with:
38 | user: __token__
39 | password: ${{ secrets.PYPI_MAVEHGVS }}
40 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Idea
2 | .idea/
3 |
4 | # Byte-compiled / optimized / DLL files
5 | __pycache__/
6 | *.py[cod]
7 | *$py.class
8 |
9 | # C extensions
10 | *.so
11 |
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 |
62 | # Flask stuff:
63 | instance/
64 | .webassets-cache
65 |
66 | # Scrapy stuff:
67 | .scrapy
68 |
69 | # Sphinx documentation
70 | docs/_build/
71 |
72 | # PyBuilder
73 | target/
74 |
75 | # Jupyter Notebook
76 | .ipynb_checkpoints
77 |
78 | # pyenv
79 | .python-version
80 |
81 | # celery beat schedule file
82 | celerybeat-schedule
83 |
84 | # SageMath parsed files
85 | *.sage.py
86 |
87 | # Environments
88 | .env
89 | .venv
90 | env/
91 | venv/
92 | ENV/
93 | env.bak/
94 | venv.bak/
95 |
96 | # Spyder project settings
97 | .spyderproject
98 | .spyproject
99 |
100 | # Rope project settings
101 | .ropeproject
102 |
103 | # mkdocs documentation
104 | /site
105 |
106 | # mypy
107 | .mypy_cache/
108 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/psf/black
3 | rev: 23.1.0
4 | hooks:
5 | - id: black
6 | language_version: python3.11
7 | - repo: https://github.com/pycqa/flake8
8 | rev: 5.0.4
9 | hooks:
10 | - id: flake8
11 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you use this software, please cite it as below."
3 | authors:
4 | - family-names: "Rubin"
5 | given-names: "Alan F"
6 | orcid: "https://orcid.org/0000-0003-1474-605X"
7 | title: "mavehgvs"
8 | version: 0.4.0
9 | doi: 10.5281/zenodo.5148054
10 | date-released: 2021-07-30
11 | url: "https://github.com/VariantEffect/mavehgvs"
12 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2018-2023, Alan F Rubin and Daniel Esposito
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://travis-ci.com/VariantEffect/mavehgvs)
2 | [](https://coveralls.io/github/VariantEffect/mavehgvs?branch=main)
3 | [](https://github.com/psf/black)
4 |
5 | # mavehgvs
6 | mavehgvs is the Python reference implementation of the MAVE-HGVS variant representation standard,
7 | a strict subset of [HGVS](http://varnomen.hgvs.org/), used primarily for clinical genomics.
8 |
9 | ## The MAVE-HGVS Standard
10 | MAVE-HGVS is a strict subset of the [HGVS Sequence Variant Nomenclature](https://varnomen.hgvs.org/), version 20.05.
11 | HGVS nomenclature is comprehensive and very expressive and consequently includes a lot of syntax that is not needed to
12 | represent variants from Multiplexed Assay of Variant Effect (MAVE) data and makes the variant strings more challenging
13 | to parse.
14 |
15 | While packages exist for parsing HGVS (most notably the
16 | [biocommons hgvs package](https://github.com/biocommons/hgvs/), they are intended for use in human genetics and
17 | rely on sequence databases and reference sequence (called "target sequence" for MAVE-HGVS), which are not always
18 | available for or relevant for multiplexed assays.
19 |
20 | MAVE-HGVS is an attempt to define an easy-to-parse subset of the HGVS nomenclature that captures those variants that
21 | occur in MAVE datasets, while excluding many variant types that are unlikely to be found. Importantly, the
22 | mavehgvs implementation does not rely on external sequence databases or identifiers.
23 |
24 | ## Supported Variants
25 | MAVE-HGVS supports DNA, RNA, and protein variants.
26 | MAVE-HGVS supports a subset of HGVS variants including:
27 |
28 | * substitutions
29 | * deletions
30 | * duplications
31 | * insertions
32 | * frame shifts
33 |
34 | Many HGVS variants are unsupported including:
35 |
36 | * inversions
37 | * conversions
38 | * extensions
39 | * changes in methylation state
40 | * RNA fusion transcripts
41 | * mosaicism
42 | * chimerism
43 | * variants with uncertain consequence
44 | * variants in trans or unknown phase
45 | * complex variants (e.g. translocations)
46 |
47 | For further details, including example variants, see the specification in the package documentation.
48 |
49 | # Installation
50 | Install mavehgvs from pip using:
51 |
52 | ```bash
53 | pip3 install mavehgvs
54 | ```
55 |
56 | To set up the package for development purposes, include the optional dependencies and
57 | install pre-commit:
58 |
59 | pip3 install mavehgvs[dev]
60 | pre-commit install
61 |
62 | # Feedback
63 | To report a problem or request a new feature with either the mavehgvs package or the MAVE-HGVS standard,
64 | please use the GitHub issue tracker.
65 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
1 | .. _api-docs:
2 |
3 | mavehgvs API documentation
4 | ==========================
5 |
6 | Variant objects
7 | ---------------
8 |
9 | Each variant can be parsed into a variant object, which populates and exposes named
10 | fields for each piece of the variant string.
11 |
12 | .. automodule:: mavehgvs.position
13 | :members:
14 | :private-members:
15 | :special-members:
16 |
17 | .. automodule:: mavehgvs.variant
18 | :members:
19 | :private-members:
20 | :special-members:
21 |
22 | .. automodule:: mavehgvs.exceptions
23 | :members:
24 |
25 | Utility functions for handling variants
26 | ---------------------------------------
27 |
28 | .. automodule:: mavehgvs.util
29 | :members:
30 |
31 | Utility functions for regular expression patterns
32 | -------------------------------------------------
33 |
34 | .. automodule:: mavehgvs.patterns.util
35 | :members:
36 |
37 | DNA pattern strings
38 | -------------------
39 |
40 | .. automodule:: mavehgvs.patterns.dna
41 | :members:
42 |
43 | RNA pattern strings
44 | -------------------
45 |
46 | .. automodule:: mavehgvs.patterns.rna
47 | :members:
48 |
49 | Protein pattern strings
50 | -----------------------
51 |
52 | .. automodule:: mavehgvs.patterns.protein
53 | :members:
54 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 |
16 | sys.path.insert(0, os.path.abspath("../src"))
17 |
18 | from mavehgvs import __version__ # noqa: E402
19 |
20 | # -- Project information -----------------------------------------------------
21 |
22 | project = "MAVE-HGVS"
23 | copyright = "2018-2023, Alan F Rubin and Daniel Esposito"
24 | author = "Alan F Rubin and Daniel Esposito"
25 |
26 | # The full version, including alpha/beta/rc tags
27 | release = __version__
28 |
29 |
30 | # -- General configuration ---------------------------------------------------
31 |
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = [
36 | "sphinx.ext.autodoc",
37 | "sphinx.ext.napoleon",
38 | "sphinx.ext.intersphinx",
39 | "sphinx.ext.autosectionlabel",
40 | ]
41 |
42 | # Add any paths that contain templates here, relative to this directory.
43 | templates_path = ["_templates"]
44 |
45 | # List of patterns, relative to source directory, that match files and
46 | # directories to ignore when looking for source files.
47 | # This pattern also affects html_static_path and html_extra_path.
48 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
49 |
50 |
51 | # -- Options for HTML output -------------------------------------------------
52 |
53 | # The theme to use for HTML and HTML Help pages. See the documentation for
54 | # a list of builtin themes.
55 | #
56 | html_theme = "nature"
57 |
58 | # Add any paths that contain custom static files (such as style sheets) here,
59 | # relative to this directory. They are copied after the builtin static files,
60 | # so a file named "default.css" will overwrite the builtin "default.css".
61 | html_static_path = ["_static"]
62 |
63 |
64 | # -- Extension configuration -------------------------------------------------
65 | intersphinx_mapping = {"python": ("https://docs.python.org/3", None)}
66 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | MAVE-HGVS documentation
2 | =======================
3 |
4 | MAVE-HGVS is a strict subset of the `HGVS sequence variant nomenclature `_
5 | used by `MaveDB `_ and related tools to represent protein and DNA variants in
6 | Multiplexed Assays of Variant Effect (MAVE) datasets.
7 |
8 | This version of MAVE-HGVS is based on HGVS version 20.05.
9 |
10 | When citing, please refer to:
11 |
12 | #. Esposito, D., Weile J., *et al.* MaveDB: an open-source platform to distribute and interpret data from multiplexed assays of variant effect. *Genome Biol* **20**, 223 (2019). https://doi.org/10.1186/s13059-019-1845-6
13 | #. den Dunnen, J. T. *et al.* HGVS Recommendations for the Description of Sequence Variants: 2016 Update. *Hum Mutat* **37**, 564–569 (2016). https://doi.org/10.1002/humu.22981
14 |
15 | .. toctree::
16 | :maxdepth: 2
17 | :caption: Contents:
18 |
19 | spec
20 | api
21 |
22 | Indices and tables
23 | ==================
24 |
25 | * :ref:`genindex`
26 | * :ref:`modindex`
27 | * :ref:`search`
28 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/prefix.csv:
--------------------------------------------------------------------------------
1 | "c", "coding DNA sequence"
2 | "g", "linear genomic DNA sequence"
3 | "m", "mitochondrial genomic DNA sequence"
4 | "n", "non-coding DNA sequence"
5 | "o", "circular genomic DNA sequence"
6 | "p", "protein sequence"
7 | "r", "RNA transcript sequence"
8 |
--------------------------------------------------------------------------------
/docs/spec.rst:
--------------------------------------------------------------------------------
1 | .. _spec-docs:
2 |
3 | MAVE-HGVS specification
4 | =======================
5 |
6 | MAVE-HGVS is a strict subset of the `HGVS Sequence Variant Nomenclature `_, version 20.05.
7 | HGVS nomenclature is comprehensive and very expressive and consequently includes a lot of syntax that is not needed to
8 | represent variants from Multiplexed Assay of Variant Effect (MAVE) data and makes the variant strings more challenging
9 | to parse.
10 |
11 | While packages exist for parsing HGVS (most notably the
12 | `biocommons hgvs package `_), they are intended for use in human genetics and
13 | rely on sequence databases and reference sequence (called "target sequence" for MAVE-HGVS), which are not always
14 | available for or relevant for multiplexed assays.
15 |
16 | MAVE-HGVS is an attempt to define an easy-to-parse subset of the HGVS nomenclature that captures those variants that
17 | occur in MAVE datasets, while excluding many variant types that are unlikely to be found. Importantly, the
18 | :ref:`corresponding implementation ` of MAVE-HGVS does not rely on external sequence databases or identifiers.
19 |
20 | Key differences between HGVS and MAVE-HGVS
21 | ------------------------------------------
22 |
23 | Standard HGVS strings have the format :code:`reference:variant` (e.g. :code:`NM_001130145.3:c.832C>T`).
24 | MAVE-HGVS strings typically include the variant portion only and the reference (target) portion is inferred from the
25 | MAVE design.
26 |
27 | Target identifiers in MAVE-HGVS are optional, and would typically be used in cases where a mix of MAVE datasets are
28 | being analyzed jointly or for experimental designs that contain multiple target sequences.
29 | Target identifiers in MAVE-HGVS can contain any word characters, numbers, or the underscore.
30 |
31 | MAVE-HGVS does not distinguish between variants that have been observed experimentally and the predicted consequence of
32 | observed variants.
33 | Therefore, variants that contain :code:`()` to denote predicted consequences are considered invalid with one exception
34 | (see `Substitution`_ below).
35 |
36 | MAVE-HGVS supports position numberings that are relative to a transcript (e.g. :code:`c.85+12G>A` or :code:`n.*22del`).
37 | These positions are referred to here as using the extended position notation.
38 | Variants using the extended position notation should appear alongside variants with simple (integer only) position
39 | numbers relative to the target sequence, expressed using the appropriate genomic prefix.
40 |
41 | Like HGVS, MAVE-HGVS supports alleles (called multi-variants in MAVE-HGVS) that describe multiple variants in a single
42 | variant string.
43 | Multi-variants are represented as a semicolon-separated list of valid MAVE-HGVS variants.
44 |
45 | MAVE-HGVS supports a subset of HGVS variants including:
46 |
47 | * substitutions
48 | * frame shifts
49 | * deletions
50 | * duplications
51 | * insertions
52 |
53 | Many HGVS variants are unsupported including:
54 |
55 | * inversions
56 | * extensions
57 | * changes in methylation state
58 | * RNA fusion transcripts
59 | * mosaicism
60 | * chimerism
61 | * variants with uncertain consequence
62 | * variants in trans or unknown phase
63 | * complex variants (e.g. translocations)
64 |
65 | Sequence prefixes and sequence types
66 | ------------------------------------
67 |
68 | Similarly to HGVS, a MAVE-HGVS variant begins with a single prefix character that defines the sequence type.
69 | Supported sequence types are the same as for HGVS, and are listed in the following table:
70 |
71 | .. csv-table::
72 | :file: ../docs/prefix.csv
73 | :header: "Prefix", "Description"
74 | :widths: 5, 20
75 |
76 | Typically MAVE variants are expressed relative to a coding, non-coding, or protein sequence.
77 |
78 | A notable exception is when the target sequence for the MAVE consists of both coding and non-coding sequences,
79 | such as when a full-length gene with introns is mutagenized and splice variants are assayed via saturation genome
80 | editing or other methods.
81 | In this case, it is appropriate to use one of the genomic sequence prefixes to describe changes using the contiguous
82 | region containing all mutagenized sequences as the target sequence.
83 |
84 | RNA variants are intended to be used when assaying the functional consequences to an RNA molecule,
85 | such as a tRNA or ribozyme.
86 | Variants that are measured at the DNA level should generally not use the RNA syntax.
87 |
88 | Equality
89 | --------
90 |
91 | MAVE-HGVS allows variants to describe equality to the target in a variety of ways.
92 |
93 | Variants describing identity to the full target sequence (e.g. :code:`c.=`) are valid and are the intended way to
94 | specify identity to the target (wild-type) sequence.
95 | This replaces the `Enrich2 `_ :code:`_wt` variant syntax.
96 |
97 | Variants that describe identity to the reference (target) at a single position (e.g. :code:`c.44=`)
98 | or range of positions (e.g. :code:`c.1_3=`) are valid for coding and genomic sequences.
99 | These should only be used for special cases, such as in MITE-seq datasets where the scores and counts are
100 | reported separately for each wild-type codon.
101 |
102 | The target-identity variants :code:`c.=` and :code:`p.=` are only valid on their own and are considered invalid as
103 | part of multi-variants.
104 | The variants that describe nucleotide identity to part of the reference are also invalid as part of multi-variants.
105 |
106 | Variants that describe identity to the target at a single amino acid position (e.g. :code:`p.Cys22=`) are valid and
107 | are the preferred way to describe specific synonymous variants.
108 |
109 | The variant :code:`p.(=)` is used when summarizing the population of variants that are synonymous at the protein level
110 | but not target identical at the DNA level.
111 | This replaces the `Enrich2 `_ :code:`_sy` variant syntax.
112 |
113 | .. warning:: Many variants currently in MaveDB use only '=' as part of multi-variants and are therefore invalid
114 | MAVE-HGVS.
115 | Additionally, some MaveDB datasets have a one-to-one relationship between nucleotide and protein multi-variants
116 | resulting in duplicate protein variants in the multi-variant.
117 | This should also be considered invalid.
118 |
119 | Examples of valid equality variants:
120 |
121 | * c.=
122 | * c.22=
123 | * c.1_3=
124 | * g.123=
125 | * p.Cys22=
126 | * p.(=)
127 |
128 | Substitution
129 | ------------
130 |
131 | .. note:: TODO: add some noncoding ('n.' variants) to the examples.
132 |
133 | MAVE-HGVS supports substitutions of a single nucleotide or amino acid.
134 |
135 | MAVE-HGVS does not support extension variants, which extend an amino acid sequence to the N- or C- terminal end
136 | (e.g. :code:`p.Met1ext-4` for gain of an upstream start or :code:`p.Ter345Lysext5` for a new downstream termination
137 | codon).
138 | Variants that remove a termination codon should be written as standard substitution variants.
139 | Variants that result in an N-terminal extension are currently undefined,
140 | but have not been observed in the MAVE literature at the time of writing.
141 |
142 | Substitutions of more than one base at a time are covered under `Deletion-Insertion`_.
143 |
144 | Examples of valid substitutions:
145 |
146 | * g.48C>A
147 | * c.122-6T>A
148 | * c.*33G>C
149 | * p.Glu27Trp
150 | * p.Ter345Lys
151 | * r.22g>u
152 | * r.33+12a>c
153 |
154 | Examples of valid HGVS substitutions that are invalid in MAVE-HGVS:
155 |
156 | * g.48C>W
157 | * c.122=/T>A
158 | * p.(Glu27Trp)
159 | * p.*345Lys
160 | * p.Glu23Xaa
161 | * r.spl
162 |
163 | Frame Shift
164 | -----------
165 |
166 | MAVE-HGVS supports a simplified syntax to describe frame shifts in protein variants.
167 | Multi-variants that include multiple frame shifts or a second variant after a frame shift are considered invalid.
168 |
169 | Because frame shift (and the related extension) variants are uncommon in MAVE datasets, MAVE-HGVS provides this minimal support.
170 | Extension variants (removal of a termination codon) should be expressed as a frame shift at the termination codon.
171 |
172 | Examples of valid frame shift variants:
173 |
174 | * p.Glu27fs
175 | * p.Asp125fs
176 | * p.Ter385fs
177 |
178 | Examples of valid HGVS frame shift variants that are invalid in MAVE-HGVS:
179 |
180 | * p.Arg12LysfsTer18
181 | * p.Arg12Lysfs*18
182 | * p.Glu27fs*?
183 | * p.(Glu27fs)
184 |
185 | Deletion
186 | --------
187 |
188 | MAVE-HGVS supports deletions of specified nucleotides or amino acids.
189 |
190 | Deletions of an unknown number of bases or amino acids are not supported.
191 | For example, deletions where the breakpoint is not known or where the deletion extends past the end of the target
192 | cannot be represented with uncertainty.
193 | To represent a deletion of a sequence including the start or end of the target, specify the deletion exactly as if it
194 | extended to the first or last position.
195 |
196 | Examples of valid deletions:
197 |
198 | * g.44del
199 | * c.78+5_78+10del
200 | * c.1_95del
201 | * p.Gly18del
202 | * p.Gln7_Asn19del
203 | * r.34_36del
204 |
205 | Examples of valid HGVS deletions that are invalid in MAVE-HGVS:
206 |
207 | * c.(78+1_79-1)_(124+1_125-1)del
208 | * g.(?_85)_(124\_?)del
209 | * c.122=/del
210 | * p.(Gly18del)
211 | * r.=/9_12del
212 | * r.(155_185)del
213 |
214 | Duplication
215 | -----------
216 |
217 | MAVE-HGVS supports duplications of one or more nucleotides or amino acids.
218 | The syntax is the same as HGVS.
219 |
220 | Examples of valid duplications:
221 |
222 | * g.22_24dup
223 | * c.77dup
224 | * c.101+1_101+7dup
225 | * p.Pro12_Gly18dup
226 | * p.Cys5dup
227 | * r.12dup
228 |
229 | Examples of valid HGVS duplications that are invalid in MAVE-HGVS:
230 |
231 | * c.(78+1_79-1)_(124+1_125-1)dup
232 | * g.(?_85)_(124\_?)dup
233 | * c.122_125=//dup
234 | * p.(Cys5dup)
235 |
236 | Insertion
237 | ---------
238 |
239 | MAVE-HGVS supports insertions of a specified nucleotide or amino acid sequence.
240 |
241 | Insertions of a number of unspecified bases or amino acids or insertions using ambiguity characters (e.g. N or Xaa)
242 | are not supported.
243 |
244 | Insertions must be specified by listing the complete inserted sequence.
245 | Referring to the sequence that is inserted based on its position in the target sequence is not considered valid for
246 | MAVE-HGVS.
247 |
248 | To describe an insertion at the end of the target sequence, use a :ref:`Deletion-Insertion` variant that deletes
249 | the last base or amino acid in the target and inserts the deleted symbol plus the insertion.
250 |
251 | Examples of valid insertions:
252 |
253 | * g.234_235insT
254 | * c.84_85insCTG
255 | * c.99+6_99+7insA
256 | * p.His7_Gln8insSer
257 | * p.Ala12_Pro13insGlyProCys
258 | * r.22_23insauc
259 |
260 | Examples of valid HGVS insertions that are invalid in MAVE-HGVS:
261 |
262 | * c.84_85ins100_125
263 | * g.234_235ins(10)
264 | * g.234_235ins(?)
265 | * c.(122_125)insG
266 | * p.(His7_Gln8insSer)
267 | * p.(His7_Gln8insX)
268 | * p.(Ala12_Pro13ins(2))
269 | * r.(27_30)insu
270 | * r.74_74insnnn
271 |
272 | Deletion-Insertion
273 | ------------------
274 |
275 | MAVE-HGVS supports deletion-insertions of a specified nucleotide or amino acid sequence.
276 |
277 | Deletion-insertions of a number of unspecified bases or amino acids or insertions using ambiguity characters
278 | (e.g. N or Xaa) are not supported. This includes deletion-insertions with uncertain breakpoints.
279 |
280 | Examples of valid deletion-insertions:
281 |
282 | * g.22delinsAACG
283 | * c.83_85delinsT
284 | * c.43-6_595+12delinsCTT
285 | * p.Ile71_Cys80delinsSer
286 | * p.His44delinsValProGlyGlu
287 | * r.92delinsgac
288 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["hatchling"]
3 | build-backend = "hatchling.build"
4 |
5 | [project]
6 | name = "mavehgvs"
7 | dynamic = ["version"]
8 | description = "Regular expression-based validation of HGVS-style variant strings for Multiplexed Assays of Variant Effect."
9 | readme = "README.md"
10 | license = "BSD-3-Clause"
11 | requires-python = ">=3.6"
12 | authors = [
13 | { name = "Alan F Rubin", email = "alan.rubin@wehi.edu.au" },
14 | ]
15 | classifiers = [
16 | "Development Status :: 3 - Alpha",
17 | "Intended Audience :: Science/Research",
18 | "License :: OSI Approved :: BSD License",
19 | "Operating System :: OS Independent",
20 | "Programming Language :: Python :: 3",
21 | "Topic :: Scientific/Engineering :: Bio-Informatics",
22 | ]
23 | dependencies = [
24 | "fqfa>=1.2.3",
25 | ]
26 |
27 | [project.urls]
28 | repository = "https://github.com/VariantEffect/mavehgvs"
29 | documentation = "https://www.mavedb.org/docs/mavehgvs"
30 |
31 | [project.optional-dependencies]
32 | dev = [
33 | "black",
34 | "flake8",
35 | "pre-commit",
36 | "pytest",
37 | ]
38 |
39 | [tool.hatch.version]
40 | path = "src/mavehgvs/__init__.py"
41 |
42 | [tool.hatch.build.targets.wheel]
43 | packages = ["src/mavehgvs"]
44 |
45 | [tool.hatch.build.targets.sdist]
46 | exclude = [
47 | "docs/",
48 | ".github/",
49 | ]
50 |
51 | [tool.setuptools.package-data]
52 | "mavehgvs" = ["py.typed"]
53 |
--------------------------------------------------------------------------------
/src/mavehgvs/__init__.py:
--------------------------------------------------------------------------------
1 | from mavehgvs.exceptions import MaveHgvsParseError
2 | from mavehgvs.position import VariantPosition
3 | from mavehgvs.variant import Variant
4 | from mavehgvs.util import parse_variant_strings
5 |
6 | __version__ = "0.7.0"
7 |
8 | __all__ = [
9 | "__version__",
10 | "Variant",
11 | "VariantPosition",
12 | "MaveHgvsParseError",
13 | "parse_variant_strings",
14 | ]
15 |
--------------------------------------------------------------------------------
/src/mavehgvs/exceptions.py:
--------------------------------------------------------------------------------
1 | __all__ = ["MaveHgvsParseError"]
2 |
3 |
4 | class MaveHgvsParseError(Exception):
5 | """Exception to use when a MAVE-HGVS string is not valid."""
6 |
7 | pass
8 |
--------------------------------------------------------------------------------
/src/mavehgvs/patterns/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VariantEffect/mavehgvs/69476dde5391022e7c0eca32ecd1734e371436eb/src/mavehgvs/patterns/__init__.py
--------------------------------------------------------------------------------
/src/mavehgvs/patterns/combined.py:
--------------------------------------------------------------------------------
1 | from mavehgvs.patterns.dna import dna_single_variant as dsv, dna_multi_variant as dmv
2 | from mavehgvs.patterns.rna import rna_single_variant as rsv, rna_multi_variant as rmv
3 | from mavehgvs.patterns.protein import (
4 | pro_single_variant as psv,
5 | pro_multi_variant as pmv,
6 | )
7 |
8 | any_variant = (
9 | r"(?:(?P[a-zA-Z0-9_.-]+):)?"
10 | + r"(?P"
11 | + rf"(?P{r'|'.join([dsv, rsv, psv])})|"
12 | + rf"(?P{r'|'.join([dmv, rmv, pmv])})"
13 | + r")"
14 | )
15 |
--------------------------------------------------------------------------------
/src/mavehgvs/patterns/dna.py:
--------------------------------------------------------------------------------
1 | from fqfa.constants import DNA_BASES
2 | from mavehgvs.patterns.util import combine_patterns, remove_named_groups
3 | from mavehgvs.patterns.position import pos, pos_intron, pos_intron_utr
4 |
5 | dna_nt: str = rf"[{''.join(DNA_BASES)}]"
6 | """str: Pattern matching any uppercase DNA base.
7 |
8 | This does not include IUPAC ambiguity characters.
9 | """
10 |
11 | dna_equal_c: str = (
12 | rf"(?P(?:(?:(?P{pos_intron_utr})_(?P{pos_intron_utr}))|"
13 | + rf"(?P{pos_intron_utr}))?(?P=))"
14 | )
15 | """str: Pattern matching DNA equality with numeric, intronic, or UTR positions.
16 | """
17 |
18 | dna_sub_c: str = (
19 | rf"(?P(?P{pos_intron_utr})(?P[{dna_nt})>(?P{dna_nt}))"
20 | )
21 | """str: Pattern matching a DNA substitution with numeric, intronic, or UTR positions.
22 | """
23 |
24 | dna_del_c: str = (
25 | rf"(?P(?:(?:(?P{pos_intron_utr})_(?P{pos_intron_utr}))|"
26 | + rf"(?P{pos_intron_utr}))del)"
27 | )
28 | """str: Pattern matching a DNA deletion with numeric, intronic, or UTR positions.
29 | """
30 |
31 | dna_dup_c: str = (
32 | rf"(?P(?:(?:(?P{pos_intron_utr})_"
33 | + rf"(?P{pos_intron_utr}))|(?P{pos_intron_utr}))dup)"
34 | )
35 | """str: Pattern matching a DNA duplication with numeric, intronic, or UTR positions.
36 | """
37 |
38 | dna_ins_c: str = (
39 | rf"(?P(?P{pos_intron_utr})_"
40 | + rf"(?P{pos_intron_utr})ins(?P{dna_nt}+))"
41 | )
42 | """str: Pattern matching a DNA insertion with numeric, intronic, or UTR positions.
43 | """
44 |
45 | dna_delins_c: str = (
46 | rf"(?P(?:(?:(?P{pos_intron_utr})_"
47 | + rf"(?P{pos_intron_utr}))|(?P{pos_intron_utr}))"
48 | + rf"delins(?P{dna_nt}+))"
49 | )
50 | """str: Pattern matching a DNA deletion-insertion with numeric, intronic, or UTR
51 | positions.
52 | """
53 |
54 | dna_equal_n: str = r"(?P(?P=))"
55 | """str: Pattern matching DNA equality with no position support.
56 | """
57 |
58 | dna_sub_n: str = dna_sub_c.replace(pos_intron_utr, pos_intron).replace(
59 | "(?P", "(?P"
60 | )
61 | """str: Pattern matching a DNA substitution with numeric or intron positions for
62 | non-coding variants.
63 | """
64 |
65 | dna_del_n: str = dna_del_c.replace(pos_intron_utr, pos_intron).replace(
66 | "(?P", "(?P"
67 | )
68 | """str: Pattern matching a DNA deletion with numeric or intron positions for non-coding
69 | variants.
70 | """
71 |
72 | dna_dup_n: str = dna_dup_c.replace(pos_intron_utr, pos_intron).replace(
73 | "(?P", "(?P"
74 | )
75 | """str: Pattern matching a DNA duplication with numeric or intron positions for
76 | non-coding variants.
77 | """
78 |
79 | dna_ins_n: str = dna_ins_c.replace(pos_intron_utr, pos_intron).replace(
80 | "(?P", "(?P"
81 | )
82 | """str: Pattern matching a DNA insertion with numeric or intron positions for non-coding
83 | variants.
84 | """
85 |
86 | dna_delins_n: str = dna_delins_c.replace(pos_intron_utr, pos_intron).replace(
87 | "(?P", "(?P"
88 | )
89 | """str: Pattern matching a DNA deletion-insertion with numeric or intron positions for
90 | non-coding variants.
91 | """
92 |
93 | dna_equal_gmo: str = dna_equal_c.replace(pos_intron_utr, pos).replace(
94 | "(?P", "(?P"
95 | )
96 | """str: Pattern matching a DNA substitution with only numeric positions for
97 | genomic-style variants.
98 | """
99 |
100 | dna_sub_gmo: str = dna_sub_c.replace(pos_intron_utr, pos).replace(
101 | "(?P", "(?P"
102 | )
103 | """str: Pattern matching a DNA substitution with only numeric positions for
104 | genomic-style variants.
105 | """
106 |
107 | dna_del_gmo: str = dna_del_c.replace(pos_intron_utr, pos).replace(
108 | "(?P", "(?P"
109 | )
110 | """str: Pattern matching a DNA deletion with only numeric positions for genomic-style
111 | variants.
112 | """
113 |
114 | dna_dup_gmo: str = dna_dup_c.replace(pos_intron_utr, pos).replace(
115 | "(?P", "(?P"
116 | )
117 | """str: Pattern matching a DNA duplication with only numeric positions for genomic-style
118 | variants.
119 | """
120 |
121 | dna_ins_gmo: str = dna_ins_c.replace(pos_intron_utr, pos).replace(
122 | "(?P", "(?P"
123 | )
124 | """str: Pattern matching a DNA insertion with only numeric positions for genomic-style
125 | variants.
126 | """
127 |
128 | dna_delins_gmo: str = dna_delins_c.replace(pos_intron_utr, pos).replace(
129 | "(?P", "(?P"
130 | )
131 | """str: Pattern matching a DNA deletion-insertion with only numeric positions for
132 | genomic-style variants.
133 | """
134 |
135 | dna_variant_c: str = combine_patterns(
136 | [dna_equal_c, dna_sub_c, dna_del_c, dna_dup_c, dna_ins_c, dna_delins_c], None
137 | )
138 | """str: Pattern matching any of the coding DNA variants.
139 | """
140 |
141 | dna_variant_n: str = combine_patterns(
142 | [dna_equal_n, dna_sub_n, dna_del_n, dna_dup_n, dna_ins_n, dna_delins_n], None
143 | )
144 | """str: Pattern matching any of the non-coding DNA variants.
145 | """
146 |
147 | dna_variant_gmo: str = combine_patterns(
148 | [dna_equal_gmo, dna_sub_gmo, dna_del_gmo, dna_dup_gmo, dna_ins_gmo, dna_delins_gmo],
149 | None,
150 | )
151 | """str: Pattern matching any of the genomic-style DNA variants.
152 | """
153 |
154 | dna_single_variant: str = (
155 | rf"(?Pc\.{dna_variant_c})|"
156 | + rf"(?Pn\.{dna_variant_n})|"
157 | + rf"(?P[gmo]\.{dna_variant_gmo})"
158 | )
159 | """str: Pattern matching any complete single DNA variant, including the prefix
160 | character.
161 | """
162 |
163 | dna_multi_variant: str = (
164 | r"(?Pc\."
165 | + rf"\[{remove_named_groups(dna_variant_c)}"
166 | + rf"(?:;{remove_named_groups(dna_variant_c)}){{1,}}\])|"
167 | + r"(?Pn\."
168 | + rf"\[{remove_named_groups(dna_variant_n)}"
169 | + rf"(?:;{remove_named_groups(dna_variant_n)}){{1,}}\])|"
170 | + r"(?P[gmo]\."
171 | + rf"\[{remove_named_groups(dna_variant_gmo)}"
172 | + rf"(?:;{remove_named_groups(dna_variant_gmo)}){{1,}}\])"
173 | )
174 | """str: Pattern matching any complete DNA multi-variant, including the prefix character.
175 |
176 | Named capture groups have been removed from the variant patterns because of
177 | non-uniqueness.
178 | Another applications of single-variant regular expressions is needed to recover the
179 | named groups from each individual variant in the multi-variant.
180 | """
181 |
--------------------------------------------------------------------------------
/src/mavehgvs/patterns/position.py:
--------------------------------------------------------------------------------
1 | pos: str = r"[1-9][0-9]*"
2 | """str: Pattern matching a positive integer not starting with 0.
3 |
4 | This pattern is used for sequence positions, as position 0 does not exist.
5 | """
6 |
7 | pos_intron: str = rf"{pos}(?:[+-]{pos})?"
8 | """str: Pattern matching a position with optional intron component.
9 |
10 | This pattern is used for sequence positions in an RNA or noncoding sequence.
11 | """
12 |
13 | pos_intron_utr: str = rf"[*-]?{pos}(?:[+-]{pos})?"
14 | """str: Pattern matching a position with optional intron and UTR components.
15 |
16 | This pattern is used for sequence positions in a coding sequence.
17 | """
18 |
--------------------------------------------------------------------------------
/src/mavehgvs/patterns/protein.py:
--------------------------------------------------------------------------------
1 | from fqfa.constants import AA_CODES
2 | from mavehgvs.patterns.util import combine_patterns, remove_named_groups
3 | from mavehgvs.patterns.position import pos
4 |
5 | amino_acid: str = rf"(?:{'|'.join(AA_CODES.values())})"
6 | """str: Pattern matching any amino acid or Ter.
7 |
8 | This does not include ambiguous amino acids such as Glx and Xaa.
9 | """
10 |
11 | aa_pos: str = rf"(?:{amino_acid}{pos})"
12 | """str: Pattern matching an amino acid code followed by a position.
13 | """
14 |
15 | pro_equal: str = (
16 | rf"(?P(?:(?P{aa_pos})?(?P=))|(?P\(=\)))"
17 | )
18 | """str: Pattern matching protein equality or synonymous variant.
19 | """
20 |
21 | pro_sub: str = rf"(?P(?P{aa_pos})(?P{amino_acid}))"
22 | """str: Pattern matching a protein substitution.
23 | """
24 |
25 | pro_fs: str = rf"(?P(?P{aa_pos})fs)"
26 | """str: Pattern matching a protein substitution.
27 | """
28 |
29 | pro_del: str = (
30 | rf"(?P(?:(?P{aa_pos})_(?P{aa_pos})del)|"
31 | + rf"(?:(?P{aa_pos})del))"
32 | )
33 | """str: Pattern matching a protein deletion.
34 | """
35 |
36 | pro_dup: str = (
37 | rf"(?P(?:(?P{aa_pos})_(?P{aa_pos})dup)|"
38 | + rf"(?:(?P{aa_pos})dup))"
39 | )
40 | """str: Pattern matching a protein duplication.
41 | """
42 |
43 | pro_ins: str = (
44 | rf"(?P(?P{aa_pos})_(?P{aa_pos})ins(?P{amino_acid}+))"
45 | )
46 | """str: Pattern matching a protein insertion.
47 | """
48 |
49 | pro_delins: str = (
50 | rf"(?P(?:(?:(?P{aa_pos})_(?P{aa_pos}))|"
51 | + rf"(?P{aa_pos}))delins(?P{amino_acid}+))"
52 | )
53 | """str: Pattern matching a protein deletion-insertion.
54 | """
55 |
56 | pro_variant: str = combine_patterns(
57 | [pro_equal, pro_sub, pro_fs, pro_del, pro_dup, pro_ins, pro_delins], None
58 | )
59 | """str: Pattern matching any single protein variant event.
60 | """
61 |
62 | pro_single_variant: str = rf"(?Pp\.{pro_variant})"
63 | """str: Pattern matching any complete protein variant, including the prefix character.
64 | """
65 |
66 | pro_multi_variant: str = (
67 | rf"(?Pp\.\[{remove_named_groups(pro_variant)}"
68 | + rf"(?:;{remove_named_groups(pro_variant)}){{1,}}\])"
69 | )
70 |
71 | """str: Pattern matching any complete protein multi-variant, including the prefix
72 | character.
73 |
74 | Named capture groups have been removed from the variant patterns because of
75 | non-uniqueness.
76 | Another applications of single-variant regular expressions is needed to recover the
77 | named groups from each individual variant in the multi-variant.
78 | """
79 |
--------------------------------------------------------------------------------
/src/mavehgvs/patterns/rna.py:
--------------------------------------------------------------------------------
1 | from fqfa.constants import RNA_BASES
2 | from mavehgvs.patterns.util import combine_patterns, remove_named_groups
3 | from mavehgvs.patterns.position import pos_intron
4 |
5 | rna_nt: str = rf"[{''.join(RNA_BASES).lower()}]"
6 | """str: Pattern matching any lowercase RNA base.
7 |
8 | This does not include IUPAC ambiguity characters.
9 | """
10 |
11 | rna_equal: str = (
12 | rf"(?P(?:(?:(?P{pos_intron})_"
13 | + rf"(?P{pos_intron}))|(?P{pos_intron}))?(?P=))"
14 | )
15 | """str: Pattern matching RNA equality with numeric or relative-to-transcript positions.
16 | """
17 |
18 | rna_sub: str = (
19 | rf"(?P(?P{pos_intron})(?P][{rna_nt})>(?P{rna_nt}))"
20 | )
21 | """str: Pattern matching a RNA substitution with numeric or relative-to-transcript
22 | positions.
23 | """
24 |
25 | rna_del: str = (
26 | rf"(?P(?:(?:(?P{pos_intron})_(?P{pos_intron}))|"
27 | + rf"(?P{pos_intron}))del)"
28 | )
29 | """str: Pattern matching a RNA deletion with numeric or relative-to-transcript
30 | positions.
31 | """
32 |
33 | rna_dup: str = (
34 | rf"(?P(?:(?:(?P{pos_intron})_(?P{pos_intron}))|"
35 | + rf"(?P{pos_intron}))dup)"
36 | )
37 | """str: Pattern matching a RNA duplication with numeric or relative-to-transcript
38 | positions.
39 | """
40 |
41 | rna_ins: str = (
42 | rf"(?P(?P{pos_intron})_(?P{pos_intron})ins(?P{rna_nt}+))"
43 | )
44 | """str: Pattern matching a RNA insertion with numeric or relative-to-transcript
45 | positions.
46 | """
47 |
48 | rna_delins: str = (
49 | rf"(?P(?:(?:(?P{pos_intron})_(?P{pos_intron}))|"
50 | + rf"(?P{pos_intron}))delins(?P{rna_nt}+))"
51 | )
52 | """str: Pattern matching a RNA deletion-insertion with numeric or relative-to-transcript
53 | positions.
54 | """
55 |
56 | rna_variant: str = combine_patterns(
57 | [rna_equal, rna_sub, rna_del, rna_dup, rna_ins, rna_delins], None
58 | )
59 | """str: Pattern matching any single RNA variant event.
60 | """
61 |
62 | rna_single_variant: str = rf"(?Pr\.{rna_variant})"
63 | """str: Pattern matching any complete RNA variant, including the prefix character.
64 | """
65 |
66 | rna_multi_variant: str = (
67 | rf"(?Pr\.\[{remove_named_groups(rna_variant)}"
68 | + rf"(?:;{remove_named_groups(rna_variant)}){{1,}}\])"
69 | )
70 | """str: Pattern matching any complete RNA multi-variant, including the prefix character.
71 |
72 | Named capture groups have been removed from the variant patterns because of
73 | non-uniqueness.
74 | Another applications of single-variant regular expressions is needed to recover the
75 | named groups from each individual variant in the multi-variant.
76 | """
77 |
--------------------------------------------------------------------------------
/src/mavehgvs/patterns/util.py:
--------------------------------------------------------------------------------
1 | """Utility functions for working with mavehgvs regex pattern strings.
2 | """
3 |
4 | import re
5 | from typing import Sequence, Optional
6 |
7 |
8 | def combine_patterns(patterns: Sequence[str], groupname: Optional[str] = None) -> str:
9 | """Combine multiple pattern strings into a single pattern string.
10 |
11 | Because multiple identical group names are not allowed in a pattern, the resulting
12 | object renames all named match groups such they are prefixed with the first match
13 | group name in the pattern. For example,
14 | ``(?P(?P[1-9][0-9]*)...`` becomes
15 | ``(?P(?P[1-9][0-9]*)...``.
16 |
17 | The function assumes that all input patterns are enclosed in parentheses.
18 |
19 | Parameters
20 | ----------
21 | patterns : Sequence[str]
22 | Sequence of pattern strings to combine.
23 |
24 | groupname : Optional[str]
25 | Name for the capture group surrounding the resulting pattern. If this is None, a
26 | non-capturing group will be used instead.
27 |
28 | Returns
29 | -------
30 | str
31 | Pattern string that matches any of the input patterns. Match groups are renamed
32 | as described above to attempt to ensure uniqueness across the combined pattern.
33 |
34 | """
35 | tag_re = re.compile(r"\(\?P<(\w+)>")
36 | stripped_patterns = list()
37 | for p in patterns:
38 | tags = list(tag_re.finditer(p))
39 | prefix = f"{tags[0].group(1)}_"
40 | new_p = p
41 | for t in tags[:0:-1]:
42 | start, end = t.span(1)
43 | new_p = "".join((new_p[:start], prefix, new_p[start:]))
44 | stripped_patterns.append(new_p)
45 | if groupname is None:
46 | combined = rf"(?:{r'|'.join(stripped_patterns)})"
47 | else:
48 | combined = rf"(?P<{groupname}>{r'|'.join(stripped_patterns)})"
49 |
50 | return combined
51 |
52 |
53 | def remove_named_groups(pattern: str, noncapturing: bool = True) -> str:
54 | """Function that replaces named match groups in a regular expression pattern.
55 |
56 | Named groups are replaced with either regular parentheses or non-capturing
57 | parentheses.
58 |
59 | Parameters
60 | ----------
61 | pattern : str
62 | The pattern string to strip match groups from.
63 |
64 | noncapturing : bool
65 | If True, the named grouping parentheses are replaced by non-capturing
66 | parentheses.
67 | If False, regular parentheses are used.
68 |
69 | Returns
70 | -------
71 | str
72 | The pattern string without named match groups.
73 |
74 | """
75 | if noncapturing:
76 | new_parens = "(?:"
77 | else:
78 | new_parens = "("
79 |
80 | return re.sub(r"\(\?P<\w+>", new_parens, pattern)
81 |
--------------------------------------------------------------------------------
/src/mavehgvs/position.py:
--------------------------------------------------------------------------------
1 | import re
2 | from functools import total_ordering
3 |
4 | from mavehgvs.exceptions import MaveHgvsParseError
5 | from mavehgvs.patterns.position import pos
6 | from mavehgvs.patterns.protein import amino_acid
7 |
8 | __all__ = ["VariantPosition"]
9 |
10 | pos_with_groups: str = (
11 | rf"(?P{amino_acid})?(?P[*-]?{pos})"
12 | + rf"(?P[+-]{pos})?"
13 | )
14 | """str: Pattern matching a position with match groups for parsing into a
15 | :py:class:`VariantPosition`.
16 | """
17 |
18 |
19 | @total_ordering
20 | class VariantPosition:
21 | """Class for storing a variant position.
22 |
23 | The class includes special fields for variants using the extended position syntax.
24 | Attributes
25 | ----------
26 | position : Optional[int]
27 | The position as an integer.
28 | Negative positions are only expected for 5' UTR positions.
29 | amino_acid : Optional[str]
30 | The amino acid at this position for protein variants.
31 | intronic_position : Optional[int]
32 | The number of bases into the intron for intronic positions.
33 | None for non-intronic positions.
34 |
35 | Nucleotides in the 5' half of the intron have positive ``intronic_position`` and
36 | their position is that of the last base of the 5' exon.
37 | Nucleotides in the 3' half of the intron have negative ``intronic_position`` and
38 | their position is that of the first base of the 3' exon.
39 | utr : Optional[bool]
40 | True if the position is in the UTR. None for all other positions.
41 |
42 | """
43 |
44 | fullmatch = re.compile(pos_with_groups, flags=re.ASCII).fullmatch
45 | """Callable[[str, int, int], Optional[Match[str]]]: fullmatch callable for parsing
46 | positions
47 |
48 | Returns an :py:obj:`re.Match` object if the full string matches one of the position
49 | groups in :py:data:`pos_extended`.
50 | """
51 |
52 | def __init__(self, pos_str: str) -> None:
53 | """Parse a position string into a VariantPosition object.
54 |
55 | Parameters
56 | ----------
57 | pos_str : str
58 | The string to convert to a VariantPosition object.
59 |
60 | """
61 | try:
62 | gdict = VariantPosition.fullmatch(pos_str).groupdict()
63 | except AttributeError:
64 | raise MaveHgvsParseError(f"invalid variant position string '{pos_str}'")
65 |
66 | self.position = None
67 | self.amino_acid = None
68 | self.intronic_position = None
69 | self.utr = None
70 |
71 | if gdict["position"].startswith("*"): # 3' UTR position
72 | self.utr = True
73 | self.position = int(gdict["position"][1:])
74 | else:
75 | if gdict["position"].startswith("-"): # 5' UTR position
76 | self.utr = True
77 | self.position = int(gdict["position"])
78 |
79 | if gdict["position_aa"] is not None:
80 | self.amino_acid = gdict["position_aa"]
81 |
82 | if gdict["position_intron"] is not None:
83 | self.intronic_position = int(gdict["position_intron"])
84 |
85 | if self.amino_acid is not None and (
86 | self.intronic_position is not None or self.utr is not None
87 | ):
88 | raise MaveHgvsParseError("invalid variant")
89 |
90 | def __repr__(self) -> str:
91 | """The object representation is equivalent to the input string.
92 |
93 | Returns
94 | -------
95 | str
96 | The object representation.
97 |
98 | """
99 | if self.utr and self.position > 0:
100 | p = f"*{self.position}"
101 | else:
102 | p = f"{self.position}"
103 |
104 | if self.intronic_position is not None:
105 | if self.intronic_position > 0:
106 | return f"{p}+{self.intronic_position}"
107 | else:
108 | return f"{p}{self.intronic_position}"
109 | elif self.amino_acid is not None:
110 | return f"{self.amino_acid}{p}"
111 | else:
112 | return p
113 |
114 | def __lt__(self, other: "VariantPosition") -> bool:
115 | """Less than comparison operator.
116 |
117 | Other comparison operators will be filled in using
118 | :py:func:`functools.total_ordering`.
119 |
120 | Parameters
121 | ----------
122 | other : VariantPosition
123 | The other VariantPosition to compare to.
124 |
125 | Returns
126 | -------
127 | bool
128 | True if this position evaluates as strictly less than the other position;
129 | else False.
130 |
131 | """
132 | if self.utr == other.utr:
133 | if self.position == other.position:
134 | if (
135 | self.intronic_position == other.intronic_position
136 | ): # pragma: no cover
137 | # this case is covered by __eq__
138 | return False
139 | elif self.intronic_position is None:
140 | return other.intronic_position > 0
141 | elif other.intronic_position is None:
142 | return self.intronic_position < 0
143 | else:
144 | return self.intronic_position < other.intronic_position
145 | else:
146 | return self.position < other.position
147 | else: # 5' < non-UTR < 3'
148 | if self.utr:
149 | if self.position < 0: # self is in 5' UTR
150 | return True
151 | else: # self is in 3' UTR
152 | return False
153 | else:
154 | if other.position < 0: # other is in 5' UTR
155 | return False
156 | else: # other is in 3' UTR
157 | return True
158 |
159 | def __eq__(self, other: "VariantPosition") -> bool:
160 | """Equality comparison operator.
161 |
162 | Note that the amino acid portion of a protein position is not used in this
163 | comparison.
164 |
165 | Other comparison operators will be filled in using
166 | :py:func:`functools.total_ordering`.
167 |
168 | Parameters
169 | ----------
170 | other : VariantPosition
171 | The other VariantPosition to compare to.
172 |
173 | Returns
174 | -------
175 | bool
176 | True if this position is the same as the other position; else False.
177 |
178 | """
179 | return (self.position, self.intronic_position, self.utr) == (
180 | other.position,
181 | other.intronic_position,
182 | other.utr,
183 | )
184 |
185 | def __ne__(self, other: "VariantPosition") -> bool:
186 | """Not equal comparison operator.
187 |
188 | Note that the amino acid portion of a protein position is not used in this
189 | comparison.
190 |
191 | Other comparison operators will be filled in using
192 | :py:func:`functools.total_ordering`.
193 |
194 | Parameters
195 | ----------
196 | other : VariantPosition
197 | The other VariantPosition to compare to.
198 |
199 | Returns
200 | -------
201 | bool
202 | True if this position is not the same as the other position; else False.
203 |
204 | """
205 | return (self.position, self.intronic_position, self.utr) != (
206 | other.position,
207 | other.intronic_position,
208 | other.utr,
209 | )
210 |
211 | def is_utr(self) -> bool:
212 | """Return whether this is a UTR position.
213 |
214 | Returns
215 | -------
216 | bool
217 | True if the object describes a position in the UTR; else False.
218 |
219 | """
220 | return self.utr is not None
221 |
222 | def is_intronic(self) -> bool:
223 | """Return whether this is an intronic position.
224 |
225 | Returns
226 | -------
227 | bool
228 | True if the object describes a position in an intron; else False.
229 |
230 | """
231 | return self.intronic_position is not None
232 |
233 | def is_protein(self) -> bool:
234 | """Return whether this is a protein position
235 |
236 | Returns
237 | -------
238 | bool
239 | True if the object describes a position with an amino acid component; else
240 | False.
241 | """
242 | return self.amino_acid is not None
243 |
244 | def is_extended(self) -> bool:
245 | """Return whether this position was described using the extended syntax.
246 |
247 | Returns
248 | -------
249 | bool
250 | True if the position was described using the extended syntax; else False.
251 |
252 | """
253 | return self.utr is not None or self.intronic_position is not None
254 |
255 | # string annotation in the type hint below is required for Python 3.6 compatibility
256 | def is_adjacent(self, other: "VariantPosition") -> bool:
257 | """Return whether this variant and another are immediately adjacent in sequence
258 | space.
259 |
260 | The following special cases are not handled correctly:
261 |
262 | * The special case involving the last variant in a transcript sequence and the
263 | first base in the 3' UTR will be evaluated as not adjacent, as the object does
264 | not have sequence length information.
265 | * The special case involving the two middle bases in an intron where the
266 | numbering switches from positive with respect to the 5' end of the intron to
267 | negative with respect to the 3' end of the intron will be evaluated as not
268 | adjacent, as the object does not have intron length information.
269 | * This ignores the special case where there is an intron between the last base
270 | of the 5' UTR and the first base of the coding sequence because it is not
271 | biologically relevant to the best of my knowledge.
272 |
273 | Parameters
274 | ----------
275 | other : VariantPosition
276 | The object to calculate adjacency to.
277 |
278 | Returns
279 | -------
280 | bool
281 | True if the positions describe adjacent bases in sequence space; else False.
282 |
283 | """
284 | if self.utr == other.utr:
285 | if self.intronic_position is None and other.intronic_position is None:
286 | return abs(self.position - other.position) == 1
287 | elif (
288 | self.position == other.position
289 | ): # intronic positions can only be adjacent if relative to the same base
290 | if (
291 | self.intronic_position is not None
292 | and other.intronic_position is not None
293 | ):
294 | return abs(self.intronic_position - other.intronic_position) == 1
295 | else:
296 | # special case for first/last base of intron and
297 | # corresponding first/last base of exon
298 | return (
299 | self.intronic_position == -1
300 | or self.intronic_position == 1
301 | or other.intronic_position == -1
302 | or other.intronic_position == 1
303 | )
304 | else:
305 | return False
306 | else: # special case for last base of 5' utr and first base of non-UTR sequence
307 | return (self.position == -1 and other.position == 1) or (
308 | other.position == -1 and self.position == 1
309 | )
310 |
--------------------------------------------------------------------------------
/src/mavehgvs/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VariantEffect/mavehgvs/69476dde5391022e7c0eca32ecd1734e371436eb/src/mavehgvs/py.typed
--------------------------------------------------------------------------------
/src/mavehgvs/util.py:
--------------------------------------------------------------------------------
1 | from typing import List, Tuple, Optional, Iterable
2 |
3 | from mavehgvs.variant import Variant
4 | from mavehgvs.exceptions import MaveHgvsParseError
5 |
6 | __all__ = ["parse_variant_strings"]
7 |
8 |
9 | def parse_variant_strings(
10 | variants: Iterable[str],
11 | targetseq: Optional[str] = None,
12 | expected_prefix: Optional[str] = None,
13 | ) -> Tuple[List[Optional[Variant]], List[Optional[str]]]:
14 | """Parse a list of MAVE-HGVS strings into Variant objects or error messages.
15 |
16 | Parameters
17 | ----------
18 | variants : Iterable[str]
19 | Iterable of MAVE-HGVS strings to parse.
20 |
21 | targetseq : Optional[str]
22 | If provided, all variants will be validated for agreement with this sequence.
23 | See the documentation for :py:class:`Variant` for further details.
24 |
25 | expected_prefix : Optional[str]
26 | If provided, all variants will be expected to have the same single-letter
27 | prefix.
28 | Variants that do not have this prefix will be treated as invalid.
29 |
30 | Returns
31 | -------
32 | Tuple[List[Optional[Variant]], List[Optional[str]]]
33 | Returns a pair of lists containing variants or error messages.
34 |
35 | Both lists have the same length as the input list.
36 | The first list contains Variant objects if the string was successfully parsed;
37 | else None.
38 | The second list contains None if the string was successfully parsed; else the
39 | error message.
40 |
41 | """
42 | if expected_prefix is not None and expected_prefix not in list("cgmnopr"):
43 | raise ValueError("invalid expected prefix")
44 |
45 | valid = list()
46 | invalid = list()
47 |
48 | for s in variants:
49 | try:
50 | v = Variant(s, targetseq=targetseq)
51 | except MaveHgvsParseError as error:
52 | valid.append(None)
53 | invalid.append(str(error))
54 | else:
55 | if expected_prefix is not None and v.prefix != expected_prefix:
56 | valid.append(None)
57 | invalid.append("unexpected variant prefix")
58 | else:
59 | valid.append(v)
60 | invalid.append(None)
61 |
62 | return valid, invalid
63 |
--------------------------------------------------------------------------------
/src/mavehgvs/variant.py:
--------------------------------------------------------------------------------
1 | import re
2 | import itertools
3 | from typing import Optional, Union, List, Tuple, Mapping, Any, Sequence, Dict, Generator
4 |
5 | from fqfa.constants import AA_CODES
6 |
7 | from mavehgvs.position import VariantPosition
8 | from mavehgvs.patterns.combined import any_variant
9 | from mavehgvs.exceptions import MaveHgvsParseError
10 |
11 | __all__ = ["Variant"]
12 |
13 | AA_3_TO_1 = {value: key for key, value in AA_CODES.items()}
14 | """Dict[str, str]: for converting three-letter amino acid codes to single-letter codes.
15 | """
16 |
17 |
18 | class Variant:
19 | fullmatch = re.compile(any_variant, flags=re.ASCII).fullmatch
20 | """Callable[[str, int, int], Optional[Match[str]]]: fullmatch callable for parsing a
21 | single MAVE-HGVS variant
22 |
23 | Returns an :py:obj:`re.Match` object if the full string defines a valid MAVE-HGVS
24 | variant.
25 | Match groups in the result can be used to extract components of the variant.
26 | """
27 |
28 | VTYPES = (
29 | "equal", # equality
30 | "sub", # substitution
31 | "fs", # frame shift
32 | "del", # deletion
33 | "dup", # duplication
34 | "ins", # insertion
35 | "delins", # deletion-insertion
36 | )
37 | """Tuple[str]: variant type tags used in MAVE-HGVS patterns and variant type names.
38 | """
39 |
40 | def __init__( # noqa: max-complexity: 37
41 | self,
42 | s: Union[str, Mapping[str, Any], Sequence[Mapping[str, Any]]],
43 | targetseq: Optional[str] = None,
44 | relaxed_ordering: bool = False,
45 | ):
46 | """Convert a MAVE-HGVS variant string into a corresponding object with named
47 | fields.
48 |
49 | Parameters
50 | ----------
51 | s : Union[str, Mapping[str, Any], Sequence[Mapping[str, Any]]]
52 | MAVE-HGVS variant string to convert into an object, dictionary type object
53 | containing key-value pairs corresponding to a MAVE-HGVS object, or
54 | list/tuple of dictionary type objects for a variant with multiple events.
55 |
56 | targetseq : Optional[str]
57 | If provided, the variant will be validated for agreement with this sequence.
58 | Target sequence validation is not supported for variants using the extended
59 | position syntax.
60 |
61 | This must be an amino acid sequence for protein variants or a nucleotide
62 | sequence for coding/noncoding/genomic variants.
63 | DNA and amino acid sequences should be in uppercase, RNA in lowercase.
64 |
65 | relaxed_ordering : bool
66 | If True, variants that do not observe the 3-prime rule for variant position
67 | ordering are allowed.
68 | The object representation will observe the 3-prime rule, so it may differ
69 | from the input string in this case.
70 |
71 | """
72 | if isinstance(s, str): # variant string to parse
73 | variant_string = s
74 | elif isinstance(s, Mapping): # dictionary-style single variant
75 | variant_string = self._variant_dictionary_to_string(s, include_prefix=True)
76 | elif isinstance(s, Sequence): # dictionary-style multi-variant
77 | if not all(isinstance(v, Mapping) for v in s):
78 | raise ValueError("multi-variant iterable must contain Mapping objects")
79 | try:
80 | all_prefixes = [v["prefix"] for v in s]
81 | except KeyError:
82 | raise MaveHgvsParseError("variant dictionary missing required keys")
83 | if len(set(all_prefixes)) != 1:
84 | raise MaveHgvsParseError(
85 | "cannot combine variants with different prefixes"
86 | )
87 | multivariants = ";".join(
88 | self._variant_dictionary_to_string(v, include_prefix=False) for v in s
89 | )
90 | variant_string = f"{s[0]['prefix']}.[{multivariants}]"
91 | else:
92 | raise ValueError("can only create Variants from string or Mapping objects")
93 |
94 | variant_match = self.fullmatch(variant_string)
95 | if variant_match is None:
96 | raise MaveHgvsParseError("failed regular expression validation")
97 | else:
98 | match_dict = variant_match.groupdict()
99 |
100 | # set target id if present
101 | if match_dict["target_id"] is not None:
102 | self._target_id = match_dict["target_id"]
103 | else:
104 | self._target_id = None
105 |
106 | # set prefix and determine if this is a multi-variant
107 | if match_dict["single_variant"] is not None:
108 | self.variant_count = 1
109 | self._prefix = match_dict["single_variant"][0]
110 | elif match_dict["multi_variant"] is not None:
111 | self.variant_count = len(variant_string.split(";"))
112 | self._prefix = match_dict["multi_variant"][0]
113 | else: # pragma: no cover
114 | raise ValueError("invalid match type")
115 |
116 | if self.variant_count == 1:
117 | (
118 | self._variant_types,
119 | self._positions,
120 | self._sequences,
121 | ) = self._process_string_variant(
122 | match_dict, relaxed_ordering=relaxed_ordering
123 | )
124 | elif self.variant_count > 1:
125 | self._variant_types = list()
126 | self._positions = list()
127 | self._sequences = list()
128 |
129 | # format each individual variant event as a single variant and parse it
130 | for variant_substring in match_dict["multi_variant"][3:-1].split(";"):
131 | groupdict = self.fullmatch(
132 | f"{self._prefix}.{variant_substring}"
133 | ).groupdict()
134 | vt, p, s = self._process_string_variant(
135 | groupdict, relaxed_ordering=relaxed_ordering
136 | )
137 | if vt == "equal":
138 | raise MaveHgvsParseError(
139 | "multi-variants cannot contain target-identical variants"
140 | )
141 |
142 | self._variant_types.append(vt)
143 | self._positions.append(p)
144 | self._sequences.append(s)
145 |
146 | # ensure that multiple variants aren't defined for the same positions
147 | for vp1, vp2 in itertools.combinations(self._positions, 2):
148 | if isinstance(vp1, VariantPosition) and isinstance(
149 | vp2, VariantPosition
150 | ): # both single position
151 | if vp1 == vp2:
152 | raise MaveHgvsParseError(
153 | "multi-variant has multiple changes at same position"
154 | )
155 | elif isinstance(vp1, VariantPosition) and isinstance(vp2, Tuple):
156 | if vp2[0] <= vp1 <= vp2[1]:
157 | raise MaveHgvsParseError(
158 | "multi-variant has overlapping changes"
159 | )
160 | elif isinstance(vp1, Tuple) and isinstance(vp2, VariantPosition):
161 | if vp1[0] <= vp2 <= vp1[1]:
162 | raise MaveHgvsParseError(
163 | "multi-variant has overlapping changes"
164 | )
165 | elif isinstance(vp1, Tuple) and isinstance(vp2, Tuple):
166 | if (
167 | vp1[0] <= vp2[0] <= vp1[1]
168 | or vp1[0] <= vp2[1] <= vp1[1]
169 | or vp2[0] <= vp1[0] <= vp2[1]
170 | or vp2[0] <= vp1[1] <= vp2[1]
171 | ):
172 | raise MaveHgvsParseError(
173 | "multi-variant has overlapping changes"
174 | )
175 | else: # pragma: no cover
176 | raise ValueError("invalid position type")
177 |
178 | # re-order variants and validate
179 | def sort_key(x):
180 | if isinstance(x[1], VariantPosition):
181 | return x[1]
182 | elif isinstance(x[1], Tuple):
183 | return x[1][0]
184 | else: # pragma: no cover
185 | raise ValueError("invalid position type")
186 |
187 | variant_list = list(self.variant_tuples())
188 | ordered_list = sorted(variant_list, key=sort_key)
189 | if variant_list != ordered_list:
190 | if relaxed_ordering:
191 | self._variant_types = [x[0] for x in ordered_list]
192 | self._positions = [x[1] for x in ordered_list]
193 | self._sequences = [x[2] for x in ordered_list]
194 | else:
195 | raise MaveHgvsParseError("multi-variants not in sorted order")
196 |
197 | # make sure there is at most one frame shift
198 | if sum(x == "fs" for x in self._variant_types) > 1:
199 | raise MaveHgvsParseError("maximum of one frame shift is permitted")
200 |
201 | # make sure the frame shift is last if present
202 | if any(x == "fs" for x in self._variant_types):
203 | if self._variant_types[-1] != "fs":
204 | raise MaveHgvsParseError(
205 | "no variants are permitted to follow a frame shift"
206 | )
207 |
208 | else: # pragma: no cover
209 | raise ValueError("invalid variant count")
210 |
211 | if targetseq is not None:
212 | for vtype, pos, seq in self.variant_tuples():
213 | if self._prefix != "p" and vtype == "sub":
214 | self._target_validate(pos, seq[0], targetseq)
215 | elif (
216 | pos is None and vtype == "equal"
217 | ): # special case for full-length target identical variants
218 | pass
219 | else:
220 | self._target_validate(pos, None, targetseq)
221 |
222 | def variant_tuples(
223 | self,
224 | ) -> Generator[
225 | Tuple[
226 | str,
227 | Optional[Union[VariantPosition, Tuple[VariantPosition, VariantPosition]]],
228 | Optional[Union[str, Tuple[str, str]]],
229 | ],
230 | None,
231 | None,
232 | ]:
233 | """Generator that yields tuples containing the variant components.
234 |
235 | Yields
236 | ------
237 | Tuple
238 | Tuple of the variant type, position(s), and sequence(s) for each element in
239 | the variant.
240 |
241 | """
242 | if self.is_multi_variant():
243 | for vtype, pos, seq in zip(
244 | self._variant_types, self._positions, self._sequences
245 | ):
246 | yield vtype, pos, seq
247 | else:
248 | yield self._variant_types, self._positions, self._sequences
249 |
250 | def _process_string_variant( # noqa: max-complexity: 23
251 | self, match_dict: Dict[str, str], relaxed_ordering: bool
252 | ) -> Tuple[
253 | str,
254 | Optional[Union[VariantPosition, Tuple[VariantPosition, VariantPosition]]],
255 | Optional[Union[str, Tuple[str, str]]],
256 | ]:
257 | """Process the match dictionary from a single variant into its components.
258 |
259 | Parameters
260 | ----------
261 | match_dict : Dict[str, str]
262 | Match dictionary from the MAVE-HGVS regular expression.
263 | relaxed_ordering : bool
264 | If True, variants that do not observe the 3-prime rule for variant position
265 | ordering are allowed.
266 |
267 | Returns
268 | -------
269 | Tuple[str, Optional[Union[VariantPosition, Tuple[VariantPosition, \
270 | VariantPosition]]], Optional[Union[str, Tuple[str, str]]]]
271 | Returns a 3-tuple containing the variant type, optional position (or
272 | start/end positions), and optional before/after substitution sequences or
273 | inserted sequence.
274 |
275 | """
276 | variant_type = None
277 | positions = None
278 | sequences = None
279 |
280 | # determine which named groups to check
281 | if self._prefix == "p":
282 | pattern_group_tuples = [(f"pro_{t}", t) for t in self.VTYPES]
283 | elif self._prefix == "r":
284 | pattern_group_tuples = [(f"rna_{t}", t) for t in self.VTYPES if t != "fs"]
285 | elif self._prefix in tuple("cn"):
286 | pattern_group_tuples = [
287 | (f"dna_{t}_{self._prefix}", t) for t in self.VTYPES if t != "fs"
288 | ]
289 | elif self._prefix in tuple("gmo"):
290 | pattern_group_tuples = [
291 | (f"dna_{t}_gmo", t) for t in self.VTYPES if t != "fs"
292 | ]
293 | else: # pragma: no cover
294 | raise ValueError("unexpected prefix")
295 |
296 | # set the variant type
297 | vtype_set = False
298 | pattern_group = None
299 | for pg, vtype in pattern_group_tuples:
300 | if match_dict[pg] is not None:
301 | if vtype_set: # pragma: no cover
302 | raise ValueError(f"ambiguous match: '{pg}' and '{pattern_group}'")
303 | variant_type = vtype
304 | pattern_group = pg
305 | vtype_set = True
306 |
307 | # set the position and sequence
308 | if variant_type == "sub":
309 | positions = VariantPosition(match_dict[f"{pattern_group}_position"])
310 | if self._prefix == "p":
311 | sequences = (positions.amino_acid, match_dict[f"{pattern_group}_new"])
312 | elif self._prefix in tuple("gmocnr"):
313 | sequences = (
314 | match_dict[f"{pattern_group}_ref"],
315 | match_dict[f"{pattern_group}_new"],
316 | )
317 | else: # pragma: no cover
318 | raise ValueError("unexpected prefix")
319 | elif variant_type in ("equal", "fs", "del", "dup", "ins", "delins"):
320 | # set position
321 | if (
322 | match_dict.get(f"{pattern_group}_position") is not None
323 | ): # use get() since ins pattern doesn't have pos
324 | positions = VariantPosition(match_dict[f"{pattern_group}_position"])
325 | elif (
326 | match_dict.get(f"{pattern_group}_start") is not None
327 | and match_dict.get(f"{pattern_group}_end") is not None
328 | ):
329 | positions = (
330 | VariantPosition(match_dict[f"{pattern_group}_start"]),
331 | VariantPosition(match_dict[f"{pattern_group}_end"]),
332 | )
333 | # extra validation on positions
334 | if positions[0] >= positions[1]:
335 | if relaxed_ordering:
336 | positions = (positions[1], positions[0])
337 | else:
338 | raise MaveHgvsParseError(
339 | "start position must be before end position"
340 | )
341 | if variant_type == "ins":
342 | if not positions[0].is_adjacent(positions[1]):
343 | raise MaveHgvsParseError("insertion positions must be adjacent")
344 | else: # pragma: no cover
345 | if variant_type != "equal":
346 | raise MaveHgvsParseError("variant position not found")
347 |
348 | # set sequence if needed
349 | if variant_type in ("ins", "delins"):
350 | sequences = match_dict[f"{pattern_group}_seq"]
351 | elif variant_type == "equal":
352 | if (
353 | match_dict[f"{pattern_group}_equal"] is not None
354 | ): # special case for target identity
355 | sequences = match_dict[f"{pattern_group}_equal"]
356 | elif match_dict["pro_equal_equal_sy"] is not None:
357 | sequences = match_dict["pro_equal_equal_sy"]
358 |
359 | return variant_type, positions, sequences
360 |
361 | # TODO: API documentation for the dictionary objects
362 | @staticmethod
363 | def _variant_dictionary_to_string( # noqa: max-complexity: 25
364 | vdict: Mapping[str, Any], include_prefix: bool
365 | ) -> str:
366 | """Convert a match dictionary from a single variant into a string for further
367 | validation.
368 |
369 | This method performs minimal validation of the values provided in the input, and
370 | instead converts it into a variant string that is validated using the regular
371 | expression based validators.
372 |
373 | Parameters
374 | ----------
375 | vdict : Mapping[str, Any]
376 | Key-value pairs describing a single variant.
377 | include_prefix: bool
378 | If True, the variant prefix and '.' will be included in the string; else it
379 | is omitted (for use with multi-variants).
380 |
381 | Returns
382 | -------
383 | str
384 | A string representing this variant.
385 |
386 | Raises
387 | ------
388 | MaveHgvsParseError
389 | If the dictionary does not have a valid set of keys.
390 |
391 | """
392 | try:
393 | variant_type = vdict["variant_type"]
394 | prefix = vdict["prefix"]
395 | except KeyError:
396 | raise MaveHgvsParseError("variant dictionary missing required keys")
397 |
398 | if variant_type == "equal":
399 | expected_keys = ["variant_type", "prefix"]
400 | if prefix == "p":
401 | expected_keys.extend(["position", "target"])
402 | else:
403 | expected_keys.extend(["start_position", "end_position"])
404 | if sorted(vdict.keys()) != sorted(expected_keys):
405 | raise MaveHgvsParseError("variant dictionary contains invalid keys")
406 | if prefix == "p":
407 | variant_string = f"{vdict['target']}{vdict['position']}="
408 | elif vdict["start_position"] == vdict["end_position"]:
409 | variant_string = f"{vdict['start_position']}="
410 | else:
411 | variant_string = f"{vdict['start_position']}_{vdict['end_position']}="
412 | elif variant_type == "sub":
413 | if sorted(vdict.keys()) != sorted(
414 | ["variant_type", "prefix", "position", "target", "variant"]
415 | ):
416 | raise MaveHgvsParseError("variant dictionary contains invalid keys")
417 | if prefix == "p":
418 | variant_string = (
419 | f"{vdict['target']}{vdict['position']}{vdict['variant']}"
420 | )
421 | else:
422 | variant_string = (
423 | f"{vdict['position']}{vdict['target']}>{vdict['variant']}"
424 | )
425 | elif variant_type == "fs":
426 | if sorted(vdict.keys()) != sorted(
427 | ["variant_type", "prefix", "position", "target"]
428 | ):
429 | raise MaveHgvsParseError("variant dictionary contains invalid keys")
430 | if prefix == "p":
431 | variant_string = f"{vdict['target']}{vdict['position']}fs"
432 | else:
433 | raise MaveHgvsParseError(
434 | "frame shifts are only supported for protein variants"
435 | )
436 | elif variant_type in ("del", "dup"):
437 | expected_keys = ["variant_type", "prefix", "start_position", "end_position"]
438 | if prefix == "p":
439 | expected_keys.extend(["start_target", "end_target"])
440 | if sorted(vdict.keys()) != sorted(expected_keys):
441 | raise MaveHgvsParseError("variant dictionary contains invalid keys")
442 | if prefix == "p":
443 | start = f"{vdict['start_target']}{vdict['start_position']}"
444 | end = f"{vdict['end_target']}{vdict['end_position']}"
445 | else:
446 | start = vdict["start_position"]
447 | end = vdict["end_position"]
448 | if start == end:
449 | variant_string = f"{start}{variant_type}"
450 | else:
451 | variant_string = f"{start}_{end}{variant_type}"
452 | elif variant_type in ("ins", "delins"):
453 | expected_keys = [
454 | "variant_type",
455 | "prefix",
456 | "start_position",
457 | "end_position",
458 | "variant",
459 | ]
460 | if prefix == "p":
461 | expected_keys.extend(["start_target", "end_target"])
462 | if sorted(vdict.keys()) != sorted(expected_keys):
463 | raise MaveHgvsParseError("variant dictionary contains invalid keys")
464 | if prefix == "p":
465 | start = f"{vdict['start_target']}{vdict['start_position']}"
466 | end = f"{vdict['end_target']}{vdict['end_position']}"
467 | else:
468 | start = vdict["start_position"]
469 | end = vdict["end_position"]
470 | if start == end and variant_type == "delins":
471 | variant_string = f"{start}{variant_type}{vdict['variant']}"
472 | else:
473 | variant_string = f"{start}_{end}{variant_type}{vdict['variant']}"
474 | else:
475 | raise MaveHgvsParseError("invalid variant type")
476 |
477 | if include_prefix:
478 | return f"{vdict['prefix']}.{variant_string}"
479 | else:
480 | return variant_string
481 |
482 | def _format_component_variants(self) -> List[str]: # noqa: max-complexity: 14
483 | """Format each of the component variants of this variant into a variant string.
484 |
485 | The result is a list of strings, each representing a single variant. If this
486 | variant is a single variant, the list will contain a single element equivalent
487 | to the input string. For multi-variants, the list will contain each component
488 | variant of the variant.
489 |
490 | Returns
491 | -------
492 | List[str]
493 | List of formatted component variants.
494 |
495 | """
496 |
497 | def format_variant(
498 | vtype: str,
499 | pos: Union[VariantPosition, Tuple[VariantPosition, VariantPosition]],
500 | seq: Optional[Union[str, Tuple[str, str]]],
501 | ) -> str:
502 | """Helper function for building variant strings.
503 |
504 | Parameters
505 | ----------
506 | vtype : str
507 | The variant type, as described by :py:obj:`Variant.__vtypes`
508 | pos : Union[VariantPosition, Tuple[VariantPosition, VariantPosition]]
509 | The position or pair of positions describing the variant.
510 | seq : Optional[Union[str, Tuple[str, str]]]
511 | The sequence or pair of sequences describing the variant.
512 | Only used for substitions, insertions, and deletion-insertions.
513 |
514 | Returns
515 | -------
516 | str
517 | A string representing this variant element.
518 |
519 | """
520 | if vtype == "sub":
521 | if self._prefix == "p": # protein variant
522 | return f"{pos}{seq[1]}"
523 | else: # nucleotide variant
524 | return f"{pos}{seq[0]}>{seq[1]}"
525 | elif vtype == "fs":
526 | return f"{pos}fs"
527 | elif vtype in ("del", "dup"):
528 | if isinstance(pos, tuple):
529 | return f"{pos[0]}_{pos[1]}{vtype}"
530 | else:
531 | return f"{pos}{vtype}"
532 | elif vtype in ("ins", "delins"):
533 | if isinstance(pos, tuple):
534 | return f"{pos[0]}_{pos[1]}{vtype}{seq}"
535 | else:
536 | return f"{pos}{vtype}{seq}"
537 | elif vtype == "equal":
538 | if pos is None:
539 | return f"{seq}"
540 | elif isinstance(pos, tuple):
541 | return f"{pos[0]}_{pos[1]}{seq}"
542 | else:
543 | return f"{pos}{seq}"
544 | else: # pragma: no cover
545 | raise ValueError("invalid variant type")
546 |
547 | return [format_variant(*t) for t in self.variant_tuples()]
548 |
549 | def __eq__(self, other: "Variant") -> bool:
550 | """Equality comparison operator.
551 |
552 | Parameters
553 | ----------
554 | other : Variant
555 | The other Variant to compare to.
556 |
557 | Returns
558 | -------
559 | bool
560 | True if this variant is the same as the other position; else False.
561 |
562 | """
563 | return (
564 | self._target_id,
565 | self.variant_count,
566 | self._prefix,
567 | self._variant_types,
568 | self._positions,
569 | self._sequences,
570 | ) == (
571 | other._target_id,
572 | other.variant_count,
573 | other._prefix,
574 | other._variant_types,
575 | other._positions,
576 | other._sequences,
577 | )
578 |
579 | def __repr__(self) -> str:
580 | """The object representation is equivalent to the input string.
581 |
582 | Returns
583 | -------
584 | str
585 | The object representation.
586 |
587 | """
588 |
589 | elements = self._format_component_variants()
590 |
591 | if self._target_id is not None:
592 | prefix = f"{self._target_id}:{self._prefix}"
593 | else:
594 | prefix = f"{self._prefix}"
595 |
596 | if self.is_multi_variant():
597 | return f"{prefix}.[{';'.join(elements)}]"
598 | else:
599 | return f"{prefix}.{elements[0]}"
600 |
601 | @staticmethod
602 | def _target_validate(
603 | pos: Union[VariantPosition, Tuple[VariantPosition, VariantPosition]],
604 | ref: Optional[str],
605 | target: str,
606 | ) -> None:
607 | """Determine whether the target portion of a variant matches the target
608 | sequence.
609 |
610 | Note that variants using extended syntax cannot be validated with this method.
611 | If an extended syntax variant is encountered, it will be interpreted as
612 | valid/matching.
613 |
614 | Parameters
615 | ----------
616 | pos : Union[VariantPosition, Tuple[VariantPosition, VariantPosition]]
617 | Single variant position or start/end tuple for an indel.
618 | ref : Optional[str]
619 | Reference base to validate for nucleotide substitutions.
620 | This should be None for amino acid substitutions, since the reference is
621 | included in the VariantPosition.
622 | target : str
623 | Target sequence. This must be an amino acid sequence for protein variants or
624 | a nucleotide sequence for coding/noncoding/genomic variants.
625 | RNA sequences should be in lowercase, DNA sequences should be in uppercase.
626 |
627 | Returns
628 | -------
629 | None
630 |
631 | Raises
632 | ------
633 | MaveHgvsParseError
634 | If the reference base or amino acid does not match the target at the given
635 | position
636 | MaveHgvsParseError
637 | If the position is outside the bounds of the target.
638 |
639 | """
640 | if not isinstance(pos, tuple):
641 | pos = (pos,)
642 |
643 | if any(p.is_extended() for p in pos):
644 | return
645 | elif any(p.position > len(target) for p in pos):
646 | raise MaveHgvsParseError("variant coordinate out of bounds")
647 | else:
648 | if ref is not None and len(pos) == 1: # nucleotide substitution
649 | if target[pos[0].position - 1] != ref:
650 | raise MaveHgvsParseError("variant reference does not match target")
651 | elif pos[0].amino_acid is not None: # protein variant
652 | for p in pos:
653 | if target[p.position - 1] != AA_3_TO_1[p.amino_acid]:
654 | raise MaveHgvsParseError(
655 | "variant reference does not match target"
656 | )
657 | else:
658 | return
659 |
660 | def is_target_identical(self) -> bool:
661 | """Return whether the variant describes the "wild-type" sequence or is the
662 | special synonymous variant.
663 |
664 | This is the variant described with only the equals sign (e.g. ``c.=``)
665 | or the uncertain equals protein variant (e.g. ``p.(=)``).
666 |
667 | Coding or genomic variants that specify an identical region (e.g. ``c.1_3=`` are
668 | also considered target identical.
669 |
670 | Synonymous protein variants (e.g. ``p.Leu12=``) are not considered target
671 | identical.
672 |
673 | Returns
674 | -------
675 | bool
676 | True if this variant describes the wild-type or target sequence; else False.
677 |
678 | """
679 | if self._variant_types == "equal":
680 | if self._prefix == "p":
681 | return self._positions is None
682 | else:
683 | return True
684 | else:
685 | return False
686 |
687 | def is_synonymous(self) -> bool:
688 | """Return whether the variant describes a synonymous protein variant or is the
689 | special synonymous variant.
690 |
691 | Returns
692 | -------
693 | bool
694 | True if this variant describes a synonymous protein variant; else False.
695 |
696 | """
697 | return self._variant_types == "equal" and self._prefix == "p"
698 |
699 | def is_multi_variant(self) -> bool:
700 | """Return whether the variant is a multi-variant.
701 |
702 | A multi-variant is a single variant describing multiple events enclosed in '[]'.
703 | Multi-variants are referred to as alleles in the HGVS standard.
704 |
705 | Returns
706 | -------
707 | bool
708 | True if the variant is a multi-variant; else False.
709 |
710 | """
711 | return self.variant_count > 1
712 |
713 | @property
714 | def prefix(self) -> str:
715 | """The single-letter prefix for this variant.
716 |
717 | Returns
718 | -------
719 | str
720 | Single-letter prefix corresponding to the sequence type.
721 |
722 | See the following table for sequence type prefixes and their meanings:
723 |
724 | .. csv-table::
725 | :file: ../docs/prefix.csv
726 | :header: "Prefix", "Description"
727 | :widths: 5, 20
728 |
729 | """
730 | return self._prefix
731 |
732 | @property
733 | def variant_type(self) -> Union[str, List[str]]:
734 | """The type for this variant.
735 |
736 | Valid variant types are:
737 |
738 | * ``'sub'`` for substitutions
739 | * ``'del'`` for deletions
740 | * ``'dup'`` for duplications
741 | * ``'ins'`` for insertions
742 | * ``'delins'`` for deletion-insertions
743 |
744 | Returns
745 | -------
746 | Union[str, List[str]]
747 | String containing the variant type. Returns a list of strings for a
748 | multi-variant.
749 |
750 | """
751 | return self._variant_types
752 |
753 | def uses_extended_positions(self) -> bool:
754 | """Return whether the variant uses the extended position notation to describe
755 | intronic or UTR positions.
756 |
757 | Examples of variants using the extended position notation include:
758 |
759 | * c.122-6T>A
760 | * r.*33a>c
761 | * c.43-6_595+12delinsCTT
762 |
763 | This should always be false for variants with a genomic or protein prefix, as
764 | variants with these prefixes cannot use positions relative to a transcript under
765 | the MAVE-HGVS specification.
766 |
767 | Returns
768 | -------
769 | bool
770 | True if the variant (or any of the individual variants for a multi-variant)
771 | uses the extended position notation.
772 |
773 | """
774 | if self.is_multi_variant():
775 | all_positions = list()
776 | for p in self.positions:
777 | if isinstance(p, tuple):
778 | all_positions.extend(p)
779 | else:
780 | all_positions.append(p)
781 | return any(p.is_extended() for p in all_positions)
782 | else:
783 | if self._positions is None: # special case for target identity
784 | return False
785 | elif isinstance(self.positions, tuple):
786 | return any(p.is_extended() for p in self.positions)
787 | else:
788 | return self.positions.is_extended()
789 |
790 | @property
791 | def positions(
792 | self,
793 | ) -> Optional[
794 | Union[
795 | VariantPosition,
796 | Tuple[VariantPosition, VariantPosition],
797 | List[Union[VariantPosition, Tuple[VariantPosition, VariantPosition]]],
798 | ]
799 | ]:
800 | """The variant position as a single position or tuple containing start and end
801 | positions.
802 |
803 | Each position is an instance of :py:class:`mavehgvs.position.VariantPosition`.
804 |
805 | Returns
806 | -------
807 | Union[VariantPosition, Tuple[VariantPosition, VariantPosition], \
808 | List[Union[VariantPosition, Tuple[VariantPosition, VariantPosition]]]]
809 | Variant position or tuple of start/end positions.
810 | Returns a list of positions or start/end tuples for a multi-variant.
811 |
812 | """
813 | return self._positions
814 |
815 | @property
816 | def sequence(
817 | self,
818 | ) -> Optional[
819 | Union[str, Tuple[str, str], List[Optional[Union[str, Tuple[str, str]]]]]
820 | ]:
821 | """The sequence portion of the variant.
822 |
823 | This can be a tuple of target and new bases for a substitution, a single
824 | sequence for insertions or deletion-insertions, or the "=" character for
825 | variants that are identical to the target sequence.
826 |
827 | Returns
828 | -------
829 | Union[str, Tuple[str, str], List[Optional[Union[str, Tuple[str, str]]]]]]
830 | Tuple of ref/new bases for substitutions, string containing inserted
831 | sequence, or the "=" character.
832 | Returns None if the variant does not have a sequence component (deletion or
833 | duplication).
834 | Returns a list for a multi-variant, which may contain None values for
835 | deletions or duplications.
836 |
837 | """
838 | return self._sequences
839 |
840 | @property
841 | def target_id(self) -> Optional[str]:
842 | """The target identifier for the variant (if applicable).
843 |
844 | The target identifier precedes the prefix and is followed by a ``:``.
845 | For example in ``NM_001130145.3:c.832C>T`` the target identifier is
846 | "NM_001130145.3".
847 |
848 | Returns
849 | -------
850 | Optional[str]
851 | The target identifier, or None if it is not set.
852 |
853 | """
854 | return self._target_id
855 |
856 | def components(self) -> Tuple[str, ...]:
857 | """The component substrings of a variant.
858 |
859 | Returns
860 | -------
861 | Tuple[str, ...]
862 | List of component substrings for this variant.
863 |
864 | """
865 | if self.target_id is not None:
866 | prefix = f"{self.target_id}:{self.prefix}"
867 | else:
868 | prefix = f"{self.prefix}"
869 |
870 | return tuple(
871 | [f"{prefix}.{component}" for component in self._format_component_variants()]
872 | )
873 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VariantEffect/mavehgvs/69476dde5391022e7c0eca32ecd1734e371436eb/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_patterns/__init__.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | from typing import Iterable, Iterator, Tuple
3 |
4 |
5 | def build_multi_variants(
6 | valid_strings: Iterable[str],
7 | invalid_strings: Iterable[str],
8 | min_length: int = 2,
9 | max_length: int = 3,
10 | ) -> Tuple[Iterator, Iterator]:
11 | """Build iterators of valid and invalid multi-variant strings to test.
12 |
13 | Parameters
14 | ----------
15 | valid_strings : Iterable[str]
16 | Iterable containing all the valid single-variant strings.
17 | invalid_strings : Iterable[str]
18 | Iterable containing all the invalid single-variant strings.
19 | min_length : int
20 | Minimum length of multi-variants that will be generated.
21 | max_length : int
22 | Maximum length of multi-variants that will be generated.
23 | Note that increasing this value may massively increase test runtime.
24 |
25 | Returns
26 | -------
27 | Tuple[Iterator, Iterator]
28 | Returns iterators containing semicolon-separated multi-variant strings.
29 |
30 | The first iterator contains multi-variants from only valid_strings and the
31 | second iterator contains multi-variants that include at least one variant from
32 | invalid_strings.
33 | """
34 | # create an iterable of permutations for each length and store them in lists
35 | valid_multivariants = list()
36 | invalid_multivariants = list()
37 |
38 | for i in range(min_length, max_length + 1):
39 | valid_multivariants.append(
40 | ";".join(x) for x in itertools.permutations(valid_strings, i)
41 | )
42 | invalid_multivariants.append(
43 | ";".join(x)
44 | for x in itertools.permutations(
45 | itertools.chain(valid_strings, invalid_strings), i
46 | )
47 | if any(y in x for y in invalid_strings)
48 | )
49 |
50 | # combine the lists into single iterators and return
51 | return itertools.chain.from_iterable(
52 | valid_multivariants
53 | ), itertools.chain.from_iterable(invalid_multivariants)
54 |
--------------------------------------------------------------------------------
/tests/test_patterns/test_dna.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import re
3 | from mavehgvs.patterns.dna import (
4 | dna_equal_c,
5 | dna_equal_n,
6 | dna_equal_gmo,
7 | dna_sub_c,
8 | dna_sub_n,
9 | dna_sub_gmo,
10 | dna_del_c,
11 | dna_del_n,
12 | dna_del_gmo,
13 | dna_dup_c,
14 | dna_dup_n,
15 | dna_dup_gmo,
16 | dna_ins_c,
17 | dna_ins_n,
18 | dna_ins_gmo,
19 | dna_delins_c,
20 | dna_delins_n,
21 | dna_delins_gmo,
22 | dna_variant_c,
23 | dna_variant_n,
24 | dna_variant_gmo,
25 | dna_single_variant,
26 | dna_multi_variant,
27 | )
28 | from . import build_multi_variants
29 |
30 |
31 | class TestDnaEqualC(unittest.TestCase):
32 | @classmethod
33 | def setUpClass(cls):
34 | cls.pattern = re.compile(dna_equal_c, flags=re.ASCII)
35 |
36 | cls.valid_strings = [
37 | "=",
38 | "18=",
39 | "10_14=",
40 | "122-6=",
41 | "*24=",
42 | "19+22=",
43 | "19+22_88=",
44 | "-27+3=",
45 | ]
46 |
47 | cls.invalid_strings = ["=22", "(=)", "18(=)"]
48 |
49 | def test_valid_strings(self):
50 | for s in self.valid_strings:
51 | with self.subTest(s=s):
52 | self.assertIsNotNone(
53 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
54 | )
55 |
56 | def test_invalid_strings(self):
57 | for s in self.invalid_strings:
58 | with self.subTest(s=s):
59 | self.assertIsNone(
60 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
61 | )
62 |
63 |
64 | class TestDnaEqualN(unittest.TestCase):
65 | @classmethod
66 | def setUpClass(cls):
67 | cls.pattern = re.compile(dna_equal_n, flags=re.ASCII)
68 |
69 | cls.valid_strings = ["="]
70 |
71 | cls.invalid_strings = [
72 | "=22",
73 | "(=)",
74 | "18(=)",
75 | "-27+3=",
76 | "*24=",
77 | "18=",
78 | "10_14=",
79 | "122-6=",
80 | "19+22=",
81 | "19+22_88=",
82 | ]
83 |
84 | def test_valid_strings(self):
85 | for s in self.valid_strings:
86 | with self.subTest(s=s):
87 | self.assertIsNotNone(
88 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
89 | )
90 |
91 | def test_invalid_strings(self):
92 | for s in self.invalid_strings:
93 | with self.subTest(s=s):
94 | self.assertIsNone(
95 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
96 | )
97 |
98 |
99 | class TestDnaEqualGMO(unittest.TestCase):
100 | @classmethod
101 | def setUpClass(cls):
102 | cls.pattern = re.compile(dna_equal_gmo, flags=re.ASCII)
103 |
104 | cls.valid_strings = ["=", "18=", "10_14="]
105 |
106 | cls.invalid_strings = [
107 | "=22",
108 | "(=)",
109 | "18(=)",
110 | "122-6=",
111 | "*24=",
112 | "19+22=",
113 | "19+22_88=",
114 | "-27+3=",
115 | ]
116 |
117 | def test_valid_strings(self):
118 | for s in self.valid_strings:
119 | with self.subTest(s=s):
120 | self.assertIsNotNone(
121 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
122 | )
123 |
124 | def test_invalid_strings(self):
125 | for s in self.invalid_strings:
126 | with self.subTest(s=s):
127 | self.assertIsNone(
128 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
129 | )
130 |
131 |
132 | class TestDnaSubC(unittest.TestCase):
133 | @classmethod
134 | def setUpClass(cls):
135 | cls.pattern = re.compile(dna_sub_c, flags=re.ASCII)
136 |
137 | cls.valid_strings = ["48C>A", "122-6T>A", "*24G>C", "19+22A>G", "-27+3T>C"]
138 |
139 | cls.invalid_strings = ["22g>u", "48C>W", "122=/T>A"]
140 |
141 | def test_valid_strings(self):
142 | for s in self.valid_strings:
143 | with self.subTest(s=s):
144 | self.assertIsNotNone(
145 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
146 | )
147 |
148 | def test_invalid_strings(self):
149 | for s in self.invalid_strings:
150 | with self.subTest(s=s):
151 | self.assertIsNone(
152 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
153 | )
154 |
155 |
156 | class TestDnaSubN(unittest.TestCase):
157 | @classmethod
158 | def setUpClass(cls):
159 | cls.pattern = re.compile(dna_sub_n, flags=re.ASCII)
160 |
161 | cls.valid_strings = ["48C>A", "122-6T>A", "19+22A>G"]
162 |
163 | cls.invalid_strings = ["22g>u", "48C>W", "122=/T>A", "*24G>C", "-27+3T>C"]
164 |
165 | def test_valid_strings(self):
166 | for s in self.valid_strings:
167 | with self.subTest(s=s):
168 | self.assertIsNotNone(
169 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
170 | )
171 |
172 | def test_invalid_strings(self):
173 | for s in self.invalid_strings:
174 | with self.subTest(s=s):
175 | self.assertIsNone(
176 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
177 | )
178 |
179 |
180 | class TestDnaSubGmo(unittest.TestCase):
181 | @classmethod
182 | def setUpClass(cls):
183 | cls.pattern = re.compile(dna_sub_gmo, flags=re.ASCII)
184 |
185 | cls.valid_strings = ["48C>A"]
186 |
187 | cls.invalid_strings = ["122-6T>A", "22g>u", "48C>W", "22=", "122=/T>A", "0C>T"]
188 |
189 | def test_valid_strings(self):
190 | for s in self.valid_strings:
191 | with self.subTest(s=s):
192 | self.assertIsNotNone(
193 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
194 | )
195 |
196 | def test_invalid_strings(self):
197 | for s in self.invalid_strings:
198 | with self.subTest(s=s):
199 | self.assertIsNone(
200 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
201 | )
202 |
203 |
204 | class TestDnaDelC(unittest.TestCase):
205 | @classmethod
206 | def setUpClass(cls):
207 | cls.pattern = re.compile(dna_del_c, flags=re.ASCII)
208 |
209 | cls.valid_strings = [
210 | "44del",
211 | "1_95del",
212 | "78+5_78+10del",
213 | "-25+1_-25+3del",
214 | "*17del",
215 | ]
216 |
217 | cls.invalid_strings = [
218 | "(78+1_79-1)_(124+1_125-1)del",
219 | "(?_85)_(124_?)del",
220 | "122=/del",
221 | ]
222 |
223 | def test_valid_strings(self):
224 | for s in self.valid_strings:
225 | with self.subTest(s=s):
226 | self.assertIsNotNone(
227 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
228 | )
229 |
230 | def test_invalid_strings(self):
231 | for s in self.invalid_strings:
232 | with self.subTest(s=s):
233 | self.assertIsNone(
234 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
235 | )
236 |
237 |
238 | class TestDnaDelN(unittest.TestCase):
239 | @classmethod
240 | def setUpClass(cls):
241 | cls.pattern = re.compile(dna_del_n, flags=re.ASCII)
242 |
243 | cls.valid_strings = ["44del", "1_95del", "78+5_78+10del"]
244 |
245 | cls.invalid_strings = [
246 | "(78+1_79-1)_(124+1_125-1)del",
247 | "(?_85)_(124_?)del",
248 | "122=/del",
249 | "-25+1_-25+3del",
250 | "*17del",
251 | ]
252 |
253 | def test_valid_strings(self):
254 | for s in self.valid_strings:
255 | with self.subTest(s=s):
256 | self.assertIsNotNone(
257 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
258 | )
259 |
260 | def test_invalid_strings(self):
261 | for s in self.invalid_strings:
262 | with self.subTest(s=s):
263 | self.assertIsNone(
264 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
265 | )
266 |
267 |
268 | class TestDnaDelGmo(unittest.TestCase):
269 | @classmethod
270 | def setUpClass(cls):
271 | cls.pattern = re.compile(dna_del_gmo, flags=re.ASCII)
272 |
273 | cls.valid_strings = ["44del", "1_95del"]
274 |
275 | cls.invalid_strings = [
276 | "78+5_78+10del",
277 | "-25+1_-25+3del",
278 | "*17del",
279 | "(78+1_79-1)_(124+1_125-1)del",
280 | "(?_85)_(124_?)del",
281 | "122=/del",
282 | ]
283 |
284 | def test_valid_strings(self):
285 | for s in self.valid_strings:
286 | with self.subTest(s=s):
287 | self.assertIsNotNone(
288 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
289 | )
290 |
291 | def test_invalid_strings(self):
292 | for s in self.invalid_strings:
293 | with self.subTest(s=s):
294 | self.assertIsNone(
295 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
296 | )
297 |
298 |
299 | class TestDnaDupC(unittest.TestCase):
300 | @classmethod
301 | def setUpClass(cls):
302 | cls.pattern = re.compile(dna_dup_c, flags=re.ASCII)
303 |
304 | cls.valid_strings = [
305 | "22_24dup",
306 | "77dup",
307 | "101+1_101+7dup",
308 | "-25+1_-25+3dup",
309 | "*17dup",
310 | ]
311 |
312 | cls.invalid_strings = [
313 | "(78+1_79-1)_(124+1_125-1)dup",
314 | "(?_85)_(124_?)dup",
315 | "122_125=//dup",
316 | ]
317 |
318 | def test_valid_strings(self):
319 | for s in self.valid_strings:
320 | with self.subTest(s=s):
321 | self.assertIsNotNone(
322 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
323 | )
324 |
325 | def test_invalid_strings(self):
326 | for s in self.invalid_strings:
327 | with self.subTest(s=s):
328 | self.assertIsNone(
329 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
330 | )
331 |
332 |
333 | class TestDnaDupN(unittest.TestCase):
334 | @classmethod
335 | def setUpClass(cls):
336 | cls.pattern = re.compile(dna_dup_n, flags=re.ASCII)
337 |
338 | cls.valid_strings = ["22_24dup", "77dup", "101+1_101+7dup"]
339 |
340 | cls.invalid_strings = [
341 | "(78+1_79-1)_(124+1_125-1)dup",
342 | "(?_85)_(124_?)dup",
343 | "122_125=//dup",
344 | "-25+1_-25+3dup",
345 | "*17dup",
346 | ]
347 |
348 | def test_valid_strings(self):
349 | for s in self.valid_strings:
350 | with self.subTest(s=s):
351 | self.assertIsNotNone(
352 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
353 | )
354 |
355 | def test_invalid_strings(self):
356 | for s in self.invalid_strings:
357 | with self.subTest(s=s):
358 | self.assertIsNone(
359 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
360 | )
361 |
362 |
363 | class TestDnaDupGmo(unittest.TestCase):
364 | @classmethod
365 | def setUpClass(cls):
366 | cls.pattern = re.compile(dna_dup_gmo, flags=re.ASCII)
367 |
368 | cls.valid_strings = ["22_24dup", "77dup"]
369 |
370 | cls.invalid_strings = [
371 | "(78+1_79-1)_(124+1_125-1)dup",
372 | "(?_85)_(124_?)dup",
373 | "122_125=//dup",
374 | "101+1_101+7dup",
375 | "-25+1_-25+3dup",
376 | "*17dup",
377 | ]
378 |
379 | def test_valid_strings(self):
380 | for s in self.valid_strings:
381 | with self.subTest(s=s):
382 | self.assertIsNotNone(
383 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
384 | )
385 |
386 | def test_invalid_strings(self):
387 | for s in self.invalid_strings:
388 | with self.subTest(s=s):
389 | self.assertIsNone(
390 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
391 | )
392 |
393 |
394 | class TestDnaInsC(unittest.TestCase):
395 | @classmethod
396 | def setUpClass(cls):
397 | cls.pattern = re.compile(dna_ins_c, flags=re.ASCII)
398 |
399 | cls.valid_strings = [
400 | "234_235insT",
401 | "84_85insCTG",
402 | "*84_*85insCTG",
403 | "99+6_99+7insA",
404 | "124+100_124-100insTTG",
405 | "124+101_124-100insTTG",
406 | ]
407 |
408 | cls.invalid_strings = ["84_85ins100_125", "234_235ins(10)", "234_235ins(?)"]
409 |
410 | def test_valid_strings(self):
411 | for s in self.valid_strings:
412 | with self.subTest(s=s):
413 | self.assertIsNotNone(
414 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
415 | )
416 |
417 | def test_invalid_strings(self):
418 | for s in self.invalid_strings:
419 | with self.subTest(s=s):
420 | self.assertIsNone(
421 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
422 | )
423 |
424 |
425 | class TestDnaInsN(unittest.TestCase):
426 | @classmethod
427 | def setUpClass(cls):
428 | cls.pattern = re.compile(dna_ins_n, flags=re.ASCII)
429 |
430 | cls.valid_strings = [
431 | "234_235insT",
432 | "84_85insCTG",
433 | "99+6_99+7insA",
434 | "124+100_124-100insTTG",
435 | "124+101_124-100insTTG",
436 | ]
437 |
438 | cls.invalid_strings = [
439 | "84_85ins100_125",
440 | "234_235ins(10)",
441 | "234_235ins(?)",
442 | "*84_*85insCTG",
443 | ]
444 |
445 | def test_valid_strings(self):
446 | for s in self.valid_strings:
447 | with self.subTest(s=s):
448 | self.assertIsNotNone(
449 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
450 | )
451 |
452 | def test_invalid_strings(self):
453 | for s in self.invalid_strings:
454 | with self.subTest(s=s):
455 | self.assertIsNone(
456 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
457 | )
458 |
459 |
460 | class TestDnaInsGmo(unittest.TestCase):
461 | @classmethod
462 | def setUpClass(cls):
463 | cls.pattern = re.compile(dna_ins_gmo, flags=re.ASCII)
464 |
465 | cls.valid_strings = ["234_235insT", "84_85insCTG"]
466 |
467 | cls.invalid_strings = [
468 | "99+6_99+7insA",
469 | "84_85ins100_125",
470 | "234_235ins(10)",
471 | "234_235ins(?)",
472 | ]
473 |
474 | def test_valid_strings(self):
475 | for s in self.valid_strings:
476 | with self.subTest(s=s):
477 | self.assertIsNotNone(
478 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
479 | )
480 |
481 | def test_invalid_strings(self):
482 | for s in self.invalid_strings:
483 | with self.subTest(s=s):
484 | self.assertIsNone(
485 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
486 | )
487 |
488 |
489 | class TestDnaDelinsC(unittest.TestCase):
490 | @classmethod
491 | def setUpClass(cls):
492 | cls.pattern = re.compile(dna_delins_c, flags=re.ASCII)
493 |
494 | cls.valid_strings = [
495 | "22delinsAACG",
496 | "83_85delinsT",
497 | "43-6_595+12delinsCTT",
498 | "*788delinsA",
499 | ]
500 |
501 | cls.invalid_strings = ["84_85delinsAAN", "234delinsW"]
502 |
503 | def test_valid_strings(self):
504 | for s in self.valid_strings:
505 | with self.subTest(s=s):
506 | self.assertIsNotNone(
507 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
508 | )
509 |
510 | def test_invalid_strings(self):
511 | for s in self.invalid_strings:
512 | with self.subTest(s=s):
513 | self.assertIsNone(
514 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
515 | )
516 |
517 |
518 | class TestDnaDelinsN(unittest.TestCase):
519 | @classmethod
520 | def setUpClass(cls):
521 | cls.pattern = re.compile(dna_delins_n, flags=re.ASCII)
522 |
523 | cls.valid_strings = ["22delinsAACG", "83_85delinsT", "43-6_595+12delinsCTT"]
524 |
525 | cls.invalid_strings = ["84_85delinsAAN", "234delinsW" "*788delinsA"]
526 |
527 | def test_valid_strings(self):
528 | for s in self.valid_strings:
529 | with self.subTest(s=s):
530 | self.assertIsNotNone(
531 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
532 | )
533 |
534 | def test_invalid_strings(self):
535 | for s in self.invalid_strings:
536 | with self.subTest(s=s):
537 | self.assertIsNone(
538 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
539 | )
540 |
541 |
542 | class TestDnaDelinsGmo(unittest.TestCase):
543 | @classmethod
544 | def setUpClass(cls):
545 | cls.pattern = re.compile(dna_delins_gmo, flags=re.ASCII)
546 |
547 | cls.valid_strings = ["22delinsAACG", "83_85delinsT"]
548 |
549 | cls.invalid_strings = [
550 | "43-6_595+12delinsCTT",
551 | "*788delinsA",
552 | "84_85delinsAAN",
553 | "234delinsW",
554 | ]
555 |
556 | def test_valid_strings(self):
557 | for s in self.valid_strings:
558 | with self.subTest(s=s):
559 | self.assertIsNotNone(
560 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
561 | )
562 |
563 | def test_invalid_strings(self):
564 | for s in self.invalid_strings:
565 | with self.subTest(s=s):
566 | self.assertIsNone(
567 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
568 | )
569 |
570 |
571 | class TestDnaVariantC(unittest.TestCase):
572 | @classmethod
573 | def setUpClass(cls):
574 | cls.pattern = re.compile(dna_variant_c, flags=re.ASCII)
575 |
576 | cls.valid_strings = [
577 | "48C>A",
578 | "=",
579 | "22=",
580 | "4_6=",
581 | "122-6T>A",
582 | "*24G>C",
583 | "19+22A>G",
584 | "-27+3T>C",
585 | "44del",
586 | "1_95del",
587 | "78+5_78+10del",
588 | "-25+1_-25+3del",
589 | "*17del",
590 | "22_24dup",
591 | "77dup",
592 | "101+1_101+7dup",
593 | "-25+1_-25+3dup",
594 | "*17dup",
595 | "234_235insT",
596 | "84_85insCTG",
597 | "99+6_99+7insA",
598 | "22delinsAACG",
599 | "83_85delinsT",
600 | "43-6_595+12delinsCTT",
601 | "*788delinsA",
602 | ]
603 |
604 | cls.invalid_strings = [
605 | "22g>u",
606 | "48C>W",
607 | "122=/T>A",
608 | "(78+1_79-1)_(124+1_125-1)del",
609 | "(?_85)_(124_?)del",
610 | "122=/del",
611 | "(78+1_79-1)_(124+1_125-1)dup",
612 | "(?_85)_(124_?)dup",
613 | "122_125=//dup",
614 | "84_85ins100_125",
615 | "234_235ins(10)",
616 | "234_235ins(?)",
617 | "84_85delinsAAN",
618 | "234delinsW",
619 | ]
620 |
621 | def test_valid_strings(self):
622 | for s in self.valid_strings:
623 | with self.subTest(s=s):
624 | self.assertIsNotNone(
625 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
626 | )
627 |
628 | def test_invalid_strings(self):
629 | for s in self.invalid_strings:
630 | with self.subTest(s=s):
631 | self.assertIsNone(
632 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
633 | )
634 |
635 |
636 | class TestDnaVariantN(unittest.TestCase):
637 | @classmethod
638 | def setUpClass(cls):
639 | cls.pattern = re.compile(dna_variant_n, flags=re.ASCII)
640 |
641 | cls.valid_strings = [
642 | "48C>A",
643 | "=",
644 | "122-6T>A",
645 | "19+22A>G",
646 | "44del",
647 | "1_95del",
648 | "78+5_78+10del",
649 | "22_24dup",
650 | "77dup",
651 | "101+1_101+7dup",
652 | "234_235insT",
653 | "84_85insCTG",
654 | "99+6_99+7insA",
655 | "22delinsAACG",
656 | "83_85delinsT",
657 | "43-6_595+12delinsCTT",
658 | ]
659 |
660 | cls.invalid_strings = [
661 | "22=",
662 | "1_3=",
663 | "22g>u",
664 | "48C>W",
665 | "122=/T>A",
666 | "(78+1_79-1)_(124+1_125-1)del",
667 | "(?_85)_(124_?)del",
668 | "122=/del",
669 | "(78+1_79-1)_(124+1_125-1)dup",
670 | "(?_85)_(124_?)dup",
671 | "122_125=//dup",
672 | "84_85ins100_125",
673 | "234_235ins(10)",
674 | "234_235ins(?)",
675 | "84_85delinsAAN",
676 | "234delinsW",
677 | "*24G>C",
678 | "-27+3T>C",
679 | "-25+1_-25+3del",
680 | "*17del",
681 | "-25+1_-25+3dup",
682 | "*17dup",
683 | "*788delinsA",
684 | ]
685 |
686 | def test_valid_strings(self):
687 | for s in self.valid_strings:
688 | with self.subTest(s=s):
689 | self.assertIsNotNone(
690 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
691 | )
692 |
693 | def test_invalid_strings(self):
694 | for s in self.invalid_strings:
695 | with self.subTest(s=s):
696 | self.assertIsNone(
697 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
698 | )
699 |
700 |
701 | class TestDnaVariantGmo(unittest.TestCase):
702 | @classmethod
703 | def setUpClass(cls):
704 | cls.pattern = re.compile(dna_variant_gmo, flags=re.ASCII)
705 |
706 | cls.valid_strings = [
707 | "48C>A",
708 | "=",
709 | "22=",
710 | "1_3=",
711 | "44del",
712 | "1_95del",
713 | "22_24dup",
714 | "77dup",
715 | "234_235insT",
716 | "84_85insCTG",
717 | "22delinsAACG",
718 | "83_85delinsT",
719 | ]
720 |
721 | cls.invalid_strings = [
722 | "43-6_595+12delinsCTT",
723 | "*788delinsA",
724 | "99+6_99+7insA",
725 | "101+1_101+7dup",
726 | "-25+1_-25+3dup",
727 | "*17dup",
728 | "78+5_78+10del",
729 | "-25+1_-25+3del",
730 | "*17del",
731 | "*24G>C",
732 | "19+22A>G",
733 | "122-6T>A",
734 | "-27+3T>C",
735 | "22g>u",
736 | "48C>W",
737 | "122=/T>A",
738 | "(78+1_79-1)_(124+1_125-1)del",
739 | "(?_85)_(124_?)del",
740 | "122=/del",
741 | "(78+1_79-1)_(124+1_125-1)dup",
742 | "(?_85)_(124_?)dup",
743 | "122_125=//dup",
744 | "84_85ins100_125",
745 | "234_235ins(10)",
746 | "234_235ins(?)",
747 | "84_85delinsAAN",
748 | "234delinsW",
749 | ]
750 |
751 | def test_valid_strings(self):
752 | for s in self.valid_strings:
753 | with self.subTest(s=s):
754 | self.assertIsNotNone(
755 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
756 | )
757 |
758 | def test_invalid_strings(self):
759 | for s in self.invalid_strings:
760 | with self.subTest(s=s):
761 | self.assertIsNone(
762 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
763 | )
764 |
765 |
766 | class TestDnaSingleVariant(unittest.TestCase):
767 | @classmethod
768 | def setUpClass(cls):
769 | cls.pattern = re.compile(dna_single_variant, flags=re.ASCII)
770 |
771 | cls.valid_strings = [
772 | "48C>A",
773 | "=",
774 | "44del",
775 | "1_95del",
776 | "22_24dup",
777 | "77dup",
778 | "234_235insT",
779 | "84_85insCTG",
780 | "22delinsAACG",
781 | "83_85delinsT",
782 | ]
783 |
784 | cls.valid_strings_c_only = [
785 | "*788delinsA",
786 | "-25+1_-25+3dup",
787 | "*17dup",
788 | "-25+1_-25+3del",
789 | "*17del",
790 | "*24G>C",
791 | "-27+3T>C",
792 | ]
793 |
794 | cls.valid_strings_cn_only = [
795 | "43-6_595+12delinsCTT",
796 | "99+6_99+7insA",
797 | "101+1_101+7dup",
798 | "78+5_78+10del",
799 | "19+22A>G",
800 | "122-6T>A",
801 | ]
802 |
803 | cls.valid_strings_cgmo_only = ["22=", "4_6="]
804 |
805 | cls.invalid_strings = [
806 | "22g>u",
807 | "48C>W",
808 | "122=/T>A",
809 | "(78+1_79-1)_(124+1_125-1)del",
810 | "(?_85)_(124_?)del",
811 | "122=/del",
812 | "(78+1_79-1)_(124+1_125-1)dup",
813 | "(?_85)_(124_?)dup",
814 | "122_125=//dup",
815 | "84_85ins100_125",
816 | "234_235ins(10)",
817 | "234_235ins(?)",
818 | "84_85delinsAAN",
819 | "234delinsW",
820 | ]
821 |
822 | def test_valid_strings(self):
823 | for p in "cngmo":
824 | for s in self.valid_strings:
825 | with self.subTest(s=s, p=p):
826 | v = f"{p}.{s}"
827 | self.assertIsNotNone(
828 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"'
829 | )
830 | for p in "cgmo":
831 | for s in self.valid_strings_cgmo_only:
832 | with self.subTest(s=s, p=p):
833 | v = f"{p}.{s}"
834 | self.assertIsNotNone(
835 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"'
836 | )
837 | for p in "cn":
838 | for s in self.valid_strings_cn_only:
839 | with self.subTest(s=s, p=p):
840 | v = f"{p}.{s}"
841 | self.assertIsNotNone(
842 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"'
843 | )
844 | for p in "c":
845 | for s in self.valid_strings_c_only:
846 | with self.subTest(s=s, p=p):
847 | v = f"{p}.{s}"
848 | self.assertIsNotNone(
849 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"'
850 | )
851 |
852 | def test_invalid_strings(self):
853 | for p in "cngmo":
854 | for s in self.invalid_strings:
855 | with self.subTest(s=s, p=p):
856 | v = f"{p}.{s}"
857 | self.assertIsNone(
858 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"'
859 | )
860 | for p in "ngmo":
861 | for s in self.valid_strings_c_only:
862 | with self.subTest(s=s, p=p):
863 | v = f"{p}.{s}"
864 | self.assertIsNone(
865 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"'
866 | )
867 | for p in "gmo":
868 | for s in self.valid_strings_cn_only:
869 | with self.subTest(s=s, p=p):
870 | v = f"{p}.{s}"
871 | self.assertIsNone(
872 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"'
873 | )
874 | for p in "n":
875 | for s in self.valid_strings_cgmo_only:
876 | with self.subTest(s=s, p=p):
877 | v = f"{p}.{s}"
878 | self.assertIsNone(
879 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"'
880 | )
881 |
882 |
883 | class TestDnaMultiVariant(unittest.TestCase):
884 | @classmethod
885 | def setUpClass(cls):
886 | cls.pattern = re.compile(dna_multi_variant, flags=re.ASCII)
887 |
888 | single_valid_strings = [
889 | "48C>A",
890 | "=",
891 | "44del",
892 | "1_95del",
893 | "22_24dup",
894 | "77dup",
895 | "234_235insT",
896 | "84_85insCTG",
897 | "22delinsAACG",
898 | "83_85delinsT",
899 | ]
900 |
901 | single_valid_strings_c_only = [
902 | "*788delinsA",
903 | "-25+1_-25+3dup",
904 | "*17dup",
905 | "-25+1_-25+3del",
906 | "*17del",
907 | "*24G>C",
908 | "-27+3T>C",
909 | ]
910 |
911 | single_valid_strings_cn_only = [
912 | "43-6_595+12delinsCTT",
913 | "99+6_99+7insA",
914 | "101+1_101+7dup",
915 | "78+5_78+10del",
916 | "19+22A>G",
917 | "122-6T>A",
918 | ]
919 |
920 | single_valid_strings_cgmo_only = ["22=", "4_6="]
921 |
922 | single_invalid_strings = [
923 | "22g>u",
924 | "48C>W",
925 | "122=/T>A",
926 | "(78+1_79-1)_(124+1_125-1)del",
927 | "(?_85)_(124_?)del",
928 | "122=/del",
929 | "(78+1_79-1)_(124+1_125-1)dup",
930 | "(?_85)_(124_?)dup",
931 | "122_125=//dup",
932 | "84_85ins100_125",
933 | "234_235ins(10)",
934 | "234_235ins(?)",
935 | "84_85delinsAAN",
936 | "234delinsW",
937 | ]
938 |
939 | cls.valid_strings, cls.invalid_strings = build_multi_variants(
940 | single_valid_strings, single_invalid_strings
941 | )
942 | cls.valid_strings_c_only, cls.invalid_strings_ngmo = build_multi_variants(
943 | single_valid_strings_c_only, single_valid_strings_c_only
944 | )
945 | cls.valid_strings_cn_only, cls.invalid_strings_gmo = build_multi_variants(
946 | single_valid_strings_cn_only, single_valid_strings_cn_only
947 | )
948 | cls.valid_strings_cgmo_only, cls.invalid_strings_n = build_multi_variants(
949 | single_valid_strings_cgmo_only, single_valid_strings_cgmo_only
950 | )
951 |
952 | def test_valid_strings(self):
953 | for p in "cngmo":
954 | for s in self.valid_strings:
955 | with self.subTest(s=s, p=p):
956 | v = f"{p}.[{s}]"
957 | self.assertIsNotNone(
958 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"'
959 | )
960 | for p in "cgmo":
961 | for s in self.valid_strings_cgmo_only:
962 | with self.subTest(s=s, p=p):
963 | v = f"{p}.[{s}]"
964 | self.assertIsNotNone(
965 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"'
966 | )
967 | for p in "cn":
968 | for s in self.valid_strings_cn_only:
969 | with self.subTest(s=s, p=p):
970 | v = f"{p}.[{s}]"
971 | self.assertIsNotNone(
972 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"'
973 | )
974 | for p in "c":
975 | for s in self.valid_strings_c_only:
976 | with self.subTest(s=s, p=p):
977 | v = f"{p}.[{s}]"
978 | self.assertIsNotNone(
979 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"'
980 | )
981 |
982 | def test_invalid_strings(self):
983 | for p in "cngmo":
984 | for s in self.invalid_strings:
985 | with self.subTest(s=s, p=p):
986 | v = f"{p}.[{s}]"
987 | self.assertIsNone(
988 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"'
989 | )
990 | for p in "ngmo":
991 | for s in self.invalid_strings_ngmo:
992 | with self.subTest(s=s, p=p):
993 | v = f"{p}.[{s}]"
994 | self.assertIsNone(
995 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"'
996 | )
997 | for p in "gmo":
998 | for s in self.invalid_strings_gmo:
999 | with self.subTest(s=s, p=p):
1000 | v = f"{p}.[{s}]"
1001 | self.assertIsNone(
1002 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"'
1003 | )
1004 | for p in "n":
1005 | for s in self.invalid_strings_n:
1006 | with self.subTest(s=s, p=p):
1007 | v = f"{p}.[{s}]"
1008 | self.assertIsNone(
1009 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"'
1010 | )
1011 |
1012 |
1013 | if __name__ == "__main__":
1014 | unittest.main()
1015 |
--------------------------------------------------------------------------------
/tests/test_patterns/test_protein.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import re
3 | from mavehgvs.patterns.protein import (
4 | pro_equal,
5 | pro_sub,
6 | pro_fs,
7 | pro_del,
8 | pro_dup,
9 | pro_ins,
10 | pro_delins,
11 | pro_variant,
12 | pro_single_variant,
13 | pro_multi_variant,
14 | )
15 | from . import build_multi_variants
16 |
17 |
18 | class TestProteinEqual(unittest.TestCase):
19 | @classmethod
20 | def setUpClass(cls):
21 | cls.pattern = re.compile(pro_equal, flags=re.ASCII)
22 |
23 | cls.valid_strings = [
24 | "=",
25 | "(=)",
26 | "Cys22=",
27 | ]
28 |
29 | cls.invalid_strings = ["=22", "Arg18(=)", "Cys-22", "=="]
30 |
31 | def test_valid_strings(self):
32 | for s in self.valid_strings:
33 | with self.subTest(s=s):
34 | self.assertIsNotNone(
35 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
36 | )
37 |
38 | def test_invalid_strings(self):
39 | for s in self.invalid_strings:
40 | with self.subTest(s=s):
41 | self.assertIsNone(
42 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
43 | )
44 |
45 |
46 | class TestProteinSub(unittest.TestCase):
47 | @classmethod
48 | def setUpClass(cls):
49 | cls.pattern = re.compile(pro_sub, flags=re.ASCII)
50 |
51 | cls.valid_strings = ["Glu27Trp", "Ter345Lys"]
52 |
53 | cls.invalid_strings = [
54 | "22A>T",
55 | "Xaa12Arg",
56 | "Arg21Xaa",
57 | "Pro17*",
58 | "*345Lys",
59 | "(Glu27Trp)",
60 | ]
61 |
62 | def test_valid_strings(self):
63 | for s in self.valid_strings:
64 | with self.subTest(s=s):
65 | self.assertIsNotNone(
66 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
67 | )
68 |
69 | def test_invalid_strings(self):
70 | for s in self.invalid_strings:
71 | with self.subTest(s=s):
72 | self.assertIsNone(
73 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
74 | )
75 |
76 |
77 | class TestProteinFs(unittest.TestCase):
78 | @classmethod
79 | def setUpClass(cls):
80 | cls.pattern = re.compile(pro_fs, flags=re.ASCII)
81 |
82 | cls.valid_strings = ["Glu27fs"]
83 |
84 | cls.invalid_strings = [
85 | "=fs",
86 | "Arg12LysfsTer18",
87 | "Arg12Lysfs*18",
88 | "Glu27fs*?",
89 | "(Glu27fs)",
90 | ]
91 |
92 | def test_valid_strings(self):
93 | for s in self.valid_strings:
94 | with self.subTest(s=s):
95 | self.assertIsNotNone(
96 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
97 | )
98 |
99 | def test_invalid_strings(self):
100 | for s in self.invalid_strings:
101 | with self.subTest(s=s):
102 | self.assertIsNone(
103 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
104 | )
105 |
106 |
107 | class TestProteinDel(unittest.TestCase):
108 | @classmethod
109 | def setUpClass(cls):
110 | cls.pattern = re.compile(pro_del, flags=re.ASCII)
111 |
112 | cls.valid_strings = [
113 | "Gly18del",
114 | "Gln7_Asn19del",
115 | ]
116 |
117 | cls.invalid_strings = ["=del", "18del", "122_128del", "(Gly18del)"]
118 |
119 | def test_valid_strings(self):
120 | for s in self.valid_strings:
121 | with self.subTest(s=s):
122 | self.assertIsNotNone(
123 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
124 | )
125 |
126 | def test_invalid_strings(self):
127 | for s in self.invalid_strings:
128 | with self.subTest(s=s):
129 | self.assertIsNone(
130 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
131 | )
132 |
133 |
134 | class TestProteinDup(unittest.TestCase):
135 | @classmethod
136 | def setUpClass(cls):
137 | cls.pattern = re.compile(pro_dup, flags=re.ASCII)
138 |
139 | cls.valid_strings = [
140 | "Cys5dup",
141 | "Pro12_Gly18dup",
142 | ]
143 |
144 | cls.invalid_strings = ["=dup", "18dup", "122_128dup", "(Cys5dup)"]
145 |
146 | def test_valid_strings(self):
147 | for s in self.valid_strings:
148 | with self.subTest(s=s):
149 | self.assertIsNotNone(
150 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
151 | )
152 |
153 | def test_invalid_strings(self):
154 | for s in self.invalid_strings:
155 | with self.subTest(s=s):
156 | self.assertIsNone(
157 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
158 | )
159 |
160 |
161 | class TestProteinIns(unittest.TestCase):
162 | @classmethod
163 | def setUpClass(cls):
164 | cls.pattern = re.compile(pro_ins, flags=re.ASCII)
165 |
166 | cls.valid_strings = [
167 | "His7_Gln8insSer",
168 | "Ala12_Pro13insGlyProCys",
169 | ]
170 |
171 | cls.invalid_strings = [
172 | "(His7_Gln8insSer)",
173 | "(His7_Gln8insX)",
174 | "(Ala12_Pro13ins(2))",
175 | "His7_Gln8ins?",
176 | "His7_Gln8insXaa",
177 | ]
178 |
179 | def test_valid_strings(self):
180 | for s in self.valid_strings:
181 | with self.subTest(s=s):
182 | self.assertIsNotNone(
183 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
184 | )
185 |
186 | def test_invalid_strings(self):
187 | for s in self.invalid_strings:
188 | with self.subTest(s=s):
189 | self.assertIsNone(
190 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
191 | )
192 |
193 |
194 | class TestProteinDelins(unittest.TestCase):
195 | @classmethod
196 | def setUpClass(cls):
197 | cls.pattern = re.compile(pro_delins, flags=re.ASCII)
198 |
199 | cls.valid_strings = [
200 | "Ile71_Cys80delinsSer",
201 | "His44delinsValProGlyGlu",
202 | ]
203 |
204 | cls.invalid_strings = ["(Ile71_Cys80delinsSer)", "Ile71_Cys80delinsXaa"]
205 |
206 | def test_valid_strings(self):
207 | for s in self.valid_strings:
208 | with self.subTest(s=s):
209 | self.assertIsNotNone(
210 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
211 | )
212 |
213 | def test_invalid_strings(self):
214 | for s in self.invalid_strings:
215 | with self.subTest(s=s):
216 | self.assertIsNone(
217 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
218 | )
219 |
220 |
221 | class TestProteinVariant(unittest.TestCase):
222 | @classmethod
223 | def setUpClass(cls):
224 | cls.pattern = re.compile(pro_variant, flags=re.ASCII)
225 |
226 | cls.valid_strings = [
227 | "=",
228 | "(=)",
229 | "Cys22=",
230 | "Glu27Trp",
231 | "Ter345Lys",
232 | "Glu27fs",
233 | "Gly18del",
234 | "Gln7_Asn19del",
235 | "Cys5dup",
236 | "Pro12_Gly18dup",
237 | "His7_Gln8insSer",
238 | "Ala12_Pro13insGlyProCys",
239 | "Ile71_Cys80delinsSer",
240 | "His44delinsValProGlyGlu",
241 | ]
242 |
243 | cls.invalid_strings = [
244 | "=22",
245 | "Arg18(=)",
246 | "Cys-22",
247 | "==",
248 | "22A>T",
249 | "Xaa12Arg",
250 | "Arg21Xaa",
251 | "Pro17*",
252 | "*345Lys",
253 | "(Glu27Trp)",
254 | "=fs",
255 | "Arg12LysfsTer18",
256 | "Arg12Lysfs*18",
257 | "Glu27fs*?",
258 | "(Glu27fs)",
259 | "=del",
260 | "18del",
261 | "122_128del",
262 | "(Gly18del)",
263 | "=dup",
264 | "18dup",
265 | "122_128dup",
266 | "(Cys5dup)",
267 | "(His7_Gln8insSer)",
268 | "(His7_Gln8insX)",
269 | "(Ala12_Pro13ins(2))",
270 | "His7_Gln8ins?",
271 | "His7_Gln8insXaa",
272 | "(Ile71_Cys80delinsSer)",
273 | "Ile71_Cys80delinsXaa",
274 | ]
275 |
276 | def test_valid_strings(self):
277 | for s in self.valid_strings:
278 | with self.subTest(s=s):
279 | self.assertIsNotNone(
280 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
281 | )
282 |
283 | def test_invalid_strings(self):
284 | for s in self.invalid_strings:
285 | with self.subTest(s=s):
286 | self.assertIsNone(
287 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
288 | )
289 |
290 |
291 | class TestProteinSingleVariant(unittest.TestCase):
292 | @classmethod
293 | def setUpClass(cls):
294 | cls.pattern = re.compile(pro_single_variant, flags=re.ASCII)
295 |
296 | cls.valid_strings = [
297 | "=",
298 | "(=)",
299 | "Cys22=",
300 | "Glu27Trp",
301 | "Ter345Lys",
302 | "Glu27fs",
303 | "Gly18del",
304 | "Gln7_Asn19del",
305 | "Cys5dup",
306 | "Pro12_Gly18dup",
307 | "His7_Gln8insSer",
308 | "Ala12_Pro13insGlyProCys",
309 | "Ile71_Cys80delinsSer",
310 | "His44delinsValProGlyGlu",
311 | ]
312 |
313 | cls.invalid_strings = [
314 | "=22",
315 | "Arg18(=)",
316 | "Cys-22",
317 | "==",
318 | "22A>T",
319 | "Xaa12Arg",
320 | "Arg21Xaa",
321 | "Pro17*",
322 | "*345Lys",
323 | "(Glu27Trp)",
324 | "=fs",
325 | "Arg12LysfsTer18",
326 | "Arg12Lysfs*18",
327 | "Glu27fs*?",
328 | "(Glu27fs)",
329 | "=del",
330 | "18del",
331 | "122_128del",
332 | "(Gly18del)",
333 | "=dup",
334 | "18dup",
335 | "122_128dup",
336 | "(Cys5dup)",
337 | "(His7_Gln8insSer)",
338 | "(His7_Gln8insX)",
339 | "(Ala12_Pro13ins(2))",
340 | "His7_Gln8ins?",
341 | "His7_Gln8insXaa",
342 | "(Ile71_Cys80delinsSer)",
343 | "Ile71_Cys80delinsXaa",
344 | ]
345 |
346 | def test_valid_strings(self):
347 | for s in self.valid_strings:
348 | with self.subTest(s=s):
349 | v = f"p.{s}"
350 | self.assertIsNotNone(
351 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"'
352 | )
353 |
354 | def test_invalid_strings(self):
355 | for s in self.invalid_strings:
356 | with self.subTest(s=s):
357 | v = f"p.{s}"
358 | self.assertIsNone(
359 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"'
360 | )
361 |
362 |
363 | class TestProteinMultiVariant(unittest.TestCase):
364 | @classmethod
365 | def setUpClass(cls):
366 | cls.pattern = re.compile(pro_multi_variant, flags=re.ASCII)
367 |
368 | single_valid_strings = [
369 | "=",
370 | "(=)",
371 | "Cys22=",
372 | "Glu27Trp",
373 | "Ter345Lys",
374 | "Glu27fs",
375 | "Gly18del",
376 | "Gln7_Asn19del",
377 | "Cys5dup",
378 | "Pro12_Gly18dup",
379 | "His7_Gln8insSer",
380 | "Ala12_Pro13insGlyProCys",
381 | "Ile71_Cys80delinsSer",
382 | "His44delinsValProGlyGlu",
383 | ]
384 |
385 | single_invalid_strings = [
386 | "=22",
387 | "Arg18(=)",
388 | "Cys-22",
389 | "==",
390 | "22A>T",
391 | "Xaa12Arg",
392 | "Arg21Xaa",
393 | "Pro17*",
394 | "*345Lys",
395 | "(Glu27Trp)",
396 | "=fs",
397 | "Arg12LysfsTer18",
398 | "Arg12Lysfs*18",
399 | "Glu27fs*?",
400 | "(Glu27fs)",
401 | "=del",
402 | "18del",
403 | "122_128del",
404 | "(Gly18del)",
405 | "=dup",
406 | "18dup",
407 | "122_128dup",
408 | "(Cys5dup)",
409 | "(His7_Gln8insSer)",
410 | "(His7_Gln8insX)",
411 | "(Ala12_Pro13ins(2))",
412 | "His7_Gln8ins?",
413 | "His7_Gln8insXaa",
414 | "(Ile71_Cys80delinsSer)",
415 | "Ile71_Cys80delinsXaa",
416 | ]
417 |
418 | cls.valid_strings, cls.invalid_strings = build_multi_variants(
419 | single_valid_strings, single_invalid_strings
420 | )
421 |
422 | def test_valid_strings(self):
423 | for s in self.valid_strings:
424 | with self.subTest(s=s):
425 | v = f"p.[{s}]"
426 | self.assertIsNotNone(
427 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"'
428 | )
429 |
430 | def test_invalid_strings(self):
431 | for s in self.invalid_strings:
432 | with self.subTest(s=s):
433 | v = f"p.[{s}]"
434 | self.assertIsNone(
435 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"'
436 | )
437 |
438 |
439 | if __name__ == "__main__":
440 | unittest.main()
441 |
--------------------------------------------------------------------------------
/tests/test_patterns/test_rna.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import re
3 | from mavehgvs.patterns.rna import (
4 | rna_equal,
5 | rna_sub,
6 | rna_del,
7 | rna_dup,
8 | rna_ins,
9 | rna_delins,
10 | rna_variant,
11 | rna_single_variant,
12 | rna_multi_variant,
13 | )
14 | from . import build_multi_variants
15 |
16 |
17 | class TestRnaEqual(unittest.TestCase):
18 | @classmethod
19 | def setUpClass(cls):
20 | cls.pattern = re.compile(rna_equal, flags=re.ASCII)
21 |
22 | cls.valid_strings = [
23 | "=",
24 | ]
25 |
26 | cls.invalid_strings = ["=22", "(=)", "=="]
27 |
28 | def test_valid_strings(self):
29 | for s in self.valid_strings:
30 | with self.subTest(s=s):
31 | self.assertIsNotNone(
32 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
33 | )
34 |
35 | def test_invalid_strings(self):
36 | for s in self.invalid_strings:
37 | with self.subTest(s=s):
38 | self.assertIsNone(
39 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
40 | )
41 |
42 |
43 | class TestRnaSub(unittest.TestCase):
44 | @classmethod
45 | def setUpClass(cls):
46 | cls.pattern = re.compile(rna_sub, flags=re.ASCII)
47 |
48 | cls.valid_strings = ["22g>u", "33+12a>c"]
49 |
50 | cls.invalid_strings = [
51 | "spl",
52 | "33+12A>G",
53 | "22g>t",
54 | ]
55 |
56 | def test_valid_strings(self):
57 | for s in self.valid_strings:
58 | with self.subTest(s=s):
59 | self.assertIsNotNone(
60 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
61 | )
62 |
63 | def test_invalid_strings(self):
64 | for s in self.invalid_strings:
65 | with self.subTest(s=s):
66 | self.assertIsNone(
67 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
68 | )
69 |
70 |
71 | class TestRnaDel(unittest.TestCase):
72 | @classmethod
73 | def setUpClass(cls):
74 | cls.pattern = re.compile(rna_del, flags=re.ASCII)
75 |
76 | cls.valid_strings = ["34_36del", "17del", "27_27+12del", "101+1_101+7del"]
77 |
78 | cls.invalid_strings = ["=del", "=/9_12del", "(155_185)del", "34_36"]
79 |
80 | def test_valid_strings(self):
81 | for s in self.valid_strings:
82 | with self.subTest(s=s):
83 | self.assertIsNotNone(
84 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
85 | )
86 |
87 | def test_invalid_strings(self):
88 | for s in self.invalid_strings:
89 | with self.subTest(s=s):
90 | self.assertIsNone(
91 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
92 | )
93 |
94 |
95 | class TestRnaDup(unittest.TestCase):
96 | @classmethod
97 | def setUpClass(cls):
98 | cls.pattern = re.compile(rna_dup, flags=re.ASCII)
99 |
100 | cls.valid_strings = ["12dup", "2_24dup", "101+1_101+7dup", "12-24_12-12dup"]
101 |
102 | cls.invalid_strings = ["=dup", "(78+1_79-1)_(124+1_125-1)dup"]
103 |
104 | def test_valid_strings(self):
105 | for s in self.valid_strings:
106 | with self.subTest(s=s):
107 | self.assertIsNotNone(
108 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
109 | )
110 |
111 | def test_invalid_strings(self):
112 | for s in self.invalid_strings:
113 | with self.subTest(s=s):
114 | self.assertIsNone(
115 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
116 | )
117 |
118 |
119 | class TestRnaIns(unittest.TestCase):
120 | @classmethod
121 | def setUpClass(cls):
122 | cls.pattern = re.compile(rna_ins, flags=re.ASCII)
123 |
124 | cls.valid_strings = [
125 | "22_23insauc",
126 | "17_18insa",
127 | ]
128 |
129 | cls.invalid_strings = [
130 | "(27_30)insu",
131 | "74_74insnnn",
132 | ]
133 |
134 | def test_valid_strings(self):
135 | for s in self.valid_strings:
136 | with self.subTest(s=s):
137 | self.assertIsNotNone(
138 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
139 | )
140 |
141 | def test_invalid_strings(self):
142 | for s in self.invalid_strings:
143 | with self.subTest(s=s):
144 | self.assertIsNone(
145 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
146 | )
147 |
148 |
149 | class TestRnaDelins(unittest.TestCase):
150 | @classmethod
151 | def setUpClass(cls):
152 | cls.pattern = re.compile(rna_delins, flags=re.ASCII)
153 |
154 | cls.valid_strings = ["92delinsgac", "12_17delinsc"]
155 |
156 | cls.invalid_strings = ["234_235ins(10)", "(122_125)insg"]
157 |
158 | def test_valid_strings(self):
159 | for s in self.valid_strings:
160 | with self.subTest(s=s):
161 | self.assertIsNotNone(
162 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
163 | )
164 |
165 | def test_invalid_strings(self):
166 | for s in self.invalid_strings:
167 | with self.subTest(s=s):
168 | self.assertIsNone(
169 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
170 | )
171 |
172 |
173 | class TestRnaVariant(unittest.TestCase):
174 | @classmethod
175 | def setUpClass(cls):
176 | cls.pattern = re.compile(rna_variant, flags=re.ASCII)
177 |
178 | cls.valid_strings = [
179 | "=",
180 | "22g>u",
181 | "33+12a>c",
182 | "34_36del",
183 | "17del",
184 | "12dup",
185 | "2_24dup",
186 | "101+1_101+7dup",
187 | "22_23insauc",
188 | "17_18insa",
189 | "92delinsgac",
190 | "12_17delinsc",
191 | ]
192 |
193 | cls.invalid_strings = [
194 | "=22",
195 | "(=)",
196 | "==",
197 | "spl",
198 | "33+12A>G",
199 | "22g>t",
200 | "=del",
201 | "=/9_12del",
202 | "(155_185)del",
203 | "=dup",
204 | "(78+1_79-1)_(124+1_125-1)dup",
205 | "(27_30)insu",
206 | "74_74insnnn",
207 | "234_235ins(10)",
208 | "(122_125)insg",
209 | ]
210 |
211 | def test_valid_strings(self):
212 | for s in self.valid_strings:
213 | with self.subTest(s=s):
214 | self.assertIsNotNone(
215 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"'
216 | )
217 |
218 | def test_invalid_strings(self):
219 | for s in self.invalid_strings:
220 | with self.subTest(s=s):
221 | self.assertIsNone(
222 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"'
223 | )
224 |
225 |
226 | class TestRnaSingleVariant(unittest.TestCase):
227 | @classmethod
228 | def setUpClass(cls):
229 | cls.pattern = re.compile(rna_single_variant, flags=re.ASCII)
230 |
231 | cls.valid_strings = [
232 | "=",
233 | "22g>u",
234 | "33+12a>c",
235 | "34_36del",
236 | "17del",
237 | "12dup",
238 | "2_24dup",
239 | "101+1_101+7dup",
240 | "22_23insauc",
241 | "17_18insa",
242 | "92delinsgac",
243 | "12_17delinsc",
244 | ]
245 |
246 | cls.invalid_strings = [
247 | "=22",
248 | "(=)",
249 | "==",
250 | "spl",
251 | "33+12A>G",
252 | "22g>t",
253 | "=del",
254 | "=/9_12del",
255 | "(155_185)del",
256 | "=dup",
257 | "(78+1_79-1)_(124+1_125-1)dup",
258 | "(27_30)insu",
259 | "74_74insnnn",
260 | "234_235ins(10)",
261 | "(122_125)insg",
262 | ]
263 |
264 | def test_valid_strings(self):
265 | for s in self.valid_strings:
266 | with self.subTest(s=s):
267 | v = f"r.{s}"
268 | self.assertIsNotNone(
269 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"'
270 | )
271 |
272 | def test_invalid_strings(self):
273 | for s in self.invalid_strings:
274 | with self.subTest(s=s):
275 | v = f"r.{s}"
276 | self.assertIsNone(
277 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"'
278 | )
279 |
280 |
281 | class TestRnaMultiVariant(unittest.TestCase):
282 | @classmethod
283 | def setUpClass(cls):
284 | cls.pattern = re.compile(rna_multi_variant, flags=re.ASCII)
285 |
286 | single_valid_strings = [
287 | "=",
288 | "22g>u",
289 | "33+12a>c",
290 | "34_36del",
291 | "17del",
292 | "12dup",
293 | "2_24dup",
294 | "101+1_101+7dup",
295 | "22_23insauc",
296 | "17_18insa",
297 | "92delinsgac",
298 | "12_17delinsc",
299 | ]
300 |
301 | single_invalid_strings = [
302 | "=22",
303 | "(=)",
304 | "==",
305 | "spl",
306 | "33+12A>G",
307 | "22g>t",
308 | "=del",
309 | "=/9_12del",
310 | "(155_185)del",
311 | "=dup",
312 | "(78+1_79-1)_(124+1_125-1)dup",
313 | "(27_30)insu",
314 | "74_74insnnn",
315 | "234_235ins(10)",
316 | "(122_125)insg",
317 | ]
318 | cls.valid_strings, cls.invalid_strings = build_multi_variants(
319 | single_valid_strings, single_invalid_strings
320 | )
321 |
322 | def test_valid_strings(self):
323 | for s in self.valid_strings:
324 | with self.subTest(s=s):
325 | v = f"r.[{s}]"
326 | self.assertIsNotNone(
327 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"'
328 | )
329 |
330 | def test_invalid_strings(self):
331 | for s in self.invalid_strings:
332 | with self.subTest(s=s):
333 | v = f"r.[{s}]"
334 | self.assertIsNone(
335 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"'
336 | )
337 |
338 |
339 | if __name__ == "__main__":
340 | unittest.main()
341 |
--------------------------------------------------------------------------------
/tests/test_patterns/test_util.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from mavehgvs.patterns.util import combine_patterns, remove_named_groups
4 |
5 |
6 | class TestCombinePatterns(unittest.TestCase):
7 | def test_without_groupname(self):
8 | pattern_tuples = [
9 | (
10 | ("(?P(?P[1-9]))", "(?P(?P[1-9]))"),
11 | "(?:(?P(?P[1-9]))|(?P(?P[1-9])))",
12 | )
13 | ]
14 |
15 | for p1, p2 in pattern_tuples:
16 | with self.subTest(p1=p1, p2=p2):
17 | self.assertEqual(combine_patterns(p1), p2)
18 |
19 | def test_with_groupname(self):
20 | pattern_tuples = [
21 | (
22 | ("(?P(?P[1-9]))", "(?P(?P[1-9]))"),
23 | "test",
24 | "(?P(?P(?P[1-9]))|(?P(?P[1-9])))",
25 | )
26 | ]
27 |
28 | for p1, g, p2 in pattern_tuples:
29 | with self.subTest(p1=p1, g=g, p2=p2):
30 | self.assertEqual(combine_patterns(p1, groupname=g), p2)
31 |
32 |
33 | class TestRemoveNamedGroups(unittest.TestCase):
34 | def test_noncapturing(self):
35 | pattern_tuples = [("(?P(?P[1-9]))", "(?:(?:[1-9]))")]
36 |
37 | for p1, p2 in pattern_tuples:
38 | with self.subTest(p1=p1, p2=p2):
39 | self.assertEqual(remove_named_groups(p1, noncapturing=True), p2)
40 |
41 | def test_capturing(self):
42 | pattern_tuples = [("(?P(?P[1-9]))", "(([1-9]))")]
43 |
44 | for p1, p2 in pattern_tuples:
45 | with self.subTest(p1=p1, p2=p2):
46 | self.assertEqual(remove_named_groups(p1, noncapturing=False), p2)
47 |
48 |
49 | if __name__ == "__main__":
50 | unittest.main()
51 |
--------------------------------------------------------------------------------
/tests/test_position.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import itertools
3 | import random
4 | from mavehgvs.position import VariantPosition
5 | from mavehgvs.exceptions import MaveHgvsParseError
6 |
7 |
8 | class TestObjectCreation(unittest.TestCase):
9 | def test_position_only(self) -> None:
10 | v = VariantPosition("8")
11 | self.assertTupleEqual(
12 | (v.position, v.amino_acid, v.intronic_position, v.utr),
13 | (8, None, None, None),
14 | )
15 | self.assertFalse(v.is_utr())
16 | self.assertFalse(v.is_intronic())
17 | self.assertFalse(v.is_protein())
18 | self.assertFalse(v.is_extended())
19 |
20 | v = VariantPosition("92380")
21 | self.assertTupleEqual(
22 | (v.position, v.amino_acid, v.intronic_position, v.utr),
23 | (92380, None, None, None),
24 | )
25 | self.assertFalse(v.is_utr())
26 | self.assertFalse(v.is_intronic())
27 | self.assertFalse(v.is_protein())
28 | self.assertFalse(v.is_extended())
29 |
30 | def test_amino_acid(self) -> None:
31 | v = VariantPosition("Gly8")
32 | self.assertTupleEqual(
33 | (v.position, v.amino_acid, v.intronic_position, v.utr),
34 | (8, "Gly", None, None),
35 | )
36 | self.assertFalse(v.is_utr())
37 | self.assertFalse(v.is_intronic())
38 | self.assertTrue(v.is_protein())
39 | self.assertFalse(v.is_extended())
40 |
41 | v = VariantPosition("Cys92380")
42 | self.assertTupleEqual(
43 | (v.position, v.amino_acid, v.intronic_position, v.utr),
44 | (92380, "Cys", None, None),
45 | )
46 | self.assertFalse(v.is_utr())
47 | self.assertFalse(v.is_intronic())
48 | self.assertTrue(v.is_protein())
49 | self.assertFalse(v.is_extended())
50 |
51 | def test_invalid_strings(self) -> None:
52 | position_strings = (
53 | "08",
54 | "+12",
55 | "*-99",
56 | "A",
57 | "TCGA",
58 | "g",
59 | "*",
60 | "-",
61 | "+",
62 | "**6",
63 | "800 + 12",
64 | "-12*5",
65 | "Glu-12",
66 | "*5Trp",
67 | "Xyz12",
68 | "ALA12",
69 | )
70 | for s in position_strings:
71 | with self.subTest(s=s):
72 | with self.assertRaises(MaveHgvsParseError):
73 | VariantPosition(s)
74 |
75 | def test_utr(self) -> None:
76 | v = VariantPosition("*8")
77 | self.assertTupleEqual(
78 | (v.position, v.amino_acid, v.intronic_position, v.utr),
79 | (8, None, None, True),
80 | )
81 | self.assertTrue(v.is_utr())
82 | self.assertFalse(v.is_intronic())
83 | self.assertFalse(v.is_protein())
84 | self.assertTrue(v.is_extended())
85 |
86 | v = VariantPosition("-80")
87 | self.assertTupleEqual(
88 | (v.position, v.amino_acid, v.intronic_position, v.utr),
89 | (-80, None, None, True),
90 | )
91 | self.assertTrue(v.is_utr())
92 | self.assertFalse(v.is_intronic())
93 | self.assertFalse(v.is_protein())
94 | self.assertTrue(v.is_extended())
95 |
96 | def test_intron(self) -> None:
97 | v = VariantPosition("122-6")
98 | self.assertTupleEqual(
99 | (v.position, v.amino_acid, v.intronic_position, v.utr),
100 | (122, None, -6, None),
101 | )
102 | self.assertFalse(v.is_utr())
103 | self.assertTrue(v.is_intronic())
104 | self.assertFalse(v.is_protein())
105 | self.assertTrue(v.is_extended())
106 |
107 | v = VariantPosition("78+10")
108 | self.assertTupleEqual(
109 | (v.position, v.amino_acid, v.intronic_position, v.utr), (78, None, 10, None)
110 | )
111 | self.assertFalse(v.is_utr())
112 | self.assertTrue(v.is_intronic())
113 | self.assertFalse(v.is_protein())
114 | self.assertTrue(v.is_extended())
115 |
116 | def test_utr_intron(self) -> None:
117 | v = VariantPosition("*89+67")
118 | self.assertTupleEqual(
119 | (v.position, v.amino_acid, v.intronic_position, v.utr), (89, None, 67, True)
120 | )
121 | self.assertTrue(v.is_utr())
122 | self.assertTrue(v.is_intronic())
123 | self.assertFalse(v.is_protein())
124 | self.assertTrue(v.is_extended())
125 |
126 | v = VariantPosition("-127+6")
127 | self.assertTupleEqual(
128 | (v.position, v.amino_acid, v.intronic_position, v.utr),
129 | (-127, None, 6, True),
130 | )
131 | self.assertTrue(v.is_utr())
132 | self.assertTrue(v.is_intronic())
133 | self.assertFalse(v.is_protein())
134 | self.assertTrue(v.is_extended())
135 |
136 | v = VariantPosition("*73-105")
137 | self.assertTupleEqual(
138 | (v.position, v.amino_acid, v.intronic_position, v.utr),
139 | (73, None, -105, True),
140 | )
141 | self.assertTrue(v.is_utr())
142 | self.assertTrue(v.is_intronic())
143 | self.assertFalse(v.is_protein())
144 | self.assertTrue(v.is_extended())
145 |
146 | v = VariantPosition("-45-1")
147 | self.assertTupleEqual(
148 | (v.position, v.amino_acid, v.intronic_position, v.utr),
149 | (-45, None, -1, True),
150 | )
151 | self.assertTrue(v.is_utr())
152 | self.assertTrue(v.is_intronic())
153 | self.assertFalse(v.is_protein())
154 | self.assertTrue(v.is_extended())
155 |
156 |
157 | class TestObjectRepresentation(unittest.TestCase):
158 | def test_repr(self) -> None:
159 | position_strings = (
160 | "8",
161 | "92380",
162 | "*8",
163 | "-80",
164 | "122-6",
165 | "78+10",
166 | "*89+67",
167 | "-127+6",
168 | "*73-105",
169 | "-45-1",
170 | "Cys234",
171 | "Ala9",
172 | )
173 | for s in position_strings:
174 | with self.subTest(s=s):
175 | v = VariantPosition(s)
176 | self.assertEqual(s, repr(v))
177 |
178 |
179 | # TODO: add amino acid variants
180 | class TestComparisons(unittest.TestCase):
181 | def setUp(self) -> None:
182 | sorted_position_strings = (
183 | "-45-1",
184 | "-12",
185 | "8",
186 | "99",
187 | "99+88",
188 | "99+122",
189 | "100-12",
190 | "100",
191 | "101",
192 | "202-12",
193 | "202-1",
194 | "202",
195 | "*1",
196 | "*73-105",
197 | )
198 |
199 | self.sorted_variants = [VariantPosition(p) for p in sorted_position_strings]
200 |
201 | # pairwise itertools recipe
202 | a, b = itertools.tee(self.sorted_variants)
203 | next(b, None)
204 | self.sorted_variant_pairs = zip(a, b)
205 |
206 | def test_eq(self) -> None:
207 | for v in self.sorted_variants:
208 | with self.subTest(v=v):
209 | self.assertEqual(v, v)
210 |
211 | def test_ne(self) -> None:
212 | for v1, v2 in self.sorted_variant_pairs:
213 | with self.subTest(v1=v1, v2=v2):
214 | self.assertNotEqual(v1, v2)
215 |
216 | def test_lt(self) -> None:
217 | for v1, v2 in self.sorted_variant_pairs:
218 | with self.subTest(v1=v1, v2=v2):
219 | self.assertLess(v1, v2)
220 |
221 | def test_sorting(self) -> None:
222 | for _ in range(10):
223 | with self.subTest():
224 | shuffled_variants = self.sorted_variants.copy()
225 | while shuffled_variants == self.sorted_variants:
226 | random.shuffle(shuffled_variants)
227 | self.assertListEqual(self.sorted_variants, sorted(shuffled_variants))
228 |
229 |
230 | # TODO: add amino acid variants
231 | class TestAdjacency(unittest.TestCase):
232 | def test_adjacent_pairs(self) -> None:
233 | adjacent_pairs = (
234 | ("-45-2", "-45-1"),
235 | ("-45-1", "-45"),
236 | ("-12", "-13"),
237 | ("-1", "1"),
238 | ("8", "9"),
239 | ("202-1", "202"),
240 | ("99", "99+1"),
241 | ("99+88", "99+89"),
242 | ("100-12", "100-11"),
243 | ("100", "101"),
244 | ("*1", "*2"),
245 | ("*73-1", "*73"),
246 | )
247 | for s1, s2 in adjacent_pairs:
248 | v1 = VariantPosition(s1)
249 | v2 = VariantPosition(s2)
250 | with self.subTest(v1=v1, v2=v2):
251 | self.assertTrue(v1.is_adjacent(v2))
252 | with self.subTest(v1=v1, v2=v2):
253 | self.assertTrue(v2.is_adjacent(v1))
254 |
255 | def test_not_adjacent_to_self(self) -> None:
256 | position_strings = (
257 | "-45-1",
258 | "-12",
259 | "8",
260 | "99",
261 | "99+88",
262 | "99+122",
263 | "100-12",
264 | "100",
265 | "103",
266 | "202-12",
267 | "202-1",
268 | "205",
269 | "*1",
270 | "*12",
271 | "*73-105",
272 | )
273 | variants = [VariantPosition(s) for s in position_strings]
274 | for v in variants:
275 | with self.subTest(v=v):
276 | self.assertFalse(v.is_adjacent(v))
277 |
278 | def test_non_adjacent_pairs(self) -> None:
279 | position_strings = (
280 | "-45-1",
281 | "-12",
282 | "8",
283 | "99",
284 | "99+88",
285 | "99+122",
286 | "100-12",
287 | "103",
288 | "202-12",
289 | "202-1",
290 | "205",
291 | "*1",
292 | "*12",
293 | "*73-105",
294 | )
295 | variants = [VariantPosition(s) for s in position_strings]
296 |
297 | for v1, v2 in itertools.permutations(variants, 2):
298 | with self.subTest(v1=v1, v2=v2):
299 | self.assertFalse(v1.is_adjacent(v2))
300 |
301 |
302 | if __name__ == "__main__":
303 | unittest.main()
304 |
--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from mavehgvs.util import parse_variant_strings
4 | from mavehgvs.variant import Variant
5 |
6 |
7 | class TestParseVariantStrings(unittest.TestCase):
8 | def test_sets_error_strings_for_invalid_items(self) -> None:
9 | invalid_variant_strings = [
10 | "g.Glu27Trp",
11 | "p.27Glu>Trp",
12 | "p.122-6T>A",
13 | "G>A",
14 | "22G>A",
15 | "G.44del",
16 | "a.78+5_78+10del",
17 | "77dup",
18 | "n.Pro12_Gly18dup",
19 | "g.22_23insauc",
20 | "g.25_24del",
21 | "g.25_24ins",
22 | "r.43-6_595+12delinsctt",
23 | "x.=",
24 | "c.(=)",
25 | ]
26 |
27 | for s in invalid_variant_strings:
28 | with self.subTest(s=s):
29 | valid, invalid = parse_variant_strings([s])
30 | self.assertIsNone(valid[0])
31 | self.assertIsInstance(invalid[0], str)
32 |
33 | def test_sets_variant_for_valid_items(self) -> None:
34 | valid_variant_strings = [
35 | "p.Glu27Trp",
36 | "c.122-6T>A",
37 | "g.44del",
38 | "c.78+5_78+10del",
39 | "c.77dup",
40 | "p.Pro12_Gly18dup",
41 | "p.Ala12_Pro13insGlyProCys",
42 | "r.22_23insauc",
43 | "c.43-6_595+12delinsCTT",
44 | "p.Ile71_Cys80delinsSer",
45 | "p.=",
46 | "c.=",
47 | "p.(=)",
48 | ]
49 |
50 | for s in valid_variant_strings:
51 | with self.subTest(s=s):
52 | valid, invalid = parse_variant_strings([s])
53 | self.assertIsInstance(valid[0], Variant)
54 | self.assertIsNone(invalid[0])
55 |
56 | def test_validates_with_targetseq(self) -> None:
57 | targetseq = "ACGT"
58 | valid_variant_strings = ["c.1A>T", "c.3G>C", "c.[1A>T;3G>C]"]
59 | invalid_variant_strings = ["c.1C>T", "c.3T>C", "c.[1A>T;3T>C]", "c.5A>G"]
60 |
61 | for s in valid_variant_strings:
62 | with self.subTest(s=s, targetseq=targetseq):
63 | valid, invalid = parse_variant_strings([s], targetseq=targetseq)
64 | self.assertIsInstance(valid[0], Variant)
65 | self.assertIsNone(invalid[0])
66 |
67 | for s in invalid_variant_strings:
68 | with self.subTest(s=s, targetseq=targetseq):
69 | valid, invalid = parse_variant_strings([s], targetseq=targetseq)
70 | self.assertIsNone(valid[0])
71 | self.assertIsInstance(invalid[0], str)
72 |
73 | def test_validates_expected_prefix(self) -> None:
74 | valid_variant_strings = ["p.Glu27Trp", "c.122-6T>A", "r.22_23insauc"]
75 |
76 | for s in valid_variant_strings:
77 | p = s[0]
78 | with self.subTest(s=s, p=p):
79 | valid, invalid = parse_variant_strings([s], expected_prefix=p)
80 | self.assertIsInstance(valid[0], Variant)
81 | self.assertIsNone(invalid[0])
82 |
83 | for s in valid_variant_strings:
84 | p = "g"
85 | with self.subTest(s=s, p=p):
86 | valid, invalid = parse_variant_strings([s], expected_prefix=p)
87 | self.assertIsNone(valid[0])
88 | self.assertIsInstance(invalid[0], str)
89 |
90 | def test_valid_expected_prefixes_only(self) -> None:
91 | valid_prefixes = list("cgmnopr")
92 | invalid_prefixes = list("CGMNOPRx.4ab?")
93 | variant = "p.Glu27Trp"
94 |
95 | for p in valid_prefixes:
96 | with self.subTest(p=p):
97 | parse_variant_strings([variant], expected_prefix=p)
98 |
99 | for p in invalid_prefixes:
100 | with self.subTest(p=p):
101 | with self.assertRaises(ValueError):
102 | parse_variant_strings([variant], expected_prefix=p)
103 |
--------------------------------------------------------------------------------
/tests/test_variant.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from mavehgvs.exceptions import MaveHgvsParseError
4 | from mavehgvs.variant import Variant
5 | from mavehgvs.position import VariantPosition
6 |
7 |
8 | class TestCreateSingleVariantFromString(unittest.TestCase):
9 | def test_invalid_raises_error(self) -> None:
10 | valid_variant_strings = [
11 | "p.Glu27Trp",
12 | "c.122-6T>A",
13 | "g.44del",
14 | "c.78+5_78+10del",
15 | "c.77dup",
16 | "p.Pro12_Gly18dup",
17 | "p.Ala12_Pro13insGlyProCys",
18 | "r.22_23insauc",
19 | "c.43-6_595+12delinsCTT",
20 | "p.Ile71_Cys80delinsSer",
21 | "p.=",
22 | "c.=",
23 | "p.(=)",
24 | "c.1_3=",
25 | "c.12=",
26 | "g.88_99=",
27 | "c.43-6_595+12=",
28 | "p.Glu27fs",
29 | "NM_001301.4:c.122-6T>A",
30 | ]
31 |
32 | invalid_variant_strings = [
33 | "g.Glu27Trp",
34 | "p.27Glu>Trp",
35 | "p.122-6T>A",
36 | "G>A",
37 | "22G>A",
38 | "G.44del",
39 | "a.78+5_78+10del",
40 | "77dup",
41 | "n.Pro12_Gly18dup",
42 | "p.Pro12_Gly18insGlyProAla",
43 | "g.22_23insauc",
44 | "g.25_24del",
45 | "g.25_24ins",
46 | "r.22_24insauc",
47 | "r.43-6_595+12delinsctt",
48 | "x.=",
49 | "c.(=)",
50 | "p.(Gly24=)",
51 | "p.Gly24(=)",
52 | "p.Arg12LysfsTer18",
53 | "p.Glu27fs*?",
54 | "NM_001301.4::c.122-6T>A",
55 | ]
56 |
57 | for s in valid_variant_strings:
58 | with self.subTest(s=s):
59 | Variant(s) # should pass
60 |
61 | for s in invalid_variant_strings:
62 | with self.subTest(s=s):
63 | with self.assertRaises(MaveHgvsParseError):
64 | Variant(s)
65 |
66 | def test_sub(self) -> None:
67 | variant_strings = [
68 | "p.Glu27Trp",
69 | "p.Ter345Lys",
70 | "p.Cys22=",
71 | "g.48C>A",
72 | "c.122-6T>A",
73 | "c.*33G>C",
74 | "r.22g>u",
75 | "r.33+12a>c",
76 | "p.=",
77 | "p.(=)",
78 | "n.=",
79 | "c.1_3=",
80 | "c.12=",
81 | "g.88_99=",
82 | "c.43-6_595+12=",
83 | ]
84 |
85 | for s in variant_strings:
86 | with self.subTest(s=s):
87 | v = Variant(s)
88 | self.assertEqual(s, str(v))
89 |
90 | def test_fs(self) -> None:
91 | variant_strings = ["p.Glu27fs"]
92 |
93 | for s in variant_strings:
94 | with self.subTest(s=s):
95 | v = Variant(s)
96 | self.assertEqual(s, str(v))
97 |
98 | def test_del(self) -> None:
99 | variant_strings = [
100 | "g.44del",
101 | "c.78+5_78+10del",
102 | "c.1_95del",
103 | "p.Gly18del",
104 | "p.Gln7_Asn19del",
105 | "r.34_36del",
106 | ]
107 |
108 | for s in variant_strings:
109 | with self.subTest(s=s):
110 | v = Variant(s)
111 | self.assertEqual(s, str(v))
112 |
113 | def test_dup(self) -> None:
114 | variant_strings = [
115 | "g.22_24dup",
116 | "c.77dup",
117 | "c.101+1_101+7dup",
118 | "p.Pro12_Gly18dup",
119 | "p.Cys5dup",
120 | "r.12dup",
121 | ]
122 |
123 | for s in variant_strings:
124 | with self.subTest(s=s):
125 | v = Variant(s)
126 | self.assertEqual(s, str(v))
127 |
128 | def test_ins(self) -> None:
129 | variant_strings = [
130 | "g.234_235insT",
131 | "c.84_85insCTG",
132 | "c.99+6_99+7insA",
133 | "p.His7_Gln8insSer",
134 | "p.Ala12_Pro13insGlyProCys",
135 | "r.22_23insauc",
136 | ]
137 |
138 | for s in variant_strings:
139 | with self.subTest(s=s):
140 | v = Variant(s)
141 | self.assertEqual(s, str(v))
142 |
143 | def test_delins(self) -> None:
144 | variant_strings = [
145 | "g.22delinsAACG",
146 | "c.83_85delinsT",
147 | "c.43-6_595+12delinsCTT",
148 | "p.Ile71_Cys80delinsSer",
149 | "p.His44delinsValProGlyGlu",
150 | "r.92delinsgac",
151 | ]
152 |
153 | for s in variant_strings:
154 | with self.subTest(s=s):
155 | v = Variant(s)
156 | self.assertEqual(s, str(v))
157 |
158 | def test_target_identical(self) -> None:
159 | identical_variant_strings = [
160 | *[f"{prefix}.=" for prefix in tuple("gmocnr")],
161 | "p.(=)",
162 | "c.1_3=",
163 | ]
164 |
165 | non_identical_variant_strings = [
166 | "p.Ter345Lys",
167 | "p.Cys22=",
168 | "g.48C>A",
169 | "c.122-6T>A",
170 | "g.22delinsAACG",
171 | "c.83_85delinsT",
172 | ]
173 |
174 | for s in identical_variant_strings:
175 | with self.subTest(s=s):
176 | v = Variant(s)
177 | self.assertTrue(v.is_target_identical())
178 |
179 | for s in non_identical_variant_strings:
180 | with self.subTest(s=s):
181 | v = Variant(s)
182 | self.assertFalse(v.is_target_identical())
183 |
184 | def test_synonymous(self) -> None:
185 | synonymous_variant_strings = ["p.Gly24=", "p.=", "p.(=)"]
186 |
187 | nonsynonymous_variant_strings = ["p.Ter345Lys", "c.=", "g.48C>A"]
188 |
189 | for s in synonymous_variant_strings:
190 | with self.subTest(s=s):
191 | v = Variant(s)
192 | self.assertTrue(v.is_synonymous())
193 |
194 | for s in nonsynonymous_variant_strings:
195 | with self.subTest(s=s):
196 | v = Variant(s)
197 | self.assertFalse(v.is_synonymous())
198 |
199 | def test_relaxed_ordering(self):
200 | variant_tuples = [
201 | ("c.78+10_78+5del", "c.78+5_78+10del"),
202 | ("c.80_77dup", "c.77_80dup"),
203 | ("p.Gly18_Pro12dup", "p.Pro12_Gly18dup"),
204 | ("p.Pro13_Ala12insGlyProCys", "p.Ala12_Pro13insGlyProCys"),
205 | ("r.23_22insauc", "r.22_23insauc"),
206 | ("c.595+12_43-6delinsCTT", "c.43-6_595+12delinsCTT"),
207 | ("p.Cys80_Ile71delinsSer", "p.Ile71_Cys80delinsSer"),
208 | ("c.3_1=", "c.1_3="),
209 | ("g.99_88=", "g.88_99="),
210 | ("c.595+12_43-6=", "c.43-6_595+12="),
211 | ]
212 |
213 | for v, s in variant_tuples:
214 | with self.subTest(v=v, s=s):
215 | self.assertEqual(str(Variant(v, relaxed_ordering=True)), s)
216 |
217 |
218 | class TestCreateMultiVariantFromString(unittest.TestCase):
219 | def test_creation(self):
220 | variant_strings = [
221 | "p.[Glu27Trp;Ter345Lys]",
222 | "p.[Glu27Trp;Lys212fs]",
223 | "p.[Gly18del;Glu27Trp;Ter345Lys]",
224 | "p.[Gln7_Asn19del;Glu27Trp;Ter345Lys]",
225 | "c.[1_35del;78+5_78+10del;122T>A]",
226 | "NM_002002.3:c.[1_35del;78+5_78+10del;122T>A]",
227 | ]
228 |
229 | invalid_variant_strings = [
230 | "p.[Glu27Trp;=;Ter345Lys]",
231 | "p.[(=);Gly18del;Glu27Trp;Ter345Lys]",
232 | "c.[12T>A;=;78+5_78+10del]",
233 | "c.[1_3=;12T>A;78+5_78+10del]",
234 | "p.[Glu27fs;Arg48Lys]",
235 | "p.[Glu27fs;Arg48fs]",
236 | "NM_002002.3::c.[1_35del;78+5_78+10del;122T>A]",
237 | "NM_002002.3:c.1_35del;78+5_78+10del;122T>A",
238 | ]
239 |
240 | for s in variant_strings:
241 | with self.subTest(s=s):
242 | v = Variant(s)
243 | self.assertEqual(s, str(v))
244 |
245 | for s in invalid_variant_strings:
246 | with self.subTest(s=s):
247 | with self.assertRaises(MaveHgvsParseError):
248 | Variant(s)
249 |
250 | def test_ordering(self):
251 | variant_string_tuples = [
252 | ("p.[Gly345Lys;Glu27Trp]", "p.[Glu27Trp;Gly345Lys]"),
253 | ("p.[Glu27Trp;Gly18del;Ter345Lys]", "p.[Gly18del;Glu27Trp;Ter345Lys]"),
254 | ("c.[122T>A;1_35del;78+5_78+10del]", "c.[1_35del;78+5_78+10del;122T>A]"),
255 | ]
256 |
257 | for s, _ in variant_string_tuples:
258 | with self.subTest(s=s):
259 | with self.assertRaises(MaveHgvsParseError):
260 | Variant(s, relaxed_ordering=False)
261 |
262 | for s, s_ordered in variant_string_tuples:
263 | with self.subTest(s=s):
264 | # Should pass creation
265 | Variant(s, relaxed_ordering=True)
266 |
267 | for s, s_ordered in variant_string_tuples:
268 | with self.subTest(s=s):
269 | v = Variant(s, relaxed_ordering=True)
270 | self.assertEqual(s_ordered, str(v))
271 |
272 | def test_overlaps(self):
273 | invalid_variant_strings = [
274 | "p.[Glu27Trp;Glu27Trp]",
275 | "p.[Glu27Trp;Glu27Tyr]",
276 | "p.[Pro27Trp;Glu27Tyr]",
277 | "p.[Gly18del;Gly18Tyr]",
278 | "p.[Gln7_Asn19del;Glu13Trp]",
279 | "p.[Glu13Trp;Gln7_Asn19del]",
280 | "p.[Gln7_Asn19del;Glu13Trp;Ter345Lys]",
281 | "c.[1_95del;78+5_78+10del;122T>A]",
282 | "c.[1_95del;22T>A]",
283 | "n.[22G>A;22G>T]",
284 | ]
285 |
286 | for s in invalid_variant_strings:
287 | with self.subTest(s=s):
288 | with self.assertRaises(MaveHgvsParseError):
289 | Variant(s)
290 |
291 |
292 | class TestCreateSingleVariantFromValues(unittest.TestCase):
293 | def test_equal(self):
294 | valid_dict_tuples = [
295 | (
296 | {
297 | "variant_type": "equal",
298 | "prefix": "p",
299 | "position": "27",
300 | "target": "Glu",
301 | },
302 | "p.Glu27=",
303 | ),
304 | (
305 | {
306 | "variant_type": "equal",
307 | "prefix": "c",
308 | "start_position": "12",
309 | "end_position": "12",
310 | },
311 | "c.12=",
312 | ),
313 | (
314 | {
315 | "variant_type": "equal",
316 | "prefix": "c",
317 | "start_position": "1",
318 | "end_position": "3",
319 | },
320 | "c.1_3=",
321 | ),
322 | ]
323 |
324 | for d, s in valid_dict_tuples:
325 | with self.subTest(d=d, s=s):
326 | self.assertEqual(Variant(s), Variant(d))
327 |
328 | def test_sub(self):
329 | valid_dict_tuples = [
330 | (
331 | {
332 | "variant_type": "sub",
333 | "prefix": "p",
334 | "position": 27,
335 | "target": "Glu",
336 | "variant": "Trp",
337 | },
338 | "p.Glu27Trp",
339 | ),
340 | (
341 | {
342 | "variant_type": "sub",
343 | "prefix": "c",
344 | "position": "122-6",
345 | "target": "T",
346 | "variant": "A",
347 | },
348 | "c.122-6T>A",
349 | ),
350 | ]
351 |
352 | for d, s in valid_dict_tuples:
353 | with self.subTest(d=d, s=s):
354 | self.assertEqual(Variant(s), Variant(d))
355 |
356 | def test_fs(self):
357 | valid_dict_tuples = [
358 | (
359 | {
360 | "variant_type": "fs",
361 | "prefix": "p",
362 | "position": 27,
363 | "target": "Glu",
364 | },
365 | "p.Glu27fs",
366 | ),
367 | ]
368 |
369 | for d, s in valid_dict_tuples:
370 | with self.subTest(d=d, s=s):
371 | self.assertEqual(Variant(s), Variant(d))
372 |
373 | def test_ins(self):
374 | valid_dict_tuples = [
375 | (
376 | {
377 | "variant_type": "ins",
378 | "prefix": "p",
379 | "start_position": 12,
380 | "start_target": "Ala",
381 | "end_position": 13,
382 | "end_target": "Pro",
383 | "variant": "GlyProCys",
384 | },
385 | "p.Ala12_Pro13insGlyProCys",
386 | ),
387 | (
388 | {
389 | "variant_type": "ins",
390 | "prefix": "r",
391 | "start_position": 22,
392 | "end_position": 23,
393 | "variant": "auc",
394 | },
395 | "r.22_23insauc",
396 | ),
397 | ]
398 |
399 | for d, s in valid_dict_tuples:
400 | with self.subTest(d=d, s=s):
401 | self.assertEqual(Variant(s), Variant(d))
402 |
403 | def test_del(self):
404 | valid_dict_tuples = [
405 | (
406 | {
407 | "variant_type": "del",
408 | "prefix": "g",
409 | "start_position": 44,
410 | "end_position": 44,
411 | },
412 | "g.44del",
413 | ),
414 | (
415 | {
416 | "variant_type": "del",
417 | "prefix": "c",
418 | "start_position": "78+5",
419 | "end_position": "78+10",
420 | },
421 | "c.78+5_78+10del",
422 | ),
423 | (
424 | {
425 | "variant_type": "del",
426 | "prefix": "p",
427 | "start_position": 33,
428 | "start_target": "Arg",
429 | "end_position": 33,
430 | "end_target": "Arg",
431 | },
432 | "p.Arg33del",
433 | ),
434 | ]
435 |
436 | for d, s in valid_dict_tuples:
437 | with self.subTest(d=d, s=s):
438 | self.assertEqual(Variant(s), Variant(d))
439 |
440 | def test_dup(self):
441 | valid_dict_tuples = [
442 | (
443 | {
444 | "variant_type": "dup",
445 | "prefix": "c",
446 | "start_position": 77,
447 | "end_position": 77,
448 | },
449 | "c.77dup",
450 | ),
451 | (
452 | {
453 | "variant_type": "dup",
454 | "prefix": "p",
455 | "start_position": 12,
456 | "start_target": "Pro",
457 | "end_position": 18,
458 | "end_target": "Gly",
459 | },
460 | "p.Pro12_Gly18dup",
461 | ),
462 | ]
463 |
464 | for d, s in valid_dict_tuples:
465 | with self.subTest(d=d, s=s):
466 | self.assertEqual(Variant(s), Variant(d))
467 |
468 | def test_delins(self):
469 | valid_dict_tuples = [
470 | (
471 | {
472 | "variant_type": "delins",
473 | "prefix": "c",
474 | "start_position": "43-6",
475 | "end_position": "595+12",
476 | "variant": "CTT",
477 | },
478 | "c.43-6_595+12delinsCTT",
479 | ),
480 | (
481 | {
482 | "variant_type": "delins",
483 | "prefix": "c",
484 | "start_position": "45",
485 | "end_position": "45",
486 | "variant": "AGA",
487 | },
488 | "c.45delinsAGA",
489 | ),
490 | (
491 | {
492 | "variant_type": "delins",
493 | "prefix": "p",
494 | "start_position": 71,
495 | "start_target": "Ile",
496 | "end_position": 80,
497 | "end_target": "Cys",
498 | "variant": "Ser",
499 | },
500 | "p.Ile71_Cys80delinsSer",
501 | ),
502 | (
503 | {
504 | "variant_type": "delins",
505 | "prefix": "p",
506 | "start_position": 50,
507 | "start_target": "Arg",
508 | "end_position": 50,
509 | "end_target": "Arg",
510 | "variant": "AlaGly",
511 | },
512 | "p.Arg50delinsAlaGly",
513 | ),
514 | ]
515 |
516 | invalid_dicts = [
517 | {
518 | "variant_type": "delins",
519 | "prefix": "p",
520 | "start_position": 50,
521 | "start_target": "Arg",
522 | "end_position": 50,
523 | "end_target": "Cys",
524 | "variant": "AlaGly",
525 | },
526 | ]
527 |
528 | for d, s in valid_dict_tuples:
529 | with self.subTest(d=d, s=s):
530 | self.assertEqual(Variant(s), Variant(d))
531 |
532 | for d in invalid_dicts:
533 | with self.subTest(d=d):
534 | with self.assertRaises(MaveHgvsParseError):
535 | Variant(d)
536 |
537 | def test_extra_keys(self):
538 | invalid_dicts = [
539 | {
540 | "variant_type": "sub",
541 | "prefix": "p",
542 | "position": 27,
543 | "target": "Glu",
544 | "variant": "Trp",
545 | "bonus": "data",
546 | },
547 | {
548 | "variant_type": "sub",
549 | "prefix": "c",
550 | "position": "122-6",
551 | "start_target": "T",
552 | "target": "T",
553 | "variant": "A",
554 | },
555 | {
556 | "variant_type": "delins",
557 | "prefix": "p",
558 | "start_target": "Ile",
559 | "end_position": 80,
560 | "end_target": "Cys",
561 | "variant": "Ser",
562 | "position": "Ala",
563 | },
564 | ]
565 |
566 | for d in invalid_dicts:
567 | with self.subTest(d=d):
568 | with self.assertRaises(MaveHgvsParseError):
569 | Variant(d)
570 |
571 | def test_missing_keys(self):
572 | invalid_dicts = [
573 | {"prefix": "p", "position": 27, "target": "Glu", "variant": "Trp"},
574 | {"variant_type": "sub", "position": "122-6", "target": "T", "variant": "A"},
575 | {
576 | "variant_type": "delins",
577 | "prefix": "p",
578 | "start_target": "Ile",
579 | "end_position": 80,
580 | "end_target": "Cys",
581 | "variant": "Ser",
582 | },
583 | ]
584 |
585 | for d in invalid_dicts:
586 | with self.subTest(d=d):
587 | with self.assertRaises(MaveHgvsParseError):
588 | Variant(d)
589 |
590 | def test_invalid_keys(self):
591 | invalid_dicts = [
592 | {
593 | "variant_type": "equal",
594 | "prefix": "p",
595 | "start_position": "27",
596 | "end_position": "27",
597 | "target": "Glu",
598 | },
599 | {"variant_type": "dup", "prefix": "c", "position": 77},
600 | {
601 | "variant_type": "test",
602 | "prefix": "c",
603 | "start_position": 77,
604 | "end_position": 77,
605 | },
606 | {
607 | "variant_type": "fs",
608 | "prefix": "c",
609 | "position": "12",
610 | "target": "T",
611 | },
612 | ]
613 |
614 | for d in invalid_dicts:
615 | with self.subTest(d=d):
616 | with self.assertRaises(MaveHgvsParseError):
617 | Variant(d)
618 |
619 | def test_invalid_type(self):
620 | invalid_values = [1234, None, 5.55, ("p", "Ile", 80, "Cys")]
621 |
622 | for v in invalid_values:
623 | with self.subTest(v=v):
624 | with self.assertRaises(ValueError):
625 | Variant(v)
626 |
627 |
628 | class TestCreateMultiVariantFromValues(unittest.TestCase):
629 | def test_create_multivariant(self):
630 | valid_dict_tuples = [
631 | (
632 | [
633 | {
634 | "variant_type": "sub",
635 | "prefix": "p",
636 | "position": 27,
637 | "target": "Glu",
638 | "variant": "Trp",
639 | },
640 | {
641 | "variant_type": "delins",
642 | "prefix": "p",
643 | "start_position": 71,
644 | "start_target": "Ile",
645 | "end_position": 80,
646 | "end_target": "Cys",
647 | "variant": "Ser",
648 | },
649 | ],
650 | "p.[Glu27Trp;Ile71_Cys80delinsSer]",
651 | ),
652 | (
653 | [
654 | {
655 | "variant_type": "dup",
656 | "prefix": "c",
657 | "start_position": 77,
658 | "end_position": 77,
659 | },
660 | {
661 | "variant_type": "sub",
662 | "prefix": "c",
663 | "position": "122-6",
664 | "target": "T",
665 | "variant": "A",
666 | },
667 | ],
668 | "c.[77dup;122-6T>A]",
669 | ),
670 | ]
671 |
672 | invalid_dicts = [
673 | [
674 | {
675 | "variant_type": "sub",
676 | "position": 27,
677 | "target": "Glu",
678 | "variant": "Trp",
679 | },
680 | {
681 | "variant_type": "delins",
682 | "prefix": "p",
683 | "start_position": 71,
684 | "start_target": "Ile",
685 | "end_position": 80,
686 | "end_target": "Cys",
687 | "variant": "Ser",
688 | },
689 | ],
690 | [
691 | {
692 | "variant_type": "sub",
693 | "prefix": "p",
694 | "position": 27,
695 | "target": "Glu",
696 | "variant": "Trp",
697 | },
698 | {
699 | "variant_type": "sub",
700 | "prefix": "c",
701 | "position": "122-6",
702 | "target": "T",
703 | "variant": "A",
704 | },
705 | ],
706 | ]
707 |
708 | for d, s in valid_dict_tuples:
709 | with self.subTest(d=d, s=s):
710 | self.assertEqual(Variant(s), Variant(d))
711 |
712 | for d in invalid_dicts:
713 | with self.subTest(d=d):
714 | with self.assertRaises(MaveHgvsParseError):
715 | Variant(d)
716 |
717 |
718 | class TestTargetSequenceValidation(unittest.TestCase):
719 | def test_valid_dna_equal(self):
720 | variant_tuples = [("ACGT", "c.1_2="), ("ACGT", "c.4="), ("ACGT", "c.=")]
721 |
722 | for target, s in variant_tuples:
723 | with self.subTest(target=target, s=s):
724 | v = Variant(s, targetseq=target)
725 | self.assertEqual(s, str(v))
726 |
727 | def test_invalid_dna_equal(self):
728 | variant_tuples = [("ACGT", "c.4_5="), ("ACGT", "c.10=")]
729 |
730 | for target, s in variant_tuples:
731 | with self.subTest(target=target, s=s):
732 | with self.assertRaises(MaveHgvsParseError):
733 | Variant(s, targetseq=target)
734 |
735 | def test_matching_dna_substitution(self):
736 | variant_tuples = [
737 | ("ACGT", "c.1A>T"),
738 | ("ACGT", "c.3G>C"),
739 | ("ACGT", "c.[1A>T;3G>C]"),
740 | ]
741 |
742 | for target, s in variant_tuples:
743 | with self.subTest(target=target, s=s):
744 | v = Variant(s, targetseq=target)
745 | self.assertEqual(s, str(v))
746 |
747 | def test_nonmatching_dna_substitution(self):
748 | variant_tuples = [
749 | ("ACGT", "c.1C>T"),
750 | ("ACGT", "c.3T>C"),
751 | ("ACGT", "c.[1A>T;3T>C]"),
752 | ("ACGT", "c.5A>G"),
753 | ]
754 |
755 | for target, s in variant_tuples:
756 | with self.subTest(target=target, s=s):
757 | with self.assertRaises(MaveHgvsParseError):
758 | Variant(s, targetseq=target)
759 |
760 | def test_valid_dna_del(self):
761 | variant_tuples = [("ACGT", "c.1_3del"), ("ACGT", "c.4del")]
762 |
763 | for target, s in variant_tuples:
764 | with self.subTest(target=target, s=s):
765 | v = Variant(s, targetseq=target)
766 | self.assertEqual(s, str(v))
767 |
768 | def test_invalid_dna_del(self):
769 | variant_tuples = [
770 | ("ACGT", "c.1_5del"),
771 | ("ACGT", "c.6_8del"),
772 | ("ACGT", "c.7del"),
773 | ]
774 |
775 | for target, s in variant_tuples:
776 | with self.subTest(target=target, s=s):
777 | with self.assertRaises(MaveHgvsParseError):
778 | Variant(s, targetseq=target)
779 |
780 | def test_valid_dna_dup(self):
781 | variant_tuples = [("ACGT", "c.1_3dup"), ("ACGT", "c.4dup")]
782 |
783 | for target, s in variant_tuples:
784 | with self.subTest(target=target, s=s):
785 | v = Variant(s, targetseq=target)
786 | self.assertEqual(s, str(v))
787 |
788 | def test_invalid_dna_dup(self):
789 | variant_tuples = [
790 | ("ACGT", "c.1_5dup"),
791 | ("ACGT", "c.6_8dup"),
792 | ("ACGT", "c.7dup"),
793 | ]
794 |
795 | for target, s in variant_tuples:
796 | with self.subTest(target=target, s=s):
797 | with self.assertRaises(MaveHgvsParseError):
798 | Variant(s, targetseq=target)
799 |
800 | def test_valid_dna_ins(self):
801 | variant_tuples = [("ACGT", "c.1_2insAAA"), ("ACGT", "c.3_4insT")]
802 |
803 | for target, s in variant_tuples:
804 | with self.subTest(target=target, s=s):
805 | v = Variant(s, targetseq=target)
806 | self.assertEqual(s, str(v))
807 |
808 | def test_invalid_dna_ins(self):
809 | variant_tuples = [("ACGT", "c.4_5insA"), ("ACGT", "c.10_11insTCG")]
810 |
811 | for target, s in variant_tuples:
812 | with self.subTest(target=target, s=s):
813 | with self.assertRaises(MaveHgvsParseError):
814 | Variant(s, targetseq=target)
815 |
816 | def test_valid_dna_delins(self):
817 | variant_tuples = [("ACGT", "c.1_2delinsA"), ("ACGT", "c.4delinsTAAGC")]
818 |
819 | for target, s in variant_tuples:
820 | with self.subTest(target=target, s=s):
821 | v = Variant(s, targetseq=target)
822 | self.assertEqual(s, str(v))
823 |
824 | def test_invalid_dna_delins(self):
825 | variant_tuples = [("ACGT", "c.4_5delinsA"), ("ACGT", "c.10_delinsTCG")]
826 |
827 | for target, s in variant_tuples:
828 | with self.subTest(target=target, s=s):
829 | with self.assertRaises(MaveHgvsParseError):
830 | Variant(s, targetseq=target)
831 |
832 | def test_valid_protein_equal(self):
833 | variant_tuples = [("RCQY", "p.Arg1="), ("RCQY", "p.Tyr4="), ("RCQY", "p.=")]
834 |
835 | for target, s in variant_tuples:
836 | with self.subTest(target=target, s=s):
837 | v = Variant(s, targetseq=target)
838 | self.assertEqual(s, str(v))
839 |
840 | def test_invalid_protein_equal(self):
841 | variant_tuples = [("RCQY", "p.Trp5=")]
842 |
843 | for target, s in variant_tuples:
844 | with self.subTest(target=target, s=s):
845 | with self.assertRaises(MaveHgvsParseError):
846 | Variant(s, targetseq=target)
847 |
848 | def test_matching_protein_substitution(self):
849 | variant_tuples = [
850 | ("RCQY", "p.Arg1Ala"),
851 | ("RCQY", "p.Gln3Trp"),
852 | ("RCQY", "p.[Arg1Ala;Gln3Trp]"),
853 | ]
854 |
855 | for target, s in variant_tuples:
856 | with self.subTest(target=target, s=s):
857 | v = Variant(s, targetseq=target)
858 | self.assertEqual(s, str(v))
859 |
860 | def test_nonmatching_protein_substitution(self):
861 | variant_tuples = [
862 | ("RCQY", "p.Cys1Ala"),
863 | ("RCQY", "p.Ala3Trp"),
864 | ("RCQY", "p.[Arg1Ala;Cys3Trp]"),
865 | ("RCQY", "p.Asp5Glu"),
866 | ]
867 |
868 | for target, s in variant_tuples:
869 | with self.subTest(target=target, s=s):
870 | with self.assertRaises(MaveHgvsParseError):
871 | Variant(s, targetseq=target)
872 |
873 | def test_matching_protein_fs(self):
874 | variant_tuples = [
875 | ("RCQY", "p.Arg1fs"),
876 | ("RCQY", "p.Gln3fs"),
877 | ]
878 |
879 | for target, s in variant_tuples:
880 | with self.subTest(target=target, s=s):
881 | v = Variant(s, targetseq=target)
882 | self.assertEqual(s, str(v))
883 |
884 | def test_nonmatching_protein_fs(self):
885 | variant_tuples = [
886 | ("RCQY", "p.Cys1fs"),
887 | ("RCQY", "p.Ala3fs"),
888 | ("RCQY", "p.Asp5fs"),
889 | ]
890 |
891 | for target, s in variant_tuples:
892 | with self.subTest(target=target, s=s):
893 | with self.assertRaises(MaveHgvsParseError):
894 | Variant(s, targetseq=target)
895 |
896 | def test_matching_protein_indel(self):
897 | variant_tuples = [
898 | ("RCQY", "p.Arg1del"),
899 | ("RCQY", "p.Arg1_Gln3dup"),
900 | ]
901 |
902 | for target, s in variant_tuples:
903 | with self.subTest(target=target, s=s):
904 | v = Variant(s, targetseq=target)
905 | self.assertEqual(s, str(v))
906 |
907 | def test_nonmatching_protein_indel(self):
908 | variant_tuples = [
909 | ("RCQY", "p.Cys1del"),
910 | ("RCQY", "p.Arg1_Asp3dup"),
911 | ("RCQY", "p.Asp5del"),
912 | ]
913 |
914 | for target, s in variant_tuples:
915 | with self.subTest(target=target, s=s):
916 | with self.assertRaises(MaveHgvsParseError):
917 | Variant(s, targetseq=target)
918 |
919 | def test_skips_extended(self):
920 | variant_tuples = [
921 | ("ACGT", "c.1+3A>T"),
922 | ("ACGT", "c.*33G>C"),
923 | ("ACGT", "c.43-6_595+12delinsCTT"),
924 | ]
925 |
926 | for target, s in variant_tuples:
927 | with self.subTest(target=target, s=s):
928 | v = Variant(s, targetseq=target)
929 | self.assertEqual(s, str(v))
930 |
931 |
932 | class TestMiscMethods(unittest.TestCase):
933 | def test_is_multi_variant(self):
934 | single_variant_strings = [
935 | "p.Glu27Trp",
936 | "c.122-6T>A",
937 | "g.44del",
938 | "c.78+5_78+10del",
939 | "c.77dup",
940 | "p.Pro12_Gly18dup",
941 | "p.Ala12_Pro13insGlyProCys",
942 | "r.22_23insauc",
943 | "c.43-6_595+12delinsCTT",
944 | "p.Ile71_Cys80delinsSer",
945 | "p.=",
946 | ]
947 |
948 | multi_variant_strings = []
949 |
950 | for s in single_variant_strings:
951 | with self.subTest(s=s):
952 | v = Variant(s)
953 | self.assertFalse(v.is_multi_variant())
954 |
955 | for s in multi_variant_strings:
956 | with self.subTest(s=s):
957 | v = Variant(s)
958 | self.assertTrue(v.is_multi_variant())
959 |
960 | def test_uses_extended_positions(self):
961 | non_extended_variant_strings = [
962 | "p.Glu27Trp",
963 | "g.44del",
964 | "c.77dup",
965 | "p.Pro12_Gly18dup",
966 | "p.Ala12_Pro13insGlyProCys",
967 | "r.22_23insauc",
968 | "r.22g>u",
969 | "p.Ile71_Cys80delinsSer",
970 | "p.=",
971 | "p.[Pro12_Gly18dup;Glu27Trp]",
972 | "r.[22g>u;35del]",
973 | ]
974 |
975 | extended_variant_strings = [
976 | "c.122-6T>A",
977 | "c.78+5_78+10del",
978 | "c.43-6_595+12delinsCTT",
979 | "c.*33G>C",
980 | "r.33+12a>c",
981 | "c.[12G>T;122-6T>A]",
982 | "c.[43-6_595+12delinsCTT;*33G>C]",
983 | ]
984 |
985 | for s in non_extended_variant_strings:
986 | with self.subTest(s=s):
987 | v = Variant(s)
988 | self.assertFalse(v.uses_extended_positions())
989 |
990 | for s in extended_variant_strings:
991 | with self.subTest(s=s):
992 | v = Variant(s)
993 | self.assertTrue(v.uses_extended_positions())
994 |
995 | def test_components(self):
996 | variant_strings = [
997 | ("p.[Glu27Trp;Ter345Lys]", ("p.Glu27Trp", "p.Ter345Lys")),
998 | ("p.[Glu27Trp;Lys212fs]", ("p.Glu27Trp", "p.Lys212fs")),
999 | (
1000 | "p.[Gly18del;Glu27Trp;Ter345Lys]",
1001 | ("p.Gly18del", "p.Glu27Trp", "p.Ter345Lys"),
1002 | ),
1003 | (
1004 | "p.[Gln7_Asn19del;Glu27Trp;Ter345Lys]",
1005 | ("p.Gln7_Asn19del", "p.Glu27Trp", "p.Ter345Lys"),
1006 | ),
1007 | (
1008 | "c.[1_35del;78+5_78+10del;122T>A]",
1009 | ("c.1_35del", "c.78+5_78+10del", "c.122T>A"),
1010 | ),
1011 | ("p.Glu27Trp", ("p.Glu27Trp",)),
1012 | ("NP_002002.3:p.Glu27Trp", ("NP_002002.3:p.Glu27Trp",)),
1013 | (
1014 | "NP_002002.3:p.[Glu27Trp;Lys212fs]",
1015 | ("NP_002002.3:p.Glu27Trp", "NP_002002.3:p.Lys212fs"),
1016 | ),
1017 | ]
1018 |
1019 | for s, expected_components in variant_strings:
1020 | with self.subTest(s=s):
1021 | v = Variant(s)
1022 | self.assertTrue(all([c in expected_components for c in v.components()]))
1023 |
1024 |
1025 | # TODO: multi-variant test cases
1026 | class TestMiscProperties(unittest.TestCase):
1027 | def test_prefix(self):
1028 | variant_tuples = [(prefix, f"{prefix}.=") for prefix in tuple("gmocnr")]
1029 |
1030 | for p, s in variant_tuples:
1031 | with self.subTest(p=p, s=s):
1032 | v = Variant(s)
1033 | self.assertEqual(p, v.prefix)
1034 |
1035 | def test_variant_type(self):
1036 | variant_tuples = [
1037 | ("sub", "p.Glu27Trp"),
1038 | ("sub", "c.122-6T>A"),
1039 | ("fs", "p.Glu27fs"),
1040 | ("del", "g.44del"),
1041 | ("del", "c.78+5_78+10del"),
1042 | ("dup", "c.77dup"),
1043 | ("dup", "p.Pro12_Gly18dup"),
1044 | ("ins", "p.Ala12_Pro13insGlyProCys"),
1045 | ("ins", "r.22_23insauc"),
1046 | ("delins", "c.43-6_595+12delinsCTT"),
1047 | ("delins", "p.Ile71_Cys80delinsSer"),
1048 | ]
1049 |
1050 | for t, s in variant_tuples:
1051 | with self.subTest(t=t, s=s):
1052 | v = Variant(s)
1053 | self.assertEqual(t, v.variant_type)
1054 |
1055 | def test_position(self):
1056 | variant_tuples = [
1057 | (VariantPosition("Glu27"), "p.Glu27Trp"),
1058 | (VariantPosition("Glu27"), "p.Glu27fs"),
1059 | (VariantPosition("122-6"), "c.122-6T>A"),
1060 | (VariantPosition("44"), "g.44del"),
1061 | ((VariantPosition("78+5"), VariantPosition("78+10")), "c.78+5_78+10del"),
1062 | (VariantPosition("77"), "c.77dup"),
1063 | ((VariantPosition("Pro12"), VariantPosition("Gly18")), "p.Pro12_Gly18dup"),
1064 | (
1065 | (VariantPosition("Ala12"), VariantPosition("Pro13")),
1066 | "p.Ala12_Pro13insGlyProCys",
1067 | ),
1068 | ((VariantPosition("22"), VariantPosition("23")), "r.22_23insauc"),
1069 | (
1070 | (VariantPosition("43-6"), VariantPosition("595+12")),
1071 | "c.43-6_595+12delinsCTT",
1072 | ),
1073 | (
1074 | (VariantPosition("Ile71"), VariantPosition("Cys80")),
1075 | "p.Ile71_Cys80delinsSer",
1076 | ),
1077 | ]
1078 |
1079 | for p, s in variant_tuples:
1080 | with self.subTest(p=p, s=s):
1081 | v = Variant(s)
1082 | if isinstance(p, list): # multi-variant
1083 | self.assertEqual(len(p), len(v.positions))
1084 | for q, vp in zip(p, v.positions):
1085 | if isinstance(q, tuple):
1086 | self.assertTupleEqual(q, vp)
1087 | else:
1088 | self.assertEqual(q, vp)
1089 | if isinstance(p, tuple):
1090 | self.assertTupleEqual(p, v.positions)
1091 | else:
1092 | self.assertEqual(p, v.positions)
1093 |
1094 | def test_sequence(self):
1095 | variant_tuples = [
1096 | (("Glu", "Trp"), "p.Glu27Trp"),
1097 | (("T", "A"), "c.122-6T>A"),
1098 | (None, "p.Glu27fs"),
1099 | (None, "g.44del"),
1100 | (None, "c.78+5_78+10del"),
1101 | (None, "c.77dup"),
1102 | (None, "p.Pro12_Gly18dup"),
1103 | ("GlyProCys", "p.Ala12_Pro13insGlyProCys"),
1104 | ("auc", "r.22_23insauc"),
1105 | ("CTT", "c.43-6_595+12delinsCTT"),
1106 | ("Ser", "p.Ile71_Cys80delinsSer"),
1107 | ]
1108 |
1109 | for seq, s in variant_tuples:
1110 | with self.subTest(seq=seq, s=s):
1111 | v = Variant(s)
1112 | self.assertEqual(seq, v.sequence)
1113 |
1114 | def test_target_id(self):
1115 | variant_tuples = [
1116 | (None, "p.Glu27Trp"),
1117 | (None, "c.122-6T>A"),
1118 | ("GeneX", "GeneX:p.Glu27Trp"),
1119 | ("YFG1", "YFG1:c.122-6T>A"),
1120 | ("ENST00000471181.7", "ENST00000471181.7:c.122-6T>A"),
1121 | ("NM_007294.4", "NM_007294.4:c.122-6T>A"),
1122 | ("NM_007294.4", "NM_007294.4:c.[122-6T>A;153C>T]"),
1123 | ]
1124 |
1125 | for t, s in variant_tuples:
1126 | with self.subTest(t=t, s=s):
1127 | v = Variant(s)
1128 | self.assertEqual(t, v.target_id)
1129 |
1130 | for _, s in variant_tuples:
1131 | with self.subTest(s=s):
1132 | v = Variant(s)
1133 | self.assertEqual(s, str(v))
1134 |
1135 |
1136 | if __name__ == "__main__":
1137 | unittest.main()
1138 |
--------------------------------------------------------------------------------
]