├── .flake8 ├── .github └── workflows │ ├── python-package.yml │ └── python-publish.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.cff ├── LICENSE ├── README.md ├── docs ├── Makefile ├── api.rst ├── conf.py ├── index.rst ├── make.bat ├── prefix.csv └── spec.rst ├── pyproject.toml ├── src └── mavehgvs │ ├── __init__.py │ ├── exceptions.py │ ├── patterns │ ├── __init__.py │ ├── combined.py │ ├── dna.py │ ├── position.py │ ├── protein.py │ ├── rna.py │ └── util.py │ ├── position.py │ ├── py.typed │ ├── util.py │ └── variant.py └── tests ├── __init__.py ├── test_patterns ├── __init__.py ├── test_dna.py ├── test_protein.py ├── test_rna.py └── test_util.py ├── test_position.py ├── test_util.py └── test_variant.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | extend-ignore = E203 3 | max-line-length = 88 4 | max-complexity = 10 5 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-22.04 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | - name: Install package 33 | run: | 34 | python -m pip install . 35 | - name: Lint with flake8 36 | run: | 37 | # stop the build if there are Python syntax errors or undefined names 38 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 39 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 40 | flake8 . --count --exit-zero --statistics 41 | - name: Test with pytest 42 | run: | 43 | pytest 44 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install hatch 33 | - name: Build package 34 | run: hatch build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_MAVEHGVS }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Idea 2 | .idea/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 23.1.0 4 | hooks: 5 | - id: black 6 | language_version: python3.11 7 | - repo: https://github.com/pycqa/flake8 8 | rev: 5.0.4 9 | hooks: 10 | - id: flake8 11 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Rubin" 5 | given-names: "Alan F" 6 | orcid: "https://orcid.org/0000-0003-1474-605X" 7 | title: "mavehgvs" 8 | version: 0.4.0 9 | doi: 10.5281/zenodo.5148054 10 | date-released: 2021-07-30 11 | url: "https://github.com/VariantEffect/mavehgvs" 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018-2023, Alan F Rubin and Daniel Esposito 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.com/VariantEffect/mavehgvs.svg?branch=main)](https://travis-ci.com/VariantEffect/mavehgvs) 2 | [![Coverage Status](https://coveralls.io/repos/github/VariantEffect/mavehgvs/badge.svg?branch=main)](https://coveralls.io/github/VariantEffect/mavehgvs?branch=main) 3 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 4 | 5 | # mavehgvs 6 | mavehgvs is the Python reference implementation of the MAVE-HGVS variant representation standard, 7 | a strict subset of [HGVS](http://varnomen.hgvs.org/), used primarily for clinical genomics. 8 | 9 | ## The MAVE-HGVS Standard 10 | MAVE-HGVS is a strict subset of the [HGVS Sequence Variant Nomenclature](https://varnomen.hgvs.org/), version 20.05. 11 | HGVS nomenclature is comprehensive and very expressive and consequently includes a lot of syntax that is not needed to 12 | represent variants from Multiplexed Assay of Variant Effect (MAVE) data and makes the variant strings more challenging 13 | to parse. 14 | 15 | While packages exist for parsing HGVS (most notably the 16 | [biocommons hgvs package](https://github.com/biocommons/hgvs/), they are intended for use in human genetics and 17 | rely on sequence databases and reference sequence (called "target sequence" for MAVE-HGVS), which are not always 18 | available for or relevant for multiplexed assays. 19 | 20 | MAVE-HGVS is an attempt to define an easy-to-parse subset of the HGVS nomenclature that captures those variants that 21 | occur in MAVE datasets, while excluding many variant types that are unlikely to be found. Importantly, the 22 | mavehgvs implementation does not rely on external sequence databases or identifiers. 23 | 24 | ## Supported Variants 25 | MAVE-HGVS supports DNA, RNA, and protein variants. 26 | MAVE-HGVS supports a subset of HGVS variants including: 27 | 28 | * substitutions 29 | * deletions 30 | * duplications 31 | * insertions 32 | * frame shifts 33 | 34 | Many HGVS variants are unsupported including: 35 | 36 | * inversions 37 | * conversions 38 | * extensions 39 | * changes in methylation state 40 | * RNA fusion transcripts 41 | * mosaicism 42 | * chimerism 43 | * variants with uncertain consequence 44 | * variants in trans or unknown phase 45 | * complex variants (e.g. translocations) 46 | 47 | For further details, including example variants, see the specification in the package documentation. 48 | 49 | # Installation 50 | Install mavehgvs from pip using: 51 | 52 | ```bash 53 | pip3 install mavehgvs 54 | ``` 55 | 56 | To set up the package for development purposes, include the optional dependencies and 57 | install pre-commit: 58 | 59 | pip3 install mavehgvs[dev] 60 | pre-commit install 61 | 62 | # Feedback 63 | To report a problem or request a new feature with either the mavehgvs package or the MAVE-HGVS standard, 64 | please use the GitHub issue tracker. 65 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | .. _api-docs: 2 | 3 | mavehgvs API documentation 4 | ========================== 5 | 6 | Variant objects 7 | --------------- 8 | 9 | Each variant can be parsed into a variant object, which populates and exposes named 10 | fields for each piece of the variant string. 11 | 12 | .. automodule:: mavehgvs.position 13 | :members: 14 | :private-members: 15 | :special-members: 16 | 17 | .. automodule:: mavehgvs.variant 18 | :members: 19 | :private-members: 20 | :special-members: 21 | 22 | .. automodule:: mavehgvs.exceptions 23 | :members: 24 | 25 | Utility functions for handling variants 26 | --------------------------------------- 27 | 28 | .. automodule:: mavehgvs.util 29 | :members: 30 | 31 | Utility functions for regular expression patterns 32 | ------------------------------------------------- 33 | 34 | .. automodule:: mavehgvs.patterns.util 35 | :members: 36 | 37 | DNA pattern strings 38 | ------------------- 39 | 40 | .. automodule:: mavehgvs.patterns.dna 41 | :members: 42 | 43 | RNA pattern strings 44 | ------------------- 45 | 46 | .. automodule:: mavehgvs.patterns.rna 47 | :members: 48 | 49 | Protein pattern strings 50 | ----------------------- 51 | 52 | .. automodule:: mavehgvs.patterns.protein 53 | :members: 54 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | sys.path.insert(0, os.path.abspath("../src")) 17 | 18 | from mavehgvs import __version__ # noqa: E402 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = "MAVE-HGVS" 23 | copyright = "2018-2023, Alan F Rubin and Daniel Esposito" 24 | author = "Alan F Rubin and Daniel Esposito" 25 | 26 | # The full version, including alpha/beta/rc tags 27 | release = __version__ 28 | 29 | 30 | # -- General configuration --------------------------------------------------- 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = [ 36 | "sphinx.ext.autodoc", 37 | "sphinx.ext.napoleon", 38 | "sphinx.ext.intersphinx", 39 | "sphinx.ext.autosectionlabel", 40 | ] 41 | 42 | # Add any paths that contain templates here, relative to this directory. 43 | templates_path = ["_templates"] 44 | 45 | # List of patterns, relative to source directory, that match files and 46 | # directories to ignore when looking for source files. 47 | # This pattern also affects html_static_path and html_extra_path. 48 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 49 | 50 | 51 | # -- Options for HTML output ------------------------------------------------- 52 | 53 | # The theme to use for HTML and HTML Help pages. See the documentation for 54 | # a list of builtin themes. 55 | # 56 | html_theme = "nature" 57 | 58 | # Add any paths that contain custom static files (such as style sheets) here, 59 | # relative to this directory. They are copied after the builtin static files, 60 | # so a file named "default.css" will overwrite the builtin "default.css". 61 | html_static_path = ["_static"] 62 | 63 | 64 | # -- Extension configuration ------------------------------------------------- 65 | intersphinx_mapping = {"python": ("https://docs.python.org/3", None)} 66 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | MAVE-HGVS documentation 2 | ======================= 3 | 4 | MAVE-HGVS is a strict subset of the `HGVS sequence variant nomenclature `_ 5 | used by `MaveDB `_ and related tools to represent protein and DNA variants in 6 | Multiplexed Assays of Variant Effect (MAVE) datasets. 7 | 8 | This version of MAVE-HGVS is based on HGVS version 20.05. 9 | 10 | When citing, please refer to: 11 | 12 | #. Esposito, D., Weile J., *et al.* MaveDB: an open-source platform to distribute and interpret data from multiplexed assays of variant effect. *Genome Biol* **20**, 223 (2019). https://doi.org/10.1186/s13059-019-1845-6 13 | #. den Dunnen, J. T. *et al.* HGVS Recommendations for the Description of Sequence Variants: 2016 Update. *Hum Mutat* **37**, 564–569 (2016). https://doi.org/10.1002/humu.22981 14 | 15 | .. toctree:: 16 | :maxdepth: 2 17 | :caption: Contents: 18 | 19 | spec 20 | api 21 | 22 | Indices and tables 23 | ================== 24 | 25 | * :ref:`genindex` 26 | * :ref:`modindex` 27 | * :ref:`search` 28 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/prefix.csv: -------------------------------------------------------------------------------- 1 | "c", "coding DNA sequence" 2 | "g", "linear genomic DNA sequence" 3 | "m", "mitochondrial genomic DNA sequence" 4 | "n", "non-coding DNA sequence" 5 | "o", "circular genomic DNA sequence" 6 | "p", "protein sequence" 7 | "r", "RNA transcript sequence" 8 | -------------------------------------------------------------------------------- /docs/spec.rst: -------------------------------------------------------------------------------- 1 | .. _spec-docs: 2 | 3 | MAVE-HGVS specification 4 | ======================= 5 | 6 | MAVE-HGVS is a strict subset of the `HGVS Sequence Variant Nomenclature `_, version 20.05. 7 | HGVS nomenclature is comprehensive and very expressive and consequently includes a lot of syntax that is not needed to 8 | represent variants from Multiplexed Assay of Variant Effect (MAVE) data and makes the variant strings more challenging 9 | to parse. 10 | 11 | While packages exist for parsing HGVS (most notably the 12 | `biocommons hgvs package `_), they are intended for use in human genetics and 13 | rely on sequence databases and reference sequence (called "target sequence" for MAVE-HGVS), which are not always 14 | available for or relevant for multiplexed assays. 15 | 16 | MAVE-HGVS is an attempt to define an easy-to-parse subset of the HGVS nomenclature that captures those variants that 17 | occur in MAVE datasets, while excluding many variant types that are unlikely to be found. Importantly, the 18 | :ref:`corresponding implementation ` of MAVE-HGVS does not rely on external sequence databases or identifiers. 19 | 20 | Key differences between HGVS and MAVE-HGVS 21 | ------------------------------------------ 22 | 23 | Standard HGVS strings have the format :code:`reference:variant` (e.g. :code:`NM_001130145.3:c.832C>T`). 24 | MAVE-HGVS strings typically include the variant portion only and the reference (target) portion is inferred from the 25 | MAVE design. 26 | 27 | Target identifiers in MAVE-HGVS are optional, and would typically be used in cases where a mix of MAVE datasets are 28 | being analyzed jointly or for experimental designs that contain multiple target sequences. 29 | Target identifiers in MAVE-HGVS can contain any word characters, numbers, or the underscore. 30 | 31 | MAVE-HGVS does not distinguish between variants that have been observed experimentally and the predicted consequence of 32 | observed variants. 33 | Therefore, variants that contain :code:`()` to denote predicted consequences are considered invalid with one exception 34 | (see `Substitution`_ below). 35 | 36 | MAVE-HGVS supports position numberings that are relative to a transcript (e.g. :code:`c.85+12G>A` or :code:`n.*22del`). 37 | These positions are referred to here as using the extended position notation. 38 | Variants using the extended position notation should appear alongside variants with simple (integer only) position 39 | numbers relative to the target sequence, expressed using the appropriate genomic prefix. 40 | 41 | Like HGVS, MAVE-HGVS supports alleles (called multi-variants in MAVE-HGVS) that describe multiple variants in a single 42 | variant string. 43 | Multi-variants are represented as a semicolon-separated list of valid MAVE-HGVS variants. 44 | 45 | MAVE-HGVS supports a subset of HGVS variants including: 46 | 47 | * substitutions 48 | * frame shifts 49 | * deletions 50 | * duplications 51 | * insertions 52 | 53 | Many HGVS variants are unsupported including: 54 | 55 | * inversions 56 | * extensions 57 | * changes in methylation state 58 | * RNA fusion transcripts 59 | * mosaicism 60 | * chimerism 61 | * variants with uncertain consequence 62 | * variants in trans or unknown phase 63 | * complex variants (e.g. translocations) 64 | 65 | Sequence prefixes and sequence types 66 | ------------------------------------ 67 | 68 | Similarly to HGVS, a MAVE-HGVS variant begins with a single prefix character that defines the sequence type. 69 | Supported sequence types are the same as for HGVS, and are listed in the following table: 70 | 71 | .. csv-table:: 72 | :file: ../docs/prefix.csv 73 | :header: "Prefix", "Description" 74 | :widths: 5, 20 75 | 76 | Typically MAVE variants are expressed relative to a coding, non-coding, or protein sequence. 77 | 78 | A notable exception is when the target sequence for the MAVE consists of both coding and non-coding sequences, 79 | such as when a full-length gene with introns is mutagenized and splice variants are assayed via saturation genome 80 | editing or other methods. 81 | In this case, it is appropriate to use one of the genomic sequence prefixes to describe changes using the contiguous 82 | region containing all mutagenized sequences as the target sequence. 83 | 84 | RNA variants are intended to be used when assaying the functional consequences to an RNA molecule, 85 | such as a tRNA or ribozyme. 86 | Variants that are measured at the DNA level should generally not use the RNA syntax. 87 | 88 | Equality 89 | -------- 90 | 91 | MAVE-HGVS allows variants to describe equality to the target in a variety of ways. 92 | 93 | Variants describing identity to the full target sequence (e.g. :code:`c.=`) are valid and are the intended way to 94 | specify identity to the target (wild-type) sequence. 95 | This replaces the `Enrich2 `_ :code:`_wt` variant syntax. 96 | 97 | Variants that describe identity to the reference (target) at a single position (e.g. :code:`c.44=`) 98 | or range of positions (e.g. :code:`c.1_3=`) are valid for coding and genomic sequences. 99 | These should only be used for special cases, such as in MITE-seq datasets where the scores and counts are 100 | reported separately for each wild-type codon. 101 | 102 | The target-identity variants :code:`c.=` and :code:`p.=` are only valid on their own and are considered invalid as 103 | part of multi-variants. 104 | The variants that describe nucleotide identity to part of the reference are also invalid as part of multi-variants. 105 | 106 | Variants that describe identity to the target at a single amino acid position (e.g. :code:`p.Cys22=`) are valid and 107 | are the preferred way to describe specific synonymous variants. 108 | 109 | The variant :code:`p.(=)` is used when summarizing the population of variants that are synonymous at the protein level 110 | but not target identical at the DNA level. 111 | This replaces the `Enrich2 `_ :code:`_sy` variant syntax. 112 | 113 | .. warning:: Many variants currently in MaveDB use only '=' as part of multi-variants and are therefore invalid 114 | MAVE-HGVS. 115 | Additionally, some MaveDB datasets have a one-to-one relationship between nucleotide and protein multi-variants 116 | resulting in duplicate protein variants in the multi-variant. 117 | This should also be considered invalid. 118 | 119 | Examples of valid equality variants: 120 | 121 | * c.= 122 | * c.22= 123 | * c.1_3= 124 | * g.123= 125 | * p.Cys22= 126 | * p.(=) 127 | 128 | Substitution 129 | ------------ 130 | 131 | .. note:: TODO: add some noncoding ('n.' variants) to the examples. 132 | 133 | MAVE-HGVS supports substitutions of a single nucleotide or amino acid. 134 | 135 | MAVE-HGVS does not support extension variants, which extend an amino acid sequence to the N- or C- terminal end 136 | (e.g. :code:`p.Met1ext-4` for gain of an upstream start or :code:`p.Ter345Lysext5` for a new downstream termination 137 | codon). 138 | Variants that remove a termination codon should be written as standard substitution variants. 139 | Variants that result in an N-terminal extension are currently undefined, 140 | but have not been observed in the MAVE literature at the time of writing. 141 | 142 | Substitutions of more than one base at a time are covered under `Deletion-Insertion`_. 143 | 144 | Examples of valid substitutions: 145 | 146 | * g.48C>A 147 | * c.122-6T>A 148 | * c.*33G>C 149 | * p.Glu27Trp 150 | * p.Ter345Lys 151 | * r.22g>u 152 | * r.33+12a>c 153 | 154 | Examples of valid HGVS substitutions that are invalid in MAVE-HGVS: 155 | 156 | * g.48C>W 157 | * c.122=/T>A 158 | * p.(Glu27Trp) 159 | * p.*345Lys 160 | * p.Glu23Xaa 161 | * r.spl 162 | 163 | Frame Shift 164 | ----------- 165 | 166 | MAVE-HGVS supports a simplified syntax to describe frame shifts in protein variants. 167 | Multi-variants that include multiple frame shifts or a second variant after a frame shift are considered invalid. 168 | 169 | Because frame shift (and the related extension) variants are uncommon in MAVE datasets, MAVE-HGVS provides this minimal support. 170 | Extension variants (removal of a termination codon) should be expressed as a frame shift at the termination codon. 171 | 172 | Examples of valid frame shift variants: 173 | 174 | * p.Glu27fs 175 | * p.Asp125fs 176 | * p.Ter385fs 177 | 178 | Examples of valid HGVS frame shift variants that are invalid in MAVE-HGVS: 179 | 180 | * p.Arg12LysfsTer18 181 | * p.Arg12Lysfs*18 182 | * p.Glu27fs*? 183 | * p.(Glu27fs) 184 | 185 | Deletion 186 | -------- 187 | 188 | MAVE-HGVS supports deletions of specified nucleotides or amino acids. 189 | 190 | Deletions of an unknown number of bases or amino acids are not supported. 191 | For example, deletions where the breakpoint is not known or where the deletion extends past the end of the target 192 | cannot be represented with uncertainty. 193 | To represent a deletion of a sequence including the start or end of the target, specify the deletion exactly as if it 194 | extended to the first or last position. 195 | 196 | Examples of valid deletions: 197 | 198 | * g.44del 199 | * c.78+5_78+10del 200 | * c.1_95del 201 | * p.Gly18del 202 | * p.Gln7_Asn19del 203 | * r.34_36del 204 | 205 | Examples of valid HGVS deletions that are invalid in MAVE-HGVS: 206 | 207 | * c.(78+1_79-1)_(124+1_125-1)del 208 | * g.(?_85)_(124\_?)del 209 | * c.122=/del 210 | * p.(Gly18del) 211 | * r.=/9_12del 212 | * r.(155_185)del 213 | 214 | Duplication 215 | ----------- 216 | 217 | MAVE-HGVS supports duplications of one or more nucleotides or amino acids. 218 | The syntax is the same as HGVS. 219 | 220 | Examples of valid duplications: 221 | 222 | * g.22_24dup 223 | * c.77dup 224 | * c.101+1_101+7dup 225 | * p.Pro12_Gly18dup 226 | * p.Cys5dup 227 | * r.12dup 228 | 229 | Examples of valid HGVS duplications that are invalid in MAVE-HGVS: 230 | 231 | * c.(78+1_79-1)_(124+1_125-1)dup 232 | * g.(?_85)_(124\_?)dup 233 | * c.122_125=//dup 234 | * p.(Cys5dup) 235 | 236 | Insertion 237 | --------- 238 | 239 | MAVE-HGVS supports insertions of a specified nucleotide or amino acid sequence. 240 | 241 | Insertions of a number of unspecified bases or amino acids or insertions using ambiguity characters (e.g. N or Xaa) 242 | are not supported. 243 | 244 | Insertions must be specified by listing the complete inserted sequence. 245 | Referring to the sequence that is inserted based on its position in the target sequence is not considered valid for 246 | MAVE-HGVS. 247 | 248 | To describe an insertion at the end of the target sequence, use a :ref:`Deletion-Insertion` variant that deletes 249 | the last base or amino acid in the target and inserts the deleted symbol plus the insertion. 250 | 251 | Examples of valid insertions: 252 | 253 | * g.234_235insT 254 | * c.84_85insCTG 255 | * c.99+6_99+7insA 256 | * p.His7_Gln8insSer 257 | * p.Ala12_Pro13insGlyProCys 258 | * r.22_23insauc 259 | 260 | Examples of valid HGVS insertions that are invalid in MAVE-HGVS: 261 | 262 | * c.84_85ins100_125 263 | * g.234_235ins(10) 264 | * g.234_235ins(?) 265 | * c.(122_125)insG 266 | * p.(His7_Gln8insSer) 267 | * p.(His7_Gln8insX) 268 | * p.(Ala12_Pro13ins(2)) 269 | * r.(27_30)insu 270 | * r.74_74insnnn 271 | 272 | Deletion-Insertion 273 | ------------------ 274 | 275 | MAVE-HGVS supports deletion-insertions of a specified nucleotide or amino acid sequence. 276 | 277 | Deletion-insertions of a number of unspecified bases or amino acids or insertions using ambiguity characters 278 | (e.g. N or Xaa) are not supported. This includes deletion-insertions with uncertain breakpoints. 279 | 280 | Examples of valid deletion-insertions: 281 | 282 | * g.22delinsAACG 283 | * c.83_85delinsT 284 | * c.43-6_595+12delinsCTT 285 | * p.Ile71_Cys80delinsSer 286 | * p.His44delinsValProGlyGlu 287 | * r.92delinsgac 288 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "mavehgvs" 7 | dynamic = ["version"] 8 | description = "Regular expression-based validation of HGVS-style variant strings for Multiplexed Assays of Variant Effect." 9 | readme = "README.md" 10 | license = "BSD-3-Clause" 11 | requires-python = ">=3.6" 12 | authors = [ 13 | { name = "Alan F Rubin", email = "alan.rubin@wehi.edu.au" }, 14 | ] 15 | classifiers = [ 16 | "Development Status :: 3 - Alpha", 17 | "Intended Audience :: Science/Research", 18 | "License :: OSI Approved :: BSD License", 19 | "Operating System :: OS Independent", 20 | "Programming Language :: Python :: 3", 21 | "Topic :: Scientific/Engineering :: Bio-Informatics", 22 | ] 23 | dependencies = [ 24 | "fqfa>=1.2.3", 25 | ] 26 | 27 | [project.urls] 28 | repository = "https://github.com/VariantEffect/mavehgvs" 29 | documentation = "https://www.mavedb.org/docs/mavehgvs" 30 | 31 | [project.optional-dependencies] 32 | dev = [ 33 | "black", 34 | "flake8", 35 | "pre-commit", 36 | "pytest", 37 | ] 38 | 39 | [tool.hatch.version] 40 | path = "src/mavehgvs/__init__.py" 41 | 42 | [tool.hatch.build.targets.wheel] 43 | packages = ["src/mavehgvs"] 44 | 45 | [tool.hatch.build.targets.sdist] 46 | exclude = [ 47 | "docs/", 48 | ".github/", 49 | ] 50 | 51 | [tool.setuptools.package-data] 52 | "mavehgvs" = ["py.typed"] 53 | -------------------------------------------------------------------------------- /src/mavehgvs/__init__.py: -------------------------------------------------------------------------------- 1 | from mavehgvs.exceptions import MaveHgvsParseError 2 | from mavehgvs.position import VariantPosition 3 | from mavehgvs.variant import Variant 4 | from mavehgvs.util import parse_variant_strings 5 | 6 | __version__ = "0.7.0" 7 | 8 | __all__ = [ 9 | "__version__", 10 | "Variant", 11 | "VariantPosition", 12 | "MaveHgvsParseError", 13 | "parse_variant_strings", 14 | ] 15 | -------------------------------------------------------------------------------- /src/mavehgvs/exceptions.py: -------------------------------------------------------------------------------- 1 | __all__ = ["MaveHgvsParseError"] 2 | 3 | 4 | class MaveHgvsParseError(Exception): 5 | """Exception to use when a MAVE-HGVS string is not valid.""" 6 | 7 | pass 8 | -------------------------------------------------------------------------------- /src/mavehgvs/patterns/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VariantEffect/mavehgvs/69476dde5391022e7c0eca32ecd1734e371436eb/src/mavehgvs/patterns/__init__.py -------------------------------------------------------------------------------- /src/mavehgvs/patterns/combined.py: -------------------------------------------------------------------------------- 1 | from mavehgvs.patterns.dna import dna_single_variant as dsv, dna_multi_variant as dmv 2 | from mavehgvs.patterns.rna import rna_single_variant as rsv, rna_multi_variant as rmv 3 | from mavehgvs.patterns.protein import ( 4 | pro_single_variant as psv, 5 | pro_multi_variant as pmv, 6 | ) 7 | 8 | any_variant = ( 9 | r"(?:(?P[a-zA-Z0-9_.-]+):)?" 10 | + r"(?P" 11 | + rf"(?P{r'|'.join([dsv, rsv, psv])})|" 12 | + rf"(?P{r'|'.join([dmv, rmv, pmv])})" 13 | + r")" 14 | ) 15 | -------------------------------------------------------------------------------- /src/mavehgvs/patterns/dna.py: -------------------------------------------------------------------------------- 1 | from fqfa.constants import DNA_BASES 2 | from mavehgvs.patterns.util import combine_patterns, remove_named_groups 3 | from mavehgvs.patterns.position import pos, pos_intron, pos_intron_utr 4 | 5 | dna_nt: str = rf"[{''.join(DNA_BASES)}]" 6 | """str: Pattern matching any uppercase DNA base. 7 | 8 | This does not include IUPAC ambiguity characters. 9 | """ 10 | 11 | dna_equal_c: str = ( 12 | rf"(?P(?:(?:(?P{pos_intron_utr})_(?P{pos_intron_utr}))|" 13 | + rf"(?P{pos_intron_utr}))?(?P=))" 14 | ) 15 | """str: Pattern matching DNA equality with numeric, intronic, or UTR positions. 16 | """ 17 | 18 | dna_sub_c: str = ( 19 | rf"(?P(?P{pos_intron_utr})(?P{dna_nt})>(?P{dna_nt}))" 20 | ) 21 | """str: Pattern matching a DNA substitution with numeric, intronic, or UTR positions. 22 | """ 23 | 24 | dna_del_c: str = ( 25 | rf"(?P(?:(?:(?P{pos_intron_utr})_(?P{pos_intron_utr}))|" 26 | + rf"(?P{pos_intron_utr}))del)" 27 | ) 28 | """str: Pattern matching a DNA deletion with numeric, intronic, or UTR positions. 29 | """ 30 | 31 | dna_dup_c: str = ( 32 | rf"(?P(?:(?:(?P{pos_intron_utr})_" 33 | + rf"(?P{pos_intron_utr}))|(?P{pos_intron_utr}))dup)" 34 | ) 35 | """str: Pattern matching a DNA duplication with numeric, intronic, or UTR positions. 36 | """ 37 | 38 | dna_ins_c: str = ( 39 | rf"(?P(?P{pos_intron_utr})_" 40 | + rf"(?P{pos_intron_utr})ins(?P{dna_nt}+))" 41 | ) 42 | """str: Pattern matching a DNA insertion with numeric, intronic, or UTR positions. 43 | """ 44 | 45 | dna_delins_c: str = ( 46 | rf"(?P(?:(?:(?P{pos_intron_utr})_" 47 | + rf"(?P{pos_intron_utr}))|(?P{pos_intron_utr}))" 48 | + rf"delins(?P{dna_nt}+))" 49 | ) 50 | """str: Pattern matching a DNA deletion-insertion with numeric, intronic, or UTR 51 | positions. 52 | """ 53 | 54 | dna_equal_n: str = r"(?P(?P=))" 55 | """str: Pattern matching DNA equality with no position support. 56 | """ 57 | 58 | dna_sub_n: str = dna_sub_c.replace(pos_intron_utr, pos_intron).replace( 59 | "(?P", "(?P" 60 | ) 61 | """str: Pattern matching a DNA substitution with numeric or intron positions for 62 | non-coding variants. 63 | """ 64 | 65 | dna_del_n: str = dna_del_c.replace(pos_intron_utr, pos_intron).replace( 66 | "(?P", "(?P" 67 | ) 68 | """str: Pattern matching a DNA deletion with numeric or intron positions for non-coding 69 | variants. 70 | """ 71 | 72 | dna_dup_n: str = dna_dup_c.replace(pos_intron_utr, pos_intron).replace( 73 | "(?P", "(?P" 74 | ) 75 | """str: Pattern matching a DNA duplication with numeric or intron positions for 76 | non-coding variants. 77 | """ 78 | 79 | dna_ins_n: str = dna_ins_c.replace(pos_intron_utr, pos_intron).replace( 80 | "(?P", "(?P" 81 | ) 82 | """str: Pattern matching a DNA insertion with numeric or intron positions for non-coding 83 | variants. 84 | """ 85 | 86 | dna_delins_n: str = dna_delins_c.replace(pos_intron_utr, pos_intron).replace( 87 | "(?P", "(?P" 88 | ) 89 | """str: Pattern matching a DNA deletion-insertion with numeric or intron positions for 90 | non-coding variants. 91 | """ 92 | 93 | dna_equal_gmo: str = dna_equal_c.replace(pos_intron_utr, pos).replace( 94 | "(?P", "(?P" 95 | ) 96 | """str: Pattern matching a DNA substitution with only numeric positions for 97 | genomic-style variants. 98 | """ 99 | 100 | dna_sub_gmo: str = dna_sub_c.replace(pos_intron_utr, pos).replace( 101 | "(?P", "(?P" 102 | ) 103 | """str: Pattern matching a DNA substitution with only numeric positions for 104 | genomic-style variants. 105 | """ 106 | 107 | dna_del_gmo: str = dna_del_c.replace(pos_intron_utr, pos).replace( 108 | "(?P", "(?P" 109 | ) 110 | """str: Pattern matching a DNA deletion with only numeric positions for genomic-style 111 | variants. 112 | """ 113 | 114 | dna_dup_gmo: str = dna_dup_c.replace(pos_intron_utr, pos).replace( 115 | "(?P", "(?P" 116 | ) 117 | """str: Pattern matching a DNA duplication with only numeric positions for genomic-style 118 | variants. 119 | """ 120 | 121 | dna_ins_gmo: str = dna_ins_c.replace(pos_intron_utr, pos).replace( 122 | "(?P", "(?P" 123 | ) 124 | """str: Pattern matching a DNA insertion with only numeric positions for genomic-style 125 | variants. 126 | """ 127 | 128 | dna_delins_gmo: str = dna_delins_c.replace(pos_intron_utr, pos).replace( 129 | "(?P", "(?P" 130 | ) 131 | """str: Pattern matching a DNA deletion-insertion with only numeric positions for 132 | genomic-style variants. 133 | """ 134 | 135 | dna_variant_c: str = combine_patterns( 136 | [dna_equal_c, dna_sub_c, dna_del_c, dna_dup_c, dna_ins_c, dna_delins_c], None 137 | ) 138 | """str: Pattern matching any of the coding DNA variants. 139 | """ 140 | 141 | dna_variant_n: str = combine_patterns( 142 | [dna_equal_n, dna_sub_n, dna_del_n, dna_dup_n, dna_ins_n, dna_delins_n], None 143 | ) 144 | """str: Pattern matching any of the non-coding DNA variants. 145 | """ 146 | 147 | dna_variant_gmo: str = combine_patterns( 148 | [dna_equal_gmo, dna_sub_gmo, dna_del_gmo, dna_dup_gmo, dna_ins_gmo, dna_delins_gmo], 149 | None, 150 | ) 151 | """str: Pattern matching any of the genomic-style DNA variants. 152 | """ 153 | 154 | dna_single_variant: str = ( 155 | rf"(?Pc\.{dna_variant_c})|" 156 | + rf"(?Pn\.{dna_variant_n})|" 157 | + rf"(?P[gmo]\.{dna_variant_gmo})" 158 | ) 159 | """str: Pattern matching any complete single DNA variant, including the prefix 160 | character. 161 | """ 162 | 163 | dna_multi_variant: str = ( 164 | r"(?Pc\." 165 | + rf"\[{remove_named_groups(dna_variant_c)}" 166 | + rf"(?:;{remove_named_groups(dna_variant_c)}){{1,}}\])|" 167 | + r"(?Pn\." 168 | + rf"\[{remove_named_groups(dna_variant_n)}" 169 | + rf"(?:;{remove_named_groups(dna_variant_n)}){{1,}}\])|" 170 | + r"(?P[gmo]\." 171 | + rf"\[{remove_named_groups(dna_variant_gmo)}" 172 | + rf"(?:;{remove_named_groups(dna_variant_gmo)}){{1,}}\])" 173 | ) 174 | """str: Pattern matching any complete DNA multi-variant, including the prefix character. 175 | 176 | Named capture groups have been removed from the variant patterns because of 177 | non-uniqueness. 178 | Another applications of single-variant regular expressions is needed to recover the 179 | named groups from each individual variant in the multi-variant. 180 | """ 181 | -------------------------------------------------------------------------------- /src/mavehgvs/patterns/position.py: -------------------------------------------------------------------------------- 1 | pos: str = r"[1-9][0-9]*" 2 | """str: Pattern matching a positive integer not starting with 0. 3 | 4 | This pattern is used for sequence positions, as position 0 does not exist. 5 | """ 6 | 7 | pos_intron: str = rf"{pos}(?:[+-]{pos})?" 8 | """str: Pattern matching a position with optional intron component. 9 | 10 | This pattern is used for sequence positions in an RNA or noncoding sequence. 11 | """ 12 | 13 | pos_intron_utr: str = rf"[*-]?{pos}(?:[+-]{pos})?" 14 | """str: Pattern matching a position with optional intron and UTR components. 15 | 16 | This pattern is used for sequence positions in a coding sequence. 17 | """ 18 | -------------------------------------------------------------------------------- /src/mavehgvs/patterns/protein.py: -------------------------------------------------------------------------------- 1 | from fqfa.constants import AA_CODES 2 | from mavehgvs.patterns.util import combine_patterns, remove_named_groups 3 | from mavehgvs.patterns.position import pos 4 | 5 | amino_acid: str = rf"(?:{'|'.join(AA_CODES.values())})" 6 | """str: Pattern matching any amino acid or Ter. 7 | 8 | This does not include ambiguous amino acids such as Glx and Xaa. 9 | """ 10 | 11 | aa_pos: str = rf"(?:{amino_acid}{pos})" 12 | """str: Pattern matching an amino acid code followed by a position. 13 | """ 14 | 15 | pro_equal: str = ( 16 | rf"(?P(?:(?P{aa_pos})?(?P=))|(?P\(=\)))" 17 | ) 18 | """str: Pattern matching protein equality or synonymous variant. 19 | """ 20 | 21 | pro_sub: str = rf"(?P(?P{aa_pos})(?P{amino_acid}))" 22 | """str: Pattern matching a protein substitution. 23 | """ 24 | 25 | pro_fs: str = rf"(?P(?P{aa_pos})fs)" 26 | """str: Pattern matching a protein substitution. 27 | """ 28 | 29 | pro_del: str = ( 30 | rf"(?P(?:(?P{aa_pos})_(?P{aa_pos})del)|" 31 | + rf"(?:(?P{aa_pos})del))" 32 | ) 33 | """str: Pattern matching a protein deletion. 34 | """ 35 | 36 | pro_dup: str = ( 37 | rf"(?P(?:(?P{aa_pos})_(?P{aa_pos})dup)|" 38 | + rf"(?:(?P{aa_pos})dup))" 39 | ) 40 | """str: Pattern matching a protein duplication. 41 | """ 42 | 43 | pro_ins: str = ( 44 | rf"(?P(?P{aa_pos})_(?P{aa_pos})ins(?P{amino_acid}+))" 45 | ) 46 | """str: Pattern matching a protein insertion. 47 | """ 48 | 49 | pro_delins: str = ( 50 | rf"(?P(?:(?:(?P{aa_pos})_(?P{aa_pos}))|" 51 | + rf"(?P{aa_pos}))delins(?P{amino_acid}+))" 52 | ) 53 | """str: Pattern matching a protein deletion-insertion. 54 | """ 55 | 56 | pro_variant: str = combine_patterns( 57 | [pro_equal, pro_sub, pro_fs, pro_del, pro_dup, pro_ins, pro_delins], None 58 | ) 59 | """str: Pattern matching any single protein variant event. 60 | """ 61 | 62 | pro_single_variant: str = rf"(?Pp\.{pro_variant})" 63 | """str: Pattern matching any complete protein variant, including the prefix character. 64 | """ 65 | 66 | pro_multi_variant: str = ( 67 | rf"(?Pp\.\[{remove_named_groups(pro_variant)}" 68 | + rf"(?:;{remove_named_groups(pro_variant)}){{1,}}\])" 69 | ) 70 | 71 | """str: Pattern matching any complete protein multi-variant, including the prefix 72 | character. 73 | 74 | Named capture groups have been removed from the variant patterns because of 75 | non-uniqueness. 76 | Another applications of single-variant regular expressions is needed to recover the 77 | named groups from each individual variant in the multi-variant. 78 | """ 79 | -------------------------------------------------------------------------------- /src/mavehgvs/patterns/rna.py: -------------------------------------------------------------------------------- 1 | from fqfa.constants import RNA_BASES 2 | from mavehgvs.patterns.util import combine_patterns, remove_named_groups 3 | from mavehgvs.patterns.position import pos_intron 4 | 5 | rna_nt: str = rf"[{''.join(RNA_BASES).lower()}]" 6 | """str: Pattern matching any lowercase RNA base. 7 | 8 | This does not include IUPAC ambiguity characters. 9 | """ 10 | 11 | rna_equal: str = ( 12 | rf"(?P(?:(?:(?P{pos_intron})_" 13 | + rf"(?P{pos_intron}))|(?P{pos_intron}))?(?P=))" 14 | ) 15 | """str: Pattern matching RNA equality with numeric or relative-to-transcript positions. 16 | """ 17 | 18 | rna_sub: str = ( 19 | rf"(?P(?P{pos_intron})(?P{rna_nt})>(?P{rna_nt}))" 20 | ) 21 | """str: Pattern matching a RNA substitution with numeric or relative-to-transcript 22 | positions. 23 | """ 24 | 25 | rna_del: str = ( 26 | rf"(?P(?:(?:(?P{pos_intron})_(?P{pos_intron}))|" 27 | + rf"(?P{pos_intron}))del)" 28 | ) 29 | """str: Pattern matching a RNA deletion with numeric or relative-to-transcript 30 | positions. 31 | """ 32 | 33 | rna_dup: str = ( 34 | rf"(?P(?:(?:(?P{pos_intron})_(?P{pos_intron}))|" 35 | + rf"(?P{pos_intron}))dup)" 36 | ) 37 | """str: Pattern matching a RNA duplication with numeric or relative-to-transcript 38 | positions. 39 | """ 40 | 41 | rna_ins: str = ( 42 | rf"(?P(?P{pos_intron})_(?P{pos_intron})ins(?P{rna_nt}+))" 43 | ) 44 | """str: Pattern matching a RNA insertion with numeric or relative-to-transcript 45 | positions. 46 | """ 47 | 48 | rna_delins: str = ( 49 | rf"(?P(?:(?:(?P{pos_intron})_(?P{pos_intron}))|" 50 | + rf"(?P{pos_intron}))delins(?P{rna_nt}+))" 51 | ) 52 | """str: Pattern matching a RNA deletion-insertion with numeric or relative-to-transcript 53 | positions. 54 | """ 55 | 56 | rna_variant: str = combine_patterns( 57 | [rna_equal, rna_sub, rna_del, rna_dup, rna_ins, rna_delins], None 58 | ) 59 | """str: Pattern matching any single RNA variant event. 60 | """ 61 | 62 | rna_single_variant: str = rf"(?Pr\.{rna_variant})" 63 | """str: Pattern matching any complete RNA variant, including the prefix character. 64 | """ 65 | 66 | rna_multi_variant: str = ( 67 | rf"(?Pr\.\[{remove_named_groups(rna_variant)}" 68 | + rf"(?:;{remove_named_groups(rna_variant)}){{1,}}\])" 69 | ) 70 | """str: Pattern matching any complete RNA multi-variant, including the prefix character. 71 | 72 | Named capture groups have been removed from the variant patterns because of 73 | non-uniqueness. 74 | Another applications of single-variant regular expressions is needed to recover the 75 | named groups from each individual variant in the multi-variant. 76 | """ 77 | -------------------------------------------------------------------------------- /src/mavehgvs/patterns/util.py: -------------------------------------------------------------------------------- 1 | """Utility functions for working with mavehgvs regex pattern strings. 2 | """ 3 | 4 | import re 5 | from typing import Sequence, Optional 6 | 7 | 8 | def combine_patterns(patterns: Sequence[str], groupname: Optional[str] = None) -> str: 9 | """Combine multiple pattern strings into a single pattern string. 10 | 11 | Because multiple identical group names are not allowed in a pattern, the resulting 12 | object renames all named match groups such they are prefixed with the first match 13 | group name in the pattern. For example, 14 | ``(?P(?P[1-9][0-9]*)...`` becomes 15 | ``(?P(?P[1-9][0-9]*)...``. 16 | 17 | The function assumes that all input patterns are enclosed in parentheses. 18 | 19 | Parameters 20 | ---------- 21 | patterns : Sequence[str] 22 | Sequence of pattern strings to combine. 23 | 24 | groupname : Optional[str] 25 | Name for the capture group surrounding the resulting pattern. If this is None, a 26 | non-capturing group will be used instead. 27 | 28 | Returns 29 | ------- 30 | str 31 | Pattern string that matches any of the input patterns. Match groups are renamed 32 | as described above to attempt to ensure uniqueness across the combined pattern. 33 | 34 | """ 35 | tag_re = re.compile(r"\(\?P<(\w+)>") 36 | stripped_patterns = list() 37 | for p in patterns: 38 | tags = list(tag_re.finditer(p)) 39 | prefix = f"{tags[0].group(1)}_" 40 | new_p = p 41 | for t in tags[:0:-1]: 42 | start, end = t.span(1) 43 | new_p = "".join((new_p[:start], prefix, new_p[start:])) 44 | stripped_patterns.append(new_p) 45 | if groupname is None: 46 | combined = rf"(?:{r'|'.join(stripped_patterns)})" 47 | else: 48 | combined = rf"(?P<{groupname}>{r'|'.join(stripped_patterns)})" 49 | 50 | return combined 51 | 52 | 53 | def remove_named_groups(pattern: str, noncapturing: bool = True) -> str: 54 | """Function that replaces named match groups in a regular expression pattern. 55 | 56 | Named groups are replaced with either regular parentheses or non-capturing 57 | parentheses. 58 | 59 | Parameters 60 | ---------- 61 | pattern : str 62 | The pattern string to strip match groups from. 63 | 64 | noncapturing : bool 65 | If True, the named grouping parentheses are replaced by non-capturing 66 | parentheses. 67 | If False, regular parentheses are used. 68 | 69 | Returns 70 | ------- 71 | str 72 | The pattern string without named match groups. 73 | 74 | """ 75 | if noncapturing: 76 | new_parens = "(?:" 77 | else: 78 | new_parens = "(" 79 | 80 | return re.sub(r"\(\?P<\w+>", new_parens, pattern) 81 | -------------------------------------------------------------------------------- /src/mavehgvs/position.py: -------------------------------------------------------------------------------- 1 | import re 2 | from functools import total_ordering 3 | 4 | from mavehgvs.exceptions import MaveHgvsParseError 5 | from mavehgvs.patterns.position import pos 6 | from mavehgvs.patterns.protein import amino_acid 7 | 8 | __all__ = ["VariantPosition"] 9 | 10 | pos_with_groups: str = ( 11 | rf"(?P{amino_acid})?(?P[*-]?{pos})" 12 | + rf"(?P[+-]{pos})?" 13 | ) 14 | """str: Pattern matching a position with match groups for parsing into a 15 | :py:class:`VariantPosition`. 16 | """ 17 | 18 | 19 | @total_ordering 20 | class VariantPosition: 21 | """Class for storing a variant position. 22 | 23 | The class includes special fields for variants using the extended position syntax. 24 | Attributes 25 | ---------- 26 | position : Optional[int] 27 | The position as an integer. 28 | Negative positions are only expected for 5' UTR positions. 29 | amino_acid : Optional[str] 30 | The amino acid at this position for protein variants. 31 | intronic_position : Optional[int] 32 | The number of bases into the intron for intronic positions. 33 | None for non-intronic positions. 34 | 35 | Nucleotides in the 5' half of the intron have positive ``intronic_position`` and 36 | their position is that of the last base of the 5' exon. 37 | Nucleotides in the 3' half of the intron have negative ``intronic_position`` and 38 | their position is that of the first base of the 3' exon. 39 | utr : Optional[bool] 40 | True if the position is in the UTR. None for all other positions. 41 | 42 | """ 43 | 44 | fullmatch = re.compile(pos_with_groups, flags=re.ASCII).fullmatch 45 | """Callable[[str, int, int], Optional[Match[str]]]: fullmatch callable for parsing 46 | positions 47 | 48 | Returns an :py:obj:`re.Match` object if the full string matches one of the position 49 | groups in :py:data:`pos_extended`. 50 | """ 51 | 52 | def __init__(self, pos_str: str) -> None: 53 | """Parse a position string into a VariantPosition object. 54 | 55 | Parameters 56 | ---------- 57 | pos_str : str 58 | The string to convert to a VariantPosition object. 59 | 60 | """ 61 | try: 62 | gdict = VariantPosition.fullmatch(pos_str).groupdict() 63 | except AttributeError: 64 | raise MaveHgvsParseError(f"invalid variant position string '{pos_str}'") 65 | 66 | self.position = None 67 | self.amino_acid = None 68 | self.intronic_position = None 69 | self.utr = None 70 | 71 | if gdict["position"].startswith("*"): # 3' UTR position 72 | self.utr = True 73 | self.position = int(gdict["position"][1:]) 74 | else: 75 | if gdict["position"].startswith("-"): # 5' UTR position 76 | self.utr = True 77 | self.position = int(gdict["position"]) 78 | 79 | if gdict["position_aa"] is not None: 80 | self.amino_acid = gdict["position_aa"] 81 | 82 | if gdict["position_intron"] is not None: 83 | self.intronic_position = int(gdict["position_intron"]) 84 | 85 | if self.amino_acid is not None and ( 86 | self.intronic_position is not None or self.utr is not None 87 | ): 88 | raise MaveHgvsParseError("invalid variant") 89 | 90 | def __repr__(self) -> str: 91 | """The object representation is equivalent to the input string. 92 | 93 | Returns 94 | ------- 95 | str 96 | The object representation. 97 | 98 | """ 99 | if self.utr and self.position > 0: 100 | p = f"*{self.position}" 101 | else: 102 | p = f"{self.position}" 103 | 104 | if self.intronic_position is not None: 105 | if self.intronic_position > 0: 106 | return f"{p}+{self.intronic_position}" 107 | else: 108 | return f"{p}{self.intronic_position}" 109 | elif self.amino_acid is not None: 110 | return f"{self.amino_acid}{p}" 111 | else: 112 | return p 113 | 114 | def __lt__(self, other: "VariantPosition") -> bool: 115 | """Less than comparison operator. 116 | 117 | Other comparison operators will be filled in using 118 | :py:func:`functools.total_ordering`. 119 | 120 | Parameters 121 | ---------- 122 | other : VariantPosition 123 | The other VariantPosition to compare to. 124 | 125 | Returns 126 | ------- 127 | bool 128 | True if this position evaluates as strictly less than the other position; 129 | else False. 130 | 131 | """ 132 | if self.utr == other.utr: 133 | if self.position == other.position: 134 | if ( 135 | self.intronic_position == other.intronic_position 136 | ): # pragma: no cover 137 | # this case is covered by __eq__ 138 | return False 139 | elif self.intronic_position is None: 140 | return other.intronic_position > 0 141 | elif other.intronic_position is None: 142 | return self.intronic_position < 0 143 | else: 144 | return self.intronic_position < other.intronic_position 145 | else: 146 | return self.position < other.position 147 | else: # 5' < non-UTR < 3' 148 | if self.utr: 149 | if self.position < 0: # self is in 5' UTR 150 | return True 151 | else: # self is in 3' UTR 152 | return False 153 | else: 154 | if other.position < 0: # other is in 5' UTR 155 | return False 156 | else: # other is in 3' UTR 157 | return True 158 | 159 | def __eq__(self, other: "VariantPosition") -> bool: 160 | """Equality comparison operator. 161 | 162 | Note that the amino acid portion of a protein position is not used in this 163 | comparison. 164 | 165 | Other comparison operators will be filled in using 166 | :py:func:`functools.total_ordering`. 167 | 168 | Parameters 169 | ---------- 170 | other : VariantPosition 171 | The other VariantPosition to compare to. 172 | 173 | Returns 174 | ------- 175 | bool 176 | True if this position is the same as the other position; else False. 177 | 178 | """ 179 | return (self.position, self.intronic_position, self.utr) == ( 180 | other.position, 181 | other.intronic_position, 182 | other.utr, 183 | ) 184 | 185 | def __ne__(self, other: "VariantPosition") -> bool: 186 | """Not equal comparison operator. 187 | 188 | Note that the amino acid portion of a protein position is not used in this 189 | comparison. 190 | 191 | Other comparison operators will be filled in using 192 | :py:func:`functools.total_ordering`. 193 | 194 | Parameters 195 | ---------- 196 | other : VariantPosition 197 | The other VariantPosition to compare to. 198 | 199 | Returns 200 | ------- 201 | bool 202 | True if this position is not the same as the other position; else False. 203 | 204 | """ 205 | return (self.position, self.intronic_position, self.utr) != ( 206 | other.position, 207 | other.intronic_position, 208 | other.utr, 209 | ) 210 | 211 | def is_utr(self) -> bool: 212 | """Return whether this is a UTR position. 213 | 214 | Returns 215 | ------- 216 | bool 217 | True if the object describes a position in the UTR; else False. 218 | 219 | """ 220 | return self.utr is not None 221 | 222 | def is_intronic(self) -> bool: 223 | """Return whether this is an intronic position. 224 | 225 | Returns 226 | ------- 227 | bool 228 | True if the object describes a position in an intron; else False. 229 | 230 | """ 231 | return self.intronic_position is not None 232 | 233 | def is_protein(self) -> bool: 234 | """Return whether this is a protein position 235 | 236 | Returns 237 | ------- 238 | bool 239 | True if the object describes a position with an amino acid component; else 240 | False. 241 | """ 242 | return self.amino_acid is not None 243 | 244 | def is_extended(self) -> bool: 245 | """Return whether this position was described using the extended syntax. 246 | 247 | Returns 248 | ------- 249 | bool 250 | True if the position was described using the extended syntax; else False. 251 | 252 | """ 253 | return self.utr is not None or self.intronic_position is not None 254 | 255 | # string annotation in the type hint below is required for Python 3.6 compatibility 256 | def is_adjacent(self, other: "VariantPosition") -> bool: 257 | """Return whether this variant and another are immediately adjacent in sequence 258 | space. 259 | 260 | The following special cases are not handled correctly: 261 | 262 | * The special case involving the last variant in a transcript sequence and the 263 | first base in the 3' UTR will be evaluated as not adjacent, as the object does 264 | not have sequence length information. 265 | * The special case involving the two middle bases in an intron where the 266 | numbering switches from positive with respect to the 5' end of the intron to 267 | negative with respect to the 3' end of the intron will be evaluated as not 268 | adjacent, as the object does not have intron length information. 269 | * This ignores the special case where there is an intron between the last base 270 | of the 5' UTR and the first base of the coding sequence because it is not 271 | biologically relevant to the best of my knowledge. 272 | 273 | Parameters 274 | ---------- 275 | other : VariantPosition 276 | The object to calculate adjacency to. 277 | 278 | Returns 279 | ------- 280 | bool 281 | True if the positions describe adjacent bases in sequence space; else False. 282 | 283 | """ 284 | if self.utr == other.utr: 285 | if self.intronic_position is None and other.intronic_position is None: 286 | return abs(self.position - other.position) == 1 287 | elif ( 288 | self.position == other.position 289 | ): # intronic positions can only be adjacent if relative to the same base 290 | if ( 291 | self.intronic_position is not None 292 | and other.intronic_position is not None 293 | ): 294 | return abs(self.intronic_position - other.intronic_position) == 1 295 | else: 296 | # special case for first/last base of intron and 297 | # corresponding first/last base of exon 298 | return ( 299 | self.intronic_position == -1 300 | or self.intronic_position == 1 301 | or other.intronic_position == -1 302 | or other.intronic_position == 1 303 | ) 304 | else: 305 | return False 306 | else: # special case for last base of 5' utr and first base of non-UTR sequence 307 | return (self.position == -1 and other.position == 1) or ( 308 | other.position == -1 and self.position == 1 309 | ) 310 | -------------------------------------------------------------------------------- /src/mavehgvs/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VariantEffect/mavehgvs/69476dde5391022e7c0eca32ecd1734e371436eb/src/mavehgvs/py.typed -------------------------------------------------------------------------------- /src/mavehgvs/util.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Optional, Iterable 2 | 3 | from mavehgvs.variant import Variant 4 | from mavehgvs.exceptions import MaveHgvsParseError 5 | 6 | __all__ = ["parse_variant_strings"] 7 | 8 | 9 | def parse_variant_strings( 10 | variants: Iterable[str], 11 | targetseq: Optional[str] = None, 12 | expected_prefix: Optional[str] = None, 13 | ) -> Tuple[List[Optional[Variant]], List[Optional[str]]]: 14 | """Parse a list of MAVE-HGVS strings into Variant objects or error messages. 15 | 16 | Parameters 17 | ---------- 18 | variants : Iterable[str] 19 | Iterable of MAVE-HGVS strings to parse. 20 | 21 | targetseq : Optional[str] 22 | If provided, all variants will be validated for agreement with this sequence. 23 | See the documentation for :py:class:`Variant` for further details. 24 | 25 | expected_prefix : Optional[str] 26 | If provided, all variants will be expected to have the same single-letter 27 | prefix. 28 | Variants that do not have this prefix will be treated as invalid. 29 | 30 | Returns 31 | ------- 32 | Tuple[List[Optional[Variant]], List[Optional[str]]] 33 | Returns a pair of lists containing variants or error messages. 34 | 35 | Both lists have the same length as the input list. 36 | The first list contains Variant objects if the string was successfully parsed; 37 | else None. 38 | The second list contains None if the string was successfully parsed; else the 39 | error message. 40 | 41 | """ 42 | if expected_prefix is not None and expected_prefix not in list("cgmnopr"): 43 | raise ValueError("invalid expected prefix") 44 | 45 | valid = list() 46 | invalid = list() 47 | 48 | for s in variants: 49 | try: 50 | v = Variant(s, targetseq=targetseq) 51 | except MaveHgvsParseError as error: 52 | valid.append(None) 53 | invalid.append(str(error)) 54 | else: 55 | if expected_prefix is not None and v.prefix != expected_prefix: 56 | valid.append(None) 57 | invalid.append("unexpected variant prefix") 58 | else: 59 | valid.append(v) 60 | invalid.append(None) 61 | 62 | return valid, invalid 63 | -------------------------------------------------------------------------------- /src/mavehgvs/variant.py: -------------------------------------------------------------------------------- 1 | import re 2 | import itertools 3 | from typing import Optional, Union, List, Tuple, Mapping, Any, Sequence, Dict, Generator 4 | 5 | from fqfa.constants import AA_CODES 6 | 7 | from mavehgvs.position import VariantPosition 8 | from mavehgvs.patterns.combined import any_variant 9 | from mavehgvs.exceptions import MaveHgvsParseError 10 | 11 | __all__ = ["Variant"] 12 | 13 | AA_3_TO_1 = {value: key for key, value in AA_CODES.items()} 14 | """Dict[str, str]: for converting three-letter amino acid codes to single-letter codes. 15 | """ 16 | 17 | 18 | class Variant: 19 | fullmatch = re.compile(any_variant, flags=re.ASCII).fullmatch 20 | """Callable[[str, int, int], Optional[Match[str]]]: fullmatch callable for parsing a 21 | single MAVE-HGVS variant 22 | 23 | Returns an :py:obj:`re.Match` object if the full string defines a valid MAVE-HGVS 24 | variant. 25 | Match groups in the result can be used to extract components of the variant. 26 | """ 27 | 28 | VTYPES = ( 29 | "equal", # equality 30 | "sub", # substitution 31 | "fs", # frame shift 32 | "del", # deletion 33 | "dup", # duplication 34 | "ins", # insertion 35 | "delins", # deletion-insertion 36 | ) 37 | """Tuple[str]: variant type tags used in MAVE-HGVS patterns and variant type names. 38 | """ 39 | 40 | def __init__( # noqa: max-complexity: 37 41 | self, 42 | s: Union[str, Mapping[str, Any], Sequence[Mapping[str, Any]]], 43 | targetseq: Optional[str] = None, 44 | relaxed_ordering: bool = False, 45 | ): 46 | """Convert a MAVE-HGVS variant string into a corresponding object with named 47 | fields. 48 | 49 | Parameters 50 | ---------- 51 | s : Union[str, Mapping[str, Any], Sequence[Mapping[str, Any]]] 52 | MAVE-HGVS variant string to convert into an object, dictionary type object 53 | containing key-value pairs corresponding to a MAVE-HGVS object, or 54 | list/tuple of dictionary type objects for a variant with multiple events. 55 | 56 | targetseq : Optional[str] 57 | If provided, the variant will be validated for agreement with this sequence. 58 | Target sequence validation is not supported for variants using the extended 59 | position syntax. 60 | 61 | This must be an amino acid sequence for protein variants or a nucleotide 62 | sequence for coding/noncoding/genomic variants. 63 | DNA and amino acid sequences should be in uppercase, RNA in lowercase. 64 | 65 | relaxed_ordering : bool 66 | If True, variants that do not observe the 3-prime rule for variant position 67 | ordering are allowed. 68 | The object representation will observe the 3-prime rule, so it may differ 69 | from the input string in this case. 70 | 71 | """ 72 | if isinstance(s, str): # variant string to parse 73 | variant_string = s 74 | elif isinstance(s, Mapping): # dictionary-style single variant 75 | variant_string = self._variant_dictionary_to_string(s, include_prefix=True) 76 | elif isinstance(s, Sequence): # dictionary-style multi-variant 77 | if not all(isinstance(v, Mapping) for v in s): 78 | raise ValueError("multi-variant iterable must contain Mapping objects") 79 | try: 80 | all_prefixes = [v["prefix"] for v in s] 81 | except KeyError: 82 | raise MaveHgvsParseError("variant dictionary missing required keys") 83 | if len(set(all_prefixes)) != 1: 84 | raise MaveHgvsParseError( 85 | "cannot combine variants with different prefixes" 86 | ) 87 | multivariants = ";".join( 88 | self._variant_dictionary_to_string(v, include_prefix=False) for v in s 89 | ) 90 | variant_string = f"{s[0]['prefix']}.[{multivariants}]" 91 | else: 92 | raise ValueError("can only create Variants from string or Mapping objects") 93 | 94 | variant_match = self.fullmatch(variant_string) 95 | if variant_match is None: 96 | raise MaveHgvsParseError("failed regular expression validation") 97 | else: 98 | match_dict = variant_match.groupdict() 99 | 100 | # set target id if present 101 | if match_dict["target_id"] is not None: 102 | self._target_id = match_dict["target_id"] 103 | else: 104 | self._target_id = None 105 | 106 | # set prefix and determine if this is a multi-variant 107 | if match_dict["single_variant"] is not None: 108 | self.variant_count = 1 109 | self._prefix = match_dict["single_variant"][0] 110 | elif match_dict["multi_variant"] is not None: 111 | self.variant_count = len(variant_string.split(";")) 112 | self._prefix = match_dict["multi_variant"][0] 113 | else: # pragma: no cover 114 | raise ValueError("invalid match type") 115 | 116 | if self.variant_count == 1: 117 | ( 118 | self._variant_types, 119 | self._positions, 120 | self._sequences, 121 | ) = self._process_string_variant( 122 | match_dict, relaxed_ordering=relaxed_ordering 123 | ) 124 | elif self.variant_count > 1: 125 | self._variant_types = list() 126 | self._positions = list() 127 | self._sequences = list() 128 | 129 | # format each individual variant event as a single variant and parse it 130 | for variant_substring in match_dict["multi_variant"][3:-1].split(";"): 131 | groupdict = self.fullmatch( 132 | f"{self._prefix}.{variant_substring}" 133 | ).groupdict() 134 | vt, p, s = self._process_string_variant( 135 | groupdict, relaxed_ordering=relaxed_ordering 136 | ) 137 | if vt == "equal": 138 | raise MaveHgvsParseError( 139 | "multi-variants cannot contain target-identical variants" 140 | ) 141 | 142 | self._variant_types.append(vt) 143 | self._positions.append(p) 144 | self._sequences.append(s) 145 | 146 | # ensure that multiple variants aren't defined for the same positions 147 | for vp1, vp2 in itertools.combinations(self._positions, 2): 148 | if isinstance(vp1, VariantPosition) and isinstance( 149 | vp2, VariantPosition 150 | ): # both single position 151 | if vp1 == vp2: 152 | raise MaveHgvsParseError( 153 | "multi-variant has multiple changes at same position" 154 | ) 155 | elif isinstance(vp1, VariantPosition) and isinstance(vp2, Tuple): 156 | if vp2[0] <= vp1 <= vp2[1]: 157 | raise MaveHgvsParseError( 158 | "multi-variant has overlapping changes" 159 | ) 160 | elif isinstance(vp1, Tuple) and isinstance(vp2, VariantPosition): 161 | if vp1[0] <= vp2 <= vp1[1]: 162 | raise MaveHgvsParseError( 163 | "multi-variant has overlapping changes" 164 | ) 165 | elif isinstance(vp1, Tuple) and isinstance(vp2, Tuple): 166 | if ( 167 | vp1[0] <= vp2[0] <= vp1[1] 168 | or vp1[0] <= vp2[1] <= vp1[1] 169 | or vp2[0] <= vp1[0] <= vp2[1] 170 | or vp2[0] <= vp1[1] <= vp2[1] 171 | ): 172 | raise MaveHgvsParseError( 173 | "multi-variant has overlapping changes" 174 | ) 175 | else: # pragma: no cover 176 | raise ValueError("invalid position type") 177 | 178 | # re-order variants and validate 179 | def sort_key(x): 180 | if isinstance(x[1], VariantPosition): 181 | return x[1] 182 | elif isinstance(x[1], Tuple): 183 | return x[1][0] 184 | else: # pragma: no cover 185 | raise ValueError("invalid position type") 186 | 187 | variant_list = list(self.variant_tuples()) 188 | ordered_list = sorted(variant_list, key=sort_key) 189 | if variant_list != ordered_list: 190 | if relaxed_ordering: 191 | self._variant_types = [x[0] for x in ordered_list] 192 | self._positions = [x[1] for x in ordered_list] 193 | self._sequences = [x[2] for x in ordered_list] 194 | else: 195 | raise MaveHgvsParseError("multi-variants not in sorted order") 196 | 197 | # make sure there is at most one frame shift 198 | if sum(x == "fs" for x in self._variant_types) > 1: 199 | raise MaveHgvsParseError("maximum of one frame shift is permitted") 200 | 201 | # make sure the frame shift is last if present 202 | if any(x == "fs" for x in self._variant_types): 203 | if self._variant_types[-1] != "fs": 204 | raise MaveHgvsParseError( 205 | "no variants are permitted to follow a frame shift" 206 | ) 207 | 208 | else: # pragma: no cover 209 | raise ValueError("invalid variant count") 210 | 211 | if targetseq is not None: 212 | for vtype, pos, seq in self.variant_tuples(): 213 | if self._prefix != "p" and vtype == "sub": 214 | self._target_validate(pos, seq[0], targetseq) 215 | elif ( 216 | pos is None and vtype == "equal" 217 | ): # special case for full-length target identical variants 218 | pass 219 | else: 220 | self._target_validate(pos, None, targetseq) 221 | 222 | def variant_tuples( 223 | self, 224 | ) -> Generator[ 225 | Tuple[ 226 | str, 227 | Optional[Union[VariantPosition, Tuple[VariantPosition, VariantPosition]]], 228 | Optional[Union[str, Tuple[str, str]]], 229 | ], 230 | None, 231 | None, 232 | ]: 233 | """Generator that yields tuples containing the variant components. 234 | 235 | Yields 236 | ------ 237 | Tuple 238 | Tuple of the variant type, position(s), and sequence(s) for each element in 239 | the variant. 240 | 241 | """ 242 | if self.is_multi_variant(): 243 | for vtype, pos, seq in zip( 244 | self._variant_types, self._positions, self._sequences 245 | ): 246 | yield vtype, pos, seq 247 | else: 248 | yield self._variant_types, self._positions, self._sequences 249 | 250 | def _process_string_variant( # noqa: max-complexity: 23 251 | self, match_dict: Dict[str, str], relaxed_ordering: bool 252 | ) -> Tuple[ 253 | str, 254 | Optional[Union[VariantPosition, Tuple[VariantPosition, VariantPosition]]], 255 | Optional[Union[str, Tuple[str, str]]], 256 | ]: 257 | """Process the match dictionary from a single variant into its components. 258 | 259 | Parameters 260 | ---------- 261 | match_dict : Dict[str, str] 262 | Match dictionary from the MAVE-HGVS regular expression. 263 | relaxed_ordering : bool 264 | If True, variants that do not observe the 3-prime rule for variant position 265 | ordering are allowed. 266 | 267 | Returns 268 | ------- 269 | Tuple[str, Optional[Union[VariantPosition, Tuple[VariantPosition, \ 270 | VariantPosition]]], Optional[Union[str, Tuple[str, str]]]] 271 | Returns a 3-tuple containing the variant type, optional position (or 272 | start/end positions), and optional before/after substitution sequences or 273 | inserted sequence. 274 | 275 | """ 276 | variant_type = None 277 | positions = None 278 | sequences = None 279 | 280 | # determine which named groups to check 281 | if self._prefix == "p": 282 | pattern_group_tuples = [(f"pro_{t}", t) for t in self.VTYPES] 283 | elif self._prefix == "r": 284 | pattern_group_tuples = [(f"rna_{t}", t) for t in self.VTYPES if t != "fs"] 285 | elif self._prefix in tuple("cn"): 286 | pattern_group_tuples = [ 287 | (f"dna_{t}_{self._prefix}", t) for t in self.VTYPES if t != "fs" 288 | ] 289 | elif self._prefix in tuple("gmo"): 290 | pattern_group_tuples = [ 291 | (f"dna_{t}_gmo", t) for t in self.VTYPES if t != "fs" 292 | ] 293 | else: # pragma: no cover 294 | raise ValueError("unexpected prefix") 295 | 296 | # set the variant type 297 | vtype_set = False 298 | pattern_group = None 299 | for pg, vtype in pattern_group_tuples: 300 | if match_dict[pg] is not None: 301 | if vtype_set: # pragma: no cover 302 | raise ValueError(f"ambiguous match: '{pg}' and '{pattern_group}'") 303 | variant_type = vtype 304 | pattern_group = pg 305 | vtype_set = True 306 | 307 | # set the position and sequence 308 | if variant_type == "sub": 309 | positions = VariantPosition(match_dict[f"{pattern_group}_position"]) 310 | if self._prefix == "p": 311 | sequences = (positions.amino_acid, match_dict[f"{pattern_group}_new"]) 312 | elif self._prefix in tuple("gmocnr"): 313 | sequences = ( 314 | match_dict[f"{pattern_group}_ref"], 315 | match_dict[f"{pattern_group}_new"], 316 | ) 317 | else: # pragma: no cover 318 | raise ValueError("unexpected prefix") 319 | elif variant_type in ("equal", "fs", "del", "dup", "ins", "delins"): 320 | # set position 321 | if ( 322 | match_dict.get(f"{pattern_group}_position") is not None 323 | ): # use get() since ins pattern doesn't have pos 324 | positions = VariantPosition(match_dict[f"{pattern_group}_position"]) 325 | elif ( 326 | match_dict.get(f"{pattern_group}_start") is not None 327 | and match_dict.get(f"{pattern_group}_end") is not None 328 | ): 329 | positions = ( 330 | VariantPosition(match_dict[f"{pattern_group}_start"]), 331 | VariantPosition(match_dict[f"{pattern_group}_end"]), 332 | ) 333 | # extra validation on positions 334 | if positions[0] >= positions[1]: 335 | if relaxed_ordering: 336 | positions = (positions[1], positions[0]) 337 | else: 338 | raise MaveHgvsParseError( 339 | "start position must be before end position" 340 | ) 341 | if variant_type == "ins": 342 | if not positions[0].is_adjacent(positions[1]): 343 | raise MaveHgvsParseError("insertion positions must be adjacent") 344 | else: # pragma: no cover 345 | if variant_type != "equal": 346 | raise MaveHgvsParseError("variant position not found") 347 | 348 | # set sequence if needed 349 | if variant_type in ("ins", "delins"): 350 | sequences = match_dict[f"{pattern_group}_seq"] 351 | elif variant_type == "equal": 352 | if ( 353 | match_dict[f"{pattern_group}_equal"] is not None 354 | ): # special case for target identity 355 | sequences = match_dict[f"{pattern_group}_equal"] 356 | elif match_dict["pro_equal_equal_sy"] is not None: 357 | sequences = match_dict["pro_equal_equal_sy"] 358 | 359 | return variant_type, positions, sequences 360 | 361 | # TODO: API documentation for the dictionary objects 362 | @staticmethod 363 | def _variant_dictionary_to_string( # noqa: max-complexity: 25 364 | vdict: Mapping[str, Any], include_prefix: bool 365 | ) -> str: 366 | """Convert a match dictionary from a single variant into a string for further 367 | validation. 368 | 369 | This method performs minimal validation of the values provided in the input, and 370 | instead converts it into a variant string that is validated using the regular 371 | expression based validators. 372 | 373 | Parameters 374 | ---------- 375 | vdict : Mapping[str, Any] 376 | Key-value pairs describing a single variant. 377 | include_prefix: bool 378 | If True, the variant prefix and '.' will be included in the string; else it 379 | is omitted (for use with multi-variants). 380 | 381 | Returns 382 | ------- 383 | str 384 | A string representing this variant. 385 | 386 | Raises 387 | ------ 388 | MaveHgvsParseError 389 | If the dictionary does not have a valid set of keys. 390 | 391 | """ 392 | try: 393 | variant_type = vdict["variant_type"] 394 | prefix = vdict["prefix"] 395 | except KeyError: 396 | raise MaveHgvsParseError("variant dictionary missing required keys") 397 | 398 | if variant_type == "equal": 399 | expected_keys = ["variant_type", "prefix"] 400 | if prefix == "p": 401 | expected_keys.extend(["position", "target"]) 402 | else: 403 | expected_keys.extend(["start_position", "end_position"]) 404 | if sorted(vdict.keys()) != sorted(expected_keys): 405 | raise MaveHgvsParseError("variant dictionary contains invalid keys") 406 | if prefix == "p": 407 | variant_string = f"{vdict['target']}{vdict['position']}=" 408 | elif vdict["start_position"] == vdict["end_position"]: 409 | variant_string = f"{vdict['start_position']}=" 410 | else: 411 | variant_string = f"{vdict['start_position']}_{vdict['end_position']}=" 412 | elif variant_type == "sub": 413 | if sorted(vdict.keys()) != sorted( 414 | ["variant_type", "prefix", "position", "target", "variant"] 415 | ): 416 | raise MaveHgvsParseError("variant dictionary contains invalid keys") 417 | if prefix == "p": 418 | variant_string = ( 419 | f"{vdict['target']}{vdict['position']}{vdict['variant']}" 420 | ) 421 | else: 422 | variant_string = ( 423 | f"{vdict['position']}{vdict['target']}>{vdict['variant']}" 424 | ) 425 | elif variant_type == "fs": 426 | if sorted(vdict.keys()) != sorted( 427 | ["variant_type", "prefix", "position", "target"] 428 | ): 429 | raise MaveHgvsParseError("variant dictionary contains invalid keys") 430 | if prefix == "p": 431 | variant_string = f"{vdict['target']}{vdict['position']}fs" 432 | else: 433 | raise MaveHgvsParseError( 434 | "frame shifts are only supported for protein variants" 435 | ) 436 | elif variant_type in ("del", "dup"): 437 | expected_keys = ["variant_type", "prefix", "start_position", "end_position"] 438 | if prefix == "p": 439 | expected_keys.extend(["start_target", "end_target"]) 440 | if sorted(vdict.keys()) != sorted(expected_keys): 441 | raise MaveHgvsParseError("variant dictionary contains invalid keys") 442 | if prefix == "p": 443 | start = f"{vdict['start_target']}{vdict['start_position']}" 444 | end = f"{vdict['end_target']}{vdict['end_position']}" 445 | else: 446 | start = vdict["start_position"] 447 | end = vdict["end_position"] 448 | if start == end: 449 | variant_string = f"{start}{variant_type}" 450 | else: 451 | variant_string = f"{start}_{end}{variant_type}" 452 | elif variant_type in ("ins", "delins"): 453 | expected_keys = [ 454 | "variant_type", 455 | "prefix", 456 | "start_position", 457 | "end_position", 458 | "variant", 459 | ] 460 | if prefix == "p": 461 | expected_keys.extend(["start_target", "end_target"]) 462 | if sorted(vdict.keys()) != sorted(expected_keys): 463 | raise MaveHgvsParseError("variant dictionary contains invalid keys") 464 | if prefix == "p": 465 | start = f"{vdict['start_target']}{vdict['start_position']}" 466 | end = f"{vdict['end_target']}{vdict['end_position']}" 467 | else: 468 | start = vdict["start_position"] 469 | end = vdict["end_position"] 470 | if start == end and variant_type == "delins": 471 | variant_string = f"{start}{variant_type}{vdict['variant']}" 472 | else: 473 | variant_string = f"{start}_{end}{variant_type}{vdict['variant']}" 474 | else: 475 | raise MaveHgvsParseError("invalid variant type") 476 | 477 | if include_prefix: 478 | return f"{vdict['prefix']}.{variant_string}" 479 | else: 480 | return variant_string 481 | 482 | def _format_component_variants(self) -> List[str]: # noqa: max-complexity: 14 483 | """Format each of the component variants of this variant into a variant string. 484 | 485 | The result is a list of strings, each representing a single variant. If this 486 | variant is a single variant, the list will contain a single element equivalent 487 | to the input string. For multi-variants, the list will contain each component 488 | variant of the variant. 489 | 490 | Returns 491 | ------- 492 | List[str] 493 | List of formatted component variants. 494 | 495 | """ 496 | 497 | def format_variant( 498 | vtype: str, 499 | pos: Union[VariantPosition, Tuple[VariantPosition, VariantPosition]], 500 | seq: Optional[Union[str, Tuple[str, str]]], 501 | ) -> str: 502 | """Helper function for building variant strings. 503 | 504 | Parameters 505 | ---------- 506 | vtype : str 507 | The variant type, as described by :py:obj:`Variant.__vtypes` 508 | pos : Union[VariantPosition, Tuple[VariantPosition, VariantPosition]] 509 | The position or pair of positions describing the variant. 510 | seq : Optional[Union[str, Tuple[str, str]]] 511 | The sequence or pair of sequences describing the variant. 512 | Only used for substitions, insertions, and deletion-insertions. 513 | 514 | Returns 515 | ------- 516 | str 517 | A string representing this variant element. 518 | 519 | """ 520 | if vtype == "sub": 521 | if self._prefix == "p": # protein variant 522 | return f"{pos}{seq[1]}" 523 | else: # nucleotide variant 524 | return f"{pos}{seq[0]}>{seq[1]}" 525 | elif vtype == "fs": 526 | return f"{pos}fs" 527 | elif vtype in ("del", "dup"): 528 | if isinstance(pos, tuple): 529 | return f"{pos[0]}_{pos[1]}{vtype}" 530 | else: 531 | return f"{pos}{vtype}" 532 | elif vtype in ("ins", "delins"): 533 | if isinstance(pos, tuple): 534 | return f"{pos[0]}_{pos[1]}{vtype}{seq}" 535 | else: 536 | return f"{pos}{vtype}{seq}" 537 | elif vtype == "equal": 538 | if pos is None: 539 | return f"{seq}" 540 | elif isinstance(pos, tuple): 541 | return f"{pos[0]}_{pos[1]}{seq}" 542 | else: 543 | return f"{pos}{seq}" 544 | else: # pragma: no cover 545 | raise ValueError("invalid variant type") 546 | 547 | return [format_variant(*t) for t in self.variant_tuples()] 548 | 549 | def __eq__(self, other: "Variant") -> bool: 550 | """Equality comparison operator. 551 | 552 | Parameters 553 | ---------- 554 | other : Variant 555 | The other Variant to compare to. 556 | 557 | Returns 558 | ------- 559 | bool 560 | True if this variant is the same as the other position; else False. 561 | 562 | """ 563 | return ( 564 | self._target_id, 565 | self.variant_count, 566 | self._prefix, 567 | self._variant_types, 568 | self._positions, 569 | self._sequences, 570 | ) == ( 571 | other._target_id, 572 | other.variant_count, 573 | other._prefix, 574 | other._variant_types, 575 | other._positions, 576 | other._sequences, 577 | ) 578 | 579 | def __repr__(self) -> str: 580 | """The object representation is equivalent to the input string. 581 | 582 | Returns 583 | ------- 584 | str 585 | The object representation. 586 | 587 | """ 588 | 589 | elements = self._format_component_variants() 590 | 591 | if self._target_id is not None: 592 | prefix = f"{self._target_id}:{self._prefix}" 593 | else: 594 | prefix = f"{self._prefix}" 595 | 596 | if self.is_multi_variant(): 597 | return f"{prefix}.[{';'.join(elements)}]" 598 | else: 599 | return f"{prefix}.{elements[0]}" 600 | 601 | @staticmethod 602 | def _target_validate( 603 | pos: Union[VariantPosition, Tuple[VariantPosition, VariantPosition]], 604 | ref: Optional[str], 605 | target: str, 606 | ) -> None: 607 | """Determine whether the target portion of a variant matches the target 608 | sequence. 609 | 610 | Note that variants using extended syntax cannot be validated with this method. 611 | If an extended syntax variant is encountered, it will be interpreted as 612 | valid/matching. 613 | 614 | Parameters 615 | ---------- 616 | pos : Union[VariantPosition, Tuple[VariantPosition, VariantPosition]] 617 | Single variant position or start/end tuple for an indel. 618 | ref : Optional[str] 619 | Reference base to validate for nucleotide substitutions. 620 | This should be None for amino acid substitutions, since the reference is 621 | included in the VariantPosition. 622 | target : str 623 | Target sequence. This must be an amino acid sequence for protein variants or 624 | a nucleotide sequence for coding/noncoding/genomic variants. 625 | RNA sequences should be in lowercase, DNA sequences should be in uppercase. 626 | 627 | Returns 628 | ------- 629 | None 630 | 631 | Raises 632 | ------ 633 | MaveHgvsParseError 634 | If the reference base or amino acid does not match the target at the given 635 | position 636 | MaveHgvsParseError 637 | If the position is outside the bounds of the target. 638 | 639 | """ 640 | if not isinstance(pos, tuple): 641 | pos = (pos,) 642 | 643 | if any(p.is_extended() for p in pos): 644 | return 645 | elif any(p.position > len(target) for p in pos): 646 | raise MaveHgvsParseError("variant coordinate out of bounds") 647 | else: 648 | if ref is not None and len(pos) == 1: # nucleotide substitution 649 | if target[pos[0].position - 1] != ref: 650 | raise MaveHgvsParseError("variant reference does not match target") 651 | elif pos[0].amino_acid is not None: # protein variant 652 | for p in pos: 653 | if target[p.position - 1] != AA_3_TO_1[p.amino_acid]: 654 | raise MaveHgvsParseError( 655 | "variant reference does not match target" 656 | ) 657 | else: 658 | return 659 | 660 | def is_target_identical(self) -> bool: 661 | """Return whether the variant describes the "wild-type" sequence or is the 662 | special synonymous variant. 663 | 664 | This is the variant described with only the equals sign (e.g. ``c.=``) 665 | or the uncertain equals protein variant (e.g. ``p.(=)``). 666 | 667 | Coding or genomic variants that specify an identical region (e.g. ``c.1_3=`` are 668 | also considered target identical. 669 | 670 | Synonymous protein variants (e.g. ``p.Leu12=``) are not considered target 671 | identical. 672 | 673 | Returns 674 | ------- 675 | bool 676 | True if this variant describes the wild-type or target sequence; else False. 677 | 678 | """ 679 | if self._variant_types == "equal": 680 | if self._prefix == "p": 681 | return self._positions is None 682 | else: 683 | return True 684 | else: 685 | return False 686 | 687 | def is_synonymous(self) -> bool: 688 | """Return whether the variant describes a synonymous protein variant or is the 689 | special synonymous variant. 690 | 691 | Returns 692 | ------- 693 | bool 694 | True if this variant describes a synonymous protein variant; else False. 695 | 696 | """ 697 | return self._variant_types == "equal" and self._prefix == "p" 698 | 699 | def is_multi_variant(self) -> bool: 700 | """Return whether the variant is a multi-variant. 701 | 702 | A multi-variant is a single variant describing multiple events enclosed in '[]'. 703 | Multi-variants are referred to as alleles in the HGVS standard. 704 | 705 | Returns 706 | ------- 707 | bool 708 | True if the variant is a multi-variant; else False. 709 | 710 | """ 711 | return self.variant_count > 1 712 | 713 | @property 714 | def prefix(self) -> str: 715 | """The single-letter prefix for this variant. 716 | 717 | Returns 718 | ------- 719 | str 720 | Single-letter prefix corresponding to the sequence type. 721 | 722 | See the following table for sequence type prefixes and their meanings: 723 | 724 | .. csv-table:: 725 | :file: ../docs/prefix.csv 726 | :header: "Prefix", "Description" 727 | :widths: 5, 20 728 | 729 | """ 730 | return self._prefix 731 | 732 | @property 733 | def variant_type(self) -> Union[str, List[str]]: 734 | """The type for this variant. 735 | 736 | Valid variant types are: 737 | 738 | * ``'sub'`` for substitutions 739 | * ``'del'`` for deletions 740 | * ``'dup'`` for duplications 741 | * ``'ins'`` for insertions 742 | * ``'delins'`` for deletion-insertions 743 | 744 | Returns 745 | ------- 746 | Union[str, List[str]] 747 | String containing the variant type. Returns a list of strings for a 748 | multi-variant. 749 | 750 | """ 751 | return self._variant_types 752 | 753 | def uses_extended_positions(self) -> bool: 754 | """Return whether the variant uses the extended position notation to describe 755 | intronic or UTR positions. 756 | 757 | Examples of variants using the extended position notation include: 758 | 759 | * c.122-6T>A 760 | * r.*33a>c 761 | * c.43-6_595+12delinsCTT 762 | 763 | This should always be false for variants with a genomic or protein prefix, as 764 | variants with these prefixes cannot use positions relative to a transcript under 765 | the MAVE-HGVS specification. 766 | 767 | Returns 768 | ------- 769 | bool 770 | True if the variant (or any of the individual variants for a multi-variant) 771 | uses the extended position notation. 772 | 773 | """ 774 | if self.is_multi_variant(): 775 | all_positions = list() 776 | for p in self.positions: 777 | if isinstance(p, tuple): 778 | all_positions.extend(p) 779 | else: 780 | all_positions.append(p) 781 | return any(p.is_extended() for p in all_positions) 782 | else: 783 | if self._positions is None: # special case for target identity 784 | return False 785 | elif isinstance(self.positions, tuple): 786 | return any(p.is_extended() for p in self.positions) 787 | else: 788 | return self.positions.is_extended() 789 | 790 | @property 791 | def positions( 792 | self, 793 | ) -> Optional[ 794 | Union[ 795 | VariantPosition, 796 | Tuple[VariantPosition, VariantPosition], 797 | List[Union[VariantPosition, Tuple[VariantPosition, VariantPosition]]], 798 | ] 799 | ]: 800 | """The variant position as a single position or tuple containing start and end 801 | positions. 802 | 803 | Each position is an instance of :py:class:`mavehgvs.position.VariantPosition`. 804 | 805 | Returns 806 | ------- 807 | Union[VariantPosition, Tuple[VariantPosition, VariantPosition], \ 808 | List[Union[VariantPosition, Tuple[VariantPosition, VariantPosition]]]] 809 | Variant position or tuple of start/end positions. 810 | Returns a list of positions or start/end tuples for a multi-variant. 811 | 812 | """ 813 | return self._positions 814 | 815 | @property 816 | def sequence( 817 | self, 818 | ) -> Optional[ 819 | Union[str, Tuple[str, str], List[Optional[Union[str, Tuple[str, str]]]]] 820 | ]: 821 | """The sequence portion of the variant. 822 | 823 | This can be a tuple of target and new bases for a substitution, a single 824 | sequence for insertions or deletion-insertions, or the "=" character for 825 | variants that are identical to the target sequence. 826 | 827 | Returns 828 | ------- 829 | Union[str, Tuple[str, str], List[Optional[Union[str, Tuple[str, str]]]]]] 830 | Tuple of ref/new bases for substitutions, string containing inserted 831 | sequence, or the "=" character. 832 | Returns None if the variant does not have a sequence component (deletion or 833 | duplication). 834 | Returns a list for a multi-variant, which may contain None values for 835 | deletions or duplications. 836 | 837 | """ 838 | return self._sequences 839 | 840 | @property 841 | def target_id(self) -> Optional[str]: 842 | """The target identifier for the variant (if applicable). 843 | 844 | The target identifier precedes the prefix and is followed by a ``:``. 845 | For example in ``NM_001130145.3:c.832C>T`` the target identifier is 846 | "NM_001130145.3". 847 | 848 | Returns 849 | ------- 850 | Optional[str] 851 | The target identifier, or None if it is not set. 852 | 853 | """ 854 | return self._target_id 855 | 856 | def components(self) -> Tuple[str, ...]: 857 | """The component substrings of a variant. 858 | 859 | Returns 860 | ------- 861 | Tuple[str, ...] 862 | List of component substrings for this variant. 863 | 864 | """ 865 | if self.target_id is not None: 866 | prefix = f"{self.target_id}:{self.prefix}" 867 | else: 868 | prefix = f"{self.prefix}" 869 | 870 | return tuple( 871 | [f"{prefix}.{component}" for component in self._format_component_variants()] 872 | ) 873 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/VariantEffect/mavehgvs/69476dde5391022e7c0eca32ecd1734e371436eb/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_patterns/__init__.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from typing import Iterable, Iterator, Tuple 3 | 4 | 5 | def build_multi_variants( 6 | valid_strings: Iterable[str], 7 | invalid_strings: Iterable[str], 8 | min_length: int = 2, 9 | max_length: int = 3, 10 | ) -> Tuple[Iterator, Iterator]: 11 | """Build iterators of valid and invalid multi-variant strings to test. 12 | 13 | Parameters 14 | ---------- 15 | valid_strings : Iterable[str] 16 | Iterable containing all the valid single-variant strings. 17 | invalid_strings : Iterable[str] 18 | Iterable containing all the invalid single-variant strings. 19 | min_length : int 20 | Minimum length of multi-variants that will be generated. 21 | max_length : int 22 | Maximum length of multi-variants that will be generated. 23 | Note that increasing this value may massively increase test runtime. 24 | 25 | Returns 26 | ------- 27 | Tuple[Iterator, Iterator] 28 | Returns iterators containing semicolon-separated multi-variant strings. 29 | 30 | The first iterator contains multi-variants from only valid_strings and the 31 | second iterator contains multi-variants that include at least one variant from 32 | invalid_strings. 33 | """ 34 | # create an iterable of permutations for each length and store them in lists 35 | valid_multivariants = list() 36 | invalid_multivariants = list() 37 | 38 | for i in range(min_length, max_length + 1): 39 | valid_multivariants.append( 40 | ";".join(x) for x in itertools.permutations(valid_strings, i) 41 | ) 42 | invalid_multivariants.append( 43 | ";".join(x) 44 | for x in itertools.permutations( 45 | itertools.chain(valid_strings, invalid_strings), i 46 | ) 47 | if any(y in x for y in invalid_strings) 48 | ) 49 | 50 | # combine the lists into single iterators and return 51 | return itertools.chain.from_iterable( 52 | valid_multivariants 53 | ), itertools.chain.from_iterable(invalid_multivariants) 54 | -------------------------------------------------------------------------------- /tests/test_patterns/test_dna.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import re 3 | from mavehgvs.patterns.dna import ( 4 | dna_equal_c, 5 | dna_equal_n, 6 | dna_equal_gmo, 7 | dna_sub_c, 8 | dna_sub_n, 9 | dna_sub_gmo, 10 | dna_del_c, 11 | dna_del_n, 12 | dna_del_gmo, 13 | dna_dup_c, 14 | dna_dup_n, 15 | dna_dup_gmo, 16 | dna_ins_c, 17 | dna_ins_n, 18 | dna_ins_gmo, 19 | dna_delins_c, 20 | dna_delins_n, 21 | dna_delins_gmo, 22 | dna_variant_c, 23 | dna_variant_n, 24 | dna_variant_gmo, 25 | dna_single_variant, 26 | dna_multi_variant, 27 | ) 28 | from . import build_multi_variants 29 | 30 | 31 | class TestDnaEqualC(unittest.TestCase): 32 | @classmethod 33 | def setUpClass(cls): 34 | cls.pattern = re.compile(dna_equal_c, flags=re.ASCII) 35 | 36 | cls.valid_strings = [ 37 | "=", 38 | "18=", 39 | "10_14=", 40 | "122-6=", 41 | "*24=", 42 | "19+22=", 43 | "19+22_88=", 44 | "-27+3=", 45 | ] 46 | 47 | cls.invalid_strings = ["=22", "(=)", "18(=)"] 48 | 49 | def test_valid_strings(self): 50 | for s in self.valid_strings: 51 | with self.subTest(s=s): 52 | self.assertIsNotNone( 53 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 54 | ) 55 | 56 | def test_invalid_strings(self): 57 | for s in self.invalid_strings: 58 | with self.subTest(s=s): 59 | self.assertIsNone( 60 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 61 | ) 62 | 63 | 64 | class TestDnaEqualN(unittest.TestCase): 65 | @classmethod 66 | def setUpClass(cls): 67 | cls.pattern = re.compile(dna_equal_n, flags=re.ASCII) 68 | 69 | cls.valid_strings = ["="] 70 | 71 | cls.invalid_strings = [ 72 | "=22", 73 | "(=)", 74 | "18(=)", 75 | "-27+3=", 76 | "*24=", 77 | "18=", 78 | "10_14=", 79 | "122-6=", 80 | "19+22=", 81 | "19+22_88=", 82 | ] 83 | 84 | def test_valid_strings(self): 85 | for s in self.valid_strings: 86 | with self.subTest(s=s): 87 | self.assertIsNotNone( 88 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 89 | ) 90 | 91 | def test_invalid_strings(self): 92 | for s in self.invalid_strings: 93 | with self.subTest(s=s): 94 | self.assertIsNone( 95 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 96 | ) 97 | 98 | 99 | class TestDnaEqualGMO(unittest.TestCase): 100 | @classmethod 101 | def setUpClass(cls): 102 | cls.pattern = re.compile(dna_equal_gmo, flags=re.ASCII) 103 | 104 | cls.valid_strings = ["=", "18=", "10_14="] 105 | 106 | cls.invalid_strings = [ 107 | "=22", 108 | "(=)", 109 | "18(=)", 110 | "122-6=", 111 | "*24=", 112 | "19+22=", 113 | "19+22_88=", 114 | "-27+3=", 115 | ] 116 | 117 | def test_valid_strings(self): 118 | for s in self.valid_strings: 119 | with self.subTest(s=s): 120 | self.assertIsNotNone( 121 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 122 | ) 123 | 124 | def test_invalid_strings(self): 125 | for s in self.invalid_strings: 126 | with self.subTest(s=s): 127 | self.assertIsNone( 128 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 129 | ) 130 | 131 | 132 | class TestDnaSubC(unittest.TestCase): 133 | @classmethod 134 | def setUpClass(cls): 135 | cls.pattern = re.compile(dna_sub_c, flags=re.ASCII) 136 | 137 | cls.valid_strings = ["48C>A", "122-6T>A", "*24G>C", "19+22A>G", "-27+3T>C"] 138 | 139 | cls.invalid_strings = ["22g>u", "48C>W", "122=/T>A"] 140 | 141 | def test_valid_strings(self): 142 | for s in self.valid_strings: 143 | with self.subTest(s=s): 144 | self.assertIsNotNone( 145 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 146 | ) 147 | 148 | def test_invalid_strings(self): 149 | for s in self.invalid_strings: 150 | with self.subTest(s=s): 151 | self.assertIsNone( 152 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 153 | ) 154 | 155 | 156 | class TestDnaSubN(unittest.TestCase): 157 | @classmethod 158 | def setUpClass(cls): 159 | cls.pattern = re.compile(dna_sub_n, flags=re.ASCII) 160 | 161 | cls.valid_strings = ["48C>A", "122-6T>A", "19+22A>G"] 162 | 163 | cls.invalid_strings = ["22g>u", "48C>W", "122=/T>A", "*24G>C", "-27+3T>C"] 164 | 165 | def test_valid_strings(self): 166 | for s in self.valid_strings: 167 | with self.subTest(s=s): 168 | self.assertIsNotNone( 169 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 170 | ) 171 | 172 | def test_invalid_strings(self): 173 | for s in self.invalid_strings: 174 | with self.subTest(s=s): 175 | self.assertIsNone( 176 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 177 | ) 178 | 179 | 180 | class TestDnaSubGmo(unittest.TestCase): 181 | @classmethod 182 | def setUpClass(cls): 183 | cls.pattern = re.compile(dna_sub_gmo, flags=re.ASCII) 184 | 185 | cls.valid_strings = ["48C>A"] 186 | 187 | cls.invalid_strings = ["122-6T>A", "22g>u", "48C>W", "22=", "122=/T>A", "0C>T"] 188 | 189 | def test_valid_strings(self): 190 | for s in self.valid_strings: 191 | with self.subTest(s=s): 192 | self.assertIsNotNone( 193 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 194 | ) 195 | 196 | def test_invalid_strings(self): 197 | for s in self.invalid_strings: 198 | with self.subTest(s=s): 199 | self.assertIsNone( 200 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 201 | ) 202 | 203 | 204 | class TestDnaDelC(unittest.TestCase): 205 | @classmethod 206 | def setUpClass(cls): 207 | cls.pattern = re.compile(dna_del_c, flags=re.ASCII) 208 | 209 | cls.valid_strings = [ 210 | "44del", 211 | "1_95del", 212 | "78+5_78+10del", 213 | "-25+1_-25+3del", 214 | "*17del", 215 | ] 216 | 217 | cls.invalid_strings = [ 218 | "(78+1_79-1)_(124+1_125-1)del", 219 | "(?_85)_(124_?)del", 220 | "122=/del", 221 | ] 222 | 223 | def test_valid_strings(self): 224 | for s in self.valid_strings: 225 | with self.subTest(s=s): 226 | self.assertIsNotNone( 227 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 228 | ) 229 | 230 | def test_invalid_strings(self): 231 | for s in self.invalid_strings: 232 | with self.subTest(s=s): 233 | self.assertIsNone( 234 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 235 | ) 236 | 237 | 238 | class TestDnaDelN(unittest.TestCase): 239 | @classmethod 240 | def setUpClass(cls): 241 | cls.pattern = re.compile(dna_del_n, flags=re.ASCII) 242 | 243 | cls.valid_strings = ["44del", "1_95del", "78+5_78+10del"] 244 | 245 | cls.invalid_strings = [ 246 | "(78+1_79-1)_(124+1_125-1)del", 247 | "(?_85)_(124_?)del", 248 | "122=/del", 249 | "-25+1_-25+3del", 250 | "*17del", 251 | ] 252 | 253 | def test_valid_strings(self): 254 | for s in self.valid_strings: 255 | with self.subTest(s=s): 256 | self.assertIsNotNone( 257 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 258 | ) 259 | 260 | def test_invalid_strings(self): 261 | for s in self.invalid_strings: 262 | with self.subTest(s=s): 263 | self.assertIsNone( 264 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 265 | ) 266 | 267 | 268 | class TestDnaDelGmo(unittest.TestCase): 269 | @classmethod 270 | def setUpClass(cls): 271 | cls.pattern = re.compile(dna_del_gmo, flags=re.ASCII) 272 | 273 | cls.valid_strings = ["44del", "1_95del"] 274 | 275 | cls.invalid_strings = [ 276 | "78+5_78+10del", 277 | "-25+1_-25+3del", 278 | "*17del", 279 | "(78+1_79-1)_(124+1_125-1)del", 280 | "(?_85)_(124_?)del", 281 | "122=/del", 282 | ] 283 | 284 | def test_valid_strings(self): 285 | for s in self.valid_strings: 286 | with self.subTest(s=s): 287 | self.assertIsNotNone( 288 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 289 | ) 290 | 291 | def test_invalid_strings(self): 292 | for s in self.invalid_strings: 293 | with self.subTest(s=s): 294 | self.assertIsNone( 295 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 296 | ) 297 | 298 | 299 | class TestDnaDupC(unittest.TestCase): 300 | @classmethod 301 | def setUpClass(cls): 302 | cls.pattern = re.compile(dna_dup_c, flags=re.ASCII) 303 | 304 | cls.valid_strings = [ 305 | "22_24dup", 306 | "77dup", 307 | "101+1_101+7dup", 308 | "-25+1_-25+3dup", 309 | "*17dup", 310 | ] 311 | 312 | cls.invalid_strings = [ 313 | "(78+1_79-1)_(124+1_125-1)dup", 314 | "(?_85)_(124_?)dup", 315 | "122_125=//dup", 316 | ] 317 | 318 | def test_valid_strings(self): 319 | for s in self.valid_strings: 320 | with self.subTest(s=s): 321 | self.assertIsNotNone( 322 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 323 | ) 324 | 325 | def test_invalid_strings(self): 326 | for s in self.invalid_strings: 327 | with self.subTest(s=s): 328 | self.assertIsNone( 329 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 330 | ) 331 | 332 | 333 | class TestDnaDupN(unittest.TestCase): 334 | @classmethod 335 | def setUpClass(cls): 336 | cls.pattern = re.compile(dna_dup_n, flags=re.ASCII) 337 | 338 | cls.valid_strings = ["22_24dup", "77dup", "101+1_101+7dup"] 339 | 340 | cls.invalid_strings = [ 341 | "(78+1_79-1)_(124+1_125-1)dup", 342 | "(?_85)_(124_?)dup", 343 | "122_125=//dup", 344 | "-25+1_-25+3dup", 345 | "*17dup", 346 | ] 347 | 348 | def test_valid_strings(self): 349 | for s in self.valid_strings: 350 | with self.subTest(s=s): 351 | self.assertIsNotNone( 352 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 353 | ) 354 | 355 | def test_invalid_strings(self): 356 | for s in self.invalid_strings: 357 | with self.subTest(s=s): 358 | self.assertIsNone( 359 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 360 | ) 361 | 362 | 363 | class TestDnaDupGmo(unittest.TestCase): 364 | @classmethod 365 | def setUpClass(cls): 366 | cls.pattern = re.compile(dna_dup_gmo, flags=re.ASCII) 367 | 368 | cls.valid_strings = ["22_24dup", "77dup"] 369 | 370 | cls.invalid_strings = [ 371 | "(78+1_79-1)_(124+1_125-1)dup", 372 | "(?_85)_(124_?)dup", 373 | "122_125=//dup", 374 | "101+1_101+7dup", 375 | "-25+1_-25+3dup", 376 | "*17dup", 377 | ] 378 | 379 | def test_valid_strings(self): 380 | for s in self.valid_strings: 381 | with self.subTest(s=s): 382 | self.assertIsNotNone( 383 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 384 | ) 385 | 386 | def test_invalid_strings(self): 387 | for s in self.invalid_strings: 388 | with self.subTest(s=s): 389 | self.assertIsNone( 390 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 391 | ) 392 | 393 | 394 | class TestDnaInsC(unittest.TestCase): 395 | @classmethod 396 | def setUpClass(cls): 397 | cls.pattern = re.compile(dna_ins_c, flags=re.ASCII) 398 | 399 | cls.valid_strings = [ 400 | "234_235insT", 401 | "84_85insCTG", 402 | "*84_*85insCTG", 403 | "99+6_99+7insA", 404 | "124+100_124-100insTTG", 405 | "124+101_124-100insTTG", 406 | ] 407 | 408 | cls.invalid_strings = ["84_85ins100_125", "234_235ins(10)", "234_235ins(?)"] 409 | 410 | def test_valid_strings(self): 411 | for s in self.valid_strings: 412 | with self.subTest(s=s): 413 | self.assertIsNotNone( 414 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 415 | ) 416 | 417 | def test_invalid_strings(self): 418 | for s in self.invalid_strings: 419 | with self.subTest(s=s): 420 | self.assertIsNone( 421 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 422 | ) 423 | 424 | 425 | class TestDnaInsN(unittest.TestCase): 426 | @classmethod 427 | def setUpClass(cls): 428 | cls.pattern = re.compile(dna_ins_n, flags=re.ASCII) 429 | 430 | cls.valid_strings = [ 431 | "234_235insT", 432 | "84_85insCTG", 433 | "99+6_99+7insA", 434 | "124+100_124-100insTTG", 435 | "124+101_124-100insTTG", 436 | ] 437 | 438 | cls.invalid_strings = [ 439 | "84_85ins100_125", 440 | "234_235ins(10)", 441 | "234_235ins(?)", 442 | "*84_*85insCTG", 443 | ] 444 | 445 | def test_valid_strings(self): 446 | for s in self.valid_strings: 447 | with self.subTest(s=s): 448 | self.assertIsNotNone( 449 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 450 | ) 451 | 452 | def test_invalid_strings(self): 453 | for s in self.invalid_strings: 454 | with self.subTest(s=s): 455 | self.assertIsNone( 456 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 457 | ) 458 | 459 | 460 | class TestDnaInsGmo(unittest.TestCase): 461 | @classmethod 462 | def setUpClass(cls): 463 | cls.pattern = re.compile(dna_ins_gmo, flags=re.ASCII) 464 | 465 | cls.valid_strings = ["234_235insT", "84_85insCTG"] 466 | 467 | cls.invalid_strings = [ 468 | "99+6_99+7insA", 469 | "84_85ins100_125", 470 | "234_235ins(10)", 471 | "234_235ins(?)", 472 | ] 473 | 474 | def test_valid_strings(self): 475 | for s in self.valid_strings: 476 | with self.subTest(s=s): 477 | self.assertIsNotNone( 478 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 479 | ) 480 | 481 | def test_invalid_strings(self): 482 | for s in self.invalid_strings: 483 | with self.subTest(s=s): 484 | self.assertIsNone( 485 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 486 | ) 487 | 488 | 489 | class TestDnaDelinsC(unittest.TestCase): 490 | @classmethod 491 | def setUpClass(cls): 492 | cls.pattern = re.compile(dna_delins_c, flags=re.ASCII) 493 | 494 | cls.valid_strings = [ 495 | "22delinsAACG", 496 | "83_85delinsT", 497 | "43-6_595+12delinsCTT", 498 | "*788delinsA", 499 | ] 500 | 501 | cls.invalid_strings = ["84_85delinsAAN", "234delinsW"] 502 | 503 | def test_valid_strings(self): 504 | for s in self.valid_strings: 505 | with self.subTest(s=s): 506 | self.assertIsNotNone( 507 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 508 | ) 509 | 510 | def test_invalid_strings(self): 511 | for s in self.invalid_strings: 512 | with self.subTest(s=s): 513 | self.assertIsNone( 514 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 515 | ) 516 | 517 | 518 | class TestDnaDelinsN(unittest.TestCase): 519 | @classmethod 520 | def setUpClass(cls): 521 | cls.pattern = re.compile(dna_delins_n, flags=re.ASCII) 522 | 523 | cls.valid_strings = ["22delinsAACG", "83_85delinsT", "43-6_595+12delinsCTT"] 524 | 525 | cls.invalid_strings = ["84_85delinsAAN", "234delinsW" "*788delinsA"] 526 | 527 | def test_valid_strings(self): 528 | for s in self.valid_strings: 529 | with self.subTest(s=s): 530 | self.assertIsNotNone( 531 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 532 | ) 533 | 534 | def test_invalid_strings(self): 535 | for s in self.invalid_strings: 536 | with self.subTest(s=s): 537 | self.assertIsNone( 538 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 539 | ) 540 | 541 | 542 | class TestDnaDelinsGmo(unittest.TestCase): 543 | @classmethod 544 | def setUpClass(cls): 545 | cls.pattern = re.compile(dna_delins_gmo, flags=re.ASCII) 546 | 547 | cls.valid_strings = ["22delinsAACG", "83_85delinsT"] 548 | 549 | cls.invalid_strings = [ 550 | "43-6_595+12delinsCTT", 551 | "*788delinsA", 552 | "84_85delinsAAN", 553 | "234delinsW", 554 | ] 555 | 556 | def test_valid_strings(self): 557 | for s in self.valid_strings: 558 | with self.subTest(s=s): 559 | self.assertIsNotNone( 560 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 561 | ) 562 | 563 | def test_invalid_strings(self): 564 | for s in self.invalid_strings: 565 | with self.subTest(s=s): 566 | self.assertIsNone( 567 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 568 | ) 569 | 570 | 571 | class TestDnaVariantC(unittest.TestCase): 572 | @classmethod 573 | def setUpClass(cls): 574 | cls.pattern = re.compile(dna_variant_c, flags=re.ASCII) 575 | 576 | cls.valid_strings = [ 577 | "48C>A", 578 | "=", 579 | "22=", 580 | "4_6=", 581 | "122-6T>A", 582 | "*24G>C", 583 | "19+22A>G", 584 | "-27+3T>C", 585 | "44del", 586 | "1_95del", 587 | "78+5_78+10del", 588 | "-25+1_-25+3del", 589 | "*17del", 590 | "22_24dup", 591 | "77dup", 592 | "101+1_101+7dup", 593 | "-25+1_-25+3dup", 594 | "*17dup", 595 | "234_235insT", 596 | "84_85insCTG", 597 | "99+6_99+7insA", 598 | "22delinsAACG", 599 | "83_85delinsT", 600 | "43-6_595+12delinsCTT", 601 | "*788delinsA", 602 | ] 603 | 604 | cls.invalid_strings = [ 605 | "22g>u", 606 | "48C>W", 607 | "122=/T>A", 608 | "(78+1_79-1)_(124+1_125-1)del", 609 | "(?_85)_(124_?)del", 610 | "122=/del", 611 | "(78+1_79-1)_(124+1_125-1)dup", 612 | "(?_85)_(124_?)dup", 613 | "122_125=//dup", 614 | "84_85ins100_125", 615 | "234_235ins(10)", 616 | "234_235ins(?)", 617 | "84_85delinsAAN", 618 | "234delinsW", 619 | ] 620 | 621 | def test_valid_strings(self): 622 | for s in self.valid_strings: 623 | with self.subTest(s=s): 624 | self.assertIsNotNone( 625 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 626 | ) 627 | 628 | def test_invalid_strings(self): 629 | for s in self.invalid_strings: 630 | with self.subTest(s=s): 631 | self.assertIsNone( 632 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 633 | ) 634 | 635 | 636 | class TestDnaVariantN(unittest.TestCase): 637 | @classmethod 638 | def setUpClass(cls): 639 | cls.pattern = re.compile(dna_variant_n, flags=re.ASCII) 640 | 641 | cls.valid_strings = [ 642 | "48C>A", 643 | "=", 644 | "122-6T>A", 645 | "19+22A>G", 646 | "44del", 647 | "1_95del", 648 | "78+5_78+10del", 649 | "22_24dup", 650 | "77dup", 651 | "101+1_101+7dup", 652 | "234_235insT", 653 | "84_85insCTG", 654 | "99+6_99+7insA", 655 | "22delinsAACG", 656 | "83_85delinsT", 657 | "43-6_595+12delinsCTT", 658 | ] 659 | 660 | cls.invalid_strings = [ 661 | "22=", 662 | "1_3=", 663 | "22g>u", 664 | "48C>W", 665 | "122=/T>A", 666 | "(78+1_79-1)_(124+1_125-1)del", 667 | "(?_85)_(124_?)del", 668 | "122=/del", 669 | "(78+1_79-1)_(124+1_125-1)dup", 670 | "(?_85)_(124_?)dup", 671 | "122_125=//dup", 672 | "84_85ins100_125", 673 | "234_235ins(10)", 674 | "234_235ins(?)", 675 | "84_85delinsAAN", 676 | "234delinsW", 677 | "*24G>C", 678 | "-27+3T>C", 679 | "-25+1_-25+3del", 680 | "*17del", 681 | "-25+1_-25+3dup", 682 | "*17dup", 683 | "*788delinsA", 684 | ] 685 | 686 | def test_valid_strings(self): 687 | for s in self.valid_strings: 688 | with self.subTest(s=s): 689 | self.assertIsNotNone( 690 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 691 | ) 692 | 693 | def test_invalid_strings(self): 694 | for s in self.invalid_strings: 695 | with self.subTest(s=s): 696 | self.assertIsNone( 697 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 698 | ) 699 | 700 | 701 | class TestDnaVariantGmo(unittest.TestCase): 702 | @classmethod 703 | def setUpClass(cls): 704 | cls.pattern = re.compile(dna_variant_gmo, flags=re.ASCII) 705 | 706 | cls.valid_strings = [ 707 | "48C>A", 708 | "=", 709 | "22=", 710 | "1_3=", 711 | "44del", 712 | "1_95del", 713 | "22_24dup", 714 | "77dup", 715 | "234_235insT", 716 | "84_85insCTG", 717 | "22delinsAACG", 718 | "83_85delinsT", 719 | ] 720 | 721 | cls.invalid_strings = [ 722 | "43-6_595+12delinsCTT", 723 | "*788delinsA", 724 | "99+6_99+7insA", 725 | "101+1_101+7dup", 726 | "-25+1_-25+3dup", 727 | "*17dup", 728 | "78+5_78+10del", 729 | "-25+1_-25+3del", 730 | "*17del", 731 | "*24G>C", 732 | "19+22A>G", 733 | "122-6T>A", 734 | "-27+3T>C", 735 | "22g>u", 736 | "48C>W", 737 | "122=/T>A", 738 | "(78+1_79-1)_(124+1_125-1)del", 739 | "(?_85)_(124_?)del", 740 | "122=/del", 741 | "(78+1_79-1)_(124+1_125-1)dup", 742 | "(?_85)_(124_?)dup", 743 | "122_125=//dup", 744 | "84_85ins100_125", 745 | "234_235ins(10)", 746 | "234_235ins(?)", 747 | "84_85delinsAAN", 748 | "234delinsW", 749 | ] 750 | 751 | def test_valid_strings(self): 752 | for s in self.valid_strings: 753 | with self.subTest(s=s): 754 | self.assertIsNotNone( 755 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 756 | ) 757 | 758 | def test_invalid_strings(self): 759 | for s in self.invalid_strings: 760 | with self.subTest(s=s): 761 | self.assertIsNone( 762 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 763 | ) 764 | 765 | 766 | class TestDnaSingleVariant(unittest.TestCase): 767 | @classmethod 768 | def setUpClass(cls): 769 | cls.pattern = re.compile(dna_single_variant, flags=re.ASCII) 770 | 771 | cls.valid_strings = [ 772 | "48C>A", 773 | "=", 774 | "44del", 775 | "1_95del", 776 | "22_24dup", 777 | "77dup", 778 | "234_235insT", 779 | "84_85insCTG", 780 | "22delinsAACG", 781 | "83_85delinsT", 782 | ] 783 | 784 | cls.valid_strings_c_only = [ 785 | "*788delinsA", 786 | "-25+1_-25+3dup", 787 | "*17dup", 788 | "-25+1_-25+3del", 789 | "*17del", 790 | "*24G>C", 791 | "-27+3T>C", 792 | ] 793 | 794 | cls.valid_strings_cn_only = [ 795 | "43-6_595+12delinsCTT", 796 | "99+6_99+7insA", 797 | "101+1_101+7dup", 798 | "78+5_78+10del", 799 | "19+22A>G", 800 | "122-6T>A", 801 | ] 802 | 803 | cls.valid_strings_cgmo_only = ["22=", "4_6="] 804 | 805 | cls.invalid_strings = [ 806 | "22g>u", 807 | "48C>W", 808 | "122=/T>A", 809 | "(78+1_79-1)_(124+1_125-1)del", 810 | "(?_85)_(124_?)del", 811 | "122=/del", 812 | "(78+1_79-1)_(124+1_125-1)dup", 813 | "(?_85)_(124_?)dup", 814 | "122_125=//dup", 815 | "84_85ins100_125", 816 | "234_235ins(10)", 817 | "234_235ins(?)", 818 | "84_85delinsAAN", 819 | "234delinsW", 820 | ] 821 | 822 | def test_valid_strings(self): 823 | for p in "cngmo": 824 | for s in self.valid_strings: 825 | with self.subTest(s=s, p=p): 826 | v = f"{p}.{s}" 827 | self.assertIsNotNone( 828 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"' 829 | ) 830 | for p in "cgmo": 831 | for s in self.valid_strings_cgmo_only: 832 | with self.subTest(s=s, p=p): 833 | v = f"{p}.{s}" 834 | self.assertIsNotNone( 835 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"' 836 | ) 837 | for p in "cn": 838 | for s in self.valid_strings_cn_only: 839 | with self.subTest(s=s, p=p): 840 | v = f"{p}.{s}" 841 | self.assertIsNotNone( 842 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"' 843 | ) 844 | for p in "c": 845 | for s in self.valid_strings_c_only: 846 | with self.subTest(s=s, p=p): 847 | v = f"{p}.{s}" 848 | self.assertIsNotNone( 849 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"' 850 | ) 851 | 852 | def test_invalid_strings(self): 853 | for p in "cngmo": 854 | for s in self.invalid_strings: 855 | with self.subTest(s=s, p=p): 856 | v = f"{p}.{s}" 857 | self.assertIsNone( 858 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"' 859 | ) 860 | for p in "ngmo": 861 | for s in self.valid_strings_c_only: 862 | with self.subTest(s=s, p=p): 863 | v = f"{p}.{s}" 864 | self.assertIsNone( 865 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"' 866 | ) 867 | for p in "gmo": 868 | for s in self.valid_strings_cn_only: 869 | with self.subTest(s=s, p=p): 870 | v = f"{p}.{s}" 871 | self.assertIsNone( 872 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"' 873 | ) 874 | for p in "n": 875 | for s in self.valid_strings_cgmo_only: 876 | with self.subTest(s=s, p=p): 877 | v = f"{p}.{s}" 878 | self.assertIsNone( 879 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"' 880 | ) 881 | 882 | 883 | class TestDnaMultiVariant(unittest.TestCase): 884 | @classmethod 885 | def setUpClass(cls): 886 | cls.pattern = re.compile(dna_multi_variant, flags=re.ASCII) 887 | 888 | single_valid_strings = [ 889 | "48C>A", 890 | "=", 891 | "44del", 892 | "1_95del", 893 | "22_24dup", 894 | "77dup", 895 | "234_235insT", 896 | "84_85insCTG", 897 | "22delinsAACG", 898 | "83_85delinsT", 899 | ] 900 | 901 | single_valid_strings_c_only = [ 902 | "*788delinsA", 903 | "-25+1_-25+3dup", 904 | "*17dup", 905 | "-25+1_-25+3del", 906 | "*17del", 907 | "*24G>C", 908 | "-27+3T>C", 909 | ] 910 | 911 | single_valid_strings_cn_only = [ 912 | "43-6_595+12delinsCTT", 913 | "99+6_99+7insA", 914 | "101+1_101+7dup", 915 | "78+5_78+10del", 916 | "19+22A>G", 917 | "122-6T>A", 918 | ] 919 | 920 | single_valid_strings_cgmo_only = ["22=", "4_6="] 921 | 922 | single_invalid_strings = [ 923 | "22g>u", 924 | "48C>W", 925 | "122=/T>A", 926 | "(78+1_79-1)_(124+1_125-1)del", 927 | "(?_85)_(124_?)del", 928 | "122=/del", 929 | "(78+1_79-1)_(124+1_125-1)dup", 930 | "(?_85)_(124_?)dup", 931 | "122_125=//dup", 932 | "84_85ins100_125", 933 | "234_235ins(10)", 934 | "234_235ins(?)", 935 | "84_85delinsAAN", 936 | "234delinsW", 937 | ] 938 | 939 | cls.valid_strings, cls.invalid_strings = build_multi_variants( 940 | single_valid_strings, single_invalid_strings 941 | ) 942 | cls.valid_strings_c_only, cls.invalid_strings_ngmo = build_multi_variants( 943 | single_valid_strings_c_only, single_valid_strings_c_only 944 | ) 945 | cls.valid_strings_cn_only, cls.invalid_strings_gmo = build_multi_variants( 946 | single_valid_strings_cn_only, single_valid_strings_cn_only 947 | ) 948 | cls.valid_strings_cgmo_only, cls.invalid_strings_n = build_multi_variants( 949 | single_valid_strings_cgmo_only, single_valid_strings_cgmo_only 950 | ) 951 | 952 | def test_valid_strings(self): 953 | for p in "cngmo": 954 | for s in self.valid_strings: 955 | with self.subTest(s=s, p=p): 956 | v = f"{p}.[{s}]" 957 | self.assertIsNotNone( 958 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"' 959 | ) 960 | for p in "cgmo": 961 | for s in self.valid_strings_cgmo_only: 962 | with self.subTest(s=s, p=p): 963 | v = f"{p}.[{s}]" 964 | self.assertIsNotNone( 965 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"' 966 | ) 967 | for p in "cn": 968 | for s in self.valid_strings_cn_only: 969 | with self.subTest(s=s, p=p): 970 | v = f"{p}.[{s}]" 971 | self.assertIsNotNone( 972 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"' 973 | ) 974 | for p in "c": 975 | for s in self.valid_strings_c_only: 976 | with self.subTest(s=s, p=p): 977 | v = f"{p}.[{s}]" 978 | self.assertIsNotNone( 979 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"' 980 | ) 981 | 982 | def test_invalid_strings(self): 983 | for p in "cngmo": 984 | for s in self.invalid_strings: 985 | with self.subTest(s=s, p=p): 986 | v = f"{p}.[{s}]" 987 | self.assertIsNone( 988 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"' 989 | ) 990 | for p in "ngmo": 991 | for s in self.invalid_strings_ngmo: 992 | with self.subTest(s=s, p=p): 993 | v = f"{p}.[{s}]" 994 | self.assertIsNone( 995 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"' 996 | ) 997 | for p in "gmo": 998 | for s in self.invalid_strings_gmo: 999 | with self.subTest(s=s, p=p): 1000 | v = f"{p}.[{s}]" 1001 | self.assertIsNone( 1002 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"' 1003 | ) 1004 | for p in "n": 1005 | for s in self.invalid_strings_n: 1006 | with self.subTest(s=s, p=p): 1007 | v = f"{p}.[{s}]" 1008 | self.assertIsNone( 1009 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"' 1010 | ) 1011 | 1012 | 1013 | if __name__ == "__main__": 1014 | unittest.main() 1015 | -------------------------------------------------------------------------------- /tests/test_patterns/test_protein.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import re 3 | from mavehgvs.patterns.protein import ( 4 | pro_equal, 5 | pro_sub, 6 | pro_fs, 7 | pro_del, 8 | pro_dup, 9 | pro_ins, 10 | pro_delins, 11 | pro_variant, 12 | pro_single_variant, 13 | pro_multi_variant, 14 | ) 15 | from . import build_multi_variants 16 | 17 | 18 | class TestProteinEqual(unittest.TestCase): 19 | @classmethod 20 | def setUpClass(cls): 21 | cls.pattern = re.compile(pro_equal, flags=re.ASCII) 22 | 23 | cls.valid_strings = [ 24 | "=", 25 | "(=)", 26 | "Cys22=", 27 | ] 28 | 29 | cls.invalid_strings = ["=22", "Arg18(=)", "Cys-22", "=="] 30 | 31 | def test_valid_strings(self): 32 | for s in self.valid_strings: 33 | with self.subTest(s=s): 34 | self.assertIsNotNone( 35 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 36 | ) 37 | 38 | def test_invalid_strings(self): 39 | for s in self.invalid_strings: 40 | with self.subTest(s=s): 41 | self.assertIsNone( 42 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 43 | ) 44 | 45 | 46 | class TestProteinSub(unittest.TestCase): 47 | @classmethod 48 | def setUpClass(cls): 49 | cls.pattern = re.compile(pro_sub, flags=re.ASCII) 50 | 51 | cls.valid_strings = ["Glu27Trp", "Ter345Lys"] 52 | 53 | cls.invalid_strings = [ 54 | "22A>T", 55 | "Xaa12Arg", 56 | "Arg21Xaa", 57 | "Pro17*", 58 | "*345Lys", 59 | "(Glu27Trp)", 60 | ] 61 | 62 | def test_valid_strings(self): 63 | for s in self.valid_strings: 64 | with self.subTest(s=s): 65 | self.assertIsNotNone( 66 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 67 | ) 68 | 69 | def test_invalid_strings(self): 70 | for s in self.invalid_strings: 71 | with self.subTest(s=s): 72 | self.assertIsNone( 73 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 74 | ) 75 | 76 | 77 | class TestProteinFs(unittest.TestCase): 78 | @classmethod 79 | def setUpClass(cls): 80 | cls.pattern = re.compile(pro_fs, flags=re.ASCII) 81 | 82 | cls.valid_strings = ["Glu27fs"] 83 | 84 | cls.invalid_strings = [ 85 | "=fs", 86 | "Arg12LysfsTer18", 87 | "Arg12Lysfs*18", 88 | "Glu27fs*?", 89 | "(Glu27fs)", 90 | ] 91 | 92 | def test_valid_strings(self): 93 | for s in self.valid_strings: 94 | with self.subTest(s=s): 95 | self.assertIsNotNone( 96 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 97 | ) 98 | 99 | def test_invalid_strings(self): 100 | for s in self.invalid_strings: 101 | with self.subTest(s=s): 102 | self.assertIsNone( 103 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 104 | ) 105 | 106 | 107 | class TestProteinDel(unittest.TestCase): 108 | @classmethod 109 | def setUpClass(cls): 110 | cls.pattern = re.compile(pro_del, flags=re.ASCII) 111 | 112 | cls.valid_strings = [ 113 | "Gly18del", 114 | "Gln7_Asn19del", 115 | ] 116 | 117 | cls.invalid_strings = ["=del", "18del", "122_128del", "(Gly18del)"] 118 | 119 | def test_valid_strings(self): 120 | for s in self.valid_strings: 121 | with self.subTest(s=s): 122 | self.assertIsNotNone( 123 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 124 | ) 125 | 126 | def test_invalid_strings(self): 127 | for s in self.invalid_strings: 128 | with self.subTest(s=s): 129 | self.assertIsNone( 130 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 131 | ) 132 | 133 | 134 | class TestProteinDup(unittest.TestCase): 135 | @classmethod 136 | def setUpClass(cls): 137 | cls.pattern = re.compile(pro_dup, flags=re.ASCII) 138 | 139 | cls.valid_strings = [ 140 | "Cys5dup", 141 | "Pro12_Gly18dup", 142 | ] 143 | 144 | cls.invalid_strings = ["=dup", "18dup", "122_128dup", "(Cys5dup)"] 145 | 146 | def test_valid_strings(self): 147 | for s in self.valid_strings: 148 | with self.subTest(s=s): 149 | self.assertIsNotNone( 150 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 151 | ) 152 | 153 | def test_invalid_strings(self): 154 | for s in self.invalid_strings: 155 | with self.subTest(s=s): 156 | self.assertIsNone( 157 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 158 | ) 159 | 160 | 161 | class TestProteinIns(unittest.TestCase): 162 | @classmethod 163 | def setUpClass(cls): 164 | cls.pattern = re.compile(pro_ins, flags=re.ASCII) 165 | 166 | cls.valid_strings = [ 167 | "His7_Gln8insSer", 168 | "Ala12_Pro13insGlyProCys", 169 | ] 170 | 171 | cls.invalid_strings = [ 172 | "(His7_Gln8insSer)", 173 | "(His7_Gln8insX)", 174 | "(Ala12_Pro13ins(2))", 175 | "His7_Gln8ins?", 176 | "His7_Gln8insXaa", 177 | ] 178 | 179 | def test_valid_strings(self): 180 | for s in self.valid_strings: 181 | with self.subTest(s=s): 182 | self.assertIsNotNone( 183 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 184 | ) 185 | 186 | def test_invalid_strings(self): 187 | for s in self.invalid_strings: 188 | with self.subTest(s=s): 189 | self.assertIsNone( 190 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 191 | ) 192 | 193 | 194 | class TestProteinDelins(unittest.TestCase): 195 | @classmethod 196 | def setUpClass(cls): 197 | cls.pattern = re.compile(pro_delins, flags=re.ASCII) 198 | 199 | cls.valid_strings = [ 200 | "Ile71_Cys80delinsSer", 201 | "His44delinsValProGlyGlu", 202 | ] 203 | 204 | cls.invalid_strings = ["(Ile71_Cys80delinsSer)", "Ile71_Cys80delinsXaa"] 205 | 206 | def test_valid_strings(self): 207 | for s in self.valid_strings: 208 | with self.subTest(s=s): 209 | self.assertIsNotNone( 210 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 211 | ) 212 | 213 | def test_invalid_strings(self): 214 | for s in self.invalid_strings: 215 | with self.subTest(s=s): 216 | self.assertIsNone( 217 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 218 | ) 219 | 220 | 221 | class TestProteinVariant(unittest.TestCase): 222 | @classmethod 223 | def setUpClass(cls): 224 | cls.pattern = re.compile(pro_variant, flags=re.ASCII) 225 | 226 | cls.valid_strings = [ 227 | "=", 228 | "(=)", 229 | "Cys22=", 230 | "Glu27Trp", 231 | "Ter345Lys", 232 | "Glu27fs", 233 | "Gly18del", 234 | "Gln7_Asn19del", 235 | "Cys5dup", 236 | "Pro12_Gly18dup", 237 | "His7_Gln8insSer", 238 | "Ala12_Pro13insGlyProCys", 239 | "Ile71_Cys80delinsSer", 240 | "His44delinsValProGlyGlu", 241 | ] 242 | 243 | cls.invalid_strings = [ 244 | "=22", 245 | "Arg18(=)", 246 | "Cys-22", 247 | "==", 248 | "22A>T", 249 | "Xaa12Arg", 250 | "Arg21Xaa", 251 | "Pro17*", 252 | "*345Lys", 253 | "(Glu27Trp)", 254 | "=fs", 255 | "Arg12LysfsTer18", 256 | "Arg12Lysfs*18", 257 | "Glu27fs*?", 258 | "(Glu27fs)", 259 | "=del", 260 | "18del", 261 | "122_128del", 262 | "(Gly18del)", 263 | "=dup", 264 | "18dup", 265 | "122_128dup", 266 | "(Cys5dup)", 267 | "(His7_Gln8insSer)", 268 | "(His7_Gln8insX)", 269 | "(Ala12_Pro13ins(2))", 270 | "His7_Gln8ins?", 271 | "His7_Gln8insXaa", 272 | "(Ile71_Cys80delinsSer)", 273 | "Ile71_Cys80delinsXaa", 274 | ] 275 | 276 | def test_valid_strings(self): 277 | for s in self.valid_strings: 278 | with self.subTest(s=s): 279 | self.assertIsNotNone( 280 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 281 | ) 282 | 283 | def test_invalid_strings(self): 284 | for s in self.invalid_strings: 285 | with self.subTest(s=s): 286 | self.assertIsNone( 287 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 288 | ) 289 | 290 | 291 | class TestProteinSingleVariant(unittest.TestCase): 292 | @classmethod 293 | def setUpClass(cls): 294 | cls.pattern = re.compile(pro_single_variant, flags=re.ASCII) 295 | 296 | cls.valid_strings = [ 297 | "=", 298 | "(=)", 299 | "Cys22=", 300 | "Glu27Trp", 301 | "Ter345Lys", 302 | "Glu27fs", 303 | "Gly18del", 304 | "Gln7_Asn19del", 305 | "Cys5dup", 306 | "Pro12_Gly18dup", 307 | "His7_Gln8insSer", 308 | "Ala12_Pro13insGlyProCys", 309 | "Ile71_Cys80delinsSer", 310 | "His44delinsValProGlyGlu", 311 | ] 312 | 313 | cls.invalid_strings = [ 314 | "=22", 315 | "Arg18(=)", 316 | "Cys-22", 317 | "==", 318 | "22A>T", 319 | "Xaa12Arg", 320 | "Arg21Xaa", 321 | "Pro17*", 322 | "*345Lys", 323 | "(Glu27Trp)", 324 | "=fs", 325 | "Arg12LysfsTer18", 326 | "Arg12Lysfs*18", 327 | "Glu27fs*?", 328 | "(Glu27fs)", 329 | "=del", 330 | "18del", 331 | "122_128del", 332 | "(Gly18del)", 333 | "=dup", 334 | "18dup", 335 | "122_128dup", 336 | "(Cys5dup)", 337 | "(His7_Gln8insSer)", 338 | "(His7_Gln8insX)", 339 | "(Ala12_Pro13ins(2))", 340 | "His7_Gln8ins?", 341 | "His7_Gln8insXaa", 342 | "(Ile71_Cys80delinsSer)", 343 | "Ile71_Cys80delinsXaa", 344 | ] 345 | 346 | def test_valid_strings(self): 347 | for s in self.valid_strings: 348 | with self.subTest(s=s): 349 | v = f"p.{s}" 350 | self.assertIsNotNone( 351 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"' 352 | ) 353 | 354 | def test_invalid_strings(self): 355 | for s in self.invalid_strings: 356 | with self.subTest(s=s): 357 | v = f"p.{s}" 358 | self.assertIsNone( 359 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"' 360 | ) 361 | 362 | 363 | class TestProteinMultiVariant(unittest.TestCase): 364 | @classmethod 365 | def setUpClass(cls): 366 | cls.pattern = re.compile(pro_multi_variant, flags=re.ASCII) 367 | 368 | single_valid_strings = [ 369 | "=", 370 | "(=)", 371 | "Cys22=", 372 | "Glu27Trp", 373 | "Ter345Lys", 374 | "Glu27fs", 375 | "Gly18del", 376 | "Gln7_Asn19del", 377 | "Cys5dup", 378 | "Pro12_Gly18dup", 379 | "His7_Gln8insSer", 380 | "Ala12_Pro13insGlyProCys", 381 | "Ile71_Cys80delinsSer", 382 | "His44delinsValProGlyGlu", 383 | ] 384 | 385 | single_invalid_strings = [ 386 | "=22", 387 | "Arg18(=)", 388 | "Cys-22", 389 | "==", 390 | "22A>T", 391 | "Xaa12Arg", 392 | "Arg21Xaa", 393 | "Pro17*", 394 | "*345Lys", 395 | "(Glu27Trp)", 396 | "=fs", 397 | "Arg12LysfsTer18", 398 | "Arg12Lysfs*18", 399 | "Glu27fs*?", 400 | "(Glu27fs)", 401 | "=del", 402 | "18del", 403 | "122_128del", 404 | "(Gly18del)", 405 | "=dup", 406 | "18dup", 407 | "122_128dup", 408 | "(Cys5dup)", 409 | "(His7_Gln8insSer)", 410 | "(His7_Gln8insX)", 411 | "(Ala12_Pro13ins(2))", 412 | "His7_Gln8ins?", 413 | "His7_Gln8insXaa", 414 | "(Ile71_Cys80delinsSer)", 415 | "Ile71_Cys80delinsXaa", 416 | ] 417 | 418 | cls.valid_strings, cls.invalid_strings = build_multi_variants( 419 | single_valid_strings, single_invalid_strings 420 | ) 421 | 422 | def test_valid_strings(self): 423 | for s in self.valid_strings: 424 | with self.subTest(s=s): 425 | v = f"p.[{s}]" 426 | self.assertIsNotNone( 427 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"' 428 | ) 429 | 430 | def test_invalid_strings(self): 431 | for s in self.invalid_strings: 432 | with self.subTest(s=s): 433 | v = f"p.[{s}]" 434 | self.assertIsNone( 435 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"' 436 | ) 437 | 438 | 439 | if __name__ == "__main__": 440 | unittest.main() 441 | -------------------------------------------------------------------------------- /tests/test_patterns/test_rna.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import re 3 | from mavehgvs.patterns.rna import ( 4 | rna_equal, 5 | rna_sub, 6 | rna_del, 7 | rna_dup, 8 | rna_ins, 9 | rna_delins, 10 | rna_variant, 11 | rna_single_variant, 12 | rna_multi_variant, 13 | ) 14 | from . import build_multi_variants 15 | 16 | 17 | class TestRnaEqual(unittest.TestCase): 18 | @classmethod 19 | def setUpClass(cls): 20 | cls.pattern = re.compile(rna_equal, flags=re.ASCII) 21 | 22 | cls.valid_strings = [ 23 | "=", 24 | ] 25 | 26 | cls.invalid_strings = ["=22", "(=)", "=="] 27 | 28 | def test_valid_strings(self): 29 | for s in self.valid_strings: 30 | with self.subTest(s=s): 31 | self.assertIsNotNone( 32 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 33 | ) 34 | 35 | def test_invalid_strings(self): 36 | for s in self.invalid_strings: 37 | with self.subTest(s=s): 38 | self.assertIsNone( 39 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 40 | ) 41 | 42 | 43 | class TestRnaSub(unittest.TestCase): 44 | @classmethod 45 | def setUpClass(cls): 46 | cls.pattern = re.compile(rna_sub, flags=re.ASCII) 47 | 48 | cls.valid_strings = ["22g>u", "33+12a>c"] 49 | 50 | cls.invalid_strings = [ 51 | "spl", 52 | "33+12A>G", 53 | "22g>t", 54 | ] 55 | 56 | def test_valid_strings(self): 57 | for s in self.valid_strings: 58 | with self.subTest(s=s): 59 | self.assertIsNotNone( 60 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 61 | ) 62 | 63 | def test_invalid_strings(self): 64 | for s in self.invalid_strings: 65 | with self.subTest(s=s): 66 | self.assertIsNone( 67 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 68 | ) 69 | 70 | 71 | class TestRnaDel(unittest.TestCase): 72 | @classmethod 73 | def setUpClass(cls): 74 | cls.pattern = re.compile(rna_del, flags=re.ASCII) 75 | 76 | cls.valid_strings = ["34_36del", "17del", "27_27+12del", "101+1_101+7del"] 77 | 78 | cls.invalid_strings = ["=del", "=/9_12del", "(155_185)del", "34_36"] 79 | 80 | def test_valid_strings(self): 81 | for s in self.valid_strings: 82 | with self.subTest(s=s): 83 | self.assertIsNotNone( 84 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 85 | ) 86 | 87 | def test_invalid_strings(self): 88 | for s in self.invalid_strings: 89 | with self.subTest(s=s): 90 | self.assertIsNone( 91 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 92 | ) 93 | 94 | 95 | class TestRnaDup(unittest.TestCase): 96 | @classmethod 97 | def setUpClass(cls): 98 | cls.pattern = re.compile(rna_dup, flags=re.ASCII) 99 | 100 | cls.valid_strings = ["12dup", "2_24dup", "101+1_101+7dup", "12-24_12-12dup"] 101 | 102 | cls.invalid_strings = ["=dup", "(78+1_79-1)_(124+1_125-1)dup"] 103 | 104 | def test_valid_strings(self): 105 | for s in self.valid_strings: 106 | with self.subTest(s=s): 107 | self.assertIsNotNone( 108 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 109 | ) 110 | 111 | def test_invalid_strings(self): 112 | for s in self.invalid_strings: 113 | with self.subTest(s=s): 114 | self.assertIsNone( 115 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 116 | ) 117 | 118 | 119 | class TestRnaIns(unittest.TestCase): 120 | @classmethod 121 | def setUpClass(cls): 122 | cls.pattern = re.compile(rna_ins, flags=re.ASCII) 123 | 124 | cls.valid_strings = [ 125 | "22_23insauc", 126 | "17_18insa", 127 | ] 128 | 129 | cls.invalid_strings = [ 130 | "(27_30)insu", 131 | "74_74insnnn", 132 | ] 133 | 134 | def test_valid_strings(self): 135 | for s in self.valid_strings: 136 | with self.subTest(s=s): 137 | self.assertIsNotNone( 138 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 139 | ) 140 | 141 | def test_invalid_strings(self): 142 | for s in self.invalid_strings: 143 | with self.subTest(s=s): 144 | self.assertIsNone( 145 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 146 | ) 147 | 148 | 149 | class TestRnaDelins(unittest.TestCase): 150 | @classmethod 151 | def setUpClass(cls): 152 | cls.pattern = re.compile(rna_delins, flags=re.ASCII) 153 | 154 | cls.valid_strings = ["92delinsgac", "12_17delinsc"] 155 | 156 | cls.invalid_strings = ["234_235ins(10)", "(122_125)insg"] 157 | 158 | def test_valid_strings(self): 159 | for s in self.valid_strings: 160 | with self.subTest(s=s): 161 | self.assertIsNotNone( 162 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 163 | ) 164 | 165 | def test_invalid_strings(self): 166 | for s in self.invalid_strings: 167 | with self.subTest(s=s): 168 | self.assertIsNone( 169 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 170 | ) 171 | 172 | 173 | class TestRnaVariant(unittest.TestCase): 174 | @classmethod 175 | def setUpClass(cls): 176 | cls.pattern = re.compile(rna_variant, flags=re.ASCII) 177 | 178 | cls.valid_strings = [ 179 | "=", 180 | "22g>u", 181 | "33+12a>c", 182 | "34_36del", 183 | "17del", 184 | "12dup", 185 | "2_24dup", 186 | "101+1_101+7dup", 187 | "22_23insauc", 188 | "17_18insa", 189 | "92delinsgac", 190 | "12_17delinsc", 191 | ] 192 | 193 | cls.invalid_strings = [ 194 | "=22", 195 | "(=)", 196 | "==", 197 | "spl", 198 | "33+12A>G", 199 | "22g>t", 200 | "=del", 201 | "=/9_12del", 202 | "(155_185)del", 203 | "=dup", 204 | "(78+1_79-1)_(124+1_125-1)dup", 205 | "(27_30)insu", 206 | "74_74insnnn", 207 | "234_235ins(10)", 208 | "(122_125)insg", 209 | ] 210 | 211 | def test_valid_strings(self): 212 | for s in self.valid_strings: 213 | with self.subTest(s=s): 214 | self.assertIsNotNone( 215 | self.pattern.fullmatch(s), msg=f'failed to match "{s}"' 216 | ) 217 | 218 | def test_invalid_strings(self): 219 | for s in self.invalid_strings: 220 | with self.subTest(s=s): 221 | self.assertIsNone( 222 | self.pattern.fullmatch(s), msg=f'incorrectly matched "{s}"' 223 | ) 224 | 225 | 226 | class TestRnaSingleVariant(unittest.TestCase): 227 | @classmethod 228 | def setUpClass(cls): 229 | cls.pattern = re.compile(rna_single_variant, flags=re.ASCII) 230 | 231 | cls.valid_strings = [ 232 | "=", 233 | "22g>u", 234 | "33+12a>c", 235 | "34_36del", 236 | "17del", 237 | "12dup", 238 | "2_24dup", 239 | "101+1_101+7dup", 240 | "22_23insauc", 241 | "17_18insa", 242 | "92delinsgac", 243 | "12_17delinsc", 244 | ] 245 | 246 | cls.invalid_strings = [ 247 | "=22", 248 | "(=)", 249 | "==", 250 | "spl", 251 | "33+12A>G", 252 | "22g>t", 253 | "=del", 254 | "=/9_12del", 255 | "(155_185)del", 256 | "=dup", 257 | "(78+1_79-1)_(124+1_125-1)dup", 258 | "(27_30)insu", 259 | "74_74insnnn", 260 | "234_235ins(10)", 261 | "(122_125)insg", 262 | ] 263 | 264 | def test_valid_strings(self): 265 | for s in self.valid_strings: 266 | with self.subTest(s=s): 267 | v = f"r.{s}" 268 | self.assertIsNotNone( 269 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"' 270 | ) 271 | 272 | def test_invalid_strings(self): 273 | for s in self.invalid_strings: 274 | with self.subTest(s=s): 275 | v = f"r.{s}" 276 | self.assertIsNone( 277 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"' 278 | ) 279 | 280 | 281 | class TestRnaMultiVariant(unittest.TestCase): 282 | @classmethod 283 | def setUpClass(cls): 284 | cls.pattern = re.compile(rna_multi_variant, flags=re.ASCII) 285 | 286 | single_valid_strings = [ 287 | "=", 288 | "22g>u", 289 | "33+12a>c", 290 | "34_36del", 291 | "17del", 292 | "12dup", 293 | "2_24dup", 294 | "101+1_101+7dup", 295 | "22_23insauc", 296 | "17_18insa", 297 | "92delinsgac", 298 | "12_17delinsc", 299 | ] 300 | 301 | single_invalid_strings = [ 302 | "=22", 303 | "(=)", 304 | "==", 305 | "spl", 306 | "33+12A>G", 307 | "22g>t", 308 | "=del", 309 | "=/9_12del", 310 | "(155_185)del", 311 | "=dup", 312 | "(78+1_79-1)_(124+1_125-1)dup", 313 | "(27_30)insu", 314 | "74_74insnnn", 315 | "234_235ins(10)", 316 | "(122_125)insg", 317 | ] 318 | cls.valid_strings, cls.invalid_strings = build_multi_variants( 319 | single_valid_strings, single_invalid_strings 320 | ) 321 | 322 | def test_valid_strings(self): 323 | for s in self.valid_strings: 324 | with self.subTest(s=s): 325 | v = f"r.[{s}]" 326 | self.assertIsNotNone( 327 | self.pattern.fullmatch(v), msg=f'failed to match "{v}"' 328 | ) 329 | 330 | def test_invalid_strings(self): 331 | for s in self.invalid_strings: 332 | with self.subTest(s=s): 333 | v = f"r.[{s}]" 334 | self.assertIsNone( 335 | self.pattern.fullmatch(v), msg=f'incorrectly matched "{v}"' 336 | ) 337 | 338 | 339 | if __name__ == "__main__": 340 | unittest.main() 341 | -------------------------------------------------------------------------------- /tests/test_patterns/test_util.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from mavehgvs.patterns.util import combine_patterns, remove_named_groups 4 | 5 | 6 | class TestCombinePatterns(unittest.TestCase): 7 | def test_without_groupname(self): 8 | pattern_tuples = [ 9 | ( 10 | ("(?P(?P[1-9]))", "(?P(?P[1-9]))"), 11 | "(?:(?P(?P[1-9]))|(?P(?P[1-9])))", 12 | ) 13 | ] 14 | 15 | for p1, p2 in pattern_tuples: 16 | with self.subTest(p1=p1, p2=p2): 17 | self.assertEqual(combine_patterns(p1), p2) 18 | 19 | def test_with_groupname(self): 20 | pattern_tuples = [ 21 | ( 22 | ("(?P(?P[1-9]))", "(?P(?P[1-9]))"), 23 | "test", 24 | "(?P(?P(?P[1-9]))|(?P(?P[1-9])))", 25 | ) 26 | ] 27 | 28 | for p1, g, p2 in pattern_tuples: 29 | with self.subTest(p1=p1, g=g, p2=p2): 30 | self.assertEqual(combine_patterns(p1, groupname=g), p2) 31 | 32 | 33 | class TestRemoveNamedGroups(unittest.TestCase): 34 | def test_noncapturing(self): 35 | pattern_tuples = [("(?P(?P[1-9]))", "(?:(?:[1-9]))")] 36 | 37 | for p1, p2 in pattern_tuples: 38 | with self.subTest(p1=p1, p2=p2): 39 | self.assertEqual(remove_named_groups(p1, noncapturing=True), p2) 40 | 41 | def test_capturing(self): 42 | pattern_tuples = [("(?P(?P[1-9]))", "(([1-9]))")] 43 | 44 | for p1, p2 in pattern_tuples: 45 | with self.subTest(p1=p1, p2=p2): 46 | self.assertEqual(remove_named_groups(p1, noncapturing=False), p2) 47 | 48 | 49 | if __name__ == "__main__": 50 | unittest.main() 51 | -------------------------------------------------------------------------------- /tests/test_position.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import itertools 3 | import random 4 | from mavehgvs.position import VariantPosition 5 | from mavehgvs.exceptions import MaveHgvsParseError 6 | 7 | 8 | class TestObjectCreation(unittest.TestCase): 9 | def test_position_only(self) -> None: 10 | v = VariantPosition("8") 11 | self.assertTupleEqual( 12 | (v.position, v.amino_acid, v.intronic_position, v.utr), 13 | (8, None, None, None), 14 | ) 15 | self.assertFalse(v.is_utr()) 16 | self.assertFalse(v.is_intronic()) 17 | self.assertFalse(v.is_protein()) 18 | self.assertFalse(v.is_extended()) 19 | 20 | v = VariantPosition("92380") 21 | self.assertTupleEqual( 22 | (v.position, v.amino_acid, v.intronic_position, v.utr), 23 | (92380, None, None, None), 24 | ) 25 | self.assertFalse(v.is_utr()) 26 | self.assertFalse(v.is_intronic()) 27 | self.assertFalse(v.is_protein()) 28 | self.assertFalse(v.is_extended()) 29 | 30 | def test_amino_acid(self) -> None: 31 | v = VariantPosition("Gly8") 32 | self.assertTupleEqual( 33 | (v.position, v.amino_acid, v.intronic_position, v.utr), 34 | (8, "Gly", None, None), 35 | ) 36 | self.assertFalse(v.is_utr()) 37 | self.assertFalse(v.is_intronic()) 38 | self.assertTrue(v.is_protein()) 39 | self.assertFalse(v.is_extended()) 40 | 41 | v = VariantPosition("Cys92380") 42 | self.assertTupleEqual( 43 | (v.position, v.amino_acid, v.intronic_position, v.utr), 44 | (92380, "Cys", None, None), 45 | ) 46 | self.assertFalse(v.is_utr()) 47 | self.assertFalse(v.is_intronic()) 48 | self.assertTrue(v.is_protein()) 49 | self.assertFalse(v.is_extended()) 50 | 51 | def test_invalid_strings(self) -> None: 52 | position_strings = ( 53 | "08", 54 | "+12", 55 | "*-99", 56 | "A", 57 | "TCGA", 58 | "g", 59 | "*", 60 | "-", 61 | "+", 62 | "**6", 63 | "800 + 12", 64 | "-12*5", 65 | "Glu-12", 66 | "*5Trp", 67 | "Xyz12", 68 | "ALA12", 69 | ) 70 | for s in position_strings: 71 | with self.subTest(s=s): 72 | with self.assertRaises(MaveHgvsParseError): 73 | VariantPosition(s) 74 | 75 | def test_utr(self) -> None: 76 | v = VariantPosition("*8") 77 | self.assertTupleEqual( 78 | (v.position, v.amino_acid, v.intronic_position, v.utr), 79 | (8, None, None, True), 80 | ) 81 | self.assertTrue(v.is_utr()) 82 | self.assertFalse(v.is_intronic()) 83 | self.assertFalse(v.is_protein()) 84 | self.assertTrue(v.is_extended()) 85 | 86 | v = VariantPosition("-80") 87 | self.assertTupleEqual( 88 | (v.position, v.amino_acid, v.intronic_position, v.utr), 89 | (-80, None, None, True), 90 | ) 91 | self.assertTrue(v.is_utr()) 92 | self.assertFalse(v.is_intronic()) 93 | self.assertFalse(v.is_protein()) 94 | self.assertTrue(v.is_extended()) 95 | 96 | def test_intron(self) -> None: 97 | v = VariantPosition("122-6") 98 | self.assertTupleEqual( 99 | (v.position, v.amino_acid, v.intronic_position, v.utr), 100 | (122, None, -6, None), 101 | ) 102 | self.assertFalse(v.is_utr()) 103 | self.assertTrue(v.is_intronic()) 104 | self.assertFalse(v.is_protein()) 105 | self.assertTrue(v.is_extended()) 106 | 107 | v = VariantPosition("78+10") 108 | self.assertTupleEqual( 109 | (v.position, v.amino_acid, v.intronic_position, v.utr), (78, None, 10, None) 110 | ) 111 | self.assertFalse(v.is_utr()) 112 | self.assertTrue(v.is_intronic()) 113 | self.assertFalse(v.is_protein()) 114 | self.assertTrue(v.is_extended()) 115 | 116 | def test_utr_intron(self) -> None: 117 | v = VariantPosition("*89+67") 118 | self.assertTupleEqual( 119 | (v.position, v.amino_acid, v.intronic_position, v.utr), (89, None, 67, True) 120 | ) 121 | self.assertTrue(v.is_utr()) 122 | self.assertTrue(v.is_intronic()) 123 | self.assertFalse(v.is_protein()) 124 | self.assertTrue(v.is_extended()) 125 | 126 | v = VariantPosition("-127+6") 127 | self.assertTupleEqual( 128 | (v.position, v.amino_acid, v.intronic_position, v.utr), 129 | (-127, None, 6, True), 130 | ) 131 | self.assertTrue(v.is_utr()) 132 | self.assertTrue(v.is_intronic()) 133 | self.assertFalse(v.is_protein()) 134 | self.assertTrue(v.is_extended()) 135 | 136 | v = VariantPosition("*73-105") 137 | self.assertTupleEqual( 138 | (v.position, v.amino_acid, v.intronic_position, v.utr), 139 | (73, None, -105, True), 140 | ) 141 | self.assertTrue(v.is_utr()) 142 | self.assertTrue(v.is_intronic()) 143 | self.assertFalse(v.is_protein()) 144 | self.assertTrue(v.is_extended()) 145 | 146 | v = VariantPosition("-45-1") 147 | self.assertTupleEqual( 148 | (v.position, v.amino_acid, v.intronic_position, v.utr), 149 | (-45, None, -1, True), 150 | ) 151 | self.assertTrue(v.is_utr()) 152 | self.assertTrue(v.is_intronic()) 153 | self.assertFalse(v.is_protein()) 154 | self.assertTrue(v.is_extended()) 155 | 156 | 157 | class TestObjectRepresentation(unittest.TestCase): 158 | def test_repr(self) -> None: 159 | position_strings = ( 160 | "8", 161 | "92380", 162 | "*8", 163 | "-80", 164 | "122-6", 165 | "78+10", 166 | "*89+67", 167 | "-127+6", 168 | "*73-105", 169 | "-45-1", 170 | "Cys234", 171 | "Ala9", 172 | ) 173 | for s in position_strings: 174 | with self.subTest(s=s): 175 | v = VariantPosition(s) 176 | self.assertEqual(s, repr(v)) 177 | 178 | 179 | # TODO: add amino acid variants 180 | class TestComparisons(unittest.TestCase): 181 | def setUp(self) -> None: 182 | sorted_position_strings = ( 183 | "-45-1", 184 | "-12", 185 | "8", 186 | "99", 187 | "99+88", 188 | "99+122", 189 | "100-12", 190 | "100", 191 | "101", 192 | "202-12", 193 | "202-1", 194 | "202", 195 | "*1", 196 | "*73-105", 197 | ) 198 | 199 | self.sorted_variants = [VariantPosition(p) for p in sorted_position_strings] 200 | 201 | # pairwise itertools recipe 202 | a, b = itertools.tee(self.sorted_variants) 203 | next(b, None) 204 | self.sorted_variant_pairs = zip(a, b) 205 | 206 | def test_eq(self) -> None: 207 | for v in self.sorted_variants: 208 | with self.subTest(v=v): 209 | self.assertEqual(v, v) 210 | 211 | def test_ne(self) -> None: 212 | for v1, v2 in self.sorted_variant_pairs: 213 | with self.subTest(v1=v1, v2=v2): 214 | self.assertNotEqual(v1, v2) 215 | 216 | def test_lt(self) -> None: 217 | for v1, v2 in self.sorted_variant_pairs: 218 | with self.subTest(v1=v1, v2=v2): 219 | self.assertLess(v1, v2) 220 | 221 | def test_sorting(self) -> None: 222 | for _ in range(10): 223 | with self.subTest(): 224 | shuffled_variants = self.sorted_variants.copy() 225 | while shuffled_variants == self.sorted_variants: 226 | random.shuffle(shuffled_variants) 227 | self.assertListEqual(self.sorted_variants, sorted(shuffled_variants)) 228 | 229 | 230 | # TODO: add amino acid variants 231 | class TestAdjacency(unittest.TestCase): 232 | def test_adjacent_pairs(self) -> None: 233 | adjacent_pairs = ( 234 | ("-45-2", "-45-1"), 235 | ("-45-1", "-45"), 236 | ("-12", "-13"), 237 | ("-1", "1"), 238 | ("8", "9"), 239 | ("202-1", "202"), 240 | ("99", "99+1"), 241 | ("99+88", "99+89"), 242 | ("100-12", "100-11"), 243 | ("100", "101"), 244 | ("*1", "*2"), 245 | ("*73-1", "*73"), 246 | ) 247 | for s1, s2 in adjacent_pairs: 248 | v1 = VariantPosition(s1) 249 | v2 = VariantPosition(s2) 250 | with self.subTest(v1=v1, v2=v2): 251 | self.assertTrue(v1.is_adjacent(v2)) 252 | with self.subTest(v1=v1, v2=v2): 253 | self.assertTrue(v2.is_adjacent(v1)) 254 | 255 | def test_not_adjacent_to_self(self) -> None: 256 | position_strings = ( 257 | "-45-1", 258 | "-12", 259 | "8", 260 | "99", 261 | "99+88", 262 | "99+122", 263 | "100-12", 264 | "100", 265 | "103", 266 | "202-12", 267 | "202-1", 268 | "205", 269 | "*1", 270 | "*12", 271 | "*73-105", 272 | ) 273 | variants = [VariantPosition(s) for s in position_strings] 274 | for v in variants: 275 | with self.subTest(v=v): 276 | self.assertFalse(v.is_adjacent(v)) 277 | 278 | def test_non_adjacent_pairs(self) -> None: 279 | position_strings = ( 280 | "-45-1", 281 | "-12", 282 | "8", 283 | "99", 284 | "99+88", 285 | "99+122", 286 | "100-12", 287 | "103", 288 | "202-12", 289 | "202-1", 290 | "205", 291 | "*1", 292 | "*12", 293 | "*73-105", 294 | ) 295 | variants = [VariantPosition(s) for s in position_strings] 296 | 297 | for v1, v2 in itertools.permutations(variants, 2): 298 | with self.subTest(v1=v1, v2=v2): 299 | self.assertFalse(v1.is_adjacent(v2)) 300 | 301 | 302 | if __name__ == "__main__": 303 | unittest.main() 304 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from mavehgvs.util import parse_variant_strings 4 | from mavehgvs.variant import Variant 5 | 6 | 7 | class TestParseVariantStrings(unittest.TestCase): 8 | def test_sets_error_strings_for_invalid_items(self) -> None: 9 | invalid_variant_strings = [ 10 | "g.Glu27Trp", 11 | "p.27Glu>Trp", 12 | "p.122-6T>A", 13 | "G>A", 14 | "22G>A", 15 | "G.44del", 16 | "a.78+5_78+10del", 17 | "77dup", 18 | "n.Pro12_Gly18dup", 19 | "g.22_23insauc", 20 | "g.25_24del", 21 | "g.25_24ins", 22 | "r.43-6_595+12delinsctt", 23 | "x.=", 24 | "c.(=)", 25 | ] 26 | 27 | for s in invalid_variant_strings: 28 | with self.subTest(s=s): 29 | valid, invalid = parse_variant_strings([s]) 30 | self.assertIsNone(valid[0]) 31 | self.assertIsInstance(invalid[0], str) 32 | 33 | def test_sets_variant_for_valid_items(self) -> None: 34 | valid_variant_strings = [ 35 | "p.Glu27Trp", 36 | "c.122-6T>A", 37 | "g.44del", 38 | "c.78+5_78+10del", 39 | "c.77dup", 40 | "p.Pro12_Gly18dup", 41 | "p.Ala12_Pro13insGlyProCys", 42 | "r.22_23insauc", 43 | "c.43-6_595+12delinsCTT", 44 | "p.Ile71_Cys80delinsSer", 45 | "p.=", 46 | "c.=", 47 | "p.(=)", 48 | ] 49 | 50 | for s in valid_variant_strings: 51 | with self.subTest(s=s): 52 | valid, invalid = parse_variant_strings([s]) 53 | self.assertIsInstance(valid[0], Variant) 54 | self.assertIsNone(invalid[0]) 55 | 56 | def test_validates_with_targetseq(self) -> None: 57 | targetseq = "ACGT" 58 | valid_variant_strings = ["c.1A>T", "c.3G>C", "c.[1A>T;3G>C]"] 59 | invalid_variant_strings = ["c.1C>T", "c.3T>C", "c.[1A>T;3T>C]", "c.5A>G"] 60 | 61 | for s in valid_variant_strings: 62 | with self.subTest(s=s, targetseq=targetseq): 63 | valid, invalid = parse_variant_strings([s], targetseq=targetseq) 64 | self.assertIsInstance(valid[0], Variant) 65 | self.assertIsNone(invalid[0]) 66 | 67 | for s in invalid_variant_strings: 68 | with self.subTest(s=s, targetseq=targetseq): 69 | valid, invalid = parse_variant_strings([s], targetseq=targetseq) 70 | self.assertIsNone(valid[0]) 71 | self.assertIsInstance(invalid[0], str) 72 | 73 | def test_validates_expected_prefix(self) -> None: 74 | valid_variant_strings = ["p.Glu27Trp", "c.122-6T>A", "r.22_23insauc"] 75 | 76 | for s in valid_variant_strings: 77 | p = s[0] 78 | with self.subTest(s=s, p=p): 79 | valid, invalid = parse_variant_strings([s], expected_prefix=p) 80 | self.assertIsInstance(valid[0], Variant) 81 | self.assertIsNone(invalid[0]) 82 | 83 | for s in valid_variant_strings: 84 | p = "g" 85 | with self.subTest(s=s, p=p): 86 | valid, invalid = parse_variant_strings([s], expected_prefix=p) 87 | self.assertIsNone(valid[0]) 88 | self.assertIsInstance(invalid[0], str) 89 | 90 | def test_valid_expected_prefixes_only(self) -> None: 91 | valid_prefixes = list("cgmnopr") 92 | invalid_prefixes = list("CGMNOPRx.4ab?") 93 | variant = "p.Glu27Trp" 94 | 95 | for p in valid_prefixes: 96 | with self.subTest(p=p): 97 | parse_variant_strings([variant], expected_prefix=p) 98 | 99 | for p in invalid_prefixes: 100 | with self.subTest(p=p): 101 | with self.assertRaises(ValueError): 102 | parse_variant_strings([variant], expected_prefix=p) 103 | -------------------------------------------------------------------------------- /tests/test_variant.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from mavehgvs.exceptions import MaveHgvsParseError 4 | from mavehgvs.variant import Variant 5 | from mavehgvs.position import VariantPosition 6 | 7 | 8 | class TestCreateSingleVariantFromString(unittest.TestCase): 9 | def test_invalid_raises_error(self) -> None: 10 | valid_variant_strings = [ 11 | "p.Glu27Trp", 12 | "c.122-6T>A", 13 | "g.44del", 14 | "c.78+5_78+10del", 15 | "c.77dup", 16 | "p.Pro12_Gly18dup", 17 | "p.Ala12_Pro13insGlyProCys", 18 | "r.22_23insauc", 19 | "c.43-6_595+12delinsCTT", 20 | "p.Ile71_Cys80delinsSer", 21 | "p.=", 22 | "c.=", 23 | "p.(=)", 24 | "c.1_3=", 25 | "c.12=", 26 | "g.88_99=", 27 | "c.43-6_595+12=", 28 | "p.Glu27fs", 29 | "NM_001301.4:c.122-6T>A", 30 | ] 31 | 32 | invalid_variant_strings = [ 33 | "g.Glu27Trp", 34 | "p.27Glu>Trp", 35 | "p.122-6T>A", 36 | "G>A", 37 | "22G>A", 38 | "G.44del", 39 | "a.78+5_78+10del", 40 | "77dup", 41 | "n.Pro12_Gly18dup", 42 | "p.Pro12_Gly18insGlyProAla", 43 | "g.22_23insauc", 44 | "g.25_24del", 45 | "g.25_24ins", 46 | "r.22_24insauc", 47 | "r.43-6_595+12delinsctt", 48 | "x.=", 49 | "c.(=)", 50 | "p.(Gly24=)", 51 | "p.Gly24(=)", 52 | "p.Arg12LysfsTer18", 53 | "p.Glu27fs*?", 54 | "NM_001301.4::c.122-6T>A", 55 | ] 56 | 57 | for s in valid_variant_strings: 58 | with self.subTest(s=s): 59 | Variant(s) # should pass 60 | 61 | for s in invalid_variant_strings: 62 | with self.subTest(s=s): 63 | with self.assertRaises(MaveHgvsParseError): 64 | Variant(s) 65 | 66 | def test_sub(self) -> None: 67 | variant_strings = [ 68 | "p.Glu27Trp", 69 | "p.Ter345Lys", 70 | "p.Cys22=", 71 | "g.48C>A", 72 | "c.122-6T>A", 73 | "c.*33G>C", 74 | "r.22g>u", 75 | "r.33+12a>c", 76 | "p.=", 77 | "p.(=)", 78 | "n.=", 79 | "c.1_3=", 80 | "c.12=", 81 | "g.88_99=", 82 | "c.43-6_595+12=", 83 | ] 84 | 85 | for s in variant_strings: 86 | with self.subTest(s=s): 87 | v = Variant(s) 88 | self.assertEqual(s, str(v)) 89 | 90 | def test_fs(self) -> None: 91 | variant_strings = ["p.Glu27fs"] 92 | 93 | for s in variant_strings: 94 | with self.subTest(s=s): 95 | v = Variant(s) 96 | self.assertEqual(s, str(v)) 97 | 98 | def test_del(self) -> None: 99 | variant_strings = [ 100 | "g.44del", 101 | "c.78+5_78+10del", 102 | "c.1_95del", 103 | "p.Gly18del", 104 | "p.Gln7_Asn19del", 105 | "r.34_36del", 106 | ] 107 | 108 | for s in variant_strings: 109 | with self.subTest(s=s): 110 | v = Variant(s) 111 | self.assertEqual(s, str(v)) 112 | 113 | def test_dup(self) -> None: 114 | variant_strings = [ 115 | "g.22_24dup", 116 | "c.77dup", 117 | "c.101+1_101+7dup", 118 | "p.Pro12_Gly18dup", 119 | "p.Cys5dup", 120 | "r.12dup", 121 | ] 122 | 123 | for s in variant_strings: 124 | with self.subTest(s=s): 125 | v = Variant(s) 126 | self.assertEqual(s, str(v)) 127 | 128 | def test_ins(self) -> None: 129 | variant_strings = [ 130 | "g.234_235insT", 131 | "c.84_85insCTG", 132 | "c.99+6_99+7insA", 133 | "p.His7_Gln8insSer", 134 | "p.Ala12_Pro13insGlyProCys", 135 | "r.22_23insauc", 136 | ] 137 | 138 | for s in variant_strings: 139 | with self.subTest(s=s): 140 | v = Variant(s) 141 | self.assertEqual(s, str(v)) 142 | 143 | def test_delins(self) -> None: 144 | variant_strings = [ 145 | "g.22delinsAACG", 146 | "c.83_85delinsT", 147 | "c.43-6_595+12delinsCTT", 148 | "p.Ile71_Cys80delinsSer", 149 | "p.His44delinsValProGlyGlu", 150 | "r.92delinsgac", 151 | ] 152 | 153 | for s in variant_strings: 154 | with self.subTest(s=s): 155 | v = Variant(s) 156 | self.assertEqual(s, str(v)) 157 | 158 | def test_target_identical(self) -> None: 159 | identical_variant_strings = [ 160 | *[f"{prefix}.=" for prefix in tuple("gmocnr")], 161 | "p.(=)", 162 | "c.1_3=", 163 | ] 164 | 165 | non_identical_variant_strings = [ 166 | "p.Ter345Lys", 167 | "p.Cys22=", 168 | "g.48C>A", 169 | "c.122-6T>A", 170 | "g.22delinsAACG", 171 | "c.83_85delinsT", 172 | ] 173 | 174 | for s in identical_variant_strings: 175 | with self.subTest(s=s): 176 | v = Variant(s) 177 | self.assertTrue(v.is_target_identical()) 178 | 179 | for s in non_identical_variant_strings: 180 | with self.subTest(s=s): 181 | v = Variant(s) 182 | self.assertFalse(v.is_target_identical()) 183 | 184 | def test_synonymous(self) -> None: 185 | synonymous_variant_strings = ["p.Gly24=", "p.=", "p.(=)"] 186 | 187 | nonsynonymous_variant_strings = ["p.Ter345Lys", "c.=", "g.48C>A"] 188 | 189 | for s in synonymous_variant_strings: 190 | with self.subTest(s=s): 191 | v = Variant(s) 192 | self.assertTrue(v.is_synonymous()) 193 | 194 | for s in nonsynonymous_variant_strings: 195 | with self.subTest(s=s): 196 | v = Variant(s) 197 | self.assertFalse(v.is_synonymous()) 198 | 199 | def test_relaxed_ordering(self): 200 | variant_tuples = [ 201 | ("c.78+10_78+5del", "c.78+5_78+10del"), 202 | ("c.80_77dup", "c.77_80dup"), 203 | ("p.Gly18_Pro12dup", "p.Pro12_Gly18dup"), 204 | ("p.Pro13_Ala12insGlyProCys", "p.Ala12_Pro13insGlyProCys"), 205 | ("r.23_22insauc", "r.22_23insauc"), 206 | ("c.595+12_43-6delinsCTT", "c.43-6_595+12delinsCTT"), 207 | ("p.Cys80_Ile71delinsSer", "p.Ile71_Cys80delinsSer"), 208 | ("c.3_1=", "c.1_3="), 209 | ("g.99_88=", "g.88_99="), 210 | ("c.595+12_43-6=", "c.43-6_595+12="), 211 | ] 212 | 213 | for v, s in variant_tuples: 214 | with self.subTest(v=v, s=s): 215 | self.assertEqual(str(Variant(v, relaxed_ordering=True)), s) 216 | 217 | 218 | class TestCreateMultiVariantFromString(unittest.TestCase): 219 | def test_creation(self): 220 | variant_strings = [ 221 | "p.[Glu27Trp;Ter345Lys]", 222 | "p.[Glu27Trp;Lys212fs]", 223 | "p.[Gly18del;Glu27Trp;Ter345Lys]", 224 | "p.[Gln7_Asn19del;Glu27Trp;Ter345Lys]", 225 | "c.[1_35del;78+5_78+10del;122T>A]", 226 | "NM_002002.3:c.[1_35del;78+5_78+10del;122T>A]", 227 | ] 228 | 229 | invalid_variant_strings = [ 230 | "p.[Glu27Trp;=;Ter345Lys]", 231 | "p.[(=);Gly18del;Glu27Trp;Ter345Lys]", 232 | "c.[12T>A;=;78+5_78+10del]", 233 | "c.[1_3=;12T>A;78+5_78+10del]", 234 | "p.[Glu27fs;Arg48Lys]", 235 | "p.[Glu27fs;Arg48fs]", 236 | "NM_002002.3::c.[1_35del;78+5_78+10del;122T>A]", 237 | "NM_002002.3:c.1_35del;78+5_78+10del;122T>A", 238 | ] 239 | 240 | for s in variant_strings: 241 | with self.subTest(s=s): 242 | v = Variant(s) 243 | self.assertEqual(s, str(v)) 244 | 245 | for s in invalid_variant_strings: 246 | with self.subTest(s=s): 247 | with self.assertRaises(MaveHgvsParseError): 248 | Variant(s) 249 | 250 | def test_ordering(self): 251 | variant_string_tuples = [ 252 | ("p.[Gly345Lys;Glu27Trp]", "p.[Glu27Trp;Gly345Lys]"), 253 | ("p.[Glu27Trp;Gly18del;Ter345Lys]", "p.[Gly18del;Glu27Trp;Ter345Lys]"), 254 | ("c.[122T>A;1_35del;78+5_78+10del]", "c.[1_35del;78+5_78+10del;122T>A]"), 255 | ] 256 | 257 | for s, _ in variant_string_tuples: 258 | with self.subTest(s=s): 259 | with self.assertRaises(MaveHgvsParseError): 260 | Variant(s, relaxed_ordering=False) 261 | 262 | for s, s_ordered in variant_string_tuples: 263 | with self.subTest(s=s): 264 | # Should pass creation 265 | Variant(s, relaxed_ordering=True) 266 | 267 | for s, s_ordered in variant_string_tuples: 268 | with self.subTest(s=s): 269 | v = Variant(s, relaxed_ordering=True) 270 | self.assertEqual(s_ordered, str(v)) 271 | 272 | def test_overlaps(self): 273 | invalid_variant_strings = [ 274 | "p.[Glu27Trp;Glu27Trp]", 275 | "p.[Glu27Trp;Glu27Tyr]", 276 | "p.[Pro27Trp;Glu27Tyr]", 277 | "p.[Gly18del;Gly18Tyr]", 278 | "p.[Gln7_Asn19del;Glu13Trp]", 279 | "p.[Glu13Trp;Gln7_Asn19del]", 280 | "p.[Gln7_Asn19del;Glu13Trp;Ter345Lys]", 281 | "c.[1_95del;78+5_78+10del;122T>A]", 282 | "c.[1_95del;22T>A]", 283 | "n.[22G>A;22G>T]", 284 | ] 285 | 286 | for s in invalid_variant_strings: 287 | with self.subTest(s=s): 288 | with self.assertRaises(MaveHgvsParseError): 289 | Variant(s) 290 | 291 | 292 | class TestCreateSingleVariantFromValues(unittest.TestCase): 293 | def test_equal(self): 294 | valid_dict_tuples = [ 295 | ( 296 | { 297 | "variant_type": "equal", 298 | "prefix": "p", 299 | "position": "27", 300 | "target": "Glu", 301 | }, 302 | "p.Glu27=", 303 | ), 304 | ( 305 | { 306 | "variant_type": "equal", 307 | "prefix": "c", 308 | "start_position": "12", 309 | "end_position": "12", 310 | }, 311 | "c.12=", 312 | ), 313 | ( 314 | { 315 | "variant_type": "equal", 316 | "prefix": "c", 317 | "start_position": "1", 318 | "end_position": "3", 319 | }, 320 | "c.1_3=", 321 | ), 322 | ] 323 | 324 | for d, s in valid_dict_tuples: 325 | with self.subTest(d=d, s=s): 326 | self.assertEqual(Variant(s), Variant(d)) 327 | 328 | def test_sub(self): 329 | valid_dict_tuples = [ 330 | ( 331 | { 332 | "variant_type": "sub", 333 | "prefix": "p", 334 | "position": 27, 335 | "target": "Glu", 336 | "variant": "Trp", 337 | }, 338 | "p.Glu27Trp", 339 | ), 340 | ( 341 | { 342 | "variant_type": "sub", 343 | "prefix": "c", 344 | "position": "122-6", 345 | "target": "T", 346 | "variant": "A", 347 | }, 348 | "c.122-6T>A", 349 | ), 350 | ] 351 | 352 | for d, s in valid_dict_tuples: 353 | with self.subTest(d=d, s=s): 354 | self.assertEqual(Variant(s), Variant(d)) 355 | 356 | def test_fs(self): 357 | valid_dict_tuples = [ 358 | ( 359 | { 360 | "variant_type": "fs", 361 | "prefix": "p", 362 | "position": 27, 363 | "target": "Glu", 364 | }, 365 | "p.Glu27fs", 366 | ), 367 | ] 368 | 369 | for d, s in valid_dict_tuples: 370 | with self.subTest(d=d, s=s): 371 | self.assertEqual(Variant(s), Variant(d)) 372 | 373 | def test_ins(self): 374 | valid_dict_tuples = [ 375 | ( 376 | { 377 | "variant_type": "ins", 378 | "prefix": "p", 379 | "start_position": 12, 380 | "start_target": "Ala", 381 | "end_position": 13, 382 | "end_target": "Pro", 383 | "variant": "GlyProCys", 384 | }, 385 | "p.Ala12_Pro13insGlyProCys", 386 | ), 387 | ( 388 | { 389 | "variant_type": "ins", 390 | "prefix": "r", 391 | "start_position": 22, 392 | "end_position": 23, 393 | "variant": "auc", 394 | }, 395 | "r.22_23insauc", 396 | ), 397 | ] 398 | 399 | for d, s in valid_dict_tuples: 400 | with self.subTest(d=d, s=s): 401 | self.assertEqual(Variant(s), Variant(d)) 402 | 403 | def test_del(self): 404 | valid_dict_tuples = [ 405 | ( 406 | { 407 | "variant_type": "del", 408 | "prefix": "g", 409 | "start_position": 44, 410 | "end_position": 44, 411 | }, 412 | "g.44del", 413 | ), 414 | ( 415 | { 416 | "variant_type": "del", 417 | "prefix": "c", 418 | "start_position": "78+5", 419 | "end_position": "78+10", 420 | }, 421 | "c.78+5_78+10del", 422 | ), 423 | ( 424 | { 425 | "variant_type": "del", 426 | "prefix": "p", 427 | "start_position": 33, 428 | "start_target": "Arg", 429 | "end_position": 33, 430 | "end_target": "Arg", 431 | }, 432 | "p.Arg33del", 433 | ), 434 | ] 435 | 436 | for d, s in valid_dict_tuples: 437 | with self.subTest(d=d, s=s): 438 | self.assertEqual(Variant(s), Variant(d)) 439 | 440 | def test_dup(self): 441 | valid_dict_tuples = [ 442 | ( 443 | { 444 | "variant_type": "dup", 445 | "prefix": "c", 446 | "start_position": 77, 447 | "end_position": 77, 448 | }, 449 | "c.77dup", 450 | ), 451 | ( 452 | { 453 | "variant_type": "dup", 454 | "prefix": "p", 455 | "start_position": 12, 456 | "start_target": "Pro", 457 | "end_position": 18, 458 | "end_target": "Gly", 459 | }, 460 | "p.Pro12_Gly18dup", 461 | ), 462 | ] 463 | 464 | for d, s in valid_dict_tuples: 465 | with self.subTest(d=d, s=s): 466 | self.assertEqual(Variant(s), Variant(d)) 467 | 468 | def test_delins(self): 469 | valid_dict_tuples = [ 470 | ( 471 | { 472 | "variant_type": "delins", 473 | "prefix": "c", 474 | "start_position": "43-6", 475 | "end_position": "595+12", 476 | "variant": "CTT", 477 | }, 478 | "c.43-6_595+12delinsCTT", 479 | ), 480 | ( 481 | { 482 | "variant_type": "delins", 483 | "prefix": "c", 484 | "start_position": "45", 485 | "end_position": "45", 486 | "variant": "AGA", 487 | }, 488 | "c.45delinsAGA", 489 | ), 490 | ( 491 | { 492 | "variant_type": "delins", 493 | "prefix": "p", 494 | "start_position": 71, 495 | "start_target": "Ile", 496 | "end_position": 80, 497 | "end_target": "Cys", 498 | "variant": "Ser", 499 | }, 500 | "p.Ile71_Cys80delinsSer", 501 | ), 502 | ( 503 | { 504 | "variant_type": "delins", 505 | "prefix": "p", 506 | "start_position": 50, 507 | "start_target": "Arg", 508 | "end_position": 50, 509 | "end_target": "Arg", 510 | "variant": "AlaGly", 511 | }, 512 | "p.Arg50delinsAlaGly", 513 | ), 514 | ] 515 | 516 | invalid_dicts = [ 517 | { 518 | "variant_type": "delins", 519 | "prefix": "p", 520 | "start_position": 50, 521 | "start_target": "Arg", 522 | "end_position": 50, 523 | "end_target": "Cys", 524 | "variant": "AlaGly", 525 | }, 526 | ] 527 | 528 | for d, s in valid_dict_tuples: 529 | with self.subTest(d=d, s=s): 530 | self.assertEqual(Variant(s), Variant(d)) 531 | 532 | for d in invalid_dicts: 533 | with self.subTest(d=d): 534 | with self.assertRaises(MaveHgvsParseError): 535 | Variant(d) 536 | 537 | def test_extra_keys(self): 538 | invalid_dicts = [ 539 | { 540 | "variant_type": "sub", 541 | "prefix": "p", 542 | "position": 27, 543 | "target": "Glu", 544 | "variant": "Trp", 545 | "bonus": "data", 546 | }, 547 | { 548 | "variant_type": "sub", 549 | "prefix": "c", 550 | "position": "122-6", 551 | "start_target": "T", 552 | "target": "T", 553 | "variant": "A", 554 | }, 555 | { 556 | "variant_type": "delins", 557 | "prefix": "p", 558 | "start_target": "Ile", 559 | "end_position": 80, 560 | "end_target": "Cys", 561 | "variant": "Ser", 562 | "position": "Ala", 563 | }, 564 | ] 565 | 566 | for d in invalid_dicts: 567 | with self.subTest(d=d): 568 | with self.assertRaises(MaveHgvsParseError): 569 | Variant(d) 570 | 571 | def test_missing_keys(self): 572 | invalid_dicts = [ 573 | {"prefix": "p", "position": 27, "target": "Glu", "variant": "Trp"}, 574 | {"variant_type": "sub", "position": "122-6", "target": "T", "variant": "A"}, 575 | { 576 | "variant_type": "delins", 577 | "prefix": "p", 578 | "start_target": "Ile", 579 | "end_position": 80, 580 | "end_target": "Cys", 581 | "variant": "Ser", 582 | }, 583 | ] 584 | 585 | for d in invalid_dicts: 586 | with self.subTest(d=d): 587 | with self.assertRaises(MaveHgvsParseError): 588 | Variant(d) 589 | 590 | def test_invalid_keys(self): 591 | invalid_dicts = [ 592 | { 593 | "variant_type": "equal", 594 | "prefix": "p", 595 | "start_position": "27", 596 | "end_position": "27", 597 | "target": "Glu", 598 | }, 599 | {"variant_type": "dup", "prefix": "c", "position": 77}, 600 | { 601 | "variant_type": "test", 602 | "prefix": "c", 603 | "start_position": 77, 604 | "end_position": 77, 605 | }, 606 | { 607 | "variant_type": "fs", 608 | "prefix": "c", 609 | "position": "12", 610 | "target": "T", 611 | }, 612 | ] 613 | 614 | for d in invalid_dicts: 615 | with self.subTest(d=d): 616 | with self.assertRaises(MaveHgvsParseError): 617 | Variant(d) 618 | 619 | def test_invalid_type(self): 620 | invalid_values = [1234, None, 5.55, ("p", "Ile", 80, "Cys")] 621 | 622 | for v in invalid_values: 623 | with self.subTest(v=v): 624 | with self.assertRaises(ValueError): 625 | Variant(v) 626 | 627 | 628 | class TestCreateMultiVariantFromValues(unittest.TestCase): 629 | def test_create_multivariant(self): 630 | valid_dict_tuples = [ 631 | ( 632 | [ 633 | { 634 | "variant_type": "sub", 635 | "prefix": "p", 636 | "position": 27, 637 | "target": "Glu", 638 | "variant": "Trp", 639 | }, 640 | { 641 | "variant_type": "delins", 642 | "prefix": "p", 643 | "start_position": 71, 644 | "start_target": "Ile", 645 | "end_position": 80, 646 | "end_target": "Cys", 647 | "variant": "Ser", 648 | }, 649 | ], 650 | "p.[Glu27Trp;Ile71_Cys80delinsSer]", 651 | ), 652 | ( 653 | [ 654 | { 655 | "variant_type": "dup", 656 | "prefix": "c", 657 | "start_position": 77, 658 | "end_position": 77, 659 | }, 660 | { 661 | "variant_type": "sub", 662 | "prefix": "c", 663 | "position": "122-6", 664 | "target": "T", 665 | "variant": "A", 666 | }, 667 | ], 668 | "c.[77dup;122-6T>A]", 669 | ), 670 | ] 671 | 672 | invalid_dicts = [ 673 | [ 674 | { 675 | "variant_type": "sub", 676 | "position": 27, 677 | "target": "Glu", 678 | "variant": "Trp", 679 | }, 680 | { 681 | "variant_type": "delins", 682 | "prefix": "p", 683 | "start_position": 71, 684 | "start_target": "Ile", 685 | "end_position": 80, 686 | "end_target": "Cys", 687 | "variant": "Ser", 688 | }, 689 | ], 690 | [ 691 | { 692 | "variant_type": "sub", 693 | "prefix": "p", 694 | "position": 27, 695 | "target": "Glu", 696 | "variant": "Trp", 697 | }, 698 | { 699 | "variant_type": "sub", 700 | "prefix": "c", 701 | "position": "122-6", 702 | "target": "T", 703 | "variant": "A", 704 | }, 705 | ], 706 | ] 707 | 708 | for d, s in valid_dict_tuples: 709 | with self.subTest(d=d, s=s): 710 | self.assertEqual(Variant(s), Variant(d)) 711 | 712 | for d in invalid_dicts: 713 | with self.subTest(d=d): 714 | with self.assertRaises(MaveHgvsParseError): 715 | Variant(d) 716 | 717 | 718 | class TestTargetSequenceValidation(unittest.TestCase): 719 | def test_valid_dna_equal(self): 720 | variant_tuples = [("ACGT", "c.1_2="), ("ACGT", "c.4="), ("ACGT", "c.=")] 721 | 722 | for target, s in variant_tuples: 723 | with self.subTest(target=target, s=s): 724 | v = Variant(s, targetseq=target) 725 | self.assertEqual(s, str(v)) 726 | 727 | def test_invalid_dna_equal(self): 728 | variant_tuples = [("ACGT", "c.4_5="), ("ACGT", "c.10=")] 729 | 730 | for target, s in variant_tuples: 731 | with self.subTest(target=target, s=s): 732 | with self.assertRaises(MaveHgvsParseError): 733 | Variant(s, targetseq=target) 734 | 735 | def test_matching_dna_substitution(self): 736 | variant_tuples = [ 737 | ("ACGT", "c.1A>T"), 738 | ("ACGT", "c.3G>C"), 739 | ("ACGT", "c.[1A>T;3G>C]"), 740 | ] 741 | 742 | for target, s in variant_tuples: 743 | with self.subTest(target=target, s=s): 744 | v = Variant(s, targetseq=target) 745 | self.assertEqual(s, str(v)) 746 | 747 | def test_nonmatching_dna_substitution(self): 748 | variant_tuples = [ 749 | ("ACGT", "c.1C>T"), 750 | ("ACGT", "c.3T>C"), 751 | ("ACGT", "c.[1A>T;3T>C]"), 752 | ("ACGT", "c.5A>G"), 753 | ] 754 | 755 | for target, s in variant_tuples: 756 | with self.subTest(target=target, s=s): 757 | with self.assertRaises(MaveHgvsParseError): 758 | Variant(s, targetseq=target) 759 | 760 | def test_valid_dna_del(self): 761 | variant_tuples = [("ACGT", "c.1_3del"), ("ACGT", "c.4del")] 762 | 763 | for target, s in variant_tuples: 764 | with self.subTest(target=target, s=s): 765 | v = Variant(s, targetseq=target) 766 | self.assertEqual(s, str(v)) 767 | 768 | def test_invalid_dna_del(self): 769 | variant_tuples = [ 770 | ("ACGT", "c.1_5del"), 771 | ("ACGT", "c.6_8del"), 772 | ("ACGT", "c.7del"), 773 | ] 774 | 775 | for target, s in variant_tuples: 776 | with self.subTest(target=target, s=s): 777 | with self.assertRaises(MaveHgvsParseError): 778 | Variant(s, targetseq=target) 779 | 780 | def test_valid_dna_dup(self): 781 | variant_tuples = [("ACGT", "c.1_3dup"), ("ACGT", "c.4dup")] 782 | 783 | for target, s in variant_tuples: 784 | with self.subTest(target=target, s=s): 785 | v = Variant(s, targetseq=target) 786 | self.assertEqual(s, str(v)) 787 | 788 | def test_invalid_dna_dup(self): 789 | variant_tuples = [ 790 | ("ACGT", "c.1_5dup"), 791 | ("ACGT", "c.6_8dup"), 792 | ("ACGT", "c.7dup"), 793 | ] 794 | 795 | for target, s in variant_tuples: 796 | with self.subTest(target=target, s=s): 797 | with self.assertRaises(MaveHgvsParseError): 798 | Variant(s, targetseq=target) 799 | 800 | def test_valid_dna_ins(self): 801 | variant_tuples = [("ACGT", "c.1_2insAAA"), ("ACGT", "c.3_4insT")] 802 | 803 | for target, s in variant_tuples: 804 | with self.subTest(target=target, s=s): 805 | v = Variant(s, targetseq=target) 806 | self.assertEqual(s, str(v)) 807 | 808 | def test_invalid_dna_ins(self): 809 | variant_tuples = [("ACGT", "c.4_5insA"), ("ACGT", "c.10_11insTCG")] 810 | 811 | for target, s in variant_tuples: 812 | with self.subTest(target=target, s=s): 813 | with self.assertRaises(MaveHgvsParseError): 814 | Variant(s, targetseq=target) 815 | 816 | def test_valid_dna_delins(self): 817 | variant_tuples = [("ACGT", "c.1_2delinsA"), ("ACGT", "c.4delinsTAAGC")] 818 | 819 | for target, s in variant_tuples: 820 | with self.subTest(target=target, s=s): 821 | v = Variant(s, targetseq=target) 822 | self.assertEqual(s, str(v)) 823 | 824 | def test_invalid_dna_delins(self): 825 | variant_tuples = [("ACGT", "c.4_5delinsA"), ("ACGT", "c.10_delinsTCG")] 826 | 827 | for target, s in variant_tuples: 828 | with self.subTest(target=target, s=s): 829 | with self.assertRaises(MaveHgvsParseError): 830 | Variant(s, targetseq=target) 831 | 832 | def test_valid_protein_equal(self): 833 | variant_tuples = [("RCQY", "p.Arg1="), ("RCQY", "p.Tyr4="), ("RCQY", "p.=")] 834 | 835 | for target, s in variant_tuples: 836 | with self.subTest(target=target, s=s): 837 | v = Variant(s, targetseq=target) 838 | self.assertEqual(s, str(v)) 839 | 840 | def test_invalid_protein_equal(self): 841 | variant_tuples = [("RCQY", "p.Trp5=")] 842 | 843 | for target, s in variant_tuples: 844 | with self.subTest(target=target, s=s): 845 | with self.assertRaises(MaveHgvsParseError): 846 | Variant(s, targetseq=target) 847 | 848 | def test_matching_protein_substitution(self): 849 | variant_tuples = [ 850 | ("RCQY", "p.Arg1Ala"), 851 | ("RCQY", "p.Gln3Trp"), 852 | ("RCQY", "p.[Arg1Ala;Gln3Trp]"), 853 | ] 854 | 855 | for target, s in variant_tuples: 856 | with self.subTest(target=target, s=s): 857 | v = Variant(s, targetseq=target) 858 | self.assertEqual(s, str(v)) 859 | 860 | def test_nonmatching_protein_substitution(self): 861 | variant_tuples = [ 862 | ("RCQY", "p.Cys1Ala"), 863 | ("RCQY", "p.Ala3Trp"), 864 | ("RCQY", "p.[Arg1Ala;Cys3Trp]"), 865 | ("RCQY", "p.Asp5Glu"), 866 | ] 867 | 868 | for target, s in variant_tuples: 869 | with self.subTest(target=target, s=s): 870 | with self.assertRaises(MaveHgvsParseError): 871 | Variant(s, targetseq=target) 872 | 873 | def test_matching_protein_fs(self): 874 | variant_tuples = [ 875 | ("RCQY", "p.Arg1fs"), 876 | ("RCQY", "p.Gln3fs"), 877 | ] 878 | 879 | for target, s in variant_tuples: 880 | with self.subTest(target=target, s=s): 881 | v = Variant(s, targetseq=target) 882 | self.assertEqual(s, str(v)) 883 | 884 | def test_nonmatching_protein_fs(self): 885 | variant_tuples = [ 886 | ("RCQY", "p.Cys1fs"), 887 | ("RCQY", "p.Ala3fs"), 888 | ("RCQY", "p.Asp5fs"), 889 | ] 890 | 891 | for target, s in variant_tuples: 892 | with self.subTest(target=target, s=s): 893 | with self.assertRaises(MaveHgvsParseError): 894 | Variant(s, targetseq=target) 895 | 896 | def test_matching_protein_indel(self): 897 | variant_tuples = [ 898 | ("RCQY", "p.Arg1del"), 899 | ("RCQY", "p.Arg1_Gln3dup"), 900 | ] 901 | 902 | for target, s in variant_tuples: 903 | with self.subTest(target=target, s=s): 904 | v = Variant(s, targetseq=target) 905 | self.assertEqual(s, str(v)) 906 | 907 | def test_nonmatching_protein_indel(self): 908 | variant_tuples = [ 909 | ("RCQY", "p.Cys1del"), 910 | ("RCQY", "p.Arg1_Asp3dup"), 911 | ("RCQY", "p.Asp5del"), 912 | ] 913 | 914 | for target, s in variant_tuples: 915 | with self.subTest(target=target, s=s): 916 | with self.assertRaises(MaveHgvsParseError): 917 | Variant(s, targetseq=target) 918 | 919 | def test_skips_extended(self): 920 | variant_tuples = [ 921 | ("ACGT", "c.1+3A>T"), 922 | ("ACGT", "c.*33G>C"), 923 | ("ACGT", "c.43-6_595+12delinsCTT"), 924 | ] 925 | 926 | for target, s in variant_tuples: 927 | with self.subTest(target=target, s=s): 928 | v = Variant(s, targetseq=target) 929 | self.assertEqual(s, str(v)) 930 | 931 | 932 | class TestMiscMethods(unittest.TestCase): 933 | def test_is_multi_variant(self): 934 | single_variant_strings = [ 935 | "p.Glu27Trp", 936 | "c.122-6T>A", 937 | "g.44del", 938 | "c.78+5_78+10del", 939 | "c.77dup", 940 | "p.Pro12_Gly18dup", 941 | "p.Ala12_Pro13insGlyProCys", 942 | "r.22_23insauc", 943 | "c.43-6_595+12delinsCTT", 944 | "p.Ile71_Cys80delinsSer", 945 | "p.=", 946 | ] 947 | 948 | multi_variant_strings = [] 949 | 950 | for s in single_variant_strings: 951 | with self.subTest(s=s): 952 | v = Variant(s) 953 | self.assertFalse(v.is_multi_variant()) 954 | 955 | for s in multi_variant_strings: 956 | with self.subTest(s=s): 957 | v = Variant(s) 958 | self.assertTrue(v.is_multi_variant()) 959 | 960 | def test_uses_extended_positions(self): 961 | non_extended_variant_strings = [ 962 | "p.Glu27Trp", 963 | "g.44del", 964 | "c.77dup", 965 | "p.Pro12_Gly18dup", 966 | "p.Ala12_Pro13insGlyProCys", 967 | "r.22_23insauc", 968 | "r.22g>u", 969 | "p.Ile71_Cys80delinsSer", 970 | "p.=", 971 | "p.[Pro12_Gly18dup;Glu27Trp]", 972 | "r.[22g>u;35del]", 973 | ] 974 | 975 | extended_variant_strings = [ 976 | "c.122-6T>A", 977 | "c.78+5_78+10del", 978 | "c.43-6_595+12delinsCTT", 979 | "c.*33G>C", 980 | "r.33+12a>c", 981 | "c.[12G>T;122-6T>A]", 982 | "c.[43-6_595+12delinsCTT;*33G>C]", 983 | ] 984 | 985 | for s in non_extended_variant_strings: 986 | with self.subTest(s=s): 987 | v = Variant(s) 988 | self.assertFalse(v.uses_extended_positions()) 989 | 990 | for s in extended_variant_strings: 991 | with self.subTest(s=s): 992 | v = Variant(s) 993 | self.assertTrue(v.uses_extended_positions()) 994 | 995 | def test_components(self): 996 | variant_strings = [ 997 | ("p.[Glu27Trp;Ter345Lys]", ("p.Glu27Trp", "p.Ter345Lys")), 998 | ("p.[Glu27Trp;Lys212fs]", ("p.Glu27Trp", "p.Lys212fs")), 999 | ( 1000 | "p.[Gly18del;Glu27Trp;Ter345Lys]", 1001 | ("p.Gly18del", "p.Glu27Trp", "p.Ter345Lys"), 1002 | ), 1003 | ( 1004 | "p.[Gln7_Asn19del;Glu27Trp;Ter345Lys]", 1005 | ("p.Gln7_Asn19del", "p.Glu27Trp", "p.Ter345Lys"), 1006 | ), 1007 | ( 1008 | "c.[1_35del;78+5_78+10del;122T>A]", 1009 | ("c.1_35del", "c.78+5_78+10del", "c.122T>A"), 1010 | ), 1011 | ("p.Glu27Trp", ("p.Glu27Trp",)), 1012 | ("NP_002002.3:p.Glu27Trp", ("NP_002002.3:p.Glu27Trp",)), 1013 | ( 1014 | "NP_002002.3:p.[Glu27Trp;Lys212fs]", 1015 | ("NP_002002.3:p.Glu27Trp", "NP_002002.3:p.Lys212fs"), 1016 | ), 1017 | ] 1018 | 1019 | for s, expected_components in variant_strings: 1020 | with self.subTest(s=s): 1021 | v = Variant(s) 1022 | self.assertTrue(all([c in expected_components for c in v.components()])) 1023 | 1024 | 1025 | # TODO: multi-variant test cases 1026 | class TestMiscProperties(unittest.TestCase): 1027 | def test_prefix(self): 1028 | variant_tuples = [(prefix, f"{prefix}.=") for prefix in tuple("gmocnr")] 1029 | 1030 | for p, s in variant_tuples: 1031 | with self.subTest(p=p, s=s): 1032 | v = Variant(s) 1033 | self.assertEqual(p, v.prefix) 1034 | 1035 | def test_variant_type(self): 1036 | variant_tuples = [ 1037 | ("sub", "p.Glu27Trp"), 1038 | ("sub", "c.122-6T>A"), 1039 | ("fs", "p.Glu27fs"), 1040 | ("del", "g.44del"), 1041 | ("del", "c.78+5_78+10del"), 1042 | ("dup", "c.77dup"), 1043 | ("dup", "p.Pro12_Gly18dup"), 1044 | ("ins", "p.Ala12_Pro13insGlyProCys"), 1045 | ("ins", "r.22_23insauc"), 1046 | ("delins", "c.43-6_595+12delinsCTT"), 1047 | ("delins", "p.Ile71_Cys80delinsSer"), 1048 | ] 1049 | 1050 | for t, s in variant_tuples: 1051 | with self.subTest(t=t, s=s): 1052 | v = Variant(s) 1053 | self.assertEqual(t, v.variant_type) 1054 | 1055 | def test_position(self): 1056 | variant_tuples = [ 1057 | (VariantPosition("Glu27"), "p.Glu27Trp"), 1058 | (VariantPosition("Glu27"), "p.Glu27fs"), 1059 | (VariantPosition("122-6"), "c.122-6T>A"), 1060 | (VariantPosition("44"), "g.44del"), 1061 | ((VariantPosition("78+5"), VariantPosition("78+10")), "c.78+5_78+10del"), 1062 | (VariantPosition("77"), "c.77dup"), 1063 | ((VariantPosition("Pro12"), VariantPosition("Gly18")), "p.Pro12_Gly18dup"), 1064 | ( 1065 | (VariantPosition("Ala12"), VariantPosition("Pro13")), 1066 | "p.Ala12_Pro13insGlyProCys", 1067 | ), 1068 | ((VariantPosition("22"), VariantPosition("23")), "r.22_23insauc"), 1069 | ( 1070 | (VariantPosition("43-6"), VariantPosition("595+12")), 1071 | "c.43-6_595+12delinsCTT", 1072 | ), 1073 | ( 1074 | (VariantPosition("Ile71"), VariantPosition("Cys80")), 1075 | "p.Ile71_Cys80delinsSer", 1076 | ), 1077 | ] 1078 | 1079 | for p, s in variant_tuples: 1080 | with self.subTest(p=p, s=s): 1081 | v = Variant(s) 1082 | if isinstance(p, list): # multi-variant 1083 | self.assertEqual(len(p), len(v.positions)) 1084 | for q, vp in zip(p, v.positions): 1085 | if isinstance(q, tuple): 1086 | self.assertTupleEqual(q, vp) 1087 | else: 1088 | self.assertEqual(q, vp) 1089 | if isinstance(p, tuple): 1090 | self.assertTupleEqual(p, v.positions) 1091 | else: 1092 | self.assertEqual(p, v.positions) 1093 | 1094 | def test_sequence(self): 1095 | variant_tuples = [ 1096 | (("Glu", "Trp"), "p.Glu27Trp"), 1097 | (("T", "A"), "c.122-6T>A"), 1098 | (None, "p.Glu27fs"), 1099 | (None, "g.44del"), 1100 | (None, "c.78+5_78+10del"), 1101 | (None, "c.77dup"), 1102 | (None, "p.Pro12_Gly18dup"), 1103 | ("GlyProCys", "p.Ala12_Pro13insGlyProCys"), 1104 | ("auc", "r.22_23insauc"), 1105 | ("CTT", "c.43-6_595+12delinsCTT"), 1106 | ("Ser", "p.Ile71_Cys80delinsSer"), 1107 | ] 1108 | 1109 | for seq, s in variant_tuples: 1110 | with self.subTest(seq=seq, s=s): 1111 | v = Variant(s) 1112 | self.assertEqual(seq, v.sequence) 1113 | 1114 | def test_target_id(self): 1115 | variant_tuples = [ 1116 | (None, "p.Glu27Trp"), 1117 | (None, "c.122-6T>A"), 1118 | ("GeneX", "GeneX:p.Glu27Trp"), 1119 | ("YFG1", "YFG1:c.122-6T>A"), 1120 | ("ENST00000471181.7", "ENST00000471181.7:c.122-6T>A"), 1121 | ("NM_007294.4", "NM_007294.4:c.122-6T>A"), 1122 | ("NM_007294.4", "NM_007294.4:c.[122-6T>A;153C>T]"), 1123 | ] 1124 | 1125 | for t, s in variant_tuples: 1126 | with self.subTest(t=t, s=s): 1127 | v = Variant(s) 1128 | self.assertEqual(t, v.target_id) 1129 | 1130 | for _, s in variant_tuples: 1131 | with self.subTest(s=s): 1132 | v = Variant(s) 1133 | self.assertEqual(s, str(v)) 1134 | 1135 | 1136 | if __name__ == "__main__": 1137 | unittest.main() 1138 | --------------------------------------------------------------------------------