├── .coveragerc ├── .gitattributes ├── .github └── workflows │ └── tests.yml ├── .github_changelog_generator ├── .gitignore ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── RELEASING.md ├── code-of-conduct.md ├── deploy.sh ├── examples └── varcode-quick_start.ipynb ├── lint-and-test.sh ├── lint.sh ├── pylintrc ├── requirements.txt ├── setup.py ├── test.sh ├── tests ├── __init__.py ├── benchmark_vcf_load.py ├── common.py ├── data.py ├── data │ ├── dbnsfp_validation_set.csv │ ├── different-samples.1.vcf │ ├── different-samples.2.vcf │ ├── duplicate-id.1.vcf │ ├── duplicate-id.2.vcf │ ├── duplicates.maf │ ├── duplicates.vcf │ ├── mouse_vcf_dbsnp_chr1_partial.vcf │ ├── multiallelic.vcf │ ├── mutect-example-headerless.vcf │ ├── mutect-example.vcf │ ├── ov.wustle.subset5.maf │ ├── same-samples.1.vcf │ ├── same-samples.2.vcf │ ├── simple.1.vcf │ ├── simple.2.vcf │ ├── somatic_hg19_14muts.space_in_sample_name.vcf │ ├── somatic_hg19_14muts.vcf │ ├── somatic_hg19_14muts.vcf.gz │ ├── strelka-example.vcf │ ├── tcga_ov.head.maf │ └── tcga_ov.head.xychr.maf ├── test_cli_effects.py ├── test_cli_genes.py ├── test_collection_filtering.py ├── test_common.py ├── test_cosmic_mutations.py ├── test_dbnsfp_validation.py ├── test_effect_annotation_errors.py ├── test_effect_classes.py ├── test_effect_collection.py ├── test_effect_collection_serialization.py ├── test_effects_from_mutagenix_variants.py ├── test_exonic_splice_site.py ├── test_frameshift_helpers.py ├── test_maf.py ├── test_mm10_klf6_frameshift.py ├── test_mouse.py ├── test_mutate.py ├── test_no_duplicate_variants.py ├── test_problematic_variants.py ├── test_reference.py ├── test_string_helpers.py ├── test_timings.py ├── test_variant.py ├── test_variant_collection.py ├── test_vcf.py └── test_vcf_output.py └── varcode ├── __init__.py ├── cli ├── __init__.py ├── effects_script.py ├── genes_script.py ├── logging.conf ├── variant_args.py └── version_info.py ├── common.py ├── effects ├── __init__.py ├── common.py ├── effect_classes.py ├── effect_collection.py ├── effect_helpers.py ├── effect_ordering.py ├── effect_prediction.py ├── effect_prediction_coding.py ├── effect_prediction_coding_frameshift.py ├── effect_prediction_coding_in_frame.py ├── mutate.py ├── transcript_helpers.py └── translate.py ├── maf.py ├── nucleotides.py ├── reference.py ├── string_helpers.py ├── ucsc_reference_names.py ├── util.py ├── variant.py ├── variant_collection.py ├── vcf.py ├── vcf_output.py └── version.py /.coveragerc: -------------------------------------------------------------------------------- 1 | # .coveragerc to control coverage.py 2 | [run] 3 | omit = 4 | test/* 5 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | varcode/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | # TODO: 5 | # - cache this directory $HOME/.cache/pyensembl/ 6 | # - update coveralls 7 | # - get a badge for tests passing 8 | # - download binary dependencies from conda 9 | name: Tests 10 | on: [push, pull_request] 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | fail-fast: true 17 | matrix: 18 | python-version: ["3.9", "3.10", "3.11"] 19 | 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | cache: "pip" 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | python -m pip install flake8 pytest pytest-cov coveralls 32 | pip install -r requirements.txt 33 | pip install . 34 | - name: Lint with flake8 35 | run: | 36 | # stop the build if there are Python syntax errors or undefined names 37 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 38 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 39 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 40 | - name: Run default linting script 41 | run: | 42 | ./lint.sh 43 | - name: Install Ensembl data 44 | run: | 45 | echo "Before installing Ensembl releases" && df -h 46 | pyensembl install --release 75 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh37.75/ 47 | pyensembl install --release 81 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.81/ 48 | pyensembl install --release 95 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.95/ 49 | pyensembl install --release 95 --species mouse --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCm38.95/ 50 | echo "After installing Ensembl releases" && df -h 51 | - name: Run unit tests 52 | run: | 53 | ./test.sh 54 | - name: Publish coverage to Coveralls 55 | uses: coverallsapp/github-action@v2.2.3 56 | -------------------------------------------------------------------------------- /.github_changelog_generator: -------------------------------------------------------------------------------- 1 | unreleased=false 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | # PyCharm 57 | .idea 58 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Contributing to Varcode 2 | ========================== 3 | 4 | [Varcode](http://www.github.com/openvax/varcode) is open source software and 5 | we welcome your contributions. This document should help you get started 6 | contributing to Varcode. 7 | 8 | Filing Issues 9 | ------------- 10 | If you find any bugs or problems while using Varcode or have any feature requests, please feel free to file an issue against the project. When doing so, please follow the guidelines below: 11 | 12 | To report any bugs, issues, or feature requests, please [open an issue](https://github.com/openvax/varcode/issues) 13 | Please check the [current open issues](https://github.com/openvax/varcode/issues) to see if the request already exists 14 | If you are filing a bug report, please describe the version of Varcode, PyEnsembl, and Python being used. If your problem involves a particular genomic variant, please include that variant and its corresponding reference genome (e.g. "GRCh37 1:384747 AAC>T"). 15 | 16 | Coding Guidelines 17 | ----------------- 18 | * Varcode is written in Python and adheres to the [PEP8](https://www.python.org/dev/peps/pep-0008/) 19 | style guidelines. 20 | * Contributions should come in the form of GitHub pull requests. 21 | * New features should start with a GitHub issue explaining their scope and rationale. 22 | * If the work is based on an existing issue, please reference the issue in the PR. 23 | * All new code should be accompanied by comprehensive unit tests. 24 | * If the PR fixes or implements an issue, please state "Closes #XYZ" or "Fixes #XYZ", where XYZ is the issue number. 25 | * Please ensure that your code works under Python >= 3.7. 26 | 27 | Licensing 28 | --------- 29 | Varcode is licensed under the Apache 2.0 license. Your code is assumed to be as well. 30 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include versioneer.py 2 | include varcode/_version.py 3 | include README.md 4 | include LICENSE 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Tests](https://github.com/openvax/varcode/actions/workflows/tests.yml/badge.svg)](https://github.com/openvax/varcode/actions/workflows/tests.yml) 2 | 3 | Coverage Status 4 | 5 | 6 | PyPI 7 | 8 | [![PyPI downloads](https://img.shields.io/pypi/dm/varcode.svg)](https://pypistats.org/packages/varcode) 9 | 10 | # Varcode 11 | 12 | Varcode is a library for working with genomic variant data in Python and predicting the impact of those variants on protein sequences. 13 | 14 | ## Installation 15 | 16 | You can install varcode using [pip](https://pip.pypa.io/en/latest/quickstart.html): 17 | 18 | ```bash 19 | pip install varcode 20 | ``` 21 | 22 | You can install required reference genome data through [PyEnsembl](https://github.com/openvax/pyensembl) as follows: 23 | 24 | ```bash 25 | # Downloads and installs the Ensembl releases (75 and 76) 26 | pyensembl install --release 75 76 27 | ``` 28 | 29 | ## Example 30 | 31 | ```python 32 | import varcode 33 | 34 | # Load TCGA MAF containing variants from their 35 | variants = varcode.load_maf("tcga-ovarian-cancer-variants.maf") 36 | 37 | print(variants) 38 | ### 39 | ### -- Variant(contig=1, start=69538, ref=G, alt=A, genome=GRCh37) 40 | ### -- Variant(contig=1, start=881892, ref=T, alt=G, genome=GRCh37) 41 | ### -- Variant(contig=1, start=3389714, ref=G, alt=A, genome=GRCh37) 42 | ### -- Variant(contig=1, start=3624325, ref=G, alt=T, genome=GRCh37) 43 | ### ... 44 | 45 | # you can index into a VariantCollection and get back a Variant object 46 | variant = variants[0] 47 | 48 | # groupby_gene_name returns a dictionary whose keys are gene names 49 | # and whose values are themselves VariantCollections 50 | gene_groups = variants.groupby_gene_name() 51 | 52 | # get variants which affect the TP53 gene 53 | TP53_variants = gene_groups["TP53"] 54 | 55 | # predict protein coding effect of every TP53 variant on 56 | # each transcript of the TP53 gene 57 | TP53_effects = TP53_variants.effects() 58 | 59 | print(TP53_effects) 60 | ### 61 | ### -- PrematureStop(variant=chr17 g.7574003G>A, transcript_name=TP53-001, transcript_id=ENST00000269305, effect_description=p.R342*) 62 | ### -- ThreePrimeUTR(variant=chr17 g.7574003G>A, transcript_name=TP53-005, transcript_id=ENST00000420246) 63 | ### -- PrematureStop(variant=chr17 g.7574003G>A, transcript_name=TP53-002, transcript_id=ENST00000445888, effect_description=p.R342*) 64 | ### -- FrameShift(variant=chr17 g.7574030_7574030delG, transcript_name=TP53-001, transcript_id=ENST00000269305, effect_description=p.R333fs) 65 | ### ... 66 | 67 | premature_stop_effect = TP53_effects[0] 68 | 69 | print(str(premature_stop_effect.mutant_protein_sequence)) 70 | ### 'MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMF' 71 | 72 | print(premature_stop_effect.aa_mutation_start_offset) 73 | ### 341 74 | 75 | print(premature_stop_effect.transcript) 76 | ### Transcript(id=ENST00000269305, name=TP53-001, gene_name=TP53, biotype=protein_coding, location=17:7571720-7590856) 77 | 78 | print(premature_stop_effect.gene.name) 79 | ### 'TP53' 80 | ``` 81 | 82 | If you are looking for a quick start guide, you can check out [this iPython book](./examples/varcode-quick_start.ipynb) that demonstrates simple use cases of Varcode 83 | 84 | ## Effect Types 85 | 86 | | Effect type | Description | 87 | | ---------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------- | 88 | | _AlternateStartCodon_ | Replace annotated start codon with alternative start codon (_e.g._ "ATG>CAG"). | 89 | | _ComplexSubstitution_ | Insertion and deletion of multiple amino acids. | 90 | | _Deletion_ | Coding mutation which causes deletion of amino acid(s). | 91 | | _ExonLoss_ | Deletion of entire exon, significantly disrupts protein. | 92 | | _ExonicSpliceSite_ | Mutation at the beginning or end of an exon, may affect splicing. | 93 | | _FivePrimeUTR_ | Variant affects 5' untranslated region before start codon. | 94 | | _FrameShiftTruncation_ | A frameshift which leads immediately to a stop codon (no novel amino acids created). | 95 | | _FrameShift_ | Out-of-frame insertion or deletion of nucleotides, causes novel protein sequence and often premature stop codon. | 96 | | _IncompleteTranscript_ | Can't determine effect since transcript annotation is incomplete (often missing either the start or stop codon). | 97 | | _Insertion_ | Coding mutation which causes insertion of amino acid(s). | 98 | | _Intergenic_ | Occurs outside of any annotated gene. | 99 | | _Intragenic_ | Within the annotated boundaries of a gene but not in a region that's transcribed into pre-mRNA. | 100 | | _IntronicSpliceSite_ | Mutation near the beginning or end of an intron but less likely to affect splicing than donor/acceptor mutations. | 101 | | _Intronic_ | Variant occurs between exons and is unlikely to affect splicing. | 102 | | _NoncodingTranscript_ | Transcript doesn't code for a protein. | 103 | | _PrematureStop_ | Insertion of stop codon, truncates protein. | 104 | | _Silent_ | Mutation in coding sequence which does not change the amino acid sequence of the translated protein. | 105 | | _SpliceAcceptor_ | Mutation in the last two nucleotides of an intron, likely to affect splicing. | 106 | | _SpliceDonor_ | Mutation in the first two nucleotides of an intron, likely to affect splicing. | 107 | | _StartLoss_ | Mutation causes loss of start codon, likely result is that an alternate start codon will be used down-stream (possibly in a different frame). | 108 | | _StopLoss_ | Loss of stop codon, causes extension of protein by translation of nucleotides from 3' UTR. | 109 | | _Substitution_ | Coding mutation which causes simple substitution of one amino acid for another. | 110 | | _ThreePrimeUTR_ | Variant affects 3' untranslated region after stop codon of mRNA. | 111 | 112 | ## Coordinate System 113 | 114 | Varcode currently uses a "base counted, one start" genomic coordinate system, to match the Ensembl annotation database. We are planning to switch over to "space counted, zero start" (interbase) coordinates, since that system allows for more uniform logic (no special cases for insertions). To learn more about genomic coordinate systems, read this [blog post](http://alternateallele.blogspot.com/2012/03/genome-coordinate-conventions.html). 115 | -------------------------------------------------------------------------------- /RELEASING.md: -------------------------------------------------------------------------------- 1 | # Releasing Varcode 2 | 3 | This document explains what do once your [Pull Request](https://www.atlassian.com/git/tutorials/making-a-pull-request/) has been reviewed and all final changes applied. Now you're ready merge your branch into master and release it to the world: 4 | 5 | 0. Assign a version to the release you are preparing and update `__version__` in `version.py` using [semantic versioning](https://semver.org/). 6 | 7 | 1. Merge your branch into master. 8 | 9 | 2. Run `deploy.sh`. 10 | 11 | -------------------------------------------------------------------------------- /code-of-conduct.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at hello@openvax.org. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /deploy.sh: -------------------------------------------------------------------------------- 1 | ./lint.sh && \ 2 | ./test.sh && \ 3 | python3 -m pip install --upgrade build && \ 4 | python3 -m pip install --upgrade twine && \ 5 | rm -rf dist && \ 6 | python3 -m build && \ 7 | python3 -m twine upload dist/* 8 | -------------------------------------------------------------------------------- /lint-and-test.sh: -------------------------------------------------------------------------------- 1 | ./lint.sh && ./test.sh 2 | -------------------------------------------------------------------------------- /lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | 4 | # getting false positives due to this issue with pylint: 5 | # https://bitbucket.org/logilab/pylint/issues/701/false-positives-with-not-an-iterable-and 6 | 7 | find varcode tests -name '*.py' \ 8 | | xargs pylint \ 9 | --errors-only \ 10 | --disable=unsubscriptable-object,not-an-iterable 11 | 12 | echo 'Passes pylint check' 13 | -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | [TYPECHECK] 2 | # Without ignoring this, we get errors like: 3 | # E:249,20: Module 'numpy' has no 'nan' member (no-member) 4 | ignored-modules = numpy 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.7,<2.0 2 | pandas>=2.0.0,<3.0.0 3 | pyensembl>=1.8.1 4 | biopython>=1.64 5 | pyvcf3>=1.0.0 6 | memoized_property>=1.0.2 7 | pylint>=1.4.4 8 | serializable>=0.2.1 9 | sercol>=0.1.4 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import os 15 | import re 16 | 17 | from setuptools import setup, find_packages 18 | 19 | readme_filename = "README.md" 20 | current_directory = os.path.dirname(__file__) 21 | readme_path = os.path.join(current_directory, readme_filename) 22 | 23 | try: 24 | with open(readme_path, 'r') as f: 25 | readme_markdown = f.read() 26 | except Exception as e: 27 | readme_markdown = "" 28 | print(e) 29 | print("Failed to open %s" % readme_path) 30 | 31 | # Determine version number 32 | with open('varcode/version.py', 'r') as f: 33 | version = re.search( 34 | r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', 35 | f.read(), 36 | re.MULTILINE).group(1) 37 | print("Version: %s" % version) 38 | 39 | if __name__ == '__main__': 40 | setup( 41 | name='varcode', 42 | packages=find_packages(), 43 | package_data={'varcode.cli': ['logging.conf']}, 44 | version=version, 45 | description="Variant annotation in Python", 46 | long_description=readme_markdown, 47 | long_description_content_type='text/markdown', 48 | url="https://github.com/openvax/varcode", 49 | author="Alex Rubinsteyn", 50 | author_email="alex.rubinsteyn@unc.edu", 51 | license="http://www.apache.org/licenses/LICENSE-2.0.html", 52 | classifiers=[ 53 | 'Development Status :: 4 - Beta', 54 | 'Environment :: Console', 55 | 'Operating System :: OS Independent', 56 | 'Intended Audience :: Science/Research', 57 | 'License :: OSI Approved :: Apache Software License', 58 | 'Programming Language :: Python', 59 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 60 | ], 61 | install_requires=[ 62 | 'numpy>=1.7, <2.0', 63 | 'pandas>=0.15', 64 | 'pyensembl>=1.8.1', 65 | 'biopython>=1.64', 66 | 'pyvcf3>=1.0.0', 67 | 'memoized_property>=1.0.2', 68 | 'serializable>=0.2.1', 69 | 'sercol>=0.1.4', 70 | ], 71 | entry_points={ 72 | 'console_scripts': [ 73 | 'varcode-genes = varcode.cli.genes_script:main', 74 | 'varcode = varcode.cli.effects_script:main', 75 | ] 76 | }) 77 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | pytest --cov=varcode/ --cov-report=term-missing tests 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import print_function, division, absolute_import 14 | -------------------------------------------------------------------------------- /tests/benchmark_vcf_load.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | """ 14 | Time how long it takes to open a VCF. 15 | 16 | Run as: 17 | python -m profile -s cumtime %(prog)s 18 | 19 | to get profiling output. 20 | 21 | """ 22 | import argparse 23 | import time 24 | 25 | import varcode 26 | 27 | parser = argparse.ArgumentParser(description=__doc__) 28 | 29 | parser.add_argument( 30 | "path", help="Path or URL to VCF") 31 | 32 | parser.add_argument( 33 | "--profile", action="store_true", 34 | default=False, 35 | help="Run in a profiler.") 36 | 37 | parser.add_argument( 38 | "--no-info-field", 39 | dest="info_field", 40 | action="store_false", 41 | default=True) 42 | 43 | parser.add_argument( 44 | "--pyvcf", 45 | help="use pyvcf implementation", 46 | action="store_true", 47 | default=False) 48 | 49 | def run(): 50 | args = parser.parse_args() 51 | 52 | extra_args = {} 53 | if not args.info_field: 54 | extra_args["include_info"] = False 55 | 56 | start = time.time() 57 | 58 | if args.pyvcf: 59 | result = varcode.load_vcf( 60 | args.path, 61 | allow_extended_nucleotides=True) 62 | else: 63 | result = varcode.load_vcf_fast( 64 | args.path, 65 | allow_extended_nucleotides=True, 66 | **extra_args) 67 | 68 | print("Loaded %d variants in %0.3f sec. " % ( 69 | len(result), time.time() - start)) 70 | print(result.to_string(limit=5)) 71 | 72 | if __name__ == '__main__': 73 | run() 74 | -------------------------------------------------------------------------------- /tests/common.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | 14 | expected_effect_properties = [ 15 | 'gene', 16 | 'gene_name', 17 | 'gene_id', 18 | 'transcript', 19 | 'transcript_name', 20 | 'transcript_id', 21 | 'modifies_coding_sequence', 22 | 'modifies_protein_sequence', 23 | 'aa_mutation_start_offset', 24 | 'aa_mutation_end_offset', 25 | 'mutant_protein_sequence', 26 | 'short_description' 27 | ] 28 | 29 | def check_effect_properties(effect): 30 | assert effect is not None 31 | # try accessing all the properties to make sure none crash 32 | for attribute_name in expected_effect_properties: 33 | getattr(effect, attribute_name) 34 | assert len(str(effect)) > 0 35 | assert len(repr(effect)) > 0 36 | assert effect.short_description is not None, \ 37 | "Expected effect %s to have a `short_description` property" % (effect,) 38 | assert len(effect.short_description) > 0 39 | assert effect.__class__.__name__ in str(effect), \ 40 | "Expected string representation of %s to include effect name %s" % ( 41 | effect, effect.__class__.__name__) 42 | 43 | def expect_effect( 44 | variant, 45 | transcript_id=None, 46 | effect_class=None, 47 | protein_sequence=None, 48 | **kwargs): 49 | if transcript_id is None: 50 | effects = variant.effects() 51 | effect = effects.top_priority_effect() 52 | else: 53 | transcript = variant.ensembl.transcript_by_id(transcript_id) 54 | effect = variant.effect_on_transcript(transcript) 55 | check_effect_properties(effect) 56 | if effect_class is not None: 57 | assert effect.__class__ is effect_class, \ 58 | "Expected effect class %s but got %s" % ( 59 | effect_class.__name__, 60 | effect.__class__.__name__) 61 | if protein_sequence is not None: 62 | assert effect.mutant_protein_sequence == protein_sequence, \ 63 | "Expected protein sequence %s but got %s" % ( 64 | protein_sequence, 65 | effect.mutant_protein_sequence) 66 | for field, expected_value in kwargs.items(): 67 | actual_value = getattr(effect, field) 68 | if isinstance(expected_value, int): 69 | format_string = "Expected %s=%d but got %s" 70 | elif isinstance(expected_value, float): 71 | format_string = "Expected %s=%f but got %s" 72 | else: 73 | format_string = "Expected %s='%s' but got '%s'" 74 | assert actual_value == expected_value, format_string % (field, expected_value, actual_value) 75 | 76 | def eq_(x, y, s=None): 77 | if s is None: 78 | assert x == y, "%s != %s" % (x, y) 79 | else: 80 | assert x == y, s -------------------------------------------------------------------------------- /tests/data.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | """ 14 | Helper functions and shared datasets for tests 15 | """ 16 | 17 | import os 18 | from varcode import Variant, VariantCollection, load_maf 19 | import pandas as pd 20 | 21 | def data_path(name): 22 | """ 23 | Return the absolute path to a file in the varcode/test/data directory. 24 | The name specified should be relative to varcode/test/data. 25 | """ 26 | return os.path.join(os.path.dirname(__file__), "data", name) 27 | 28 | dbnsp_validation_df = pd.read_csv(data_path('dbnsfp_validation_set.csv')) 29 | tcga_ov_variants = load_maf(data_path("tcga_ov.head.maf")) 30 | ov_wustle_variants = load_maf(data_path("ov.wustle.subset5.maf")) 31 | 32 | snp_rs4244285 = Variant( 33 | contig=10, 34 | start=94781859, 35 | ref="G", 36 | alt="A") 37 | snp_rs1537415 = Variant( 38 | contig=9, 39 | start=135637876, 40 | ref="C", 41 | alt="G") 42 | snp_rs3892097 = Variant( 43 | contig=22, 44 | start=42524947, 45 | ref="G", 46 | alt="A") 47 | 48 | db_snp_variants = VariantCollection([ 49 | snp_rs4244285, 50 | snp_rs1537415, 51 | snp_rs3892097, 52 | ]) 53 | -------------------------------------------------------------------------------- /tests/data/dbnsfp_validation_set.csv: -------------------------------------------------------------------------------- 1 | aa_alt,aa_pos,dna_alt,chrom,ensembl_transcript,dna_position,dna_ref 2 | K,143,T,14,ENST00000379932,105675961,C 3 | L,852,A,12,ENST00000261740,110221487,C 4 | L,805,A,12,ENST00000392719,110221487,C 5 | L,792,A,12,ENST00000346520,110221487,C 6 | L,745,A,12,ENST00000544971,110221487,C 7 | L,792,A,12,ENST00000537083,110221487,C 8 | L,805,A,12,ENST00000541794,110221487,C 9 | L,818,A,12,ENST00000536838,110221487,C 10 | W,241,C,15,ENST00000288235,59516943,G 11 | N,82,G,6,ENST00000377451,27279704,C 12 | C,354,G,1,ENST00000546424,15820483,C 13 | C,354,G,1,ENST00000333868,15820483,C 14 | C,204,G,1,ENST00000348549,15820483,C 15 | C,271,G,1,ENST00000375890,15820483,C 16 | N,176,T,6,ENST00000521485,84368738,C 17 | H,178,C,1,ENST00000368764,152882807,G 18 | H,32,C,1,ENST00000392667,152882807,G 19 | K,2885,T,1,ENST00000368346,155308045,C 20 | K,2880,T,1,ENST00000392403,155308045,C 21 | P,1534,G,22,ENST00000441493,18300827,T 22 | L,32,A,7,ENST00000394507,91871355,G 23 | K,84,T,4,ENST00000296522,175439195,C 24 | Q,446,C,22,ENST00000536101,26165219,G 25 | D,3878,C,2,ENST00000409009,73827899,G 26 | T,10,T,16,ENST00000283025,10788703,C 27 | T,610,T,14,ENST00000331968,30093435,C 28 | G,185,C,20,ENST00000546004,5283287,G 29 | M,170,T,17,ENST00000269051,30616025,C 30 | M,162,T,17,ENST00000538145,30616025,C 31 | M,72,T,17,ENST00000536287,30616025,C 32 | M,1664,A,9,ENST00000313050,139355629,C 33 | M,1486,A,9,ENST00000371706,139355629,C 34 | M,1486,A,9,ENST00000290037,139355629,C 35 | M,1486,A,9,ENST00000431893,139355629,C 36 | A,666,C,4,ENST00000508776,128744730,G 37 | A,697,C,4,ENST00000439123,128744730,G 38 | A,666,C,4,ENST00000296464,128744730,G 39 | A,640,C,4,ENST00000505726,128744730,G 40 | L,38,T,10,ENST00000370196,102891411,C 41 | K,270,A,1,ENST00000498508,214170686,G 42 | H,110,C,16,ENST00000311620,21261217,G 43 | L,947,A,2,ENST00000419748,88857312,G 44 | L,1098,A,2,ENST00000303236,88857312,G 45 | L,260,A,10,ENST00000372873,75407959,G 46 | L,484,A,10,ENST00000394810,75407959,G 47 | E,123,C,6,ENST00000531224,136599652,G 48 | E,121,C,6,ENST00000353331,136599652,G 49 | E,123,C,6,ENST00000527536,136599652,G 50 | E,121,C,6,ENST00000392348,136599652,G 51 | L,2419,T,5,ENST00000438447,32090810,C 52 | L,2419,T,5,ENST00000282493,32090810,C 53 | K,32,T,X,ENST00000375992,51239203,C 54 | R,250,A,14,ENST00000306051,52735280,G 55 | K,467,T,X,ENST00000396992,47483685,C 56 | V,1462,G,5,ENST00000399503,56184179,T 57 | K,123,A,16,ENST00000434417,30429101,G 58 | Q,312,T,1,ENST00000427495,242271091,C 59 | Q,282,T,1,ENST00000442594,242271091,C 60 | Q,374,T,1,ENST00000536534,242271091,C 61 | L,150,T,20,ENST00000244051,49575828,C 62 | K,774,A,16,ENST00000301727,2285538,G 63 | R,85,G,10,ENST00000520547,81272659,A 64 | N,532,A,2,ENST00000393504,99013227,G 65 | N,536,A,2,ENST00000409937,99013227,G 66 | L,26,A,7,ENST00000394507,91871373,C 67 | M,2116,G,19,ENST00000352632,41073580,C 68 | M,859,G,19,ENST00000392025,41073580,C 69 | H,161,G,17,ENST00000301037,26939700,C 70 | K,170,A,20,ENST00000375994,30409276,G 71 | F,679,A,15,ENST00000389039,45398436,G 72 | N,342,T,8,ENST00000361421,59728265,C 73 | K,167,A,13,ENST00000376958,95264638,G 74 | K,1371,T,8,ENST00000320476,144874944,C 75 | K,1290,T,8,ENST00000377533,144874944,C 76 | K,1666,A,11,ENST00000321505,33680325,G 77 | K,1672,A,11,ENST00000389726,33680325,G 78 | A,1326,G,12,ENST00000267101,56495786,C 79 | A,683,G,12,ENST00000450146,56495786,C 80 | A,1267,G,12,ENST00000415288,56495786,C 81 | K,635,T,1,ENST00000366508,247057966,C 82 | K,609,T,1,ENST00000326225,247057966,C 83 | H,379,C,12,ENST00000547057,94691119,G 84 | H,71,C,12,ENST00000545312,94691119,G 85 | K,545,A,3,ENST00000263967,178936091,G 86 | K,402,A,16,ENST00000416441,29996834,G 87 | K,278,A,16,ENST00000389398,22128096,G 88 | Q,837,C,11,ENST00000529051,124908424,G 89 | R,407,A,12,ENST00000257963,52380684,G 90 | R,448,A,12,ENST00000541224,52380684,G 91 | R,355,A,12,ENST00000542485,52380684,G 92 | Q,10,T,6,ENST00000011619,13711709,G 93 | W,763,A,2,ENST00000281405,20136107,G 94 | C,7,G,11,ENST00000398534,71249121,C 95 | Q,119,G,12,ENST00000204726,133393177,C 96 | I,745,T,15,ENST00000356865,25958932,C 97 | K,569,A,1,ENST00000369130,150116967,G 98 | K,526,A,18,ENST00000342988,48604754,G 99 | N,91,T,11,ENST00000528117,8974698,C 100 | L,444,A,5,ENST00000507386,147020337,G 101 | L,444,A,5,ENST00000265272,147020337,G 102 | L,402,A,5,ENST00000333010,147020337,G 103 | E,190,T,15,ENST00000324324,48451958,C 104 | T,108,A,19,ENST00000392518,50203981,G 105 | P,1241,C,2,ENST00000401884,242011122,A 106 | M,951,T,3,ENST00000474889,62253472,C 107 | M,922,T,3,ENST00000295874,62253472,C 108 | E,268,A,9,ENST00000380607,17793439,G 109 | E,221,A,9,ENST00000537391,17793439,G 110 | A,508,G,12,ENST00000228437,108136084,C 111 | N,87,A,10,ENST00000373910,60124591,G 112 | -------------------------------------------------------------------------------- /tests/data/different-samples.1.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta 3 | ##INFO= 4 | ##INFO= 5 | ##FORMAT= 6 | ##contig= 7 | ##contig= 8 | ##contig= 9 | ##contig= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##contig= 15 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis 16 | chr1 53513530 . A C . . GE=SCP2;EG=6342 GT 0/1 17 | chr17 36731197 . C AAT . . GE=SRCIN1;EG=80725 GT 0/1 18 | -------------------------------------------------------------------------------- /tests/data/different-samples.2.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta 3 | ##INFO= 4 | ##INFO= 5 | ##FORMAT= 6 | ##contig= 7 | ##contig= 8 | ##contig= 9 | ##contig= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##contig= 15 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT normal 16 | chr2 5 . A C . . GE=SCP2;EG=6342 GT 0/1 17 | chr7 18 . C AAT . . GE=SRCIN1;EG=80725 GT 0/1 18 | -------------------------------------------------------------------------------- /tests/data/duplicate-id.1.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.2 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta 3 | #CHROM POS ID REF ALT QUAL FILTER INFO 4 | chr1 13281 1 C G . PASS VT=SNP;SOMATIC 5 | -------------------------------------------------------------------------------- /tests/data/duplicate-id.2.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.2 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta 3 | #CHROM POS ID REF ALT QUAL FILTER INFO 4 | chr1 13281 1 C G,T . PASS VT=SNP;SOMATIC 5 | -------------------------------------------------------------------------------- /tests/data/duplicates.maf: -------------------------------------------------------------------------------- 1 | Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_position End_position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 2 | A1CF 29974 . 37 10 52573692 52573692 + Missense_Mutation SNP G G T . . . . . . 3 | A1CF 29974 . 37 10 52573692 52573692 + Missense_Mutation SNP G G T . . . . . . 4 | A1CF 29974 . 37 10 52573692 52573692 + Missense_Mutation SNP G G T . . . . . . -------------------------------------------------------------------------------- /tests/data/duplicates.vcf: -------------------------------------------------------------------------------- 1 | #CHROM POS ID REF ALT QUAL FILTER INFO 2 | chr17 7675088 . G A 0 PASS . 3 | chr17 7675088 . G A 0 PASS . 4 | chr17 7675088 . G A 0 PASS . -------------------------------------------------------------------------------- /tests/data/multiallelic.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta 3 | ##INFO= 4 | ##INFO= 5 | ##FORMAT= 6 | ##contig= 7 | ##contig= 8 | ##contig= 9 | ##contig= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##contig= 15 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis 16 | chr1 1431105 rs199599542 A C,G 593.69 PASS DP=17;GE=Wuzzle GT 0/1 17 | -------------------------------------------------------------------------------- /tests/data/ov.wustle.subset5.maf: -------------------------------------------------------------------------------- 1 | #version 2.4 2 | Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_position End_position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_file Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID chromosome_name start stop reference variant type gene_name transcript_name transcript_species transcript_source transcript_version strand transcript_status trv_type c_position amino_acid_change ucsc_cons domain all_domains deletion_substructures transcript_error 3 | AGL 178 genome.wustl.edu 37 1 100349684 100349684 + Missense_Mutation SNP G G A TCGA-13-1405-01A-01W-0494-09 TCGA-13-1405-10A-01W-0495-09 G G G A G G Unknown Valid Somatic 4 WXS 454_PCR_WGA 1 dbGAP Illumina GAIIx c0d1de72-4cce-4d74-93f0-29c462dc1426 89f04056-0478-4305-b1ce-486ae469b4dd 1 100122272 100122272 G A SNP AGL NM_000028 human genbank 54_36p 1 validated missense c.2317 p.E773K 1 NULL superfamily_Six-hairpin glycosidases;HMMPfam_GDE_C;superfamily_(Trans)glycosidases - no_errors 4 | SASS6 163786 genome.wustl.edu 37 1 100573197 100573197 + Missense_Mutation SNP G G A TCGA-04-1542-01A-01W-0553-09 TCGA-04-1542-10A-01W-0553-09 G G G A G G Unknown Valid Somatic 4 WXS 454_PCR_WGA 1 dbGAP Illumina GAIIx 317a63af-e862-43df-8ef5-7c555b2cb678 b94052a8-c3d2-4e47-81e2-62242bc0841a 1 100345785 100345785 G A SNP SASS6 NM_194292 human genbank 54_36p -1 validated missense c.1133 p.A378V 1 NULL - - no_errors 5 | LRRC39 127495 genome.wustl.edu 37 1 100618068 100618068 + Silent SNP G G A TCGA-23-1022-01A-02W-0488-09 TCGA-23-1022-10A-01W-0488-09 G G G A G G Unknown Valid Somatic 4 WXS 454_PCR_WGA 1 dbGAP Illumina GAIIx 160a0e7d-315e-4de3-a7d4-928412fd909c 6bd506d5-4f1a-4f51-a71f-e453196b245a 1 100390656 100390656 G A SNP LRRC39 NM_144620 human genbank 54_36p -1 provisional silent c.825 p.F275 1 NULL - - no_errors 6 | UBE4B 10277 genome.wustl.edu 37 1 10238758 10238758 + Silent SNP G G C TCGA-13-0920-01A-01W-0421-09 TCGA-13-0920-10A-01W-0421-09 G G G C G G Unknown Valid Somatic 4 WXS 454_PCR_WGA 1 dbGAP Illumina GAIIx 2e28969b-c9a9-41ec-80bf-f583197b7f92 83a80d56-e463-4096-8c17-a44000f80f66 1 10161345 10161345 G C SNP UBE4B NM_001105562 human genbank 54_36p 1 reviewed silent c.3582 p.G1194 0.97 NULL - - no_errors 7 | COL11A1 1301 genome.wustl.edu 37 1 103491420 103491420 + Missense_Mutation SNP T T A TCGA-13-0893-01B-01W-0494-09 TCGA-13-0893-10A-01W-0494-09 T T T A T T Unknown Valid Somatic 4 WXS 454_PCR_WGA 1 dbGAP Illumina GAIIx a335ab49-84b7-4d3b-a03d-9c3931904ca5 23f57381-b679-41b8-8197-aed711f71db4 1 103264008 103264008 T A SNP COL11A1 NM_080629 human genbank 54_36p -1 reviewed missense c.869 p.E290V 1 NULL HMMPfam_COLFI;HMMPfam_Collagen;superfamily_Concanavalin A-like lectins/glucanases;HMMPfam_Laminin_G_2;superfamily_Fibrinogen C-terminal domain-like - no_errors 8 | -------------------------------------------------------------------------------- /tests/data/same-samples.1.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta 3 | ##INFO= 4 | ##INFO= 5 | ##FORMAT= 6 | ##contig= 7 | ##contig= 8 | ##contig= 9 | ##contig= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##contig= 15 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis 16 | chr1 53513530 . A C . . GE=SCP2;EG=6342 GT 0/1 17 | chr17 36731197 . C AAT . . GE=SRCIN1;EG=80725 GT 0/1 18 | -------------------------------------------------------------------------------- /tests/data/same-samples.2.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta 3 | ##INFO= 4 | ##INFO= 5 | ##FORMAT= 6 | ##contig= 7 | ##contig= 8 | ##contig= 9 | ##contig= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##contig= 15 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis 16 | chr1 53513530 . A C . . GE=SCP2;EG=6342 GT 0/1 17 | chr17 36731197 . C AAT . . GE=SRCIN1;EG=80725 GT 0/1 18 | -------------------------------------------------------------------------------- /tests/data/simple.1.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.2 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta 3 | #CHROM POS ID REF ALT QUAL FILTER INFO 4 | chr1 13281 . C G . PASS VT=SNP;SOMATIC 5 | -------------------------------------------------------------------------------- /tests/data/simple.2.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.2 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta 3 | #CHROM POS ID REF ALT QUAL FILTER INFO 4 | chr1 13289 . T C . PASS VT=SNP;SOMATIC 5 | chr2 13289 . A G . PASS VT=SNP;SOMATIC 6 | -------------------------------------------------------------------------------- /tests/data/somatic_hg19_14muts.space_in_sample_name.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta 3 | ##INFO= 4 | ##INFO= 5 | ##FORMAT= 6 | ##contig= 7 | ##contig= 8 | ##contig= 9 | ##contig= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##contig= 15 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis foo 16 | chr1 53513530 . A C . . GE=SCP2;EG=6342 GT 0/1 17 | chr1 228295398 . G T . . GE=MRPL55;EG=128308 GT 0/1 18 | chr10 49658590 . T C . . GE=ARHGAP22;EG=58504 GT 0/1 19 | chr10 51585166 . G T . . GE=NCOA4;EG=8031 GT 0/1 20 | chr10 96709040 . A C . . GE=CYP2C9;EG=1559 GT 0/1 21 | chr10 119134281 . G T . . GE=PDZD8;EG=118987 GT 0/1 22 | chr11 118244286 . G G . . GE=UBE4A;EG=9354 GT 0/1 23 | chr12 14794076 . C A . . GE=GUCY2C;EG=2984 GT 0/1 24 | chr12 25398284 . C G . . GE=KRAS;EG=3845 GT 0/1 25 | chr12 42778752 . T A . . GE=PPHLN1;EG=51535 GT 0/1 26 | chr14 31144202 . A C . . GE=SCFD1;EG=23256 GT 0/1 27 | chr16 25704209 . G A . . GE=HS3ST4;EG=9951 GT 0/1 28 | chr17 7577548 . C CA . . GE=TP53;EG=7157 GT 0/1 29 | chr17 36731197 . C AAT . . GE=SRCIN1;EG=80725 GT 0/1 30 | -------------------------------------------------------------------------------- /tests/data/somatic_hg19_14muts.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta 3 | ##INFO= 4 | ##INFO= 5 | ##FORMAT= 6 | ##contig= 7 | ##contig= 8 | ##contig= 9 | ##contig= 10 | ##contig= 11 | ##contig= 12 | ##contig= 13 | ##contig= 14 | ##contig= 15 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis 16 | chr1 53513530 . A C . . GE=SCP2;EG=6342 GT 0/1 17 | chr1 228295398 . G T . . GE=MRPL55;EG=128308 GT 0/1 18 | chr10 49658590 . T C . . GE=ARHGAP22;EG=58504 GT 0/1 19 | chr10 51585166 . G T . . GE=NCOA4;EG=8031 GT 0/1 20 | chr10 96709040 . A C . . GE=CYP2C9;EG=1559 GT 0/1 21 | chr10 119134281 . G T . . GE=PDZD8;EG=118987 GT 0/1 22 | chr11 118244286 . G G . . GE=UBE4A;EG=9354 GT 0/1 23 | chr12 14794076 . C A . . GE=GUCY2C;EG=2984 GT 0/1 24 | chr12 25398284 . C G . . GE=KRAS;EG=3845 GT 0/1 25 | chr12 42778752 . T A . . GE=PPHLN1;EG=51535 GT 0/1 26 | chr14 31144202 . A C . . GE=SCFD1;EG=23256 GT 0/1 27 | chr16 25704209 . G A . . GE=HS3ST4;EG=9951 GT 0/1 28 | chr17 7577548 . C CA . . GE=TP53;EG=7157 GT 0/1 29 | chr17 36731197 . C AAT . . GE=SRCIN1;EG=80725 GT 0/1 30 | -------------------------------------------------------------------------------- /tests/data/somatic_hg19_14muts.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/varcode/2c1c96e0564d7ad5f66b26e33fc0a027353640f4/tests/data/somatic_hg19_14muts.vcf.gz -------------------------------------------------------------------------------- /tests/data/tcga_ov.head.maf: -------------------------------------------------------------------------------- 1 | Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_position End_position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_file Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID chromosome_name start stop reference variant type gene_name transcript_name transcript_species transcript_source transcript_version strand transcript_status trv_type c_position amino_acid_change ucsc_cons domain all_domains deletion_substructures transcript_error 2 | CDK11A 0 - 37 1 1650797 1650797 + Missense_Mutation SNP A A G TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 A A Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 1650797 1650797 A G SNP CDK11A ENST00000404249 human ensembl 69_37n -1 known missense c.325 p.C109R 0.971 NULL pfam_Prot_kinase_cat_dom,pfam_Ser-Thr/Tyr_kinase_cat_dom,superfamily_Kinase-like_dom,smart_Ser/Thr_dual-sp_kinase_dom,smart_Tyr_kinase_cat_dom,pfscan_Prot_kinase_cat_dom - no_errors 3 | GNPAT 0 - 37 1 231401797 231401797 + Missense_Mutation SNP A A C TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 A A Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 231401797 231401797 A C SNP GNPAT ENST00000366647 human ensembl 69_37n +1 known missense c.810 p.R270S 0.997 pfam_Acyltransferase,smart_Acyltransferase pfam_Acyltransferase,smart_Acyltransferase - no_errors 4 | E2F2 0 - 37 1 23836447 23836447 + Silent SNP C C A TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 C C Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 23836447 23836447 C A SNP E2F2 ENST00000361729 human ensembl 69_37n -1 known silent c.1239 p.L413 0.999 NULL pfam_E2F_TDP - no_errors 5 | VSIG2 0 - 37 11 124617502 124617502 + Missense_Mutation SNP C C G TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 C C Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 11 124617502 124617502 C G SNP VSIG2 ENST00000326621 human ensembl 69_37n -1 known missense c.913 p.G305R 0.813 NULL pfam_Ig_V-set,pfam_Ig_I-set,pfam_Immunoglobulin,smart_Ig_sub,smart_Ig_sub2,smart_Ig_V-set_subgr,pfscan_Ig-like - no_errors 6 | -------------------------------------------------------------------------------- /tests/data/tcga_ov.head.xychr.maf: -------------------------------------------------------------------------------- 1 | Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_position End_position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_file Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID chromosome_name start stop reference variant type gene_name transcript_name transcript_species transcript_source transcript_version strand transcript_status trv_type c_position amino_acid_change ucsc_cons domain all_domains deletion_substructures transcript_error 2 | CDK11A 0 - 37 X 1650797 1650797 + Missense_Mutation SNP A A G TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 A A Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 1650797 1650797 A G SNP CDK11A ENST00000404249 human ensembl 69_37n -1 known missense c.325 p.C109R 0.971 NULL pfam_Prot_kinase_cat_dom,pfam_Ser-Thr/Tyr_kinase_cat_dom,superfamily_Kinase-like_dom,smart_Ser/Thr_dual-sp_kinase_dom,smart_Tyr_kinase_cat_dom,pfscan_Prot_kinase_cat_dom - no_errors 3 | GNPAT 0 - 37 Y 231401797 231401797 + Missense_Mutation SNP A A C TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 A A Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 231401797 231401797 A C SNP GNPAT ENST00000366647 human ensembl 69_37n +1 known missense c.810 p.R270S 0.997 pfam_Acyltransferase,smart_Acyltransferase pfam_Acyltransferase,smart_Acyltransferase - no_errors 4 | E2F2 0 - 37 1 23836447 23836447 + Silent SNP C C A TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 C C Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 23836447 23836447 C A SNP E2F2 ENST00000361729 human ensembl 69_37n -1 known silent c.1239 p.L413 0.999 NULL pfam_E2F_TDP - no_errors 5 | VSIG2 0 - 37 11 124617502 124617502 + Missense_Mutation SNP C C G TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 C C Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 11 124617502 124617502 C G SNP VSIG2 ENST00000326621 human ensembl 69_37n -1 known missense c.913 p.G305R 0.813 NULL pfam_Ig_V-set,pfam_Ig_I-set,pfam_Immunoglobulin,smart_Ig_sub,smart_Ig_sub2,smart_Ig_V-set_subgr,pfscan_Ig-like - no_errors 6 | -------------------------------------------------------------------------------- /tests/test_cli_effects.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from tempfile import NamedTemporaryFile 14 | import pandas as pd 15 | 16 | from varcode.cli.effects_script import main as run_script 17 | from varcode import Variant 18 | 19 | from .common import eq_ 20 | def test_varcode_effects_script_kras_g12d_top_effect(): 21 | """ 22 | Load a variant collection with combines the ovarian cancer test VCF 23 | and a small number of variants from dbSNP 24 | """ 25 | kras_g12d_variant = Variant( 26 | 12, 27 | 25398284, 28 | "C", 29 | "T", 30 | "GRCh37") 31 | commandline_args = ["--genome", "grch37", "--only-coding", "--one-per-variant"] 32 | commandline_args.append("--variant") 33 | commandline_args.append(str(kras_g12d_variant.contig)) 34 | commandline_args.append(str(kras_g12d_variant.start)) 35 | commandline_args.append(str(kras_g12d_variant.original_ref)) 36 | commandline_args.append(str(kras_g12d_variant.original_alt)) 37 | with NamedTemporaryFile(mode="r+", delete=True) as f: 38 | commandline_args.extend(["--output-csv", f.name]) 39 | run_script(commandline_args) 40 | f.flush() 41 | df = pd.read_csv(f.name) 42 | eq_(len(df), 1) 43 | eq_(df.loc[0].gene_name, "KRAS") 44 | eq_(df.iloc[0].effect, "p.G12D") 45 | 46 | -------------------------------------------------------------------------------- /tests/test_cli_genes.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from varcode.cli.genes_script import main as run_script 14 | from .data import ov_wustle_variants, db_snp_variants 15 | 16 | from tempfile import NamedTemporaryFile 17 | import pandas as pd 18 | 19 | 20 | def test_varcode_effects_script(): 21 | """ 22 | Load a variant collection with combines the ovarian cancer test VCF 23 | and a small number of variants from dbSNP 24 | """ 25 | commandline_args = ["--genome", "grch37"] 26 | commandline_args.extend(["--maf", ov_wustle_variants.path]) 27 | for variant in db_snp_variants: 28 | commandline_args.append("--variant") 29 | commandline_args.append(str(variant.contig)) 30 | commandline_args.append(str(variant.start)) 31 | commandline_args.append(str(variant.original_ref)) 32 | commandline_args.append(str(variant.original_alt)) 33 | with NamedTemporaryFile(mode="r+", delete=True) as f: 34 | commandline_args.extend(["--output-csv", f.name]) 35 | run_script(commandline_args) 36 | f.flush() 37 | combined_variants = pd.read_csv(f.name) 38 | assert len(combined_variants) == (len(ov_wustle_variants) + len(db_snp_variants)) 39 | -------------------------------------------------------------------------------- /tests/test_collection_filtering.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | 14 | from varcode import VariantCollection 15 | from .common import eq_ 16 | from .data import ( 17 | snp_rs4244285, 18 | snp_rs1537415 19 | ) 20 | 21 | variants = VariantCollection([ 22 | # gene ids: ['ENSG00000165841', 'ENSG00000276490'] 23 | # transcript_ids : ['ENST00000371321', 'ENST00000464755'] 24 | snp_rs4244285, 25 | # gene ids: ['ENSG00000204007'] 26 | # transcript ids: ['ENST00000371763', 'ENST00000613244'] 27 | snp_rs1537415, 28 | ]) 29 | 30 | gene_fpkm_dict = { 31 | "ENSG00000165841": 10.0, 32 | "ENSG00000204007": 20.0, 33 | "ENSG00000276490": 30.0, 34 | } 35 | 36 | transcript_fpkm_dict = { 37 | "ENST00000371321": 10.0, 38 | "ENST00000464755": 20.0, 39 | "ENST00000371763": 30.0, 40 | "ENST00000613244": 40.0, 41 | "ENST00000645461": 5.0, 42 | } 43 | 44 | effects = variants.effects() 45 | 46 | empty_variants = VariantCollection([]) 47 | empty_effects = empty_variants.effects() 48 | 49 | 50 | def test_filter_variants(): 51 | eq_(variants.filter(lambda _: True), variants) 52 | eq_(variants.filter(lambda _: False), empty_variants) 53 | 54 | 55 | def test_filter_effects(): 56 | eq_(effects.filter(lambda _: True), effects) 57 | eq_(effects.filter(lambda _: False), empty_effects) 58 | 59 | 60 | def test_filter_variants_by_gene_expression(): 61 | eq_(variants.filter_by_gene_expression( 62 | gene_fpkm_dict, 0.0), variants) 63 | eq_(variants.filter_by_gene_expression( 64 | gene_fpkm_dict, 100.0), empty_variants) 65 | 66 | 67 | def test_filter_effects_by_gene_expression(): 68 | eq_(effects.filter_by_gene_expression( 69 | gene_fpkm_dict, 0.0), effects) 70 | eq_(effects.filter_by_gene_expression( 71 | gene_fpkm_dict, 100.0), empty_effects) 72 | 73 | 74 | def test_filter_variants_by_transcript_expression(): 75 | expect_all = variants.filter_by_gene_expression( 76 | gene_fpkm_dict, 0.0) 77 | eq_(expect_all, variants) 78 | expect_none = variants.filter_by_gene_expression( 79 | gene_fpkm_dict, 100.0) 80 | eq_(expect_none, empty_variants) 81 | 82 | 83 | def test_filter_effects_by_transcript_expression(): 84 | 85 | expect_all = effects.filter_by_transcript_expression( 86 | transcript_fpkm_dict, 0.0) 87 | eq_(expect_all, effects) 88 | expect_none = effects.filter_by_transcript_expression( 89 | transcript_fpkm_dict, 100.0) 90 | eq_(expect_none, empty_effects) 91 | 92 | 93 | def test_filter_silent_effects(): 94 | # all dbSNP entries in the collection are silent 95 | assert len(effects.drop_silent_and_noncoding()) == 0 96 | -------------------------------------------------------------------------------- /tests/test_common.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import varcode 14 | from .common import eq_ 15 | 16 | def test_memoize(): 17 | class State(object): 18 | def __init__(self): 19 | self.x = 0 20 | 21 | def incr(self): 22 | self.x += 1 23 | 24 | state1 = State() 25 | # call incr twice and expect state to increment twice 26 | state1.incr() 27 | state1.incr() 28 | eq_(state1.x, 2) 29 | 30 | state2 = State() 31 | memoized = varcode.common.memoize(state2.incr) 32 | # call twice but should only increase once 33 | memoized() 34 | memoized() 35 | eq_(state2.x, 1) 36 | 37 | def test_groupby_field(): 38 | class Record(object): 39 | def __init__(self, x, y): 40 | self.x = x 41 | self.y = y 42 | 43 | def __eq__(self, other): 44 | return self.x == other.x and self.y == other.y 45 | 46 | def __str__(self): 47 | return "Record(%s, %s)" % (self.x, self.y) 48 | 49 | def __repr__(self): 50 | return str(self) 51 | 52 | r1_2 = Record(1, 2) 53 | r10_20 = Record(10, 20) 54 | r1_3 = Record(1, 3) 55 | data = [r1_2, r10_20, r1_3] 56 | grouped_dict = varcode.common.groupby_field(data, 'x') 57 | eq_(tuple(sorted(grouped_dict.keys())), (1, 10)) 58 | eq_(grouped_dict[1], [r1_2, r1_3]) 59 | eq_(grouped_dict[10], [r10_20]) 60 | -------------------------------------------------------------------------------- /tests/test_cosmic_mutations.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from pyensembl import ensembl_grch37 as ensembl 14 | from varcode import Variant 15 | from varcode.effects import ( 16 | Substitution, 17 | Deletion, 18 | Insertion, 19 | FrameShift, 20 | Silent, 21 | ExonicSpliceSite, 22 | ) 23 | 24 | def _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id): 25 | variant = Variant(chrom, pos, dna_ref, dna_alt, ensembl=ensembl) 26 | effects = variant.effects() 27 | transcript_dict = effects.top_priority_effect_per_transcript_id() 28 | assert transcript_id in transcript_dict, \ 29 | "Expected transcript ID %s for variant %s not found in %s" % ( 30 | transcript_id, variant, transcript_dict) 31 | effect = transcript_dict[transcript_id] 32 | 33 | # COSMIC seems to ignore exonic splice sites 34 | if isinstance(effect, ExonicSpliceSite): 35 | return effect.alternate_effect 36 | else: 37 | return effect 38 | 39 | def _substitution(chrom, pos, dna_ref, dna_alt, transcript_id, aa_ref, aa_alt): 40 | effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id) 41 | assert isinstance(effect, Substitution), \ 42 | "Expected effect to be substitution, got %s" % (effect,) 43 | 44 | assert effect.aa_ref == aa_ref, \ 45 | "Expected aa_ref='%s' : %s but got %s : %s from %s" % ( 46 | aa_ref, type(aa_ref), 47 | effect.aa_ref, type(effect.aa_ref), 48 | effect) 49 | assert effect.aa_alt == aa_alt, \ 50 | "Expected aa_alt='%s' but got %s" % ( 51 | aa_alt, effect) 52 | 53 | def _silent(chrom, pos, dna_ref, dna_alt, transcript_id, aa_ref): 54 | effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id) 55 | assert isinstance(effect, Silent), \ 56 | "Expected effect to be silent, got %s" % (effect,) 57 | assert effect.aa_ref == aa_ref, "Expected aa_ref='%s', got '%s'" % ( 58 | aa_ref, effect.aa_ref) 59 | 60 | def _deletion(chrom, pos, dna_ref, dna_alt, transcript_id, deleted): 61 | effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id) 62 | assert isinstance(effect, Deletion), \ 63 | "Expected deletion, got %s" % (effect,) 64 | assert effect.aa_ref == deleted, \ 65 | "Expected deletion of '%s' but got deletion of '%s' for %s:%d%s>%s" % ( 66 | deleted, effect.aa_ref, chrom, pos, dna_ref, dna_alt) 67 | 68 | def _insertion(chrom, pos, dna_ref, dna_alt, transcript_id, inserted): 69 | effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id) 70 | assert isinstance(effect, Insertion), \ 71 | "Expected insertion, got %s" % (effect,) 72 | assert effect.aa_alt == inserted, \ 73 | "Expected insertion of '%s' but got %s for %s:%d%s>%s" % ( 74 | inserted, 75 | effect.short_description(), 76 | chrom, 77 | pos, 78 | dna_ref, 79 | dna_alt) 80 | 81 | def _frameshift( 82 | chrom, 83 | pos, 84 | dna_ref, 85 | dna_alt, 86 | transcript_id, 87 | aa_pos, 88 | aa_ref): 89 | effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id) 90 | assert isinstance(effect, FrameShift), \ 91 | "Expected frameshift, got %s" % (effect,) 92 | effect_aa_pos = effect.aa_mutation_start_offset 93 | assert effect.aa_ref[0] == aa_ref and effect_aa_pos + 1 == aa_pos, \ 94 | ("Expected frameshift to replace p.%d%s but instead got %s" % ( 95 | aa_pos, aa_ref, effect)) 96 | 97 | def test_COSM3939556_silent(): 98 | # 22 19222059 COSM3939556 G>T 99 | # GENE=CLTCL1_ENST00000427926 100 | # STRAND=- 101 | # CDS=c.1140C>A 102 | # AA=p.A380A 103 | _silent("22", 19222059, "G", "T", "ENST00000427926", "A") 104 | 105 | def test_COSM3747785_NBPF10_Q363L(): 106 | # 1 145311839 COSM3747785 A>T 107 | # GENE=NBPF10_ENST00000369338 108 | # STRAND=+ 109 | # CDS=c.1088A>T 110 | # AA=p.Q363L 111 | _substitution("1", 145311839, "A", "T", "ENST00000369338", "Q", "L") 112 | 113 | def test_COSM3368867_SMUG1_Q133L(): 114 | # 12 54576295 COSM3368867 T>A 115 | # GENE=SMUG1_ENST00000513838 116 | # STRAND=- 117 | # CDS=c.398A>T 118 | # AA=p.Q133L 119 | _substitution("12", 54576295, "T", "A", "ENST00000513838", "Q", "L") 120 | 121 | def test_COSM3508871_FBRS_K224N(): 122 | # 16 30676364 COSM3508871 A>T 123 | # GENE=FBRS_ENST00000356166 124 | # STRAND=+ 125 | # CDS=c.1572A>T 126 | # AA=p.K524N 127 | _substitution("16", 30676364, "A", "T", "ENST00000356166", "K", "N") 128 | 129 | def test_COSM1616161_L1724R(): 130 | # 21 46932218 COSM1616161 T>G 131 | # GENE=COL18A1_ENST00000359759 132 | # STRAND=+ 133 | # CDS=c.5171T>G 134 | # AA=p.L1724R 135 | _substitution("21", 46932218, "T", "G", "ENST00000359759", "L", "R") 136 | 137 | def test_COSM1651074_IL9R_D148Y(): 138 | # X 155234091 COSM1651074 TGG>TCT 139 | # GENE=IL9R_ENST00000244174 140 | # STRAND=+ 141 | # CDS=c.441_442GG>CT 142 | # AA=p.D148Y 143 | _substitution("X", 155234091, "TGG", "TCT", "ENST00000244174", "D", "Y") 144 | 145 | def test_COSM3682816_RBMY1D_V193A(): 146 | # Y 24030663 COSM3682816 A>G 147 | # GENE=RBMY1D_ENST00000382680 148 | # STRAND=- 149 | # CDS=c.578T>C 150 | # AA=p.V193A 151 | _substitution("Y", 24030663, "A", "G", "ENST00000382680", "V", "A") 152 | 153 | def test_COSM1333672_BCL9_Q1150delQ(): 154 | """ 155 | test_COSM1333672_BCL9_Q1150delQ : in-frame deletion of 3 nucleotides 156 | """ 157 | # 1 147095923 COSM1333672 ACAG> A 158 | # GENE=BCL9_ENST00000234739 159 | # STRAND=+ 160 | # CDS=c.3445_3447delCAG 161 | # AA=p.Q1150delQ 162 | _deletion("1", 147095923, "ACAG", "A", "ENST00000234739", "Q") 163 | 164 | def test_COSM1190996_FBX011_P57insQQQ(): 165 | """ 166 | test_COSM1190996_FBX011_P57insQQQ : in-frame insertion of 9 nucleotides 167 | """ 168 | # 2 48132713 COSM1190996 C>CTGCTGCTGC 169 | # GENE=FBXO11_ENST00000403359 170 | # STRAND=- 171 | # CDS=c.146_147insGCAGCAGCA 172 | # AA=p.Q56_P57insQQQ;CNT=1 173 | _insertion("2", 48132713, "C", "CTGCTGCTGC", "ENST00000403359", "QQQ") 174 | 175 | def test_COSM1732848_CCDC109B_F264fs(): 176 | """ 177 | test_COSM1732848_CCDC109B_F264fs : frame shift from nucleotide deletion 178 | """ 179 | # 4 110605772 COSM1732848 CT>C 180 | # GENE=CCDC109B_ENST00000394650 181 | # STRAND=+ 182 | # CDS=c.787delT 183 | # AA=p.F264fs*5 184 | _frameshift( 185 | "4", 110605772, "CT", "C", "ENST00000394650", 186 | aa_pos=264, 187 | aa_ref="F") 188 | 189 | def test_COSM87531_SYNE1_E4738fs(): 190 | """ 191 | test_COSM87531_SYNE1_E4738fs : frame shift from nucleotide insertion 192 | """ 193 | # The given genomic mutation is: 194 | # 6 152651608 COSM87531 C>CA 195 | # but through some painful manual checking I realized that 196 | # the nucleotides here are *not* the correct ones for the 197 | # forward strand (SYNE1 is on the negative strand) and instead 198 | # it should be: 199 | # 6 152651608 COSM87531 C>CT 200 | # GENE=SYNE1_ENST00000265368 201 | # STRAND=- 202 | # CDS=c.14211_14212insA 203 | # AA=p.E4738fs*34 204 | _frameshift( 205 | "6", 152651608, "C", "GT", "ENST00000265368", 206 | aa_pos=4738, 207 | aa_ref="E") 208 | 209 | def test_COSM27279_CTNNB1_Q4H(): 210 | """ 211 | test_COSM27279_CTNNB1_Q4H : Apply Cosmic mutation COSM27279 212 | transcript = 'ENST00000405570' 213 | pos: 41265571, 214 | ref : A, alt : T 215 | amino acids = Q -> H @ pos 4 (mutation = Q4H) 216 | """ 217 | _substitution("3", 41265571, "A", "T", "ENST00000405570", "Q", "H") 218 | -------------------------------------------------------------------------------- /tests/test_dbnsfp_validation.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import pytest 14 | from pyensembl import ensembl_grch37 15 | from varcode import Variant 16 | from varcode.effects import ( 17 | ExonicSpliceSite, 18 | Substitution, 19 | TranscriptMutationEffect 20 | ) 21 | import pandas as pd 22 | 23 | from .data import data_path 24 | 25 | 26 | def generate_dbnsfp_validation_set(): 27 | # check that amino acid substitution gives 28 | # same answer as subset of dbNSFP entries (using Ensembl 75) 29 | 30 | # columns for validation dataset: 31 | # - aa_pos : base-1 position within protein 32 | # - dna_alt : non-reference DNA nucleotide 33 | # - chrom : choromosome 34 | # - ensembl_transcript : transcript ID 35 | # - dna_position : base-1 position within chromosome 36 | # - dna_ref : reference DNA nucleotide 37 | 38 | # pylint: disable=no-member 39 | # pylint gets confused by read_csv 40 | validation_set = pd.read_csv(data_path('dbnsfp_validation_set.csv')) 41 | for _, row in validation_set.iterrows(): 42 | args = ( 43 | row['ensembl_transcript'], 44 | row['chrom'], 45 | row['dna_position'], 46 | row['dna_ref'], 47 | row['dna_alt'], 48 | row['aa_pos'], 49 | row['aa_alt'] 50 | ) 51 | # making this a generator so every row shows up as its 52 | # owns test in nose 53 | yield args 54 | 55 | 56 | 57 | @pytest.mark.parametrize([ 58 | 'ensembl_transcript_id', 59 | 'chrom', 60 | 'dna_position', 61 | 'dna_ref', 62 | 'dna_alt', 63 | 'aa_pos', 64 | 'aa_alt'], generate_dbnsfp_validation_set()) 65 | def test_dbnsfp_validation_set_transcript_mutation( 66 | ensembl_transcript_id, 67 | chrom, 68 | dna_position, 69 | dna_ref, 70 | dna_alt, 71 | aa_pos, 72 | aa_alt): 73 | variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl_grch37) 74 | effects = variant.effects() 75 | transcript_id_dict = { 76 | effect.transcript.id: effect 77 | for effect in effects 78 | if isinstance(effect, TranscriptMutationEffect) 79 | } 80 | assert ensembl_transcript_id in transcript_id_dict, \ 81 | "%s not found in %s" % (ensembl_transcript_id, transcript_id_dict) 82 | effect = transcript_id_dict[ensembl_transcript_id] 83 | 84 | if isinstance(effect, ExonicSpliceSite): 85 | # exonic splice site mutations carry with them an alternate effect 86 | # which is what we check against dbNSFP (since that database seemed 87 | # to ignore exonic splicing mutations) 88 | effect = effect.alternate_effect 89 | 90 | assert isinstance(effect, Substitution), \ 91 | "Expected substitution (aa_pos=%d, aa_alt=%s) but got %s" % ( 92 | aa_pos, aa_alt, effect) 93 | effect_aa_pos = effect.aa_mutation_start_offset 94 | effect_aa_alt = effect.mutant_protein_sequence[effect_aa_pos] 95 | assert ( 96 | effect_aa_pos + 1 == aa_pos and 97 | effect_aa_alt == aa_alt), \ 98 | "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % ( 99 | aa_alt, 100 | aa_pos, 101 | chrom, 102 | dna_position, 103 | dna_ref, 104 | dna_alt, 105 | effect) 106 | -------------------------------------------------------------------------------- /tests/test_effect_collection.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | """ 14 | Test properties of EffectCollection 15 | """ 16 | 17 | 18 | from varcode.effects import IncompleteTranscript, Substitution 19 | from .common import eq_ 20 | from .data import tcga_ov_variants, ov_wustle_variants 21 | tcga_ov_effects = tcga_ov_variants.effects() 22 | ov_wustle_effects = ov_wustle_variants.effects() 23 | 24 | def test_to_dataframe(): 25 | df = tcga_ov_effects.to_dataframe() 26 | eq_(len(tcga_ov_effects), len(df)) 27 | 28 | def test_effect_collection_gene_counts(): 29 | # test that each gene is counted just once 30 | for gene, count in ov_wustle_effects.gene_counts().items(): 31 | assert count > 1, \ 32 | "Expected more than 1 effect for %s (got %d)" % (gene, count) 33 | 34 | def test_effect_collection_groupby_gene(): 35 | genes = ov_wustle_effects.groupby_gene().keys() 36 | # make sure that the IDs attached to Gene objects are the same as IDs 37 | # of groupby_gene_id 38 | gene_ids = set(ov_wustle_effects.groupby_gene_id().keys()) 39 | eq_({gene.id for gene in genes}, gene_ids) 40 | 41 | def test_effect_collection_groupby_gene_id(): 42 | gene_ids = set(ov_wustle_effects.groupby_gene_id().keys()) 43 | eq_(gene_ids, { 44 | 'ENSG00000060718', 45 | 'ENSG00000156876', 46 | 'ENSG00000130939', 47 | 'ENSG00000122477', 48 | 'ENSG00000162688' 49 | }) 50 | 51 | def test_effect_collection_groupby_gene_name(): 52 | gene_names = set(ov_wustle_effects.groupby_gene_name().keys()) 53 | eq_(gene_names, {"AGL", "SASS6", "LRRC39", "UBE4B", "COL11A1"}) 54 | 55 | def test_effect_collection_groupby_variant(): 56 | variants = set(ov_wustle_effects.groupby_variant().keys()) 57 | # make sure that all the original variants are still present 58 | # in the group keys 59 | eq_(variants, set(ov_wustle_variants)) 60 | 61 | def test_effect_collection_filter_by_effect_priority(): 62 | # every effect should be at least the same priority as "incomplete" 63 | eq_( 64 | tcga_ov_effects, 65 | tcga_ov_effects.filter_by_effect_priority(IncompleteTranscript)) 66 | assert len(tcga_ov_effects) > len( 67 | tcga_ov_effects.filter_by_effect_priority(Substitution)) 68 | 69 | def test_effect_collection_drop_silent_and_noncoding(): 70 | # some of the predicted effects are non-coding so should get dropped 71 | assert len(tcga_ov_effects) > len(tcga_ov_effects.drop_silent_and_noncoding()) 72 | -------------------------------------------------------------------------------- /tests/test_effect_collection_serialization.py: -------------------------------------------------------------------------------- 1 | 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | import pickle 15 | 16 | from varcode import EffectCollection 17 | 18 | from .common import eq_ 19 | from .data import tcga_ov_variants, ov_wustle_variants 20 | 21 | tcga_ov_effects = tcga_ov_variants.effects() 22 | ov_wustle_effects = ov_wustle_variants.effects() 23 | 24 | def test_tcga_effect_collection_to_dict(): 25 | eq_( 26 | tcga_ov_effects, 27 | EffectCollection.from_dict(tcga_ov_effects.to_dict())) 28 | 29 | def test_wustle_effect_collection_to_dict(): 30 | eq_( 31 | ov_wustle_effects, 32 | EffectCollection.from_dict(ov_wustle_effects.to_dict())) 33 | 34 | def test_tcga_effect_collection_to_json(): 35 | eq_(tcga_ov_effects, EffectCollection.from_json(tcga_ov_effects.to_json())) 36 | 37 | def test_wustle_effect_collection_to_json(): 38 | eq_( 39 | ov_wustle_effects, 40 | EffectCollection.from_json(ov_wustle_effects.to_json())) 41 | 42 | def test_tcga_effect_collection_pickling(): 43 | reconstructed = pickle.loads(pickle.dumps(tcga_ov_effects)) 44 | eq_(tcga_ov_effects, reconstructed) 45 | 46 | def test_wustle_effect_collection_pickling(): 47 | reconstructed = pickle.loads(pickle.dumps(ov_wustle_effects)) 48 | eq_(ov_wustle_effects, reconstructed) -------------------------------------------------------------------------------- /tests/test_effects_from_mutagenix_variants.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | """ 14 | List of variants copied from: 15 | https://mutagenetix.utsouthwestern.edu 16 | /incidental/incidental_rec.cfm? 17 | mid=&so=rb&ac=1&r0=0&nr=100&rn=29&rl=1&scd=IGL01779&mid=153891 18 | """ 19 | 20 | from varcode import Variant 21 | from varcode.effects import Substitution 22 | 23 | from .common import expect_effect 24 | 25 | def test_substitution_Akt1_chr12_112657169_C_T_G286R(): 26 | expect_effect( 27 | variant=Variant("chr12", 112657169, "C", "T", "mm10"), 28 | effect_class=Substitution, 29 | aa_mutation_start_offset=285, 30 | aa_ref="G", 31 | aa_alt="R") 32 | 33 | def test_substitution_Apof_chr10_128269477_A_G_I167V(): 34 | expect_effect( 35 | variant=Variant("chr10", 128269477, "A", "G", "mm10"), 36 | effect_class=Substitution, 37 | aa_mutation_start_offset=166, 38 | aa_ref="I", 39 | aa_alt="V") 40 | 41 | def test_substitution_Csmd3_chr15_47857894_A_T_V1551D(): 42 | expect_effect( 43 | variant=Variant("chr15", 47857894, "A", "T", "mm10"), 44 | effect_class=Substitution, 45 | aa_mutation_start_offset=1550, 46 | aa_ref="V", 47 | aa_alt="D") 48 | 49 | def test_substitution_Pprc1_chr19_46062202_T_A_I130N(): 50 | expect_effect( 51 | variant=Variant("chr19", 46062202, "T", "A", "mm10"), 52 | effect_class=Substitution, 53 | aa_mutation_start_offset=129, 54 | aa_ref="I", 55 | aa_alt="N") 56 | 57 | def test_substitution_Vipr1_chr9_121664630_T_C_F249S(): 58 | expect_effect( 59 | variant=Variant("chr9", 121664630, "T", "C", "mm10"), 60 | effect_class=Substitution, 61 | aa_mutation_start_offset=248, 62 | aa_ref="F", 63 | aa_alt="S") 64 | -------------------------------------------------------------------------------- /tests/test_exonic_splice_site.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from varcode import Variant 14 | from varcode.effects import ExonicSpliceSite, PrematureStop 15 | 16 | 17 | def test_STAT1_stop_gain_at_exon_boundary(): 18 | # top priority effect for this variant should be PrematureStop, 19 | # even though it's also ExonicSpliceSite 20 | stat1_variant = Variant("2", "191872291", "G", "A", "GRCh37") 21 | effects = stat1_variant.effects() 22 | print(effects) 23 | assert any([e.__class__ is ExonicSpliceSite for e in effects]) 24 | top_effect = effects.top_priority_effect() 25 | print(top_effect) 26 | assert top_effect.__class__ is PrematureStop 27 | -------------------------------------------------------------------------------- /tests/test_frameshift_helpers.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from varcode.effects.effect_prediction_coding_frameshift import ( 14 | cdna_codon_sequence_after_insertion_frameshift, 15 | cdna_codon_sequence_after_deletion_or_substitution_frameshift, 16 | ) 17 | 18 | from .common import eq_ 19 | 20 | def test_cdna_codon_sequence_after_insertion_frameshift_before_codon(): 21 | # insertion: T_ATGCCCTAG 22 | i, s = cdna_codon_sequence_after_insertion_frameshift( 23 | sequence_from_start_codon="ATGCCCTAG", 24 | cds_offset_before_insertion=-1, 25 | inserted_nucleotides="T") 26 | eq_(i, 0) 27 | eq_(s, "TATGCCCTAG") 28 | 29 | def test_cdna_codon_sequence_after_insertion_frameshift_in_middle_of_codon(): 30 | # insertion: A_T_TGCCCTAG 31 | i, s = cdna_codon_sequence_after_insertion_frameshift( 32 | sequence_from_start_codon="ATGCCCTAG", 33 | cds_offset_before_insertion=0, 34 | inserted_nucleotides="T") 35 | eq_(i, 0) 36 | eq_(s, "ATTGCCCTAG") 37 | 38 | def test_cdna_codon_sequence_after_insertion_frameshift_at_end_of_codon(): 39 | # insertion: AT_T_GCCCTAG 40 | i, s = cdna_codon_sequence_after_insertion_frameshift( 41 | sequence_from_start_codon="ATGCCCTAG", 42 | cds_offset_before_insertion=1, 43 | inserted_nucleotides="T") 44 | eq_(i, 0) 45 | eq_(s, "ATTGCCCTAG") 46 | 47 | def test_cdna_codon_sequence_after_insertion_frameshift_after_codon(): 48 | # insertion: ATG_T_CCCTAG 49 | i, s = cdna_codon_sequence_after_insertion_frameshift( 50 | sequence_from_start_codon="ATGCCCTAG", 51 | cds_offset_before_insertion=2, 52 | inserted_nucleotides="T") 53 | eq_(i, 1) 54 | eq_(s, "TCCCTAG") 55 | 56 | def test_cdna_codon_sequence_after_deletion_or_substitution_frameshift_delA(): 57 | i, s = cdna_codon_sequence_after_deletion_or_substitution_frameshift( 58 | sequence_from_start_codon="ATGCCCTAG", 59 | cds_offset=0, 60 | trimmed_cdna_ref="A", 61 | trimmed_cdna_alt="") 62 | eq_(i, 0) 63 | eq_(s, "TGCCCTAG") 64 | 65 | 66 | def test_cdna_codon_sequence_after_deletion_or_substitution_frameshift_AT_to_C(): 67 | i, s = cdna_codon_sequence_after_deletion_or_substitution_frameshift( 68 | sequence_from_start_codon="ATGCCCTAG", 69 | cds_offset=0, 70 | trimmed_cdna_ref="AT", 71 | trimmed_cdna_alt="C") 72 | eq_(i, 0) 73 | eq_(s, "CGCCCTAG") 74 | -------------------------------------------------------------------------------- /tests/test_maf.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import pytest 14 | 15 | from pyensembl import ensembl_grch37 as ensembl 16 | 17 | from varcode import Variant, load_maf, load_maf_dataframe 18 | 19 | import pandas as pd 20 | 21 | from .common import eq_ 22 | from .data import tcga_ov_variants, ov_wustle_variants, data_path 23 | 24 | def test_maf(): 25 | expected_tcga_ov_variants = [ 26 | Variant(1, 1650797, "A", "G", ensembl), 27 | Variant(1, 23836447, "C", "A", ensembl), 28 | Variant(1, 231401797, "A", "C", ensembl), 29 | Variant(11, 124617502, "C", "G", ensembl), 30 | ] 31 | eq_(len(tcga_ov_variants), len(expected_tcga_ov_variants)) 32 | for v_expect, v_maf in zip(expected_tcga_ov_variants, tcga_ov_variants): 33 | eq_(v_expect, v_maf) 34 | gene_name = tcga_ov_variants.metadata[v_maf]['Hugo_Symbol'] 35 | assert any(gene.name == gene_name for gene in v_maf.genes), \ 36 | "Expected gene name %s but got %s" % (gene_name, v_maf.genes) 37 | 38 | 39 | def generate_maf_aa_changes(): 40 | # Parse a MAF file and make sure we're annotating the protein amino acid 41 | # changes in the same way. 42 | # 43 | # The data file used also contains spaces, which is good to test the parser 44 | # on. 45 | assert len(ov_wustle_variants) == 5 46 | 47 | expected_changes = {} 48 | # pylint: disable=no-member 49 | # pylint gets confused by read_csv 50 | maf_fields = pd.read_csv( 51 | ov_wustle_variants.path, 52 | sep="\t", 53 | comment="#") 54 | for _, row in maf_fields.iterrows(): 55 | key = (str(row.Chromosome), row.Start_position) 56 | change = row.amino_acid_change 57 | # silent mutations just specificy which amino acid they affect via 58 | # e.g. "p.G384" 59 | if change[-1].isdigit(): 60 | expected_changes[key] = "silent" 61 | else: 62 | expected_changes[key] = change 63 | 64 | for variant in ov_wustle_variants: 65 | key = (variant.contig, variant.start) 66 | expected = expected_changes[key] 67 | yield (variant, expected) 68 | 69 | @pytest.mark.parametrize(['variant', 'expected_aa_change'], generate_maf_aa_changes()) 70 | def test_maf_aa_changes(variant, expected_aa_change): 71 | effect = variant.effects().top_priority_effect() 72 | change = effect.short_description 73 | eq_( 74 | change, 75 | expected_aa_change, 76 | "MAF file had annotation %s but Varcode gave %s" % ( 77 | expected_aa_change, change)) 78 | 79 | 80 | def test_maf_number_entries_duplicates(): 81 | # There are 3 duplicated mutations listed in the MAF 82 | path_to_maf_with_duplicates = data_path("duplicates.maf") 83 | variants = load_maf(path_to_maf_with_duplicates, distinct=True) 84 | assert len(variants) == 1 85 | variants = load_maf(path_to_maf_with_duplicates, distinct=False) 86 | assert len(variants) == 3 87 | 88 | def test_load_maf(): 89 | for raise_on_error in [True, False]: 90 | variants = load_maf( 91 | data_path("ov.wustle.subset5.maf"), raise_on_error=raise_on_error) 92 | eq_(len(variants), 5) 93 | 94 | 95 | def test_load_maf_dataframe(): 96 | for raise_on_error in [True, False]: 97 | variants_df = load_maf_dataframe( 98 | data_path("ov.wustle.subset5.maf"), raise_on_error=raise_on_error) 99 | eq_(len(variants_df), 5) 100 | 101 | 102 | def test_xy_contigs(): 103 | """ 104 | Test MAFs with X and Y chromosomes rather than just numerical chromosomes. 105 | """ 106 | for raise_on_error in [True, False]: 107 | variants = load_maf( 108 | data_path("tcga_ov.head.xychr.maf"), raise_on_error=True) 109 | eq_(len(variants), 4) 110 | 111 | 112 | def test_load_utf8(): 113 | """ 114 | Test MAFs loaded with utf-8 encoding. 115 | """ 116 | for raise_on_error in [True, False]: 117 | variants = load_maf( 118 | data_path("ov.wustle.subset5.maf"), raise_on_error=True, encoding="utf-8") 119 | eq_(len(variants), 5) 120 | # Make sure we avoid "TypeError: character mapping must return integer, None or unicode" 121 | # from Bio.Seq. 122 | _ = variants.effects() 123 | -------------------------------------------------------------------------------- /tests/test_mm10_klf6_frameshift.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from varcode import Variant 14 | from varcode.effects import FrameShift 15 | from varcode.effects.effect_prediction_coding_frameshift import ( 16 | predict_frameshift_coding_effect, 17 | cdna_codon_sequence_after_insertion_frameshift, 18 | ) 19 | 20 | from .common import eq_ 21 | 22 | 23 | def validate_effect_values(effect): 24 | eq_(effect.__class__, FrameShift) 25 | transcript = effect.transcript 26 | eq_(transcript.name, "Klf6-201") 27 | eq_(transcript.spliced_offset(5864876), 469) 28 | eq_(effect.shifted_sequence, "GEEGGIRTEDFF") 29 | 30 | 31 | def test_mm10_Klf6_frameshift(): 32 | variant = Variant("chr13", 5864876, "", "G", "mm10") 33 | effects = variant.effects().drop_silent_and_noncoding() 34 | eq_(len(effects), 1) 35 | validate_effect_values(effects[0]) 36 | 37 | 38 | def test_mm10_Klf6_frameshift_coding_effect_fn(): 39 | variant = Variant("chr13", 5864876, "", "G", "mm10") 40 | transcripts = variant.transcripts 41 | coding_transcripts = [ 42 | t for t in transcripts 43 | if t.biotype == "protein_coding" 44 | ] 45 | eq_(len(coding_transcripts), 1) 46 | t = coding_transcripts[0] 47 | eq_(t.name, "Klf6-201") 48 | # first start codon offset is 157 49 | # mutation occurs after offset 469 50 | effect = predict_frameshift_coding_effect( 51 | trimmed_cdna_ref="", 52 | trimmed_cdna_alt="G", 53 | cds_offset=469 - 157, 54 | sequence_from_start_codon=t.sequence[157:], 55 | variant=variant, 56 | transcript=t) 57 | validate_effect_values(effect) 58 | 59 | 60 | def test_mm10_Klf6_frameshift_cdna_codon_sequence(): 61 | variant = Variant("chr13", 5864876, "", "G", "mm10") 62 | transcripts = variant.transcripts 63 | coding_transcripts = [ 64 | t for t in transcripts 65 | if t.biotype == "protein_coding" 66 | ] 67 | eq_(len(coding_transcripts), 1) 68 | t = coding_transcripts[0] 69 | eq_(t.name, "Klf6-201") 70 | mutant_codon_index, seq_after_mutated_codon = \ 71 | cdna_codon_sequence_after_insertion_frameshift( 72 | sequence_from_start_codon=t.sequence[157:], 73 | cds_offset_before_insertion=469 - 157, 74 | inserted_nucleotides="G") 75 | eq_(mutant_codon_index, 104) 76 | expected_sequence = t.sequence[469] + "G" + t.sequence[470:] 77 | eq_(seq_after_mutated_codon, expected_sequence) 78 | -------------------------------------------------------------------------------- /tests/test_mouse.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from .common import eq_ 14 | 15 | from varcode import load_vcf, load_vcf_fast, Variant 16 | from varcode.effects import Substitution 17 | from pyensembl import Genome, EnsemblRelease 18 | from .data import data_path 19 | 20 | MOUSE_ENSEMBL_RELEASE = 95 21 | SERVER = "ftp://ftp.ensembl.org" 22 | MOUSE_GTF_PATH = \ 23 | SERVER + "/pub/release-%d/gtf/mus_musculus/Mus_musculus.GRCm38.%d.gtf.gz" % ( 24 | MOUSE_ENSEMBL_RELEASE, MOUSE_ENSEMBL_RELEASE) 25 | MOUSE_TRANSCRIPT_FASTA_PATH = \ 26 | SERVER + "/pub/release-%d/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz" 27 | MOUSE_PROTEIN_FASTA_PATH = \ 28 | SERVER + "/pub/release-%d/fasta/mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz" % ( 29 | MOUSE_ENSEMBL_RELEASE) 30 | 31 | MOUSE_VCF = data_path("mouse_vcf_dbsnp_chr1_partial.vcf") 32 | 33 | explicit_url_genome = Genome( 34 | reference_name="GRCm38", 35 | annotation_name="ensembl", 36 | annotation_version=MOUSE_ENSEMBL_RELEASE, 37 | gtf_path_or_url=MOUSE_GTF_PATH, 38 | transcript_fasta_paths_or_urls=[MOUSE_TRANSCRIPT_FASTA_PATH], 39 | protein_fasta_paths_or_urls=[MOUSE_PROTEIN_FASTA_PATH]) 40 | 41 | ensembl_mouse_genome = EnsemblRelease(MOUSE_ENSEMBL_RELEASE, species="mouse") 42 | 43 | def test_load_vcf_mouse_with_explicit_urls(): 44 | variants = load_vcf(MOUSE_VCF, genome=explicit_url_genome) 45 | eq_(len(variants), 217) 46 | 47 | def test_load_vcf_mouse_with_ensembl_release(): 48 | variants = load_vcf(MOUSE_VCF, genome=ensembl_mouse_genome) 49 | eq_(len(variants), 217) 50 | 51 | def test_load_vcf_mouse_with_inferred_genome(): 52 | variants = load_vcf(MOUSE_VCF) 53 | eq_(len(variants), 217) 54 | 55 | def test_specific_variant_mouse_with_explicit_urls(): 56 | # Exon #2 at http://useast.ensembl.org/Mus_musculus/Transcript/Exons? 57 | # db=core;g=ENSMUSG00000017167;r=11:101170523-101190724;t=ENSMUST00000103109 58 | variant = Variant( 59 | contig=11, 60 | start=101177240, 61 | ref="G", 62 | alt="T", 63 | ensembl=explicit_url_genome) 64 | effects = variant.effects() 65 | eq_(len(effects), 2) 66 | substitution_effects = [ 67 | effect 68 | for effect in effects 69 | if isinstance(effect, Substitution) 70 | ] 71 | eq_(len(substitution_effects), 1) 72 | substitution_effect = substitution_effects[0] 73 | # The coding sequence through the sub: 74 | # ATGATGAGTCTCCGGCTCTTCAGCATCCTGCTCGCCACG 75 | # GTGGTCTCTGGAGCTTGGGGCTGGGGCTACTACGGTTGC 76 | # (The final G is the sub: the 77th nucleotide) 77 | # TGC (C) -> TTC (F) 78 | # 78 / 3 = 26 79 | # 0-base = 25 80 | eq_(substitution_effect.mutant_protein_sequence[25], "F") 81 | eq_(substitution_effect.original_protein_sequence[25], "C") 82 | 83 | 84 | def test_specific_variant_mouse_with_ensembl_genome(): 85 | # Exon #2 at http://useast.ensembl.org/Mus_musculus/Transcript/Exons? 86 | # db=core;g=ENSMUSG00000017167;r=11:101170523-101190724;t=ENSMUST00000103109 87 | variant = Variant( 88 | contig=11, 89 | start=101177240, 90 | ref="G", 91 | alt="T", 92 | ensembl=ensembl_mouse_genome) 93 | effects = variant.effects() 94 | eq_(len(effects), 2) 95 | substitution_effects = [ 96 | effect 97 | for effect in effects 98 | if isinstance(effect, Substitution) 99 | ] 100 | eq_(len(substitution_effects), 1) 101 | substitution_effect = substitution_effects[0] 102 | # The coding sequence through the sub: 103 | # ATGATGAGTCTCCGGCTCTTCAGCATCCTGCTCGCCACG 104 | # GTGGTCTCTGGAGCTTGGGGCTGGGGCTACTACGGTTGC 105 | # (The final G is the sub: the 77th nucleotide) 106 | # TGC (C) -> TTC (F) 107 | # 78 / 3 = 26 108 | # 0-base = 25 109 | eq_(substitution_effect.mutant_protein_sequence[25], "F") 110 | eq_(substitution_effect.original_protein_sequence[25], "C") 111 | -------------------------------------------------------------------------------- /tests/test_mutate.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from varcode.effects import mutate 14 | from .common import eq_ 15 | 16 | def test_snp_mutation(): 17 | seq = "AACCTT" 18 | mutated = mutate.substitute(seq, 1, "A", "G") 19 | eq_(mutated, "AGCCTT") 20 | 21 | def test_deletion_mutation(): 22 | seq = "AACT" 23 | mutated = mutate.substitute(seq, 1, "ACT", "T") 24 | eq_(mutated, "AT") 25 | 26 | def test_insert_before(): 27 | mutated = mutate.insert_before("AACT", 1, "GG") 28 | eq_(mutated, "AGGACT") 29 | 30 | def test_insert_after(): 31 | mutated = mutate.insert_after("AACT", 1, "GG") 32 | eq_(mutated, "AAGGCT") 33 | -------------------------------------------------------------------------------- /tests/test_no_duplicate_variants.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from pyensembl import EnsemblRelease 14 | from varcode import Variant, VariantCollection 15 | 16 | def test_drop_duplicates(): 17 | ensembl = EnsemblRelease(78) 18 | v1 = Variant("1", 3000, "A", "G", ensembl=ensembl) 19 | v1_copy = Variant("1", 3000, "A", "G", ensembl=ensembl) 20 | v2 = Variant("2", 10, "G", "T", ensembl=ensembl) 21 | collection_without_duplicates = VariantCollection( 22 | variants=[v1, v1, v1_copy, v2]) 23 | assert len(collection_without_duplicates) == 2 24 | -------------------------------------------------------------------------------- /tests/test_problematic_variants.py: -------------------------------------------------------------------------------- 1 | """ 2 | Any variants which are encountered in the wild and either cause Varcode 3 | to crash or return an incorrect annotation should be added to this 4 | test module. 5 | """ 6 | 7 | import pytest 8 | from varcode import Variant 9 | 10 | from .common import check_effect_properties 11 | 12 | # variants which have previously resulted in raised exceptions 13 | # during effect annotation 14 | should_not_crash_variants = [ 15 | # error message: 16 | # "Couldn't find position 92979124 on any exon of ENST00000540033" 17 | Variant( 18 | contig=1, 19 | start=92979092, 20 | ref="ATATATATATATATATATATATATATATATATG", 21 | alt="A", 22 | genome="GRCh37"), 23 | # error message: 24 | # "Expect non-silent stop-loss variant to cause longer variant protein" 25 | # "" but got len(original) = 653, len(variant) = 653" 26 | Variant( 27 | contig=1, 28 | start=167385324, 29 | ref="TAA", 30 | alt="T", 31 | genome="GRCh37"), 32 | # error message: 33 | # "Variant which span 5' UTR and CDS not supported" 34 | Variant( 35 | contig=19, 36 | start=44351166, 37 | ref="GGGAGAT", 38 | alt="G", 39 | genome="GRCh37"), 40 | # error message: 41 | # "Can't have ref = '' and alt = 'E' at aa_pos = 445, cds_pos = 1335" 42 | Variant( 43 | contig=1, 44 | start=1684347, 45 | ref="", 46 | alt="CCT", 47 | genome="GRCh37"), 48 | Variant( 49 | contig=11, 50 | start=47640416, 51 | ref="", 52 | alt="TCTTT", 53 | genome="GRCh37"), 54 | Variant( 55 | contig=12, 56 | start=98880902, 57 | ref="A", 58 | alt="", 59 | genome="GRCh37"), 60 | Variant( 61 | contig=19, 62 | start=52803670, 63 | ref="TG", 64 | alt="", 65 | genome="GRCh37"), 66 | Variant( 67 | contig=1, 68 | start=109792735, 69 | ref="", 70 | alt="CGC", 71 | genome="GRCh37"), 72 | # error message: 73 | # "expected ref 'GATGTCGG' at offset 1412 of ENST00000297524...CDS has 'G'" 74 | Variant( 75 | contig=8, 76 | start=87226635, 77 | ref="CCGACATC", 78 | alt="", 79 | genome="GRCh37"), 80 | # error message: "Can't have empty aa_ref and aa_alt" 81 | Variant( 82 | contig=8, 83 | start=141488566, 84 | ref="T", 85 | alt="C", 86 | genome="GRCh38"), 87 | # error message: "len(aa_alt) = 0" 88 | Variant( 89 | contig=11, 90 | start=57741870, 91 | ref="G", 92 | alt="C", 93 | genome="GRCh38"), 94 | # error message: "IndexError: string index out of range" 95 | Variant( 96 | contig=11, 97 | start=63676705, 98 | ref="T", alt="", 99 | genome="GRCh37"), 100 | # AssertionError: aa_ref and aa_alt can't both be empty string 101 | Variant( 102 | contig=1, 103 | start=56962223, 104 | ref='C', 105 | alt='T', 106 | genome="GRCh37"), 107 | # AssertionError: aa_ref and aa_alt can't both be empty string 108 | Variant( 109 | contig=1, 110 | start=56962223, 111 | ref="C", 112 | alt="T", 113 | genome="GRCh37"), 114 | # AssertionError: aa_ref and aa_alt can't both be empty string 115 | Variant( 116 | contig=1, 117 | start=151314663, 118 | ref="C", 119 | alt="T", 120 | genome="GRCh37"), 121 | # AssertionError: aa_ref and aa_alt can't both be empty string 122 | Variant( 123 | contig=1, 124 | start=153409535, 125 | ref="C", 126 | alt="T", 127 | genome="GRCh37"), 128 | # AssertionError: aa_ref and aa_alt can't both be empty string 129 | Variant( 130 | contig=10, 131 | start=105791994, 132 | ref="C", 133 | alt="T", 134 | genome="GRCh37"), 135 | # Expected frameshift_insertion to be before stop codon 136 | # for Variant(contig=1, start=109925189, ref=., alt=A, genome=GRCh38) 137 | # on transcript_id=ENST00000329608 138 | # len(protein) = 554, aa_pos = 554 139 | Variant( 140 | contig=1, 141 | start=109925189, 142 | ref="", 143 | alt="A", 144 | genome="GRCh38"), 145 | Variant( 146 | contig=7, 147 | start=117120188, 148 | ref="A", 149 | alt="AAGT", 150 | genome="GRCh37"), 151 | # had problems with end coordinate loading this one from a MAF but also 152 | # want to make sure it doesn't cause other trouble 153 | Variant( 154 | contig=1, 155 | start=109461324, 156 | ref="GG", 157 | alt="TT", 158 | genome="GRCh37") 159 | ] 160 | 161 | 162 | @pytest.mark.parametrize(['variant'], [(v,) for v in should_not_crash_variants]) 163 | def test_crashing_variants(variant): 164 | effect = variant.effects().top_priority_effect() 165 | check_effect_properties(effect) -------------------------------------------------------------------------------- /tests/test_reference.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | 14 | import warnings 15 | 16 | import pytest 17 | 18 | from varcode.reference import infer_reference_name, ensembl_reference_aliases, most_recent_assembly_name 19 | from .common import eq_ 20 | 21 | ## test cases are given as 22 | ## expected response: list of inputs 23 | reference_test_cases = { 24 | 'NCBI36': [ 25 | 'ncbi36p2.fasta', 26 | 'b36.fasta', 27 | '##reference=file:///var/lib/cwl/ncbi36/homo_sapiens.d1.vd1.fa'], 28 | 'GRCh38': [ 29 | 'grch38p2.fasta', 30 | '##reference=file:///var/lib/cwl/job367935311_index_001zdr/GRCh38.d1.vd1.fa', 31 | '##reference=file:///var/lib/cwl/job367935311_index_001zdr/GRCh38.job36.d1.vd1.fa', 32 | ], 33 | } 34 | 35 | def test_most_recent_assembly(): 36 | eq_(most_recent_assembly_name(['ncbi36', 'grch38']), 'grch38') 37 | eq_(most_recent_assembly_name(['ncbi36', 'grch38', '37mm']), 'grch38') 38 | eq_(most_recent_assembly_name(['ncbi36']), 'ncbi36') 39 | eq_(most_recent_assembly_name(['ncbi36', '35']), 'ncbi36') 40 | def generate_reference_name_aliases(): 41 | with warnings.catch_warnings(record=True) as w: 42 | for assembly_name, aliases in ensembl_reference_aliases.items(): 43 | candidate_list = [assembly_name] + list(aliases) 44 | for candidate in candidate_list: 45 | yield ( 46 | candidate, 47 | assembly_name 48 | ) 49 | 50 | @pytest.mark.parametrize(['candidate', 'assembly_name'], generate_reference_name_aliases()) 51 | def test_infer_reference_name_aliases(candidate, assembly_name): 52 | eq_(infer_reference_name(candidate), assembly_name) 53 | 54 | def generate_reference_name_fasta_filenames(): 55 | with warnings.catch_warnings(record=True): 56 | for assembly_name, aliases in reference_test_cases.items(): 57 | candidate_list = [assembly_name] + list(aliases) 58 | for candidate in candidate_list: 59 | yield ( 60 | candidate, 61 | assembly_name 62 | ) 63 | 64 | @pytest.mark.parametrize(['candidate', 'assembly_name'], generate_reference_name_fasta_filenames()) 65 | def test_reference_name_fasta_filenames(candidate, assembly_name): 66 | eq_(infer_reference_name(candidate), assembly_name) 67 | 68 | -------------------------------------------------------------------------------- /tests/test_string_helpers.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from .common import eq_ 14 | 15 | from varcode.string_helpers import trim_shared_flanking_strings 16 | 17 | def test_trim_shared_string_endings(): 18 | # empty strings 19 | eq_(trim_shared_flanking_strings("", "A"), ("", "A", "", "")) 20 | eq_(trim_shared_flanking_strings("A", ""), ("A", "", "", "")) 21 | 22 | # string pairs with shared prefixes 23 | eq_(trim_shared_flanking_strings("AA", "AA"), ("", "", "AA", "")) 24 | eq_(trim_shared_flanking_strings("AB", "AA"), ("B", "A", "A", "")) 25 | eq_(trim_shared_flanking_strings("AA", "AB"), ("A", "B", "A", "")) 26 | eq_(trim_shared_flanking_strings("AB", "A"), ("B", "", "A", "")) 27 | eq_(trim_shared_flanking_strings("AB", "A"), ("B", "", "A", "")) 28 | eq_(trim_shared_flanking_strings("A", "AB"), ("", "B", "A", "")) 29 | 30 | # string pairs with shared suffixes 31 | eq_(trim_shared_flanking_strings("CCAT", "GT"), 32 | ("CCA", "G", "", "T")) 33 | eq_(trim_shared_flanking_strings("CCAT", "GT"), 34 | ("CCA", "G", "", "T")) 35 | 36 | # string pairs with shared prefixes+suffixes 37 | eq_(trim_shared_flanking_strings( 38 | "AATG", "AACG"), ("T", "C", "AA", "G")) 39 | eq_(trim_shared_flanking_strings( 40 | "ABG", "AG"), ("B", "", "A", "G")) 41 | eq_(trim_shared_flanking_strings( 42 | "AG", "ABG"), ("", "B", "A", "G")) 43 | -------------------------------------------------------------------------------- /tests/test_timings.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import print_function, division, absolute_import 14 | import time 15 | 16 | from varcode.util import random_variants 17 | 18 | def _time_variant_annotation(variant_collection): 19 | start_t = time.time() 20 | effects = variant_collection.effects() 21 | end_t = time.time() 22 | assert len(effects.groupby_variant()) == len(variant_collection) 23 | elapsed_t = end_t - start_t 24 | return elapsed_t 25 | 26 | 27 | def test_effect_timing( 28 | n_variants=100, 29 | random_seed=0, 30 | n_warmup_variants=5): 31 | warmup_collection = random_variants( 32 | n_warmup_variants, 33 | random_seed=None) 34 | warmup_collection.effects() 35 | 36 | variant_collection = random_variants( 37 | n_variants, 38 | random_seed=random_seed) 39 | elapsed_t = _time_variant_annotation(variant_collection) 40 | print("Elapsed: %0.4f for %d variants" % (elapsed_t, n_variants)) 41 | assert elapsed_t / n_variants < 0.1, \ 42 | "Should be faster than 100ms / variant!" 43 | 44 | if __name__ == "__main__": 45 | test_effect_timing() 46 | -------------------------------------------------------------------------------- /tests/test_variant.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | """ 14 | Test simple properties of Variant objects, such as their trimming 15 | of shared prefix/suffix strings from ref/alt fields. 16 | """ 17 | 18 | import pickle 19 | from pyensembl import ensembl_grch38 20 | 21 | from varcode import Variant 22 | from .common import eq_ 23 | 24 | def test_insertion_shared_prefix(): 25 | variant = Variant(1, start=10, ref="AA", alt="AAT") 26 | eq_(variant.contig, "1") 27 | eq_(variant.original_ref, "AA") 28 | eq_(variant.original_alt, "AAT") 29 | eq_(variant.original_start, 10) 30 | # since this variant is just an insertion of a "T", get rid of 31 | # the prefix context 32 | eq_(variant.ref, "") 33 | eq_(variant.alt, "T") 34 | # the [start,end] interval for an insertion is just the base we're 35 | # inserting after, which in this case is the 11th position 36 | eq_(variant.start, 11) 37 | eq_(variant.end, 11) 38 | eq_(variant.short_description, "chr1 g.11_12insT") 39 | assert variant.is_indel 40 | assert variant.is_insertion 41 | assert not variant.is_deletion 42 | 43 | def test_insertion_no_prefix(): 44 | variant = Variant(1, start=11, ref="", alt="T") 45 | eq_(variant.contig, "1") 46 | eq_(variant.original_ref, "") 47 | eq_(variant.original_alt, "T") 48 | eq_(variant.original_start, 11) 49 | eq_(variant.ref, "") 50 | eq_(variant.alt, "T") 51 | eq_(variant.start, 11) 52 | eq_(variant.end, 11) 53 | eq_(variant.short_description, "chr1 g.11_12insT") 54 | assert variant.is_indel 55 | assert variant.is_insertion 56 | assert not variant.is_deletion 57 | 58 | def test_substitution_no_prefix(): 59 | variant = Variant(1, start=11, ref="A", alt="T") 60 | eq_(variant.contig, "1") 61 | eq_(variant.original_ref, "A") 62 | eq_(variant.original_alt, "T") 63 | eq_(variant.original_start, 11) 64 | eq_(variant.ref, "A") 65 | eq_(variant.alt, "T") 66 | eq_(variant.start, 11) 67 | eq_(variant.end, 11) 68 | eq_(variant.short_description, "chr1 g.11A>T") 69 | assert not variant.is_indel 70 | assert not variant.is_insertion 71 | assert not variant.is_deletion 72 | 73 | def test_substitution_shared_prefix(): 74 | variant = Variant(1, start=10, ref="AA", alt="AT") 75 | eq_(variant.contig, "1") 76 | eq_(variant.original_ref, "AA") 77 | eq_(variant.original_alt, "AT") 78 | eq_(variant.original_start, 10) 79 | eq_(variant.ref, "A") 80 | eq_(variant.alt, "T") 81 | eq_(variant.start, 11) 82 | eq_(variant.end, 11) 83 | eq_(variant.short_description, "chr1 g.11A>T") 84 | assert not variant.is_indel 85 | assert not variant.is_insertion 86 | assert not variant.is_deletion 87 | 88 | def test_deletion_shared_suffix(): 89 | variant = Variant(1, start=10, ref="AAC", alt="C") 90 | eq_(variant.contig, "1") 91 | eq_(variant.original_ref, "AAC") 92 | eq_(variant.original_alt, "C") 93 | eq_(variant.original_start, 10) 94 | eq_(variant.ref, "AA") 95 | eq_(variant.alt, "") 96 | eq_(variant.start, 10) 97 | eq_(variant.end, 11) 98 | eq_(variant.short_description, "chr1 g.10_11delAA") 99 | assert variant.is_indel 100 | assert not variant.is_insertion 101 | assert variant.is_deletion 102 | 103 | def test_deletion_no_suffix(): 104 | variant = Variant(1, start=10, ref="AA", alt="") 105 | eq_(variant.contig, "1") 106 | eq_(variant.original_ref, "AA") 107 | eq_(variant.original_alt, "") 108 | eq_(variant.original_start, 10) 109 | eq_(variant.ref, "AA") 110 | eq_(variant.alt, "") 111 | eq_(variant.start, 10) 112 | eq_(variant.end, 11) 113 | eq_(variant.short_description, "chr1 g.10_11delAA") 114 | assert variant.is_indel 115 | assert not variant.is_insertion 116 | assert variant.is_deletion 117 | 118 | def test_serialization(): 119 | variants = [ 120 | Variant( 121 | 1, start=10, ref="AA", alt="AAT", genome=ensembl_grch38), 122 | Variant(10, start=15, ref="A", alt="G"), 123 | Variant(20, start=150, ref="", alt="G"), 124 | ] 125 | for original in variants: 126 | # This causes the variant's ensembl object to make a SQL connection, 127 | # which makes the ensembl object non-serializable. By calling this 128 | # method, we are checking that we don't attempt to directly serialize 129 | # the ensembl object. 130 | original.effects() 131 | 132 | # Test pickling. 133 | serialized = pickle.dumps(original) 134 | reconstituted = pickle.loads(serialized) 135 | eq_(original, reconstituted) 136 | 137 | eq_(original.contig, reconstituted.contig) 138 | eq_(original.ref, reconstituted.ref) 139 | eq_(original.alt, reconstituted.alt) 140 | eq_(original.start, reconstituted.start) 141 | eq_(original.end, reconstituted.end) 142 | eq_(original.original_ref, reconstituted.original_ref) 143 | eq_(original.original_alt, reconstituted.original_alt) 144 | eq_(original.original_start, reconstituted.original_start) 145 | 146 | # Test json. 147 | serialized = original.to_json() 148 | reconstituted = Variant.from_json(serialized) 149 | eq_(original, reconstituted) 150 | 151 | def test_deserialization_old_keywords(): 152 | old_variant_representation_json = """ 153 | { 154 | "ref": "T", 155 | "contig": "22", 156 | "start": 23230319, 157 | "__class__": { 158 | "__name__": "Variant", 159 | "__module__": "varcode.variant" 160 | }, 161 | "normalize_contig_name": true, 162 | "alt": "G", 163 | "allow_extended_nucleotides": false, 164 | "ensembl": { 165 | "__class__": { 166 | "__name__": "EnsemblRelease", 167 | "__module__": "pyensembl.ensembl_release" 168 | }, 169 | "release": 75, 170 | "server": "ftp://ftp.ensembl.org", 171 | "species": { 172 | "__class__": { 173 | "__name__": "Species", 174 | "__module__": "pyensembl.species" 175 | }, 176 | "latin_name": "homo_sapiens" 177 | } 178 | } 179 | } 180 | """ 181 | variant = Variant.from_json(old_variant_representation_json) 182 | eq_(variant.contig, "22") 183 | eq_(variant.ref, "T") 184 | eq_(variant.alt, "G") 185 | eq_(variant.reference_name, "GRCh37") 186 | eq_(variant.normalize_contig_names, True) 187 | eq_(variant.allow_extended_nucleotides, False) 188 | 189 | def test_hg19_chromosome_names(): 190 | # trimming of mithochondrial name 191 | eq_(Variant("M", 1, "A", "G", "hg19", convert_ucsc_contig_names=True).contig, "MT") 192 | eq_(Variant("M", 1, "A", "G", "hg19", convert_ucsc_contig_names=False).contig, "M") 193 | 194 | eq_(Variant("chrM", 1, "A", "G", "hg19", convert_ucsc_contig_names=True).contig, "MT") 195 | eq_(Variant("chrM", 1, "A", "G", "hg19", convert_ucsc_contig_names=False).contig, "chrM") 196 | 197 | # uppercase 198 | eq_(Variant("chrm", 1, "A", "G", "hg19", convert_ucsc_contig_names=True).contig, "MT") 199 | eq_(Variant("chrm", 1, "A", "G", "hg19", convert_ucsc_contig_names=False).contig, "chrM") 200 | 201 | # trimming of 'chr' prefix from hg19 202 | eq_(Variant("chr1", 1, "A", "G", "hg19", convert_ucsc_contig_names=True).contig, "1") 203 | eq_(Variant("chr1", 1, "A", "G", "hg19", convert_ucsc_contig_names=False).contig, "chr1") 204 | 205 | def test_contig_name_normalization(): 206 | eq_(Variant(1, 1, "A", "G", normalize_contig_names=True).contig, "1") 207 | eq_(Variant(1, 1, "A", "G", normalize_contig_names=False).contig, 1) 208 | 209 | # uppercase 210 | eq_(Variant( 211 | "chrm", 1, "A", "G", normalize_contig_names=True, convert_ucsc_contig_names=False).contig, "chrM") 212 | eq_(Variant( 213 | "chrm", 1, "A", "G", normalize_contig_names=False, convert_ucsc_contig_names=False).contig, "chrm") 214 | 215 | 216 | def test_snv_transition_transversion(): 217 | ref_variant = Variant(1, start=100, ref="C", alt="C") 218 | assert not ref_variant.is_snv 219 | 220 | variant = Variant(1, start=100, ref="C", alt="T") 221 | assert variant.is_snv 222 | assert variant.is_transition 223 | assert not variant.is_transversion 224 | 225 | transversion = Variant(1, start=100, ref="C", alt="A") 226 | assert transversion.is_snv 227 | assert not transversion.is_transition 228 | assert transversion.is_transversion 229 | -------------------------------------------------------------------------------- /tests/test_variant_collection.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | """ 14 | Test properties of VariantCollection objects other than effect annotations 15 | """ 16 | from collections import Counter 17 | import pickle 18 | 19 | from .common import eq_ 20 | from .data import ov_wustle_variants, tcga_ov_variants 21 | 22 | from varcode import VariantCollection, Variant 23 | 24 | def test_variant_collection_union(): 25 | combined = ov_wustle_variants.union(tcga_ov_variants) 26 | eq_(set(combined.sources), {ov_wustle_variants.source, tcga_ov_variants.source}) 27 | eq_(len(combined), len(ov_wustle_variants) + len(tcga_ov_variants)) 28 | 29 | def test_variant_collection_intersection(): 30 | combined = ov_wustle_variants.intersection(tcga_ov_variants) 31 | eq_(set(combined.sources), {ov_wustle_variants.source, tcga_ov_variants.source}) 32 | eq_(len(combined), 0) 33 | 34 | def test_variant_collection_gene_counts(): 35 | gene_counts = ov_wustle_variants.gene_counts() 36 | # test that each gene is counted just once 37 | eq_(list(gene_counts.values()), [1] * len(gene_counts)) 38 | 39 | def test_variant_collection_groupby_gene(): 40 | genes = ov_wustle_variants.groupby_gene().keys() 41 | # make sure that the IDs attached to Gene objects are the same as IDs 42 | # of groupby_gene_id 43 | gene_ids = set(ov_wustle_variants.groupby_gene_id().keys()) 44 | eq_({gene.id for gene in genes}, gene_ids) 45 | 46 | def test_variant_collection_groupby_gene_id(): 47 | gene_ids = set(ov_wustle_variants.groupby_gene_id().keys()) 48 | eq_(gene_ids, { 49 | 'ENSG00000060718', 50 | 'ENSG00000156876', 51 | 'ENSG00000130939', 52 | 'ENSG00000122477', 53 | 'ENSG00000162688' 54 | }) 55 | 56 | def test_variant_collection_groupby_gene_name(): 57 | gene_names = set(ov_wustle_variants.groupby_gene_name().keys()) 58 | eq_(gene_names, {"AGL", "SASS6", "LRRC39", "UBE4B", "COL11A1"}) 59 | 60 | def test_reference_names(): 61 | eq_(ov_wustle_variants.reference_names(), {"GRCh37"}) 62 | 63 | def test_to_string(): 64 | string_repr = str(ov_wustle_variants) 65 | assert "start=10238758, ref='G', alt='C'" in string_repr, \ 66 | "Expected variant g.10238758 G>C in __str__:\n%s" % ( 67 | string_repr,) 68 | 69 | def test_detailed_string(): 70 | detailed_string = ov_wustle_variants.detailed_string() 71 | # expect one of the gene names from the MAF to be in the summary string 72 | assert "UBE4B" in detailed_string, \ 73 | "Expected gene name UBE4B in detailed_string():\n%s" % detailed_string 74 | assert "start=10238758, ref='G', alt='C'" in detailed_string, \ 75 | "Expected variant g.10238758 G>C in detailed_string():\n%s" % ( 76 | detailed_string,) 77 | 78 | def test_gene_counts(): 79 | expected_coding_gene_counts = Counter() 80 | expected_coding_gene_counts["CDK11A"] = 1 81 | expected_coding_gene_counts["GNPAT"] = 1 82 | expected_coding_gene_counts["E2F2"] = 1 83 | expected_coding_gene_counts["VSIG2"] = 1 84 | all_gene_counts = tcga_ov_variants.gene_counts() 85 | assert len(all_gene_counts) > len(expected_coding_gene_counts), \ 86 | ("Gene counts for all genes must contain more elements than" 87 | " gene counts for only coding genes.") 88 | for (gene_name, count) in expected_coding_gene_counts.items(): 89 | eq_(count, all_gene_counts[gene_name]) 90 | 91 | # TODO: add `only_coding` parameter to gene_counts and then test 92 | # for exact equality between `coding_gene_counts` and 93 | # `expected_counts` 94 | # 95 | # coding_gene_counts = variants.gene_counts(only_coding=True) 96 | # eq_(coding_gene_counts, expected_counts) 97 | 98 | def test_variant_collection_serialization(): 99 | variant_list = [ 100 | Variant( 101 | 1, start=10, ref="AA", alt="AAT"), 102 | Variant(10, start=15, ref="A", alt="G"), 103 | Variant(20, start=150, ref="", alt="G"), 104 | ] 105 | original = VariantCollection( 106 | variant_list, 107 | source_to_metadata_dict={ 108 | "test_data": 109 | {variant: {"a": "b", "bar": 2} for variant in variant_list}}) 110 | 111 | # This causes the variants' ensembl objects to make a SQL connection, 112 | # which makes the ensembl object non-serializable. By calling this 113 | # method, we are checking that we don't attempt to directly serialize 114 | # the ensembl object. 115 | original.effects() 116 | 117 | original_first_variant = original[0] 118 | original_metadata = original.metadata 119 | 120 | # Test pickling 121 | reconstructed = pickle.loads(pickle.dumps(original)) 122 | eq_(original, reconstructed) 123 | eq_(reconstructed[0], original_first_variant) 124 | eq_(reconstructed.metadata[original_first_variant], 125 | original_metadata[original_first_variant]) 126 | 127 | merged = original.intersection(original) 128 | merged_reconstructed = pickle.loads(pickle.dumps(merged)) 129 | eq_(merged, merged_reconstructed) 130 | 131 | # Test JSON serialization 132 | variants_from_json = VariantCollection.from_json(original.to_json()) 133 | eq_(original, variants_from_json) 134 | 135 | eq_(variants_from_json[0], original_first_variant) 136 | 137 | # pylint: disable=no-member 138 | eq_(variants_from_json.metadata[original_first_variant], 139 | original_metadata[original_first_variant]) 140 | 141 | def test_merged_variant_collection_serialization(): 142 | intersection = ov_wustle_variants.intersection(tcga_ov_variants) 143 | eq_(intersection, pickle.loads(pickle.dumps(intersection))) 144 | 145 | union = ov_wustle_variants.union(tcga_ov_variants) 146 | eq_(union, pickle.loads(pickle.dumps(union))) 147 | -------------------------------------------------------------------------------- /tests/test_vcf.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import os 14 | 15 | import pytest 16 | 17 | from pyensembl import cached_release 18 | from varcode import load_vcf, Variant 19 | 20 | from .common import eq_ 21 | from .data import data_path 22 | 23 | 24 | 25 | # Set to 1 to enable, 0 to disable. 26 | # TODO: consider running in an in-process HTTP server instead for these tests. 27 | RUN_TESTS_REQUIRING_INTERNET = bool(int( 28 | os.environ.get("RUN_TESTS_REQUIRING_INTERNET", 0))) 29 | 30 | HG19_VCF_FILENAME = data_path("somatic_hg19_14muts.vcf") 31 | HG19_VCF_EXTERNAL_URL = ( 32 | "https://raw.githubusercontent.com/hammerlab/varcode/master/test/data/somatic_hg19_14muts.vcf") 33 | 34 | # To load from the branch that introduced these changs: 35 | # (needed before this gets merged to master, can be removed after) 36 | # VCF_EXTERNAL_URL = ( 37 | # "https://raw.githubusercontent.com/hammerlab/varcode/faster-vcf-parsing/test/data/somatic_hg19_14muts.vcf") 38 | 39 | def test_load_vcf_local(): 40 | variants = load_vcf(HG19_VCF_FILENAME) 41 | assert variants.reference_names() == {"GRCh37"} 42 | assert len(variants) == 14 43 | 44 | variants = load_vcf(HG19_VCF_FILENAME + ".gz") 45 | assert variants.reference_names() == {"GRCh37"} 46 | assert len(variants) == 14 47 | 48 | variants = load_vcf("file://%s" % HG19_VCF_FILENAME) 49 | assert variants.reference_names() == {"GRCh37"} 50 | assert len(variants) == 14 51 | 52 | variants = load_vcf("file://%s.gz" % HG19_VCF_FILENAME) 53 | assert variants.reference_names() == {"GRCh37"} 54 | assert len(variants) == 14 55 | 56 | # An extra slashe before an absolute path can confuse URL parsing. 57 | # Test that it can still be opened: 58 | variants = load_vcf("/%s" % HG19_VCF_FILENAME) 59 | assert variants.reference_names() == {"GRCh37"} 60 | assert len(variants) == 14 61 | 62 | if RUN_TESTS_REQUIRING_INTERNET: 63 | def test_load_vcf_external(): 64 | variants = load_vcf(HG19_VCF_FILENAME) 65 | eq_(variants.reference_names(), {"GRCh37"}) 66 | eq_(variants.original_reference_names(), {"hg19"}) 67 | eq_(len(variants), 14) 68 | 69 | variants = load_vcf(HG19_VCF_FILENAME + ".gz") 70 | eq_(variants.reference_names(), {"GRCh37"}) 71 | eq_(len(variants), 14) 72 | 73 | def test_vcf_reference_name(): 74 | variants = load_vcf(HG19_VCF_FILENAME) 75 | 76 | # after normalization, hg19 should be remapped to GRCh37 77 | assert variants.reference_names() == {"GRCh37"} 78 | 79 | def test_genome_arg_to_load_vcf_hg19(): 80 | eq_(load_vcf(HG19_VCF_FILENAME), 81 | load_vcf(HG19_VCF_FILENAME, genome="hg19")) 82 | 83 | def test_genome_arg_to_load_vcf_int_75(): 84 | # if we use Ensembl 75 -- which is backed by GRCh37 -- then the two variant 85 | # collections will be the same as long as we also convert the contig names 86 | eq_(load_vcf(HG19_VCF_FILENAME), 87 | load_vcf(HG19_VCF_FILENAME, genome=75, convert_ucsc_contig_names=True)) 88 | 89 | assert load_vcf(HG19_VCF_FILENAME) != load_vcf( 90 | HG19_VCF_FILENAME, 91 | genome=75, 92 | convert_ucsc_contig_names=False) 93 | 94 | def test_genome_arg_to_load_vcf_cached_75(): 95 | eq_(load_vcf(HG19_VCF_FILENAME), 96 | load_vcf(HG19_VCF_FILENAME, 97 | genome=cached_release(75), convert_ucsc_contig_names=True)) 98 | assert load_vcf(HG19_VCF_FILENAME) != load_vcf( 99 | HG19_VCF_FILENAME, 100 | genome=cached_release(75), 101 | convert_ucsc_contig_names=False) 102 | 103 | def test_genome_arg_to_load_vcf_grch37(): 104 | eq_(load_vcf(HG19_VCF_FILENAME), 105 | load_vcf( 106 | HG19_VCF_FILENAME, 107 | genome="grch37", 108 | convert_ucsc_contig_names=True)) 109 | eq_(load_vcf(HG19_VCF_FILENAME), load_vcf( 110 | HG19_VCF_FILENAME, 111 | genome="GRCh37", 112 | convert_ucsc_contig_names=True)) 113 | 114 | assert load_vcf(HG19_VCF_FILENAME) != load_vcf( 115 | HG19_VCF_FILENAME, 116 | genome="grch37", 117 | convert_ucsc_contig_names=False) 118 | 119 | def test_genome_arg_to_load_vcf_b37(): 120 | eq_(load_vcf(HG19_VCF_FILENAME), 121 | load_vcf(HG19_VCF_FILENAME, genome="b37", convert_ucsc_contig_names=True)) 122 | 123 | def test_vcf_number_entries(): 124 | # there are 14 mutations listed in the VCF, make sure they are all parsed 125 | variants = load_vcf(HG19_VCF_FILENAME) 126 | assert len(variants) == 14, \ 127 | "Expected 14 mutations, got %d" % (len(variants),) 128 | 129 | def test_vcf_number_entries_duplicates(): 130 | # There are 3 duplicated mutations listed in the VCF 131 | path_to_vcf_with_duplicates = data_path("duplicates.vcf") 132 | variants = load_vcf( 133 | path_to_vcf_with_duplicates, 134 | genome='hg38', 135 | distinct=True) 136 | assert len(variants) == 1 137 | variants = load_vcf( 138 | path_to_vcf_with_duplicates, 139 | genome='hg38', 140 | distinct=False) 141 | assert len(variants) == 3 142 | 143 | def generate_vcf_gene_names(): 144 | variants = load_vcf(HG19_VCF_FILENAME) 145 | for variant in variants: 146 | yield (variants, variant) 147 | 148 | @pytest.mark.parametrize(['collection', 'variant'], generate_vcf_gene_names()) 149 | def test_vcf_gene_names(collection, variant): 150 | expected_gene_names = collection.metadata[variant]['info']['GE'] 151 | assert variant.gene_names == expected_gene_names, \ 152 | "Expected gene name %s for variant %s, got %s" % ( 153 | expected_gene_names, variant, variant.gene_names) 154 | 155 | 156 | def test_multiple_alleles_per_line(): 157 | variants = load_vcf(data_path("multiallelic.vcf")) 158 | assert len(variants) == 2, "Expected 2 variants but got %s" % variants 159 | variant_list = list(variants) 160 | expected_variants = [ 161 | Variant(1, 1431105, "A", "C", genome="GRCh37"), 162 | Variant(1, 1431105, "A", "G", genome="GRCh37"), 163 | ] 164 | eq_(set(variant_list), set(expected_variants)) 165 | 166 | def test_sample_info_genotype(): 167 | variants = load_vcf(data_path("multiallelic.vcf")) 168 | assert len(variants) == 2, "Expected 2 variants but got %s" % variants 169 | eq_(variants.metadata[variants[0]]['sample_info']['metastasis']['GT'], 170 | '0/1') 171 | eq_(variants.metadata[variants[1]]['sample_info']['metastasis']['GT'], 172 | '0/1') 173 | -------------------------------------------------------------------------------- /tests/test_vcf_output.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import tempfile 14 | 15 | import pytest 16 | 17 | from varcode import load_vcf, load_maf 18 | from varcode.vcf_output import variants_to_vcf 19 | 20 | from .data import data_path 21 | 22 | 23 | TEST_FILENAMES_HUMAN = [ 24 | 'duplicates.maf', 25 | 'multiallelic.vcf', 26 | 'mutect-example.vcf', 27 | 'ov.wustle.subset5.maf', 28 | 'somatic_hg19_14muts.space_in_sample_name.vcf', 29 | 'somatic_hg19_14muts.vcf', 30 | 'strelka-example.vcf', 31 | 'tcga_ov.head.maf', 32 | 'tcga_ov.head.xychr.maf', 33 | # 'dbnsfp_validation_set.csv', # csv 34 | # 'duplicates.vcf', # no ref genome header 35 | # 'mutect-example-headerless.vcf', # no ref genome header 36 | # 'somatic_hg19_14muts.vcf.gz', # gzip 37 | ] 38 | 39 | TEST_FILENAMES_MOUSE = [ 40 | 'mouse_vcf_dbsnp_chr1_partial.vcf', 41 | ] 42 | 43 | TEST_FILENAMES = TEST_FILENAMES_HUMAN + TEST_FILENAMES_MOUSE 44 | 45 | 46 | def _merge_metadata_naive(variants): 47 | return { 48 | k: v 49 | for d in variants.source_to_metadata_dict.values() 50 | for k, v in d.items() 51 | } 52 | 53 | 54 | 55 | def _do_roundtrip_test(filenames, convert_ucsc_to_grch37=False): 56 | 57 | def load_fn(filename): 58 | return { 59 | 'vcf': load_vcf, 60 | 'maf': load_maf 61 | }[filename.split('.')[-1]] 62 | 63 | def load_variants(): 64 | variant_collections = [] 65 | for filename in filenames: 66 | variant_collections.append(load_fn(filename)(data_path(filename))) 67 | return variant_collections[0].union(*variant_collections[1:]) 68 | 69 | variants = load_variants() 70 | if convert_ucsc_to_grch37: 71 | variants = variants.clone_without_ucsc_data() 72 | 73 | with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: 74 | metadata = _merge_metadata_naive(variants) 75 | variants_to_vcf(variants, metadata, out=f) 76 | tmp_name = f.name 77 | reparsed_variants = load_vcf(tmp_name) 78 | 79 | # `==` checks the reference genome, which won't necessarily match. 80 | for (v1, v2) in zip(variants, reparsed_variants): 81 | assert ( 82 | v1.contig == v2.contig and 83 | v1.start == v2.start and 84 | v1.ref == v2.ref and 85 | v1.start == v2.start), (v1, v2) 86 | 87 | return (variants, reparsed_variants) 88 | 89 | # TODO: 90 | # There is definitely more opportunity here to compare metadata 91 | # fields, with caveats. 92 | # --- 93 | # First, any variants from non-VCF sources (e.g., MAF files) will inevitably 94 | # lose some information through the change in representation (more importantly, 95 | # even if there is no loss in data, that data will be in a different format in 96 | # the new metadata dictionary). Thus, we should either ignore such variants 97 | # or only check certain fields. 98 | # --- 99 | # Second, without the original metadata headers in the VCF file, all metadata 100 | # information will be parsed as strings. Thus, for a simple comparison between 101 | # metadata (without the need to individually convert fields), we'd need to add 102 | # these headers to the output VCF file. See `vcf_output.py` for more info. 103 | 104 | 105 | @pytest.mark.parametrize(['filename'], [(f,) for f in TEST_FILENAMES]) 106 | def test_roundtrip_serialization_single_file(filename): 107 | _do_roundtrip_test([filename]) 108 | 109 | FILENAME_PAIRS = ( 110 | ['simple.1.vcf', 'simple.2.vcf'], # basic multi-file VCF test 111 | ['duplicates.maf', 'ov.wustle.subset5.maf'], # multiple MAF files 112 | ['duplicate-id.1.vcf', 'duplicate-id.2.vcf'], 113 | ) 114 | 115 | @pytest.mark.parametrize(['file_group'], [(f,) for f in FILENAME_PAIRS]) 116 | def test_multiple_file_roundtrip_conversion(file_group): 117 | _do_roundtrip_test(file_group) 118 | 119 | def test_multiple_file_roundtrip_conversion_mixed_references(): 120 | # testing roundtrip serialization of hg19 VCF files 121 | # converted to GRCh37 combined with b37 MAFs 122 | _do_roundtrip_test(TEST_FILENAMES_HUMAN, convert_ucsc_to_grch37=True) 123 | 124 | def test_same_samples_produce_samples(): 125 | """test_same_samples_produce_samples 126 | 127 | Ensures that, if a set of variants have the same samples, the reparsed 128 | collection will output these samples. 129 | """ 130 | (variants, reparsed_variants) = _do_roundtrip_test( 131 | ['same-samples.1.vcf', 'same-samples.2.vcf']) 132 | 133 | original_metadata = _merge_metadata_naive(variants) 134 | reparsed_metadata = _merge_metadata_naive(reparsed_variants) 135 | 136 | sample_names = set(list(original_metadata.values())[0]['sample_info'].keys()) 137 | assert all( 138 | set(d.get('sample_info', {}).keys()) == sample_names 139 | for d in reparsed_metadata.values()) 140 | 141 | 142 | def test_different_samples_produce_no_samples(): 143 | """test_different_samples_produce_no_samples 144 | 145 | Ensures that, if a set of variants have different samples, the reparsed 146 | collection will not output any samples. 147 | 148 | See `vcf_output.py` for details as to why this is the way it's done for now. 149 | """ 150 | (_, reparsed_variants) = _do_roundtrip_test( 151 | ['different-samples.1.vcf', 'different-samples.2.vcf']) 152 | 153 | metadata = _merge_metadata_naive(reparsed_variants) 154 | assert all(d.get('sample_info') is None for d in metadata.values()) 155 | -------------------------------------------------------------------------------- /varcode/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | from .variant import Variant 15 | from .variant_collection import VariantCollection 16 | from .maf import load_maf, load_maf_dataframe 17 | from .vcf import load_vcf, load_vcf_fast 18 | from .effects import ( 19 | effect_priority, 20 | top_priority_effect, 21 | EffectCollection, 22 | MutationEffect, 23 | NonsilentCodingMutation, 24 | ) 25 | from .version import __version__ 26 | 27 | __all__ = [ 28 | "__version__", 29 | 30 | # basic classes 31 | "Variant", 32 | "EffectCollection", 33 | "VariantCollection", 34 | 35 | # effects 36 | "effect_priority", 37 | "top_priority_effect", 38 | "MutationEffect", 39 | "NonsilentCodingMutation", 40 | 41 | # file loading 42 | "load_maf", 43 | "load_maf_dataframe", 44 | "load_vcf", 45 | "load_vcf_fast", 46 | ] 47 | -------------------------------------------------------------------------------- /varcode/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from .variant_args import ( 14 | add_variant_args, 15 | make_variants_parser, 16 | variant_collection_from_args, 17 | ) 18 | 19 | __all__ = [ 20 | "add_variant_args", 21 | "make_variants_parser", 22 | "variant_collection_from_args", 23 | ] 24 | -------------------------------------------------------------------------------- /varcode/cli/effects_script.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import logging.config 14 | import pkg_resources 15 | import sys 16 | 17 | from .version_info import print_version_info 18 | from .variant_args import make_variants_parser, variant_collection_from_args 19 | 20 | 21 | logging.config.fileConfig(pkg_resources.resource_filename(__name__, 'logging.conf')) 22 | logger = logging.getLogger(__name__) 23 | 24 | arg_parser = make_variants_parser( 25 | description="Annotate variants with overlapping gene names and predicted coding effects") 26 | 27 | arg_parser.add_argument("--output-csv", help="Output path to CSV") 28 | 29 | arg_parser.add_argument( 30 | "--one-per-variant", 31 | default=False, 32 | action="store_true", 33 | help=( 34 | "Only return highest priority effect overlapping a variant, " 35 | "otherwise all overlapping transcripts are returned.")) 36 | 37 | arg_parser.add_argument( 38 | "--only-coding", 39 | default=False, 40 | action="store_true", 41 | help="Filter silent and non-coding effects") 42 | 43 | def main(args_list=None): 44 | """ 45 | Script which loads variants and annotates them with overlapping genes 46 | and predicted coding effects. 47 | 48 | Example usage: 49 | varcode 50 | --vcf mutect.vcf \ 51 | --vcf strelka.vcf \ 52 | --maf tcga_brca.maf \ 53 | --variant chr1 498584 C G \ 54 | --json-variants more_variants.json 55 | """ 56 | print_version_info() 57 | if args_list is None: 58 | args_list = sys.argv[1:] 59 | 60 | args = arg_parser.parse_args(args_list) 61 | variants = variant_collection_from_args(args) 62 | effects = variants.effects() 63 | if args.only_coding: 64 | effects = effects.drop_silent_and_noncoding() 65 | if args.one_per_variant: 66 | variant_to_effect_dict = effects.top_priority_effect_per_variant() 67 | effects = effects.clone_with_new_elements(list(variant_to_effect_dict.values())) 68 | 69 | effects_dataframe = effects.to_dataframe() 70 | logger.info('\n%s', effects) 71 | if args.output_csv: 72 | effects_dataframe.to_csv(args.output_csv, index=False) 73 | -------------------------------------------------------------------------------- /varcode/cli/genes_script.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import logging 14 | import logging.config 15 | import pkg_resources 16 | import sys 17 | 18 | from .version_info import print_version_info 19 | from .variant_args import make_variants_parser, variant_collection_from_args 20 | 21 | 22 | logging.config.fileConfig(pkg_resources.resource_filename(__name__, 'logging.conf')) 23 | logger = logging.getLogger(__name__) 24 | 25 | arg_parser = make_variants_parser( 26 | description="Annotate variants with overlapping gene names") 27 | arg_parser.add_argument("--output-csv", help="Output path to CSV") 28 | 29 | def main(args_list=None): 30 | """ 31 | Script which loads variants and annotates them with overlapping genes. 32 | 33 | Example usage: 34 | varcode-genes 35 | --vcf mutect.vcf \ 36 | --vcf strelka.vcf \ 37 | --maf tcga_brca.maf \ 38 | --variant chr1 498584 C G \ 39 | --json-variants more_variants.json 40 | """ 41 | print_version_info() 42 | if args_list is None: 43 | args_list = sys.argv[1:] 44 | args = arg_parser.parse_args(args_list) 45 | variants = variant_collection_from_args(args) 46 | variants_dataframe = variants.to_dataframe() 47 | logger.info('\n%s', variants_dataframe) 48 | if args.output_csv: 49 | variants_dataframe.to_csv(args.output_csv, index=False) 50 | -------------------------------------------------------------------------------- /varcode/cli/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root,varcode,pyensembl,datacache 3 | 4 | [formatters] 5 | keys=simpleFormatter 6 | 7 | [handlers] 8 | keys=consoleHandler,consoleHandlerCritical 9 | 10 | [logger_root] 11 | level=INFO 12 | handlers=consoleHandlerCritical 13 | 14 | [handler_consoleHandler] 15 | class=StreamHandler 16 | level=INFO 17 | formatter=simpleFormatter 18 | args=(sys.stdout,) 19 | 20 | [handler_consoleHandlerCritical] # only for root logger: essentially silent 21 | class=StreamHandler 22 | level=CRITICAL 23 | formatter=simpleFormatter 24 | args=(sys.stdout,) 25 | 26 | [formatter_simpleFormatter] 27 | format=%(asctime)s - %(name)s - %(levelname)s - %(message)s 28 | datefmt= 29 | 30 | # varcode 31 | 32 | [logger_varcode] 33 | level=DEBUG 34 | qualname=varcode 35 | handlers=consoleHandler 36 | 37 | # pyensembl 38 | 39 | [logger_pyensembl] 40 | level=DEBUG 41 | qualname=pyensembl 42 | handlers=consoleHandler 43 | 44 | # datacache 45 | 46 | [logger_datacache] 47 | level=DEBUG 48 | qualname=datacache 49 | handlers=consoleHandler 50 | -------------------------------------------------------------------------------- /varcode/cli/variant_args.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from argparse import ArgumentParser 14 | 15 | from ..vcf import load_vcf 16 | from ..maf import load_maf 17 | from ..variant_collection import VariantCollection 18 | from ..variant import Variant 19 | 20 | 21 | def add_variant_args(arg_parser): 22 | """ 23 | Extends an ArgumentParser instance with the following commandline arguments: 24 | --vcf 25 | --genome 26 | --maf 27 | --variant 28 | --json-variants 29 | """ 30 | variant_arg_group = arg_parser.add_argument_group( 31 | title="Variants", 32 | description="Genomic variant files") 33 | 34 | variant_arg_group.add_argument( 35 | "--vcf", 36 | default=[], 37 | action="append", 38 | help="Genomic variants in VCF format") 39 | 40 | variant_arg_group.add_argument( 41 | "--maf", 42 | default=[], 43 | action="append", 44 | help="Genomic variants in TCGA's MAF format",) 45 | 46 | variant_arg_group.add_argument( 47 | "--variant", 48 | default=[], 49 | action="append", 50 | nargs=4, 51 | metavar=("CHR", "POS", "REF", "ALT"), 52 | help=( 53 | "Individual variant as 4 arguments giving chromsome, position, ref," 54 | " and alt. Example: chr1 3848 C G. Use '.' to indicate empty alleles" 55 | " for insertions or deletions.")) 56 | 57 | variant_arg_group.add_argument( 58 | "--genome", 59 | type=str, 60 | help=( 61 | "What reference assembly your variant coordinates are using. " 62 | "Examples: 'hg19', 'GRCh38', or 'mm9'. " 63 | "This argument is ignored for MAF files, since each row includes " 64 | "the reference. " 65 | "For VCF files, this is used if specified, and otherwise is guessed from " 66 | "the header. For variants specfied on the commandline with --variant, " 67 | "this option is required.")) 68 | 69 | variant_arg_group.add_argument( 70 | "--download-reference-genome-data", 71 | action="store_true", 72 | default=False, 73 | help=( 74 | ("Automatically download genome reference data required for " 75 | "annotation using PyEnsembl. Otherwise you must first run " 76 | "'pyensembl install' for the release/species corresponding " 77 | "to the genome used in your VCF."))) 78 | 79 | variant_arg_group.add_argument( 80 | "--json-variants", 81 | default=[], 82 | action="append", 83 | help="Path to Varcode.VariantCollection object serialized as a JSON file.") 84 | 85 | return variant_arg_group 86 | 87 | 88 | def make_variants_parser(**kwargs): 89 | """ 90 | Parameters 91 | ---------- 92 | **kwargs : dict 93 | Passed directly to argparse.ArgumentParser 94 | 95 | Creates argparse.ArgumentParser instance with options needed for loading 96 | variants from VCF, MAF, or JSON files. 97 | """ 98 | parser = ArgumentParser(**kwargs) 99 | add_variant_args(parser) 100 | return parser 101 | 102 | 103 | def download_and_install_reference_data(variant_collections): 104 | unique_genomes = { 105 | variant.ensembl 106 | for variant_collection in variant_collections 107 | for variant in variant_collection 108 | } 109 | for genome in unique_genomes: 110 | if not genome.required_local_files_exist(): 111 | genome.download() 112 | genome.index() 113 | 114 | 115 | def variant_collection_from_args(args, required=True): 116 | variant_collections = [] 117 | 118 | for vcf_path in args.vcf: 119 | variant_collections.append( 120 | load_vcf(vcf_path, genome=args.genome)) 121 | 122 | for maf_path in args.maf: 123 | variant_collections.append(load_maf(maf_path)) 124 | 125 | if args.variant: 126 | if not args.genome: 127 | raise ValueError( 128 | "--genome must be specified when using --variant") 129 | 130 | variants = [ 131 | Variant( 132 | chromosome, 133 | start=position, 134 | ref=ref, 135 | alt=alt, 136 | genome=args.genome) 137 | for (chromosome, position, ref, alt) 138 | in args.variant 139 | ] 140 | variant_collection = VariantCollection(variants) 141 | variant_collections.append(variant_collection) 142 | 143 | for json_path in args.json_variants: 144 | with open(json_path, 'r') as f: 145 | variant_collections.append( 146 | VariantCollection.from_json(f.read())) 147 | 148 | if required and len(variant_collections) == 0: 149 | raise ValueError( 150 | "No variants loaded (use --maf, --vcf, --variant, or --json-variants options)") 151 | 152 | if args.download_reference_genome_data: 153 | download_and_install_reference_data(variant_collections) 154 | 155 | # pylint: disable=no-value-for-parameter 156 | return VariantCollection.union(*variant_collections) 157 | -------------------------------------------------------------------------------- /varcode/cli/version_info.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from collections import OrderedDict 14 | from os.path import dirname 15 | from .. import __file__ as package_init_file_path 16 | from .. import __version__ 17 | 18 | 19 | def collect_version_info(): 20 | """ 21 | Collection the version and path of Varcode. 22 | 23 | TODO: 24 | add a `dependencies=False` option to also collect this info from 25 | major Python dependencies such as PyEnsembl 26 | """ 27 | d = OrderedDict() 28 | d["Varcode"] = (__version__, dirname(package_init_file_path)) 29 | return d 30 | 31 | 32 | def print_version_info(dependencies=False): 33 | for (program, (version, path)) in collect_version_info().items(): 34 | print(program) 35 | print(" Version: %s" % version) 36 | print(" Path: %s" % path) 37 | -------------------------------------------------------------------------------- /varcode/common.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from collections import defaultdict 14 | 15 | from functools import wraps 16 | 17 | 18 | def apply_groupby(records, fn, skip_none=False): 19 | """ 20 | Given a list of objects, group them into a dictionary by 21 | applying fn to each one and using returned values as a dictionary 22 | key. 23 | 24 | Parameters 25 | ---------- 26 | records : list 27 | 28 | fn : function 29 | 30 | skip_none : bool 31 | If False, then None can be a key in the returned dictionary, 32 | otherwise records whose key value is None get skipped. 33 | 34 | Returns dict. 35 | """ 36 | 37 | # create an empty list for every new key 38 | groups = defaultdict(list) 39 | for record in records: 40 | value = fn(record) 41 | if value is not None or not skip_none: 42 | groups[value].append(record) 43 | return dict(groups) 44 | 45 | 46 | def groupby_field(records, field_name, skip_none=True): 47 | """ 48 | Given a list of objects, group them into a dictionary by 49 | the unique values of a given field name. 50 | """ 51 | return apply_groupby( 52 | records, 53 | lambda obj: getattr(obj, field_name), 54 | skip_none=skip_none) 55 | 56 | 57 | def memoize(fn): 58 | """ 59 | Simple memoization decorator for functions and methods, 60 | assumes that all arguments to the function can be hashed and 61 | compared. 62 | """ 63 | memoized_values = {} 64 | 65 | @wraps(fn) 66 | def wrapped_fn(*args, **kwargs): 67 | if kwargs: 68 | cache_key = (args, tuple(sorted(kwargs.items()))) 69 | else: 70 | cache_key = (args, ()) 71 | try: 72 | return memoized_values[cache_key] 73 | except KeyError: 74 | memoized_values[cache_key] = fn(*args, **kwargs) 75 | return memoized_values[cache_key] 76 | 77 | return wrapped_fn 78 | -------------------------------------------------------------------------------- /varcode/effects/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import print_function, division, absolute_import 14 | 15 | from .effect_collection import EffectCollection 16 | from .effect_ordering import ( 17 | effect_priority, 18 | top_priority_effect, 19 | ) 20 | from .effect_prediction import ( 21 | predict_variant_effects, 22 | predict_variant_effect_on_transcript, 23 | predict_variant_effect_on_transcript_or_failure, 24 | ) 25 | from .effect_classes import ( 26 | MutationEffect, 27 | TranscriptMutationEffect, 28 | NonsilentCodingMutation, 29 | Failure, 30 | IncompleteTranscript, 31 | Intergenic, 32 | Intragenic, 33 | NoncodingTranscript, 34 | Intronic, 35 | ThreePrimeUTR, 36 | FivePrimeUTR, 37 | Silent, 38 | Substitution, 39 | Insertion, 40 | Deletion, 41 | ComplexSubstitution, 42 | AlternateStartCodon, 43 | IntronicSpliceSite, 44 | ExonicSpliceSite, 45 | StopLoss, 46 | SpliceDonor, 47 | SpliceAcceptor, 48 | PrematureStop, 49 | FrameShiftTruncation, 50 | StartLoss, 51 | FrameShift, 52 | ExonLoss, 53 | ) 54 | 55 | __all__ = [ 56 | "EffectCollection", 57 | # effect ordering 58 | "effect_priority", 59 | "top_priority_effect", 60 | 61 | # prediction functions 62 | "predict_variant_effects", 63 | "predict_variant_effect_on_transcript", 64 | "predict_variant_effect_on_transcript_or_failure", 65 | 66 | # effect classes 67 | "MutationEffect", 68 | "TranscriptMutationEffect", 69 | "Failure", 70 | "IncompleteTranscript", 71 | "Intergenic", 72 | "Intragenic", 73 | "IncompleteTranscript", 74 | "NoncodingTranscript", 75 | "ThreePrimeUTR", 76 | "FivePrimeUTR", 77 | "Intronic", 78 | "Silent", 79 | "NonsilentCodingMutation", 80 | "Substitution", 81 | "Insertion", 82 | "Deletion", 83 | "ComplexSubstitution", 84 | "AlternateStartCodon", 85 | "IntronicSpliceSite", 86 | "ExonicSpliceSite", 87 | "StopLoss", 88 | "SpliceDonor", 89 | "SpliceAcceptor", 90 | "PrematureStop", 91 | "FrameShiftTruncation", 92 | "StartLoss", 93 | "FrameShift", 94 | "ExonLoss", 95 | ] 96 | -------------------------------------------------------------------------------- /varcode/effects/common.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from Bio.Seq import Seq 14 | 15 | 16 | 17 | def bio_seq_to_str(seq): 18 | if type(seq) is str: 19 | return seq 20 | else: 21 | return str(seq) 22 | -------------------------------------------------------------------------------- /varcode/effects/effect_helpers.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | """ 14 | Helper functions for determine effect annotation for a variant 15 | """ 16 | 17 | 18 | from ..nucleotides import PURINE_NUCLEOTIDES, AMINO_NUCLEOTIDES 19 | 20 | def variant_overlaps_interval( 21 | variant_start, 22 | n_ref_bases, 23 | interval_start, 24 | interval_end): 25 | """ 26 | Does a variant overlap a given interval on the same chromosome? 27 | 28 | Parameters 29 | ---------- 30 | variant_start : int 31 | Inclusive base-1 position of variant's starting location 32 | (or location before an insertion) 33 | 34 | n_ref_bases : int 35 | Number of reference bases affect by variant (used to compute 36 | end coordinate or determine whether variant is an insertion) 37 | 38 | interval_start : int 39 | Interval's inclusive base-1 start position 40 | 41 | interval_end : int 42 | Interval's inclusive base-1 end position 43 | """ 44 | 45 | if n_ref_bases == 0: 46 | # insertions only overlap intervals which start before and 47 | # end after the insertion point, they must be fully contained 48 | # by the other interval 49 | return interval_start <= variant_start and interval_end >= variant_start 50 | variant_end = variant_start + n_ref_bases 51 | """ 52 | if self._changes_exonic_splice_site( 53 | strand_ref, 54 | strand_alt,) 55 | """ 56 | # overlap means other interval starts before this variant ends 57 | # and the interval ends after this variant starts 58 | return interval_start <= variant_end and interval_end >= variant_start 59 | 60 | 61 | def matches_exon_end_pattern(seq): 62 | """Does the end of the nucleotide string `seq` match the canonical splice 63 | signal for the 3' end of an exon: "MAG", where M is either amino base. 64 | """ 65 | if len(seq) < 3: 66 | return False 67 | return seq[-3] in AMINO_NUCLEOTIDES and seq[-2] == "A" and seq[-1] == "G" 68 | 69 | def changes_exonic_splice_site( 70 | transcript_offset, 71 | transcript, 72 | transcript_ref, 73 | transcript_alt, 74 | exon_start_offset, 75 | exon_end_offset, 76 | exon_number): 77 | """Does the given exonic mutation of a particular transcript change a 78 | splice site? 79 | 80 | Parameters 81 | ---------- 82 | transcript_offset : int 83 | Offset from start of transcript of first reference nucleotide 84 | (or the last nucleotide before an insertion) 85 | 86 | transcript : pyensembl.Transcript 87 | 88 | transcript_ref : str 89 | Reference nucleotides 90 | 91 | transcript_alt : alt 92 | Alternate nucleotides 93 | 94 | exon_start_offset : int 95 | Start offset of exon relative to beginning of transcript 96 | 97 | exon_end_offset : int 98 | End offset of exon relative to beginning of transcript 99 | 100 | exon_number : int 101 | Which exon in the order they form the transcript 102 | """ 103 | # first we're going to make sure the variant doesn't disrupt the 104 | # splicing sequences we got from Divina et. al's 105 | # Ab initio prediction of mutation-induced cryptic 106 | # splice-site activation and exon skipping 107 | # (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2947103/) 108 | # 109 | # 5' splice site: MAG|GURAGU consensus 110 | # M is A or C; R is purine; | is the exon-intron boundary 111 | # 112 | # 3' splice site: YAG|R 113 | # 114 | if exon_number > 1 and transcript_offset == exon_start_offset: 115 | # if this is any exon past the first, check to see if it lost 116 | # the purine on its left side 117 | # 118 | # the 3' splice site sequence has just a single purine on 119 | # the exon side 120 | if len(transcript_ref) > 0 and transcript_ref[0] in PURINE_NUCLEOTIDES: 121 | if len(transcript_alt) > 0: 122 | if transcript_alt[0] not in PURINE_NUCLEOTIDES: 123 | return True 124 | else: 125 | # if the mutation is a deletion, are there ref nucleotides 126 | # afterward? 127 | offset_after_deletion = transcript_offset + len(transcript_ref) 128 | if len(transcript.sequence) > offset_after_deletion: 129 | next_base = transcript.sequence[offset_after_deletion] 130 | if next_base not in PURINE_NUCLEOTIDES: 131 | return True 132 | 133 | if exon_number < len(transcript.exons): 134 | # if the mutation affects an exon whose right end gets spliced 135 | # to a next exon, check if the variant alters the exon side of 136 | # 5' consensus splicing sequence 137 | # 138 | # splicing sequence: 139 | # MAG|GURAGU 140 | # M is A or C; R is purine; | is the exon-intron boundary 141 | # 142 | # TODO: check for overlap of two intervals instead of just 143 | # seeing if the mutation starts inside the exonic splice site 144 | if variant_overlaps_interval( 145 | variant_start=transcript_offset, 146 | n_ref_bases=len(transcript_ref), 147 | interval_start=exon_end_offset - 2, 148 | interval_end=exon_end_offset): 149 | end_of_reference_exon = transcript.sequence[ 150 | exon_end_offset - 2:exon_end_offset + 1] 151 | 152 | if matches_exon_end_pattern(end_of_reference_exon): 153 | # if the last three nucleotides conform to the consensus 154 | # sequence then treat any deviation as an ExonicSpliceSite 155 | # mutation 156 | end_of_variant_exon = end_of_reference_exon 157 | if matches_exon_end_pattern(end_of_variant_exon): 158 | # end of exon matches splicing signal, check if it still 159 | # does after the mutation 160 | return True 161 | -------------------------------------------------------------------------------- /varcode/effects/effect_prediction_coding.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from .effect_prediction_coding_frameshift import predict_frameshift_coding_effect 14 | from .effect_prediction_coding_in_frame import predict_in_frame_coding_effect 15 | 16 | 17 | def predict_variant_coding_effect_on_transcript( 18 | variant, 19 | transcript, 20 | trimmed_cdna_ref, 21 | trimmed_cdna_alt, 22 | transcript_offset): 23 | """ 24 | Given a minimal cDNA ref/alt nucleotide string pair and an offset into a 25 | given transcript, determine the coding effect of this nucleotide substitution 26 | onto the translated protein. 27 | 28 | Parameters 29 | ---------- 30 | variant : Variant 31 | 32 | transcript : Transcript 33 | 34 | trimmed_cdna_ref : str 35 | Reference nucleotides we expect to find in the transcript's CDS 36 | 37 | trimmed_cdna_alt : str 38 | Alternate nucleotides we're replacing the reference with 39 | 40 | transcript_offset : int 41 | Offset into the full transcript sequence of the ref->alt substitution 42 | """ 43 | if not transcript.complete: 44 | raise ValueError( 45 | ("Can't annotate coding effect for %s" 46 | " on incomplete transcript %s" % (variant, transcript))) 47 | 48 | sequence = transcript.sequence 49 | 50 | n_ref = len(trimmed_cdna_ref) 51 | n_alt = len(trimmed_cdna_alt) 52 | 53 | # reference nucleotides found on the transcript, if these don't match 54 | # what we were told to expect from the variant then raise an exception 55 | ref_nucleotides_from_transcript = str( 56 | sequence[transcript_offset:transcript_offset + n_ref]) 57 | 58 | # Make sure that the reference sequence agrees with what we expected 59 | # from the VCF 60 | assert ref_nucleotides_from_transcript == trimmed_cdna_ref, \ 61 | "%s: expected ref '%s' at offset %d of %s, transcript has '%s'" % ( 62 | variant, 63 | trimmed_cdna_ref, 64 | transcript_offset, 65 | transcript, 66 | ref_nucleotides_from_transcript) 67 | 68 | start_codon_offset = transcript.first_start_codon_spliced_offset 69 | stop_codon_offset = transcript.last_stop_codon_spliced_offset 70 | 71 | cds_len = stop_codon_offset - start_codon_offset + 1 72 | 73 | if cds_len < 3: 74 | raise ValueError( 75 | "Coding sequence for %s is too short: '%s'" % ( 76 | transcript, 77 | transcript.sequence[start_codon_offset:stop_codon_offset + 1])) 78 | 79 | if n_ref == 0 and transcript.strand == "-": 80 | # By convention, genomic insertions happen *after* their base 1 position on 81 | # a chromosome. On the reverse strand, however, an insertion has to go 82 | # before the nucleotide at some transcript offset. 83 | # Example: 84 | # chromosome sequence: 85 | # TTT|GATCTCGTA|CCC 86 | # transcript on reverse strand: 87 | # CCC|ATGCTCTAG|TTT 88 | # where the CDS is emphasized: 89 | # ATGCTCTAG 90 | # If we have a genomic insertion g.6insATT 91 | # the genomic sequence becomes: 92 | # TTT|GAT_ATT_CTCGTA|CCC 93 | # (insert the "ATT" after the "T" at position 6) 94 | # On the reverse strand this becomes: 95 | # CCC|ATGCTC_TTA_TAG|TTT 96 | # (insert the "ATT" *before* the "T" at position 10) 97 | # 98 | # To preserve the interpretation of the start offset as the base 99 | # before the insertion, need to subtract one 100 | cds_offset = transcript_offset - start_codon_offset - 1 101 | else: 102 | cds_offset = transcript_offset - start_codon_offset 103 | 104 | assert cds_offset < cds_len, \ 105 | "Expected CDS offset (%d) < |CDS| (%d) for %s on %s" % ( 106 | cds_offset, cds_len, variant, transcript) 107 | 108 | sequence_from_start_codon = str(sequence[start_codon_offset:]) 109 | 110 | # is this an in-frame mutations? 111 | if (n_ref - n_alt) % 3 == 0: 112 | return predict_in_frame_coding_effect( 113 | variant=variant, 114 | transcript=transcript, 115 | trimmed_cdna_ref=trimmed_cdna_ref, 116 | trimmed_cdna_alt=trimmed_cdna_alt, 117 | cds_offset=cds_offset, 118 | sequence_from_start_codon=sequence_from_start_codon) 119 | else: 120 | return predict_frameshift_coding_effect( 121 | variant=variant, 122 | transcript=transcript, 123 | trimmed_cdna_ref=trimmed_cdna_ref, 124 | trimmed_cdna_alt=trimmed_cdna_alt, 125 | cds_offset=cds_offset, 126 | sequence_from_start_codon=sequence_from_start_codon) 127 | -------------------------------------------------------------------------------- /varcode/effects/effect_prediction_coding_frameshift.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | """ 14 | Effect annotation for variants which modify the coding sequence and change 15 | reading frame. 16 | """ 17 | 18 | from ..string_helpers import trim_shared_prefix 19 | 20 | from .effect_classes import ( 21 | FrameShift, 22 | FrameShiftTruncation, 23 | StartLoss, 24 | StopLoss, 25 | Silent 26 | ) 27 | from .mutate import substitute 28 | from .translate import translate 29 | 30 | 31 | def create_frameshift_effect( 32 | mutated_codon_index, 33 | sequence_from_mutated_codon, 34 | variant, 35 | transcript): 36 | """ 37 | Determine frameshift effect within a coding sequence (possibly affecting 38 | either the start or stop codons, or anythign in between) 39 | 40 | Parameters 41 | ---------- 42 | mutated_codon_index : int 43 | Codon offset (starting from 0 = start codon) of first non-reference 44 | amino acid in the variant protein 45 | 46 | sequence_from_mutated_codon: Bio.Seq 47 | Sequence of mutated cDNA, starting from first mutated codon, until 48 | the end of the transcript 49 | 50 | variant : Variant 51 | 52 | transcript : transcript 53 | """ 54 | 55 | assert transcript.protein_sequence is not None, \ 56 | "Expect transcript %s to have protein sequence" % transcript 57 | 58 | original_protein_sequence = transcript.protein_sequence 59 | original_protein_length = len(original_protein_sequence) 60 | 61 | mutant_protein_suffix = translate( 62 | nucleotide_sequence=sequence_from_mutated_codon, 63 | first_codon_is_start=False, 64 | to_stop=True, 65 | truncate=True) 66 | 67 | if mutated_codon_index == 0: 68 | # TODO: scan through sequence_from_mutated_codon for 69 | # Kozak sequence + start codon to choose the new start 70 | return StartLoss(variant=variant, transcript=transcript) 71 | 72 | # the frameshifted sequence may contain some amino acids which are 73 | # the same as the original protein! 74 | _, mutant_protein_suffix, unchanged_amino_acids = trim_shared_prefix( 75 | ref=original_protein_sequence[mutated_codon_index:], 76 | alt=mutant_protein_suffix) 77 | n_unchanged_amino_acids = len(unchanged_amino_acids) 78 | offset_to_first_different_amino_acid = mutated_codon_index + n_unchanged_amino_acids 79 | # miraculously, this frameshift left the protein unchanged, 80 | # most likely by turning one stop codon into another stop codon 81 | if n_unchanged_amino_acids == 0: 82 | aa_ref = "" 83 | else: 84 | aa_ref = original_protein_sequence[-n_unchanged_amino_acids:] 85 | if offset_to_first_different_amino_acid >= original_protein_length: 86 | # frameshift is either extending the protein or leaving it unchanged 87 | if len(mutant_protein_suffix) == 0: 88 | 89 | return Silent( 90 | variant=variant, 91 | transcript=transcript, 92 | aa_pos=mutated_codon_index, 93 | aa_ref=aa_ref) 94 | else: 95 | # When all the amino acids are the same as the original, we either 96 | # have the original protein or we've extended it. 97 | # If we've extended it, it means we must have lost our stop codon. 98 | return StopLoss( 99 | variant=variant, 100 | transcript=transcript, 101 | aa_ref=aa_ref, 102 | aa_alt=mutant_protein_suffix) 103 | # original amino acid at the mutated codon before the frameshift occurred 104 | aa_ref = original_protein_sequence[offset_to_first_different_amino_acid] 105 | 106 | # TODO: what if all the shifted amino acids were the same and the protein 107 | # ended up the same length? Add a Silent case? 108 | if len(mutant_protein_suffix) == 0: 109 | # if a frameshift doesn't create any new amino acids, then 110 | # it must immediately have hit a stop codon 111 | return FrameShiftTruncation( 112 | variant=variant, 113 | transcript=transcript, 114 | stop_codon_offset=offset_to_first_different_amino_acid) 115 | return FrameShift( 116 | variant=variant, 117 | transcript=transcript, 118 | aa_mutation_start_offset=offset_to_first_different_amino_acid, 119 | shifted_sequence=str(mutant_protein_suffix)) 120 | 121 | def cdna_codon_sequence_after_insertion_frameshift( 122 | sequence_from_start_codon, 123 | cds_offset_before_insertion, 124 | inserted_nucleotides): 125 | """ 126 | Returns index of mutated codon and nucleotide sequence starting at the first 127 | mutated codon. 128 | """ 129 | # special logic for insertions 130 | coding_sequence_after_insertion = \ 131 | sequence_from_start_codon[cds_offset_before_insertion + 1:] 132 | 133 | if cds_offset_before_insertion % 3 == 2: 134 | # insertion happens after last nucleotide in a codon, 135 | # doesn't disrupt the existing codon from cds_offset-2 to cds_offset 136 | mutated_codon_index = cds_offset_before_insertion // 3 + 1 137 | nucleotides_before = "" 138 | elif cds_offset_before_insertion % 3 == 1: 139 | # insertion happens after 2nd nucleotide of a codon 140 | # codon positions: 141 | # 1) cds_offset - 1 142 | # 2) cds_offset 143 | # <----- Insertsion 144 | # 3) cds_offset + 1 145 | mutated_codon_index = cds_offset_before_insertion // 3 146 | # the first codon in the returned sequence will contain two reference 147 | # nucleotides before the insertion 148 | nucleotides_before = sequence_from_start_codon[ 149 | cds_offset_before_insertion - 1:cds_offset_before_insertion + 1] 150 | elif cds_offset_before_insertion % 3 == 0: 151 | # insertion happens after 1st nucleotide of a codon 152 | # codon positions: 153 | # 1) cds_offset 154 | # <----- Insertsion 155 | # 2) cds_offset + 1 156 | # 3) cds_offset + 2 157 | mutated_codon_index = cds_offset_before_insertion // 3 158 | # the first codon in the returned sequence will contain one reference 159 | # nucleotide before the insertion 160 | nucleotides_before = sequence_from_start_codon[cds_offset_before_insertion] 161 | sequence_from_mutated_codon = ( 162 | nucleotides_before + 163 | inserted_nucleotides + 164 | coding_sequence_after_insertion) 165 | return mutated_codon_index, sequence_from_mutated_codon 166 | 167 | 168 | def cdna_codon_sequence_after_deletion_or_substitution_frameshift( 169 | sequence_from_start_codon, 170 | cds_offset, 171 | trimmed_cdna_ref, 172 | trimmed_cdna_alt): 173 | """ 174 | Logic for any frameshift which isn't an insertion. 175 | 176 | We have insertions as a special case since our base-inclusive 177 | indexing means something different for insertions: 178 | cds_offset = base before insertion 179 | Whereas in this case: 180 | cds_offset = first reference base affected by a variant 181 | 182 | Returns index of first modified codon and sequence from that codon 183 | onward. 184 | """ 185 | mutated_codon_index = cds_offset // 3 186 | # get the sequence starting from the first modified codon until the end 187 | # of the transcript. 188 | sequence_after_mutated_codon = \ 189 | sequence_from_start_codon[mutated_codon_index * 3:] 190 | 191 | # the variant's ref nucleotides should start either 0, 1, or 2 nucleotides 192 | # into `sequence_after_mutated_codon` 193 | offset_into_mutated_codon = cds_offset % 3 194 | 195 | sequence_from_mutated_codon = substitute( 196 | sequence=sequence_after_mutated_codon, 197 | offset=offset_into_mutated_codon, 198 | ref=trimmed_cdna_ref, 199 | alt=trimmed_cdna_alt) 200 | return mutated_codon_index, sequence_from_mutated_codon 201 | 202 | 203 | def predict_frameshift_coding_effect( 204 | variant, 205 | transcript, 206 | trimmed_cdna_ref, 207 | trimmed_cdna_alt, 208 | cds_offset, 209 | sequence_from_start_codon): 210 | """ 211 | Coding effect of a frameshift mutation. 212 | 213 | Parameters 214 | ---------- 215 | variant : Variant 216 | 217 | transcript : Transcript 218 | 219 | trimmed_cdna_ref : nucleotide sequence 220 | Reference nucleotides in the coding sequence of the given transcript. 221 | 222 | trimmed_cdna_alt : nucleotide sequence 223 | Alternate nucleotides introduced by mutation 224 | 225 | cds_offset : int 226 | Offset into the CDS of first ref nucleotide. For insertions, this 227 | is the offset of the last ref nucleotide before the insertion. 228 | 229 | sequence_from_start_codon : nucleotide sequence 230 | Nucleotides of the coding sequence and 3' UTR 231 | 232 | """ 233 | if len(trimmed_cdna_ref) != 0: 234 | mutated_codon_index, sequence_from_mutated_codon = \ 235 | cdna_codon_sequence_after_deletion_or_substitution_frameshift( 236 | sequence_from_start_codon=sequence_from_start_codon, 237 | cds_offset=cds_offset, 238 | trimmed_cdna_ref=trimmed_cdna_ref, 239 | trimmed_cdna_alt=trimmed_cdna_alt) 240 | else: 241 | mutated_codon_index, sequence_from_mutated_codon = \ 242 | cdna_codon_sequence_after_insertion_frameshift( 243 | sequence_from_start_codon=sequence_from_start_codon, 244 | cds_offset_before_insertion=cds_offset, 245 | inserted_nucleotides=trimmed_cdna_alt) 246 | return create_frameshift_effect( 247 | mutated_codon_index=mutated_codon_index, 248 | sequence_from_mutated_codon=sequence_from_mutated_codon, 249 | variant=variant, 250 | transcript=transcript) 251 | -------------------------------------------------------------------------------- /varcode/effects/mutate.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | 14 | def insert_before(sequence, offset, new_residues): 15 | """Mutate the given sequence by inserting the string `new_residues` before 16 | `offset`. 17 | 18 | Parameters 19 | ---------- 20 | sequence : sequence 21 | String of amino acids or DNA bases 22 | 23 | offset : int 24 | Base 0 offset from start of sequence, after which we should insert 25 | `new_residues`. 26 | 27 | new_residues : sequence 28 | """ 29 | assert 0 < offset <= len(sequence), \ 30 | "Invalid position %d for sequence of length %d" % ( 31 | offset, len(sequence)) 32 | prefix = sequence[:offset] 33 | suffix = sequence[offset:] 34 | return prefix + new_residues + suffix 35 | 36 | def insert_after(sequence, offset, new_residues): 37 | """Mutate the given sequence by inserting the string `new_residues` after 38 | `offset`. 39 | 40 | Parameters 41 | ---------- 42 | sequence : sequence 43 | String of amino acids or DNA bases 44 | 45 | offset : int 46 | Base 0 offset from start of sequence, after which we should insert 47 | `new_residues`. 48 | 49 | new_residues : sequence 50 | """ 51 | assert 0 <= offset < len(sequence), \ 52 | "Invalid position %d for sequence of length %d" % ( 53 | offset, len(sequence)) 54 | prefix = sequence[:offset + 1] 55 | suffix = sequence[offset + 1:] 56 | return prefix + new_residues + suffix 57 | 58 | def substitute(sequence, offset, ref, alt): 59 | """Mutate a sequence by substituting given `alt` at instead of `ref` at the 60 | given `position`. 61 | 62 | Parameters 63 | ---------- 64 | sequence : sequence 65 | String of amino acids or DNA bases 66 | 67 | offset : int 68 | Base 0 offset from start of `sequence` 69 | 70 | ref : sequence or str 71 | What do we expect to find at the position? 72 | 73 | alt : sequence or str 74 | Alternate sequence to insert 75 | """ 76 | n_ref = len(ref) 77 | sequence_ref = sequence[offset:offset + n_ref] 78 | assert str(sequence_ref) == str(ref), \ 79 | "Reference %s at offset %d != expected reference %s" % \ 80 | (sequence_ref, offset, ref) 81 | prefix = sequence[:offset] 82 | suffix = sequence[offset + n_ref:] 83 | return prefix + alt + suffix 84 | -------------------------------------------------------------------------------- /varcode/effects/transcript_helpers.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | 14 | def interval_offset_on_transcript(start, end, transcript): 15 | """ 16 | Given an interval [start:end] and a particular transcript, 17 | return the start offset of the interval relative to the 18 | chromosomal positions of the transcript. 19 | """ 20 | # ensure that start_pos:end_pos overlap with transcript positions 21 | if start > end: 22 | raise ValueError( 23 | "start_pos %d shouldn't be greater than end_pos %d" % ( 24 | start, end)) 25 | if start > transcript.end: 26 | raise ValueError( 27 | "Range %d:%d starts after transcript %s (%d:%d)" % ( 28 | start, 29 | end, 30 | transcript, 31 | transcript.start, 32 | transcript.end)) 33 | if end < transcript.start: 34 | raise ValueError( 35 | "Range %d:%d ends before transcript %s (%d:%d)" % ( 36 | start, 37 | end, 38 | transcript, 39 | transcript.start, 40 | transcript.end)) 41 | # trim the start position to the beginning of the transcript 42 | if start < transcript.start: 43 | start = transcript.start 44 | # trim the end position to the end of the transcript 45 | if end > transcript.end: 46 | end = transcript.end 47 | # return earliest offset into the spliced transcript 48 | return min( 49 | transcript.spliced_offset(start), 50 | transcript.spliced_offset(end)) 51 | -------------------------------------------------------------------------------- /varcode/effects/translate.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | """Helpers for cDNA -> protein translation. 14 | 15 | TODO: generalize this to work with the mitochondrial codon table. 16 | """ 17 | 18 | from Bio.Data import CodonTable 19 | from Bio.Seq import Seq 20 | 21 | DNA_CODON_TABLE = CodonTable.standard_dna_table.forward_table 22 | START_CODONS = set(CodonTable.standard_dna_table.start_codons) 23 | STOP_CODONS = set(CodonTable.standard_dna_table.stop_codons) 24 | 25 | 26 | def translate_codon(codon, aa_pos): 27 | """Translate a single codon into a single amino acid or stop '*' 28 | 29 | Parameters 30 | ---------- 31 | codon : str 32 | Expected to be of length 3 33 | aa_pos : int 34 | Codon/amino acid offset into the protein (starting from 0) 35 | """ 36 | # not handling rare Leucine or Valine starts! 37 | if aa_pos == 0 and codon in START_CODONS: 38 | return "M" 39 | elif codon in STOP_CODONS: 40 | return "*" 41 | else: 42 | return DNA_CODON_TABLE[codon] 43 | 44 | 45 | def translate( 46 | nucleotide_sequence, 47 | first_codon_is_start=True, 48 | to_stop=True, 49 | truncate=False): 50 | """Translates cDNA coding sequence into amino acid protein sequence. 51 | 52 | Should typically start with a start codon but allowing non-methionine 53 | first residues since the CDS we're translating might have been affected 54 | by a start loss mutation. 55 | 56 | The sequence may include the 3' UTR but will stop translation at the first 57 | encountered stop codon. 58 | 59 | Parameters 60 | ---------- 61 | nucleotide_sequence : BioPython Seq 62 | cDNA sequence 63 | 64 | first_codon_is_start : bool 65 | Treat the beginning of nucleotide_sequence (translates methionin) 66 | 67 | truncate : bool 68 | Truncate sequence if it's not a multiple of 3 (default = False) 69 | Returns BioPython Seq of amino acids 70 | """ 71 | if not isinstance(nucleotide_sequence, Seq): 72 | nucleotide_sequence = Seq(nucleotide_sequence) 73 | 74 | if truncate: 75 | # if sequence isn't a multiple of 3, truncate it so BioPython 76 | # doesn't complain 77 | n_nucleotides = int(len(nucleotide_sequence) / 3) * 3 78 | nucleotide_sequence = nucleotide_sequence[:n_nucleotides] 79 | else: 80 | n_nucleotides = len(nucleotide_sequence) 81 | 82 | assert n_nucleotides % 3 == 0, \ 83 | ("Expected nucleotide sequence to be multiple of 3" 84 | " but got %s of length %d") % ( 85 | nucleotide_sequence, 86 | n_nucleotides) 87 | 88 | # passing cds=False to translate since we may want to deal with premature 89 | # stop codons 90 | protein_sequence = nucleotide_sequence.translate(to_stop=to_stop, cds=False) 91 | 92 | if first_codon_is_start and ( 93 | len(protein_sequence) == 0 or protein_sequence[0] != "M"): 94 | if nucleotide_sequence[:3] in START_CODONS: 95 | # TODO: figure out when these should be made into methionines 96 | # and when left as whatever amino acid they normally code for 97 | # e.g. Leucine start codons 98 | # See: DOI: 10.1371/journal.pbio.0020397 99 | return "M" + protein_sequence[1:] 100 | else: 101 | raise ValueError( 102 | ("Expected first codon of %s to be start codon" 103 | " (one of %s) but got %s") % ( 104 | protein_sequence[:10], 105 | START_CODONS, 106 | nucleotide_sequence)) 107 | 108 | return protein_sequence 109 | 110 | 111 | def find_first_stop_codon(nucleotide_sequence): 112 | """ 113 | Given a sequence of codons (expected to have length multiple of three), 114 | return index of first stop codon, or -1 if none is in the sequence. 115 | """ 116 | n_mutant_codons = len(nucleotide_sequence) // 3 117 | for i in range(n_mutant_codons): 118 | codon = nucleotide_sequence[3 * i:3 * i + 3] 119 | if codon in STOP_CODONS: 120 | return i 121 | return -1 122 | 123 | 124 | def translate_in_frame_mutation( 125 | transcript, 126 | ref_codon_start_offset, 127 | ref_codon_end_offset, 128 | mutant_codons): 129 | """ 130 | Returns: 131 | - mutant amino acid sequence 132 | - offset of first stop codon in the mutant sequence (or -1 if there was none) 133 | - boolean flag indicating whether any codons from the 3' UTR were used 134 | 135 | Parameters 136 | ---------- 137 | transcript : pyensembl.Transcript 138 | Reference transcript to which a cDNA mutation should be applied. 139 | 140 | ref_codon_start_offset : int 141 | Starting (base 0) integer offset into codons (character triplets) of the 142 | transcript's reference coding sequence. 143 | 144 | ref_codon_end_offset : int 145 | Final (base 0) integer offset into codons of the transcript's 146 | reference coding sequence. 147 | 148 | mutant_codons : str 149 | Nucleotide sequence to replace the reference codons with 150 | (expected to have length that is a multiple of three) 151 | """ 152 | mutant_stop_codon_index = find_first_stop_codon(mutant_codons) 153 | 154 | using_three_prime_utr = False 155 | 156 | if mutant_stop_codon_index != -1: 157 | mutant_codons = mutant_codons[:3 * mutant_stop_codon_index] 158 | elif ref_codon_end_offset > len(transcript.protein_sequence): 159 | # if the mutant codons didn't contain a stop but did mutate the 160 | # true reference stop codon then the translated sequence might involve 161 | # the 3' UTR 162 | three_prime_utr = transcript.three_prime_utr_sequence 163 | n_utr_codons = len(three_prime_utr) // 3 164 | # trim the 3' UTR sequence to have a length that is a multiple of 3 165 | truncated_utr_sequence = three_prime_utr[:n_utr_codons * 3] 166 | 167 | # note the offset of the first stop codon in the combined 168 | # nucleotide sequence of both the end of the CDS and the 3' UTR 169 | first_utr_stop_codon_index = find_first_stop_codon(truncated_utr_sequence) 170 | 171 | if first_utr_stop_codon_index > 0: 172 | # if there is a stop codon in the 3' UTR sequence and it's not the 173 | # very first codon 174 | using_three_prime_utr = True 175 | n_mutant_codons_before_utr = len(mutant_codons) // 3 176 | mutant_stop_codon_index = n_mutant_codons_before_utr + first_utr_stop_codon_index 177 | # combine the in-frame mutant codons with the truncated sequence of 178 | # the 3' UTR 179 | mutant_codons += truncated_utr_sequence[:first_utr_stop_codon_index * 3] 180 | elif first_utr_stop_codon_index == -1: 181 | # if there is no stop codon in the 3' UTR sequence 182 | using_three_prime_utr = True 183 | mutant_codons += truncated_utr_sequence 184 | 185 | amino_acids = translate( 186 | mutant_codons, 187 | first_codon_is_start=(ref_codon_start_offset == 0)) 188 | 189 | return amino_acids, mutant_stop_codon_index, using_three_prime_utr 190 | -------------------------------------------------------------------------------- /varcode/maf.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import logging 14 | 15 | import pandas 16 | from typechecks import require_string 17 | from pandas import isnull 18 | 19 | from .reference import infer_genome 20 | from .variant import Variant, variant_ascending_position_sort_key 21 | from .variant_collection import VariantCollection 22 | 23 | TCGA_PATIENT_ID_LENGTH = 12 24 | 25 | MAF_COLUMN_NAMES = [ 26 | 'Hugo_Symbol', 27 | 'Entrez_Gene_Id', 28 | 'Center', 29 | 'NCBI_Build', 30 | 'Chromosome', 31 | 'Start_Position', 32 | 'End_Position', 33 | 'Strand', 34 | 'Variant_Classification', 35 | 'Variant_Type', 36 | 'Reference_Allele', 37 | 'Tumor_Seq_Allele1', 38 | 'Tumor_Seq_Allele2', 39 | 'dbSNP_RS', 40 | 'dbSNP_Val_Status', 41 | 'Tumor_Sample_Barcode', 42 | 'Matched_Norm_Sample_Barcode', 43 | 'Match_Norm_Seq_Allele1', 44 | 'Match_Norm_Seq_Allele2', 45 | ] 46 | 47 | 48 | def load_maf_dataframe(path, nrows=None, raise_on_error=True, encoding=None): 49 | """ 50 | Load the guaranteed columns of a TCGA MAF file into a DataFrame 51 | 52 | Parameters 53 | ---------- 54 | path : str 55 | Path to MAF file 56 | 57 | nrows : int 58 | Optional limit to number of rows loaded 59 | 60 | raise_on_error : bool 61 | Raise an exception upon encountering an error or log an error 62 | 63 | encoding : str, optional 64 | Encoding to use for UTF when reading MAF file. 65 | """ 66 | require_string(path, "Path to MAF") 67 | 68 | n_basic_columns = len(MAF_COLUMN_NAMES) 69 | 70 | # pylint: disable=no-member 71 | # pylint gets confused by read_csv 72 | df = pandas.read_csv( 73 | path, 74 | comment="#", 75 | sep="\t", 76 | low_memory=False, 77 | skip_blank_lines=True, 78 | header=0, 79 | nrows=nrows, 80 | encoding=encoding) 81 | 82 | if len(df.columns) < n_basic_columns: 83 | error_message = ( 84 | "Too few columns in MAF file %s, expected %d but got %d : %s" % ( 85 | path, n_basic_columns, len(df.columns), df.columns)) 86 | if raise_on_error: 87 | raise ValueError(error_message) 88 | else: 89 | logging.warn(error_message) 90 | 91 | # check each pair of expected/actual column names to make sure they match 92 | for expected, actual in zip(MAF_COLUMN_NAMES, df.columns): 93 | if expected != actual: 94 | # MAFs in the wild have capitalization differences in their 95 | # column names, normalize them to always use the names above 96 | if expected.lower() == actual.lower(): 97 | # using DataFrame.rename in Python 2.7.x doesn't seem to 98 | # work for some files, possibly because Pandas treats 99 | # unicode vs. str columns as different? 100 | df[expected] = df[actual] 101 | del df[actual] 102 | else: 103 | error_message = ( 104 | "Expected column %s but got %s" % (expected, actual)) 105 | if raise_on_error: 106 | raise ValueError(error_message) 107 | else: 108 | logging.warn(error_message) 109 | 110 | return df 111 | 112 | def load_maf( 113 | path, 114 | optional_cols=[], 115 | sort_key=variant_ascending_position_sort_key, 116 | distinct=True, 117 | raise_on_error=True, 118 | encoding=None, 119 | nrows=None): 120 | """ 121 | Load reference name and Variant objects from MAF filename. 122 | 123 | Parameters 124 | ---------- 125 | 126 | path : str 127 | Path to MAF (*.maf). 128 | 129 | optional_cols : list, optional 130 | A list of MAF columns to include as metadata if they are present in the MAF. 131 | Does not result in an error if those columns are not present. 132 | 133 | sort_key : fn 134 | Function which maps each element to a sorting criterion. 135 | Set to None to not to sort the variants. 136 | 137 | distinct : bool 138 | Don't keep repeated variants 139 | 140 | raise_on_error : bool 141 | Raise an exception upon encountering an error or just log a warning. 142 | 143 | encoding : str, optional 144 | Encoding to use for UTF when reading MAF file. 145 | 146 | nrows : int, optional 147 | Limit to number of rows loaded 148 | """ 149 | # pylint: disable=no-member 150 | # pylint gets confused by read_csv inside load_maf_dataframe 151 | maf_df = load_maf_dataframe( 152 | path, 153 | nrows=nrows, 154 | raise_on_error=raise_on_error, 155 | encoding=encoding) 156 | 157 | if len(maf_df) == 0 and raise_on_error: 158 | raise ValueError("Empty MAF file %s" % path) 159 | 160 | ensembl_objects = {} 161 | variants = [] 162 | metadata = {} 163 | for _, x in maf_df.iterrows(): 164 | contig = x.Chromosome 165 | if isnull(contig): 166 | error_message = "Invalid contig name: %s" % (contig,) 167 | if raise_on_error: 168 | raise ValueError(error_message) 169 | else: 170 | logging.warn(error_message) 171 | continue 172 | 173 | start_pos = x.Start_Position 174 | ref = x.Reference_Allele 175 | 176 | # it's possible in a MAF file to have multiple Ensembl releases 177 | # mixed in a single MAF file (the genome assembly is 178 | # specified by the NCBI_Build column) 179 | ncbi_build = x.NCBI_Build 180 | if ncbi_build in ensembl_objects: 181 | genome = ensembl_objects[ncbi_build] 182 | else: 183 | if isinstance(ncbi_build, int): 184 | reference_name = "B%d" % ncbi_build 185 | else: 186 | reference_name = str(ncbi_build) 187 | genome, _ = infer_genome(reference_name) 188 | ensembl_objects[ncbi_build] = genome 189 | 190 | # have to try both Tumor_Seq_Allele1 and Tumor_Seq_Allele2 191 | # to figure out which is different from the reference allele 192 | if x.Tumor_Seq_Allele1 != ref: 193 | alt = x.Tumor_Seq_Allele1 194 | else: 195 | if x.Tumor_Seq_Allele2 == ref: 196 | error_message = ( 197 | "Both tumor alleles agree with reference %s: %s" % ( 198 | ref, x,)) 199 | if raise_on_error: 200 | raise ValueError(error_message) 201 | else: 202 | logging.warn(error_message) 203 | continue 204 | alt = x.Tumor_Seq_Allele2 205 | 206 | variant = Variant( 207 | contig, 208 | start_pos, 209 | str(ref), 210 | str(alt), 211 | genome) 212 | 213 | # keep metadata about the variant and its TCGA annotation 214 | metadata[variant] = { 215 | 'Hugo_Symbol': x.Hugo_Symbol, 216 | 'Center': x.Center, 217 | 'Strand': x.Strand, 218 | 'Variant_Classification': x.Variant_Classification, 219 | 'Variant_Type': x.Variant_Type, 220 | 'dbSNP_RS': x.dbSNP_RS, 221 | 'dbSNP_Val_Status': x.dbSNP_Val_Status, 222 | 'Tumor_Sample_Barcode': x.Tumor_Sample_Barcode, 223 | 'Matched_Norm_Sample_Barcode': x.Matched_Norm_Sample_Barcode, 224 | } 225 | for optional_col in optional_cols: 226 | if optional_col in x: 227 | metadata[variant][optional_col] = x[optional_col] 228 | 229 | variants.append(variant) 230 | 231 | return VariantCollection( 232 | variants=variants, 233 | source_to_metadata_dict={path: metadata}, 234 | sort_key=sort_key, 235 | distinct=distinct) 236 | -------------------------------------------------------------------------------- /varcode/nucleotides.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import print_function, division, absolute_import 14 | 15 | import numpy as np 16 | 17 | from typechecks import require_string 18 | 19 | # include all pseudonucleotides encoding repeats and uncertain bases 20 | STANDARD_NUCLEOTIDES = {'A', 'C', 'T', 'G'} 21 | 22 | PURINE_NUCLEOTIDES = {'A', 'G'} 23 | 24 | PYRIMIDINE_NUCLEOTIDES = {'C', 'T'} 25 | 26 | AMINO_NUCLEOTIDES = {'A', 'C'} 27 | 28 | KETO_NUCLEOTIDES = {'T', 'G'} 29 | 30 | STRONG_NUCLEOTIDES = {'G', 'C'} 31 | 32 | WEAK_NUCLEOTIDES = {'A', 'T'} 33 | 34 | EXTENDED_NUCLEOTIDES = { 35 | 'A', 'C', 'T', 'G', 36 | 'Y', # Pyrimidine (C or T) 37 | 'R', # Purine (A or G) 38 | 'W', # weak (A or T) 39 | 'S', # strong (G or C) 40 | 'K', # keto (T or G) 41 | 'M', # amino (C or A) 42 | 'D', # A, G, T (not C) 43 | 'V', # A, C, G (not T) 44 | 'H', # A, C, T (not G) 45 | 'B', # C, G, T (not A) 46 | 'X', # any base 47 | 'N', # any base 48 | } 49 | 50 | 51 | def is_purine(nucleotide, allow_extended_nucleotides=False): 52 | """Is the nucleotide a purine""" 53 | if not allow_extended_nucleotides and nucleotide not in STANDARD_NUCLEOTIDES: 54 | raise ValueError( 55 | "{} is a non-standard nucleotide, neither purine or pyrimidine".format(nucleotide)) 56 | return nucleotide in PURINE_NUCLEOTIDES 57 | 58 | 59 | def all_standard_nucleotides(nucleotides): 60 | return all(base in STANDARD_NUCLEOTIDES for base in nucleotides) 61 | 62 | 63 | def normalize_nucleotide_string( 64 | nucleotides, 65 | allow_extended_nucleotides=False, 66 | empty_chars=".-", 67 | treat_nan_as_empty=True): 68 | """ 69 | Normalizes a nucleotide string by converting various ways of encoding empty 70 | strings into "", making all letters upper case, and checking to make sure 71 | all letters in the string are actually nucleotides. 72 | 73 | Parameters 74 | ---------- 75 | nucleotides : str 76 | Sequence of nucleotides, e.g. "ACCTG" 77 | 78 | extended_nucleotides : bool 79 | Allow non-canonical nucleotide characters like 'X' for unknown base 80 | 81 | empty_chars : str 82 | Characters which encode empty strings, such as "." used in VCF format 83 | or "-" used in MAF format 84 | 85 | treat_nan_as_empty : bool 86 | Some MAF files represent deletions/insertions with NaN ref/alt values 87 | """ 88 | if nucleotides in empty_chars: 89 | return "" 90 | elif treat_nan_as_empty and isinstance(nucleotides, float) and np.isnan(nucleotides): 91 | return "" 92 | 93 | require_string(nucleotides, name="nucleotide string") 94 | 95 | nucleotides = nucleotides.upper() 96 | 97 | if allow_extended_nucleotides: 98 | valid_nucleotides = EXTENDED_NUCLEOTIDES 99 | else: 100 | valid_nucleotides = STANDARD_NUCLEOTIDES 101 | 102 | if not set(nucleotides) <= valid_nucleotides: 103 | raise ValueError( 104 | "Invalid character(s) in nucleotide string: %s" % ( 105 | ",".join(set(nucleotides) - valid_nucleotides),)) 106 | 107 | return nucleotides 108 | -------------------------------------------------------------------------------- /varcode/string_helpers.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import print_function, division, absolute_import 14 | 15 | 16 | def trim_shared_prefix(ref, alt): 17 | """ 18 | Sometimes mutations are given with a shared prefix between the reference 19 | and alternate strings. Examples: C>CT (nucleotides) or GYFP>G (amino acids). 20 | 21 | This function trims the common prefix and returns the disjoint ref 22 | and alt strings, along with the shared prefix. 23 | """ 24 | n_ref = len(ref) 25 | n_alt = len(alt) 26 | n_min = min(n_ref, n_alt) 27 | i = 0 28 | while i < n_min and ref[i] == alt[i]: 29 | i += 1 30 | 31 | # guaranteed that ref and alt agree on all the characters 32 | # up to i'th position, so it doesn't matter which one we pull 33 | # the prefix out of 34 | prefix = ref[:i] 35 | ref_suffix = ref[i:] 36 | alt_suffix = alt[i:] 37 | return ref_suffix, alt_suffix, prefix 38 | 39 | 40 | def trim_shared_suffix(ref, alt): 41 | """ 42 | Reuse the `trim_shared_prefix` function above to implement similar 43 | functionality for string suffixes. 44 | 45 | Given ref='ABC' and alt='BC', we first revese both strings: 46 | reverse_ref = 'CBA' 47 | reverse_alt = 'CB' 48 | and then the result of calling trim_shared_prefix will be: 49 | ('A', '', 'CB') 50 | We then reverse all three of the result strings to get back 51 | the shared suffix and both prefixes leading up to it: 52 | ('A', '', 'BC') 53 | """ 54 | n_ref = len(ref) 55 | n_alt = len(alt) 56 | n_min = min(n_ref, n_alt) 57 | i = 0 58 | while i < n_min and ref[-i - 1] == alt[-i - 1]: 59 | i += 1 60 | 61 | # i is length of shared suffix. 62 | if i == 0: 63 | return (ref, alt, '') 64 | return (ref[:-i], alt[:-i], ref[-i:]) 65 | 66 | 67 | def trim_shared_flanking_strings(ref, alt): 68 | """ 69 | Given two nucleotide or amino acid strings, identify 70 | if they have a common prefix, a common suffix, and return 71 | their unique components along with the prefix and suffix. 72 | 73 | For example, if the input ref = "SYFFQGR" and alt = "SYMLLFIFQGR" 74 | then the result will be: 75 | ("F", "MLLFI", "SY", "FQGR") 76 | """ 77 | ref, alt, prefix = trim_shared_prefix(ref, alt) 78 | ref, alt, suffix = trim_shared_suffix(ref, alt) 79 | return ref, alt, prefix, suffix 80 | -------------------------------------------------------------------------------- /varcode/ucsc_reference_names.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | # NCBI builds and hg releases aren't identical 14 | # but the differences are all on chrM and unplaced contigs 15 | # Mapping between names copied from: 16 | # https://genome.ucsc.edu/FAQ/FAQreleases.html#release1 17 | 18 | -------------------------------------------------------------------------------- /varcode/util.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import random 14 | 15 | from Bio.Seq import reverse_complement 16 | from pyensembl import genome_for_reference_name 17 | 18 | from .nucleotides import STANDARD_NUCLEOTIDES 19 | from .variant import Variant 20 | from .variant_collection import VariantCollection 21 | 22 | # cache lists of all transcript IDs for difference Ensembl releases 23 | _transcript_ids_cache = {} 24 | 25 | def random_variants( 26 | count, 27 | genome_name="GRCh38", 28 | deletions=True, 29 | insertions=True, 30 | random_seed=None): 31 | """ 32 | Generate a VariantCollection with random variants that overlap 33 | at least one complete coding transcript. 34 | """ 35 | rng = random.Random(random_seed) 36 | ensembl = genome_for_reference_name(genome_name) 37 | 38 | if ensembl in _transcript_ids_cache: 39 | transcript_ids = _transcript_ids_cache[ensembl] 40 | else: 41 | transcript_ids = ensembl.transcript_ids() 42 | _transcript_ids_cache[ensembl] = transcript_ids 43 | 44 | variants = [] 45 | 46 | # we should finish way before this loop is over but just in case 47 | # something is wrong with PyEnsembl we want to avoid an infinite loop 48 | for _ in range(count * 100): 49 | if len(variants) < count: 50 | transcript_id = rng.choice(transcript_ids) 51 | transcript = ensembl.transcript_by_id(transcript_id) 52 | 53 | if not transcript.complete: 54 | continue 55 | 56 | exon = rng.choice(transcript.exons) 57 | base1_genomic_position = rng.randint(exon.start, exon.end) 58 | transcript_offset = transcript.spliced_offset(base1_genomic_position) 59 | seq = transcript.sequence 60 | 61 | ref = str(seq[transcript_offset]) 62 | if transcript.on_backward_strand: 63 | ref = reverse_complement(ref) 64 | 65 | alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref] 66 | 67 | if insertions: 68 | nucleotide_pairs = [ 69 | x + y 70 | for x in STANDARD_NUCLEOTIDES 71 | for y in STANDARD_NUCLEOTIDES 72 | ] 73 | alt_nucleotides.extend(nucleotide_pairs) 74 | if deletions: 75 | alt_nucleotides.append("") 76 | alt = rng.choice(alt_nucleotides) 77 | variant = Variant( 78 | transcript.contig, 79 | base1_genomic_position, 80 | ref=ref, 81 | alt=alt, 82 | ensembl=ensembl) 83 | variants.append(variant) 84 | else: 85 | return VariantCollection(variants) 86 | raise ValueError( 87 | ("Unable to generate %d random variants, " 88 | "there may be a problem with PyEnsembl") % count) 89 | -------------------------------------------------------------------------------- /varcode/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.2.1" 2 | --------------------------------------------------------------------------------