├── .coveragerc
├── .gitattributes
├── .github
└── workflows
│ └── tests.yml
├── .github_changelog_generator
├── .gitignore
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── RELEASING.md
├── code-of-conduct.md
├── deploy.sh
├── examples
└── varcode-quick_start.ipynb
├── lint-and-test.sh
├── lint.sh
├── pylintrc
├── requirements.txt
├── setup.py
├── test.sh
├── tests
├── __init__.py
├── benchmark_vcf_load.py
├── common.py
├── data.py
├── data
│ ├── dbnsfp_validation_set.csv
│ ├── different-samples.1.vcf
│ ├── different-samples.2.vcf
│ ├── duplicate-id.1.vcf
│ ├── duplicate-id.2.vcf
│ ├── duplicates.maf
│ ├── duplicates.vcf
│ ├── mouse_vcf_dbsnp_chr1_partial.vcf
│ ├── multiallelic.vcf
│ ├── mutect-example-headerless.vcf
│ ├── mutect-example.vcf
│ ├── ov.wustle.subset5.maf
│ ├── same-samples.1.vcf
│ ├── same-samples.2.vcf
│ ├── simple.1.vcf
│ ├── simple.2.vcf
│ ├── somatic_hg19_14muts.space_in_sample_name.vcf
│ ├── somatic_hg19_14muts.vcf
│ ├── somatic_hg19_14muts.vcf.gz
│ ├── strelka-example.vcf
│ ├── tcga_ov.head.maf
│ └── tcga_ov.head.xychr.maf
├── test_cli_effects.py
├── test_cli_genes.py
├── test_collection_filtering.py
├── test_common.py
├── test_cosmic_mutations.py
├── test_dbnsfp_validation.py
├── test_effect_annotation_errors.py
├── test_effect_classes.py
├── test_effect_collection.py
├── test_effect_collection_serialization.py
├── test_effects_from_mutagenix_variants.py
├── test_exonic_splice_site.py
├── test_frameshift_helpers.py
├── test_maf.py
├── test_mm10_klf6_frameshift.py
├── test_mouse.py
├── test_mutate.py
├── test_no_duplicate_variants.py
├── test_problematic_variants.py
├── test_reference.py
├── test_string_helpers.py
├── test_timings.py
├── test_variant.py
├── test_variant_collection.py
├── test_vcf.py
└── test_vcf_output.py
└── varcode
├── __init__.py
├── cli
├── __init__.py
├── effects_script.py
├── genes_script.py
├── logging.conf
├── variant_args.py
└── version_info.py
├── common.py
├── effects
├── __init__.py
├── common.py
├── effect_classes.py
├── effect_collection.py
├── effect_helpers.py
├── effect_ordering.py
├── effect_prediction.py
├── effect_prediction_coding.py
├── effect_prediction_coding_frameshift.py
├── effect_prediction_coding_in_frame.py
├── mutate.py
├── transcript_helpers.py
└── translate.py
├── maf.py
├── nucleotides.py
├── reference.py
├── string_helpers.py
├── ucsc_reference_names.py
├── util.py
├── variant.py
├── variant_collection.py
├── vcf.py
├── vcf_output.py
└── version.py
/.coveragerc:
--------------------------------------------------------------------------------
1 | # .coveragerc to control coverage.py
2 | [run]
3 | omit =
4 | test/*
5 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | varcode/_version.py export-subst
2 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3 |
4 | # TODO:
5 | # - cache this directory $HOME/.cache/pyensembl/
6 | # - update coveralls
7 | # - get a badge for tests passing
8 | # - download binary dependencies from conda
9 | name: Tests
10 | on: [push, pull_request]
11 |
12 | jobs:
13 | build:
14 | runs-on: ubuntu-latest
15 | strategy:
16 | fail-fast: true
17 | matrix:
18 | python-version: ["3.9", "3.10", "3.11"]
19 |
20 | steps:
21 | - name: Checkout repository
22 | uses: actions/checkout@v3
23 | - name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v3
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | cache: "pip"
28 | - name: Install dependencies
29 | run: |
30 | python -m pip install --upgrade pip
31 | python -m pip install flake8 pytest pytest-cov coveralls
32 | pip install -r requirements.txt
33 | pip install .
34 | - name: Lint with flake8
35 | run: |
36 | # stop the build if there are Python syntax errors or undefined names
37 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
38 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
39 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
40 | - name: Run default linting script
41 | run: |
42 | ./lint.sh
43 | - name: Install Ensembl data
44 | run: |
45 | echo "Before installing Ensembl releases" && df -h
46 | pyensembl install --release 75 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh37.75/
47 | pyensembl install --release 81 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.81/
48 | pyensembl install --release 95 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.95/
49 | pyensembl install --release 95 --species mouse --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCm38.95/
50 | echo "After installing Ensembl releases" && df -h
51 | - name: Run unit tests
52 | run: |
53 | ./test.sh
54 | - name: Publish coverage to Coveralls
55 | uses: coverallsapp/github-action@v2.2.3
56 |
--------------------------------------------------------------------------------
/.github_changelog_generator:
--------------------------------------------------------------------------------
1 | unreleased=false
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 |
25 | # PyInstaller
26 | # Usually these files are written by a python script from a template
27 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 |
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 |
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 |
43 | # Translations
44 | *.mo
45 | *.pot
46 |
47 | # Django stuff:
48 | *.log
49 |
50 | # Sphinx documentation
51 | docs/_build/
52 |
53 | # PyBuilder
54 | target/
55 |
56 | # PyCharm
57 | .idea
58 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Contributing to Varcode
2 | ==========================
3 |
4 | [Varcode](http://www.github.com/openvax/varcode) is open source software and
5 | we welcome your contributions. This document should help you get started
6 | contributing to Varcode.
7 |
8 | Filing Issues
9 | -------------
10 | If you find any bugs or problems while using Varcode or have any feature requests, please feel free to file an issue against the project. When doing so, please follow the guidelines below:
11 |
12 | To report any bugs, issues, or feature requests, please [open an issue](https://github.com/openvax/varcode/issues)
13 | Please check the [current open issues](https://github.com/openvax/varcode/issues) to see if the request already exists
14 | If you are filing a bug report, please describe the version of Varcode, PyEnsembl, and Python being used. If your problem involves a particular genomic variant, please include that variant and its corresponding reference genome (e.g. "GRCh37 1:384747 AAC>T").
15 |
16 | Coding Guidelines
17 | -----------------
18 | * Varcode is written in Python and adheres to the [PEP8](https://www.python.org/dev/peps/pep-0008/)
19 | style guidelines.
20 | * Contributions should come in the form of GitHub pull requests.
21 | * New features should start with a GitHub issue explaining their scope and rationale.
22 | * If the work is based on an existing issue, please reference the issue in the PR.
23 | * All new code should be accompanied by comprehensive unit tests.
24 | * If the PR fixes or implements an issue, please state "Closes #XYZ" or "Fixes #XYZ", where XYZ is the issue number.
25 | * Please ensure that your code works under Python >= 3.7.
26 |
27 | Licensing
28 | ---------
29 | Varcode is licensed under the Apache 2.0 license. Your code is assumed to be as well.
30 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include versioneer.py
2 | include varcode/_version.py
3 | include README.md
4 | include LICENSE
5 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://github.com/openvax/varcode/actions/workflows/tests.yml)
2 |
3 |
4 |
5 |
6 |
7 |
8 | [](https://pypistats.org/packages/varcode)
9 |
10 | # Varcode
11 |
12 | Varcode is a library for working with genomic variant data in Python and predicting the impact of those variants on protein sequences.
13 |
14 | ## Installation
15 |
16 | You can install varcode using [pip](https://pip.pypa.io/en/latest/quickstart.html):
17 |
18 | ```bash
19 | pip install varcode
20 | ```
21 |
22 | You can install required reference genome data through [PyEnsembl](https://github.com/openvax/pyensembl) as follows:
23 |
24 | ```bash
25 | # Downloads and installs the Ensembl releases (75 and 76)
26 | pyensembl install --release 75 76
27 | ```
28 |
29 | ## Example
30 |
31 | ```python
32 | import varcode
33 |
34 | # Load TCGA MAF containing variants from their
35 | variants = varcode.load_maf("tcga-ovarian-cancer-variants.maf")
36 |
37 | print(variants)
38 | ###
39 | ### -- Variant(contig=1, start=69538, ref=G, alt=A, genome=GRCh37)
40 | ### -- Variant(contig=1, start=881892, ref=T, alt=G, genome=GRCh37)
41 | ### -- Variant(contig=1, start=3389714, ref=G, alt=A, genome=GRCh37)
42 | ### -- Variant(contig=1, start=3624325, ref=G, alt=T, genome=GRCh37)
43 | ### ...
44 |
45 | # you can index into a VariantCollection and get back a Variant object
46 | variant = variants[0]
47 |
48 | # groupby_gene_name returns a dictionary whose keys are gene names
49 | # and whose values are themselves VariantCollections
50 | gene_groups = variants.groupby_gene_name()
51 |
52 | # get variants which affect the TP53 gene
53 | TP53_variants = gene_groups["TP53"]
54 |
55 | # predict protein coding effect of every TP53 variant on
56 | # each transcript of the TP53 gene
57 | TP53_effects = TP53_variants.effects()
58 |
59 | print(TP53_effects)
60 | ###
61 | ### -- PrematureStop(variant=chr17 g.7574003G>A, transcript_name=TP53-001, transcript_id=ENST00000269305, effect_description=p.R342*)
62 | ### -- ThreePrimeUTR(variant=chr17 g.7574003G>A, transcript_name=TP53-005, transcript_id=ENST00000420246)
63 | ### -- PrematureStop(variant=chr17 g.7574003G>A, transcript_name=TP53-002, transcript_id=ENST00000445888, effect_description=p.R342*)
64 | ### -- FrameShift(variant=chr17 g.7574030_7574030delG, transcript_name=TP53-001, transcript_id=ENST00000269305, effect_description=p.R333fs)
65 | ### ...
66 |
67 | premature_stop_effect = TP53_effects[0]
68 |
69 | print(str(premature_stop_effect.mutant_protein_sequence))
70 | ### 'MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMF'
71 |
72 | print(premature_stop_effect.aa_mutation_start_offset)
73 | ### 341
74 |
75 | print(premature_stop_effect.transcript)
76 | ### Transcript(id=ENST00000269305, name=TP53-001, gene_name=TP53, biotype=protein_coding, location=17:7571720-7590856)
77 |
78 | print(premature_stop_effect.gene.name)
79 | ### 'TP53'
80 | ```
81 |
82 | If you are looking for a quick start guide, you can check out [this iPython book](./examples/varcode-quick_start.ipynb) that demonstrates simple use cases of Varcode
83 |
84 | ## Effect Types
85 |
86 | | Effect type | Description |
87 | | ---------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------- |
88 | | _AlternateStartCodon_ | Replace annotated start codon with alternative start codon (_e.g._ "ATG>CAG"). |
89 | | _ComplexSubstitution_ | Insertion and deletion of multiple amino acids. |
90 | | _Deletion_ | Coding mutation which causes deletion of amino acid(s). |
91 | | _ExonLoss_ | Deletion of entire exon, significantly disrupts protein. |
92 | | _ExonicSpliceSite_ | Mutation at the beginning or end of an exon, may affect splicing. |
93 | | _FivePrimeUTR_ | Variant affects 5' untranslated region before start codon. |
94 | | _FrameShiftTruncation_ | A frameshift which leads immediately to a stop codon (no novel amino acids created). |
95 | | _FrameShift_ | Out-of-frame insertion or deletion of nucleotides, causes novel protein sequence and often premature stop codon. |
96 | | _IncompleteTranscript_ | Can't determine effect since transcript annotation is incomplete (often missing either the start or stop codon). |
97 | | _Insertion_ | Coding mutation which causes insertion of amino acid(s). |
98 | | _Intergenic_ | Occurs outside of any annotated gene. |
99 | | _Intragenic_ | Within the annotated boundaries of a gene but not in a region that's transcribed into pre-mRNA. |
100 | | _IntronicSpliceSite_ | Mutation near the beginning or end of an intron but less likely to affect splicing than donor/acceptor mutations. |
101 | | _Intronic_ | Variant occurs between exons and is unlikely to affect splicing. |
102 | | _NoncodingTranscript_ | Transcript doesn't code for a protein. |
103 | | _PrematureStop_ | Insertion of stop codon, truncates protein. |
104 | | _Silent_ | Mutation in coding sequence which does not change the amino acid sequence of the translated protein. |
105 | | _SpliceAcceptor_ | Mutation in the last two nucleotides of an intron, likely to affect splicing. |
106 | | _SpliceDonor_ | Mutation in the first two nucleotides of an intron, likely to affect splicing. |
107 | | _StartLoss_ | Mutation causes loss of start codon, likely result is that an alternate start codon will be used down-stream (possibly in a different frame). |
108 | | _StopLoss_ | Loss of stop codon, causes extension of protein by translation of nucleotides from 3' UTR. |
109 | | _Substitution_ | Coding mutation which causes simple substitution of one amino acid for another. |
110 | | _ThreePrimeUTR_ | Variant affects 3' untranslated region after stop codon of mRNA. |
111 |
112 | ## Coordinate System
113 |
114 | Varcode currently uses a "base counted, one start" genomic coordinate system, to match the Ensembl annotation database. We are planning to switch over to "space counted, zero start" (interbase) coordinates, since that system allows for more uniform logic (no special cases for insertions). To learn more about genomic coordinate systems, read this [blog post](http://alternateallele.blogspot.com/2012/03/genome-coordinate-conventions.html).
115 |
--------------------------------------------------------------------------------
/RELEASING.md:
--------------------------------------------------------------------------------
1 | # Releasing Varcode
2 |
3 | This document explains what do once your [Pull Request](https://www.atlassian.com/git/tutorials/making-a-pull-request/) has been reviewed and all final changes applied. Now you're ready merge your branch into master and release it to the world:
4 |
5 | 0. Assign a version to the release you are preparing and update `__version__` in `version.py` using [semantic versioning](https://semver.org/).
6 |
7 | 1. Merge your branch into master.
8 |
9 | 2. Run `deploy.sh`.
10 |
11 |
--------------------------------------------------------------------------------
/code-of-conduct.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at hello@openvax.org. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/deploy.sh:
--------------------------------------------------------------------------------
1 | ./lint.sh && \
2 | ./test.sh && \
3 | python3 -m pip install --upgrade build && \
4 | python3 -m pip install --upgrade twine && \
5 | rm -rf dist && \
6 | python3 -m build && \
7 | python3 -m twine upload dist/*
8 |
--------------------------------------------------------------------------------
/lint-and-test.sh:
--------------------------------------------------------------------------------
1 | ./lint.sh && ./test.sh
2 |
--------------------------------------------------------------------------------
/lint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -o errexit
3 |
4 | # getting false positives due to this issue with pylint:
5 | # https://bitbucket.org/logilab/pylint/issues/701/false-positives-with-not-an-iterable-and
6 |
7 | find varcode tests -name '*.py' \
8 | | xargs pylint \
9 | --errors-only \
10 | --disable=unsubscriptable-object,not-an-iterable
11 |
12 | echo 'Passes pylint check'
13 |
--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
1 | [TYPECHECK]
2 | # Without ignoring this, we get errors like:
3 | # E:249,20: Module 'numpy' has no 'nan' member (no-member)
4 | ignored-modules = numpy
5 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.7,<2.0
2 | pandas>=2.0.0,<3.0.0
3 | pyensembl>=1.8.1
4 | biopython>=1.64
5 | pyvcf3>=1.0.0
6 | memoized_property>=1.0.2
7 | pylint>=1.4.4
8 | serializable>=0.2.1
9 | sercol>=0.1.4
10 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed under the Apache License, Version 2.0 (the "License");
3 | # you may not use this file except in compliance with the License.
4 | # You may obtain a copy of the License at
5 | #
6 | # http://www.apache.org/licenses/LICENSE-2.0
7 | #
8 | # Unless required by applicable law or agreed to in writing, software
9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 |
14 | import os
15 | import re
16 |
17 | from setuptools import setup, find_packages
18 |
19 | readme_filename = "README.md"
20 | current_directory = os.path.dirname(__file__)
21 | readme_path = os.path.join(current_directory, readme_filename)
22 |
23 | try:
24 | with open(readme_path, 'r') as f:
25 | readme_markdown = f.read()
26 | except Exception as e:
27 | readme_markdown = ""
28 | print(e)
29 | print("Failed to open %s" % readme_path)
30 |
31 | # Determine version number
32 | with open('varcode/version.py', 'r') as f:
33 | version = re.search(
34 | r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
35 | f.read(),
36 | re.MULTILINE).group(1)
37 | print("Version: %s" % version)
38 |
39 | if __name__ == '__main__':
40 | setup(
41 | name='varcode',
42 | packages=find_packages(),
43 | package_data={'varcode.cli': ['logging.conf']},
44 | version=version,
45 | description="Variant annotation in Python",
46 | long_description=readme_markdown,
47 | long_description_content_type='text/markdown',
48 | url="https://github.com/openvax/varcode",
49 | author="Alex Rubinsteyn",
50 | author_email="alex.rubinsteyn@unc.edu",
51 | license="http://www.apache.org/licenses/LICENSE-2.0.html",
52 | classifiers=[
53 | 'Development Status :: 4 - Beta',
54 | 'Environment :: Console',
55 | 'Operating System :: OS Independent',
56 | 'Intended Audience :: Science/Research',
57 | 'License :: OSI Approved :: Apache Software License',
58 | 'Programming Language :: Python',
59 | 'Topic :: Scientific/Engineering :: Bio-Informatics',
60 | ],
61 | install_requires=[
62 | 'numpy>=1.7, <2.0',
63 | 'pandas>=0.15',
64 | 'pyensembl>=1.8.1',
65 | 'biopython>=1.64',
66 | 'pyvcf3>=1.0.0',
67 | 'memoized_property>=1.0.2',
68 | 'serializable>=0.2.1',
69 | 'sercol>=0.1.4',
70 | ],
71 | entry_points={
72 | 'console_scripts': [
73 | 'varcode-genes = varcode.cli.genes_script:main',
74 | 'varcode = varcode.cli.effects_script:main',
75 | ]
76 | })
77 |
--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | pytest --cov=varcode/ --cov-report=term-missing tests
2 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from __future__ import print_function, division, absolute_import
14 |
--------------------------------------------------------------------------------
/tests/benchmark_vcf_load.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | """
14 | Time how long it takes to open a VCF.
15 |
16 | Run as:
17 | python -m profile -s cumtime %(prog)s
18 |
19 | to get profiling output.
20 |
21 | """
22 | import argparse
23 | import time
24 |
25 | import varcode
26 |
27 | parser = argparse.ArgumentParser(description=__doc__)
28 |
29 | parser.add_argument(
30 | "path", help="Path or URL to VCF")
31 |
32 | parser.add_argument(
33 | "--profile", action="store_true",
34 | default=False,
35 | help="Run in a profiler.")
36 |
37 | parser.add_argument(
38 | "--no-info-field",
39 | dest="info_field",
40 | action="store_false",
41 | default=True)
42 |
43 | parser.add_argument(
44 | "--pyvcf",
45 | help="use pyvcf implementation",
46 | action="store_true",
47 | default=False)
48 |
49 | def run():
50 | args = parser.parse_args()
51 |
52 | extra_args = {}
53 | if not args.info_field:
54 | extra_args["include_info"] = False
55 |
56 | start = time.time()
57 |
58 | if args.pyvcf:
59 | result = varcode.load_vcf(
60 | args.path,
61 | allow_extended_nucleotides=True)
62 | else:
63 | result = varcode.load_vcf_fast(
64 | args.path,
65 | allow_extended_nucleotides=True,
66 | **extra_args)
67 |
68 | print("Loaded %d variants in %0.3f sec. " % (
69 | len(result), time.time() - start))
70 | print(result.to_string(limit=5))
71 |
72 | if __name__ == '__main__':
73 | run()
74 |
--------------------------------------------------------------------------------
/tests/common.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 |
14 | expected_effect_properties = [
15 | 'gene',
16 | 'gene_name',
17 | 'gene_id',
18 | 'transcript',
19 | 'transcript_name',
20 | 'transcript_id',
21 | 'modifies_coding_sequence',
22 | 'modifies_protein_sequence',
23 | 'aa_mutation_start_offset',
24 | 'aa_mutation_end_offset',
25 | 'mutant_protein_sequence',
26 | 'short_description'
27 | ]
28 |
29 | def check_effect_properties(effect):
30 | assert effect is not None
31 | # try accessing all the properties to make sure none crash
32 | for attribute_name in expected_effect_properties:
33 | getattr(effect, attribute_name)
34 | assert len(str(effect)) > 0
35 | assert len(repr(effect)) > 0
36 | assert effect.short_description is not None, \
37 | "Expected effect %s to have a `short_description` property" % (effect,)
38 | assert len(effect.short_description) > 0
39 | assert effect.__class__.__name__ in str(effect), \
40 | "Expected string representation of %s to include effect name %s" % (
41 | effect, effect.__class__.__name__)
42 |
43 | def expect_effect(
44 | variant,
45 | transcript_id=None,
46 | effect_class=None,
47 | protein_sequence=None,
48 | **kwargs):
49 | if transcript_id is None:
50 | effects = variant.effects()
51 | effect = effects.top_priority_effect()
52 | else:
53 | transcript = variant.ensembl.transcript_by_id(transcript_id)
54 | effect = variant.effect_on_transcript(transcript)
55 | check_effect_properties(effect)
56 | if effect_class is not None:
57 | assert effect.__class__ is effect_class, \
58 | "Expected effect class %s but got %s" % (
59 | effect_class.__name__,
60 | effect.__class__.__name__)
61 | if protein_sequence is not None:
62 | assert effect.mutant_protein_sequence == protein_sequence, \
63 | "Expected protein sequence %s but got %s" % (
64 | protein_sequence,
65 | effect.mutant_protein_sequence)
66 | for field, expected_value in kwargs.items():
67 | actual_value = getattr(effect, field)
68 | if isinstance(expected_value, int):
69 | format_string = "Expected %s=%d but got %s"
70 | elif isinstance(expected_value, float):
71 | format_string = "Expected %s=%f but got %s"
72 | else:
73 | format_string = "Expected %s='%s' but got '%s'"
74 | assert actual_value == expected_value, format_string % (field, expected_value, actual_value)
75 |
76 | def eq_(x, y, s=None):
77 | if s is None:
78 | assert x == y, "%s != %s" % (x, y)
79 | else:
80 | assert x == y, s
--------------------------------------------------------------------------------
/tests/data.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | """
14 | Helper functions and shared datasets for tests
15 | """
16 |
17 | import os
18 | from varcode import Variant, VariantCollection, load_maf
19 | import pandas as pd
20 |
21 | def data_path(name):
22 | """
23 | Return the absolute path to a file in the varcode/test/data directory.
24 | The name specified should be relative to varcode/test/data.
25 | """
26 | return os.path.join(os.path.dirname(__file__), "data", name)
27 |
28 | dbnsp_validation_df = pd.read_csv(data_path('dbnsfp_validation_set.csv'))
29 | tcga_ov_variants = load_maf(data_path("tcga_ov.head.maf"))
30 | ov_wustle_variants = load_maf(data_path("ov.wustle.subset5.maf"))
31 |
32 | snp_rs4244285 = Variant(
33 | contig=10,
34 | start=94781859,
35 | ref="G",
36 | alt="A")
37 | snp_rs1537415 = Variant(
38 | contig=9,
39 | start=135637876,
40 | ref="C",
41 | alt="G")
42 | snp_rs3892097 = Variant(
43 | contig=22,
44 | start=42524947,
45 | ref="G",
46 | alt="A")
47 |
48 | db_snp_variants = VariantCollection([
49 | snp_rs4244285,
50 | snp_rs1537415,
51 | snp_rs3892097,
52 | ])
53 |
--------------------------------------------------------------------------------
/tests/data/dbnsfp_validation_set.csv:
--------------------------------------------------------------------------------
1 | aa_alt,aa_pos,dna_alt,chrom,ensembl_transcript,dna_position,dna_ref
2 | K,143,T,14,ENST00000379932,105675961,C
3 | L,852,A,12,ENST00000261740,110221487,C
4 | L,805,A,12,ENST00000392719,110221487,C
5 | L,792,A,12,ENST00000346520,110221487,C
6 | L,745,A,12,ENST00000544971,110221487,C
7 | L,792,A,12,ENST00000537083,110221487,C
8 | L,805,A,12,ENST00000541794,110221487,C
9 | L,818,A,12,ENST00000536838,110221487,C
10 | W,241,C,15,ENST00000288235,59516943,G
11 | N,82,G,6,ENST00000377451,27279704,C
12 | C,354,G,1,ENST00000546424,15820483,C
13 | C,354,G,1,ENST00000333868,15820483,C
14 | C,204,G,1,ENST00000348549,15820483,C
15 | C,271,G,1,ENST00000375890,15820483,C
16 | N,176,T,6,ENST00000521485,84368738,C
17 | H,178,C,1,ENST00000368764,152882807,G
18 | H,32,C,1,ENST00000392667,152882807,G
19 | K,2885,T,1,ENST00000368346,155308045,C
20 | K,2880,T,1,ENST00000392403,155308045,C
21 | P,1534,G,22,ENST00000441493,18300827,T
22 | L,32,A,7,ENST00000394507,91871355,G
23 | K,84,T,4,ENST00000296522,175439195,C
24 | Q,446,C,22,ENST00000536101,26165219,G
25 | D,3878,C,2,ENST00000409009,73827899,G
26 | T,10,T,16,ENST00000283025,10788703,C
27 | T,610,T,14,ENST00000331968,30093435,C
28 | G,185,C,20,ENST00000546004,5283287,G
29 | M,170,T,17,ENST00000269051,30616025,C
30 | M,162,T,17,ENST00000538145,30616025,C
31 | M,72,T,17,ENST00000536287,30616025,C
32 | M,1664,A,9,ENST00000313050,139355629,C
33 | M,1486,A,9,ENST00000371706,139355629,C
34 | M,1486,A,9,ENST00000290037,139355629,C
35 | M,1486,A,9,ENST00000431893,139355629,C
36 | A,666,C,4,ENST00000508776,128744730,G
37 | A,697,C,4,ENST00000439123,128744730,G
38 | A,666,C,4,ENST00000296464,128744730,G
39 | A,640,C,4,ENST00000505726,128744730,G
40 | L,38,T,10,ENST00000370196,102891411,C
41 | K,270,A,1,ENST00000498508,214170686,G
42 | H,110,C,16,ENST00000311620,21261217,G
43 | L,947,A,2,ENST00000419748,88857312,G
44 | L,1098,A,2,ENST00000303236,88857312,G
45 | L,260,A,10,ENST00000372873,75407959,G
46 | L,484,A,10,ENST00000394810,75407959,G
47 | E,123,C,6,ENST00000531224,136599652,G
48 | E,121,C,6,ENST00000353331,136599652,G
49 | E,123,C,6,ENST00000527536,136599652,G
50 | E,121,C,6,ENST00000392348,136599652,G
51 | L,2419,T,5,ENST00000438447,32090810,C
52 | L,2419,T,5,ENST00000282493,32090810,C
53 | K,32,T,X,ENST00000375992,51239203,C
54 | R,250,A,14,ENST00000306051,52735280,G
55 | K,467,T,X,ENST00000396992,47483685,C
56 | V,1462,G,5,ENST00000399503,56184179,T
57 | K,123,A,16,ENST00000434417,30429101,G
58 | Q,312,T,1,ENST00000427495,242271091,C
59 | Q,282,T,1,ENST00000442594,242271091,C
60 | Q,374,T,1,ENST00000536534,242271091,C
61 | L,150,T,20,ENST00000244051,49575828,C
62 | K,774,A,16,ENST00000301727,2285538,G
63 | R,85,G,10,ENST00000520547,81272659,A
64 | N,532,A,2,ENST00000393504,99013227,G
65 | N,536,A,2,ENST00000409937,99013227,G
66 | L,26,A,7,ENST00000394507,91871373,C
67 | M,2116,G,19,ENST00000352632,41073580,C
68 | M,859,G,19,ENST00000392025,41073580,C
69 | H,161,G,17,ENST00000301037,26939700,C
70 | K,170,A,20,ENST00000375994,30409276,G
71 | F,679,A,15,ENST00000389039,45398436,G
72 | N,342,T,8,ENST00000361421,59728265,C
73 | K,167,A,13,ENST00000376958,95264638,G
74 | K,1371,T,8,ENST00000320476,144874944,C
75 | K,1290,T,8,ENST00000377533,144874944,C
76 | K,1666,A,11,ENST00000321505,33680325,G
77 | K,1672,A,11,ENST00000389726,33680325,G
78 | A,1326,G,12,ENST00000267101,56495786,C
79 | A,683,G,12,ENST00000450146,56495786,C
80 | A,1267,G,12,ENST00000415288,56495786,C
81 | K,635,T,1,ENST00000366508,247057966,C
82 | K,609,T,1,ENST00000326225,247057966,C
83 | H,379,C,12,ENST00000547057,94691119,G
84 | H,71,C,12,ENST00000545312,94691119,G
85 | K,545,A,3,ENST00000263967,178936091,G
86 | K,402,A,16,ENST00000416441,29996834,G
87 | K,278,A,16,ENST00000389398,22128096,G
88 | Q,837,C,11,ENST00000529051,124908424,G
89 | R,407,A,12,ENST00000257963,52380684,G
90 | R,448,A,12,ENST00000541224,52380684,G
91 | R,355,A,12,ENST00000542485,52380684,G
92 | Q,10,T,6,ENST00000011619,13711709,G
93 | W,763,A,2,ENST00000281405,20136107,G
94 | C,7,G,11,ENST00000398534,71249121,C
95 | Q,119,G,12,ENST00000204726,133393177,C
96 | I,745,T,15,ENST00000356865,25958932,C
97 | K,569,A,1,ENST00000369130,150116967,G
98 | K,526,A,18,ENST00000342988,48604754,G
99 | N,91,T,11,ENST00000528117,8974698,C
100 | L,444,A,5,ENST00000507386,147020337,G
101 | L,444,A,5,ENST00000265272,147020337,G
102 | L,402,A,5,ENST00000333010,147020337,G
103 | E,190,T,15,ENST00000324324,48451958,C
104 | T,108,A,19,ENST00000392518,50203981,G
105 | P,1241,C,2,ENST00000401884,242011122,A
106 | M,951,T,3,ENST00000474889,62253472,C
107 | M,922,T,3,ENST00000295874,62253472,C
108 | E,268,A,9,ENST00000380607,17793439,G
109 | E,221,A,9,ENST00000537391,17793439,G
110 | A,508,G,12,ENST00000228437,108136084,C
111 | N,87,A,10,ENST00000373910,60124591,G
112 |
--------------------------------------------------------------------------------
/tests/data/different-samples.1.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.1
2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
3 | ##INFO=
4 | ##INFO=
5 | ##FORMAT=
6 | ##contig=
7 | ##contig=
8 | ##contig=
9 | ##contig=
10 | ##contig=
11 | ##contig=
12 | ##contig=
13 | ##contig=
14 | ##contig=
15 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis
16 | chr1 53513530 . A C . . GE=SCP2;EG=6342 GT 0/1
17 | chr17 36731197 . C AAT . . GE=SRCIN1;EG=80725 GT 0/1
18 |
--------------------------------------------------------------------------------
/tests/data/different-samples.2.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.1
2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
3 | ##INFO=
4 | ##INFO=
5 | ##FORMAT=
6 | ##contig=
7 | ##contig=
8 | ##contig=
9 | ##contig=
10 | ##contig=
11 | ##contig=
12 | ##contig=
13 | ##contig=
14 | ##contig=
15 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT normal
16 | chr2 5 . A C . . GE=SCP2;EG=6342 GT 0/1
17 | chr7 18 . C AAT . . GE=SRCIN1;EG=80725 GT 0/1
18 |
--------------------------------------------------------------------------------
/tests/data/duplicate-id.1.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.2
2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
3 | #CHROM POS ID REF ALT QUAL FILTER INFO
4 | chr1 13281 1 C G . PASS VT=SNP;SOMATIC
5 |
--------------------------------------------------------------------------------
/tests/data/duplicate-id.2.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.2
2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
3 | #CHROM POS ID REF ALT QUAL FILTER INFO
4 | chr1 13281 1 C G,T . PASS VT=SNP;SOMATIC
5 |
--------------------------------------------------------------------------------
/tests/data/duplicates.maf:
--------------------------------------------------------------------------------
1 | Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_position End_position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2
2 | A1CF 29974 . 37 10 52573692 52573692 + Missense_Mutation SNP G G T . . . . . .
3 | A1CF 29974 . 37 10 52573692 52573692 + Missense_Mutation SNP G G T . . . . . .
4 | A1CF 29974 . 37 10 52573692 52573692 + Missense_Mutation SNP G G T . . . . . .
--------------------------------------------------------------------------------
/tests/data/duplicates.vcf:
--------------------------------------------------------------------------------
1 | #CHROM POS ID REF ALT QUAL FILTER INFO
2 | chr17 7675088 . G A 0 PASS .
3 | chr17 7675088 . G A 0 PASS .
4 | chr17 7675088 . G A 0 PASS .
--------------------------------------------------------------------------------
/tests/data/multiallelic.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.1
2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
3 | ##INFO=
4 | ##INFO=
5 | ##FORMAT=
6 | ##contig=
7 | ##contig=
8 | ##contig=
9 | ##contig=
10 | ##contig=
11 | ##contig=
12 | ##contig=
13 | ##contig=
14 | ##contig=
15 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis
16 | chr1 1431105 rs199599542 A C,G 593.69 PASS DP=17;GE=Wuzzle GT 0/1
17 |
--------------------------------------------------------------------------------
/tests/data/ov.wustle.subset5.maf:
--------------------------------------------------------------------------------
1 | #version 2.4
2 | Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_position End_position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_file Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID chromosome_name start stop reference variant type gene_name transcript_name transcript_species transcript_source transcript_version strand transcript_status trv_type c_position amino_acid_change ucsc_cons domain all_domains deletion_substructures transcript_error
3 | AGL 178 genome.wustl.edu 37 1 100349684 100349684 + Missense_Mutation SNP G G A TCGA-13-1405-01A-01W-0494-09 TCGA-13-1405-10A-01W-0495-09 G G G A G G Unknown Valid Somatic 4 WXS 454_PCR_WGA 1 dbGAP Illumina GAIIx c0d1de72-4cce-4d74-93f0-29c462dc1426 89f04056-0478-4305-b1ce-486ae469b4dd 1 100122272 100122272 G A SNP AGL NM_000028 human genbank 54_36p 1 validated missense c.2317 p.E773K 1 NULL superfamily_Six-hairpin glycosidases;HMMPfam_GDE_C;superfamily_(Trans)glycosidases - no_errors
4 | SASS6 163786 genome.wustl.edu 37 1 100573197 100573197 + Missense_Mutation SNP G G A TCGA-04-1542-01A-01W-0553-09 TCGA-04-1542-10A-01W-0553-09 G G G A G G Unknown Valid Somatic 4 WXS 454_PCR_WGA 1 dbGAP Illumina GAIIx 317a63af-e862-43df-8ef5-7c555b2cb678 b94052a8-c3d2-4e47-81e2-62242bc0841a 1 100345785 100345785 G A SNP SASS6 NM_194292 human genbank 54_36p -1 validated missense c.1133 p.A378V 1 NULL - - no_errors
5 | LRRC39 127495 genome.wustl.edu 37 1 100618068 100618068 + Silent SNP G G A TCGA-23-1022-01A-02W-0488-09 TCGA-23-1022-10A-01W-0488-09 G G G A G G Unknown Valid Somatic 4 WXS 454_PCR_WGA 1 dbGAP Illumina GAIIx 160a0e7d-315e-4de3-a7d4-928412fd909c 6bd506d5-4f1a-4f51-a71f-e453196b245a 1 100390656 100390656 G A SNP LRRC39 NM_144620 human genbank 54_36p -1 provisional silent c.825 p.F275 1 NULL - - no_errors
6 | UBE4B 10277 genome.wustl.edu 37 1 10238758 10238758 + Silent SNP G G C TCGA-13-0920-01A-01W-0421-09 TCGA-13-0920-10A-01W-0421-09 G G G C G G Unknown Valid Somatic 4 WXS 454_PCR_WGA 1 dbGAP Illumina GAIIx 2e28969b-c9a9-41ec-80bf-f583197b7f92 83a80d56-e463-4096-8c17-a44000f80f66 1 10161345 10161345 G C SNP UBE4B NM_001105562 human genbank 54_36p 1 reviewed silent c.3582 p.G1194 0.97 NULL - - no_errors
7 | COL11A1 1301 genome.wustl.edu 37 1 103491420 103491420 + Missense_Mutation SNP T T A TCGA-13-0893-01B-01W-0494-09 TCGA-13-0893-10A-01W-0494-09 T T T A T T Unknown Valid Somatic 4 WXS 454_PCR_WGA 1 dbGAP Illumina GAIIx a335ab49-84b7-4d3b-a03d-9c3931904ca5 23f57381-b679-41b8-8197-aed711f71db4 1 103264008 103264008 T A SNP COL11A1 NM_080629 human genbank 54_36p -1 reviewed missense c.869 p.E290V 1 NULL HMMPfam_COLFI;HMMPfam_Collagen;superfamily_Concanavalin A-like lectins/glucanases;HMMPfam_Laminin_G_2;superfamily_Fibrinogen C-terminal domain-like - no_errors
8 |
--------------------------------------------------------------------------------
/tests/data/same-samples.1.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.1
2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
3 | ##INFO=
4 | ##INFO=
5 | ##FORMAT=
6 | ##contig=
7 | ##contig=
8 | ##contig=
9 | ##contig=
10 | ##contig=
11 | ##contig=
12 | ##contig=
13 | ##contig=
14 | ##contig=
15 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis
16 | chr1 53513530 . A C . . GE=SCP2;EG=6342 GT 0/1
17 | chr17 36731197 . C AAT . . GE=SRCIN1;EG=80725 GT 0/1
18 |
--------------------------------------------------------------------------------
/tests/data/same-samples.2.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.1
2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
3 | ##INFO=
4 | ##INFO=
5 | ##FORMAT=
6 | ##contig=
7 | ##contig=
8 | ##contig=
9 | ##contig=
10 | ##contig=
11 | ##contig=
12 | ##contig=
13 | ##contig=
14 | ##contig=
15 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis
16 | chr1 53513530 . A C . . GE=SCP2;EG=6342 GT 0/1
17 | chr17 36731197 . C AAT . . GE=SRCIN1;EG=80725 GT 0/1
18 |
--------------------------------------------------------------------------------
/tests/data/simple.1.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.2
2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
3 | #CHROM POS ID REF ALT QUAL FILTER INFO
4 | chr1 13281 . C G . PASS VT=SNP;SOMATIC
5 |
--------------------------------------------------------------------------------
/tests/data/simple.2.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.2
2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
3 | #CHROM POS ID REF ALT QUAL FILTER INFO
4 | chr1 13289 . T C . PASS VT=SNP;SOMATIC
5 | chr2 13289 . A G . PASS VT=SNP;SOMATIC
6 |
--------------------------------------------------------------------------------
/tests/data/somatic_hg19_14muts.space_in_sample_name.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.1
2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
3 | ##INFO=
4 | ##INFO=
5 | ##FORMAT=
6 | ##contig=
7 | ##contig=
8 | ##contig=
9 | ##contig=
10 | ##contig=
11 | ##contig=
12 | ##contig=
13 | ##contig=
14 | ##contig=
15 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis foo
16 | chr1 53513530 . A C . . GE=SCP2;EG=6342 GT 0/1
17 | chr1 228295398 . G T . . GE=MRPL55;EG=128308 GT 0/1
18 | chr10 49658590 . T C . . GE=ARHGAP22;EG=58504 GT 0/1
19 | chr10 51585166 . G T . . GE=NCOA4;EG=8031 GT 0/1
20 | chr10 96709040 . A C . . GE=CYP2C9;EG=1559 GT 0/1
21 | chr10 119134281 . G T . . GE=PDZD8;EG=118987 GT 0/1
22 | chr11 118244286 . G G . . GE=UBE4A;EG=9354 GT 0/1
23 | chr12 14794076 . C A . . GE=GUCY2C;EG=2984 GT 0/1
24 | chr12 25398284 . C G . . GE=KRAS;EG=3845 GT 0/1
25 | chr12 42778752 . T A . . GE=PPHLN1;EG=51535 GT 0/1
26 | chr14 31144202 . A C . . GE=SCFD1;EG=23256 GT 0/1
27 | chr16 25704209 . G A . . GE=HS3ST4;EG=9951 GT 0/1
28 | chr17 7577548 . C CA . . GE=TP53;EG=7157 GT 0/1
29 | chr17 36731197 . C AAT . . GE=SRCIN1;EG=80725 GT 0/1
30 |
--------------------------------------------------------------------------------
/tests/data/somatic_hg19_14muts.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.1
2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
3 | ##INFO=
4 | ##INFO=
5 | ##FORMAT=
6 | ##contig=
7 | ##contig=
8 | ##contig=
9 | ##contig=
10 | ##contig=
11 | ##contig=
12 | ##contig=
13 | ##contig=
14 | ##contig=
15 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis
16 | chr1 53513530 . A C . . GE=SCP2;EG=6342 GT 0/1
17 | chr1 228295398 . G T . . GE=MRPL55;EG=128308 GT 0/1
18 | chr10 49658590 . T C . . GE=ARHGAP22;EG=58504 GT 0/1
19 | chr10 51585166 . G T . . GE=NCOA4;EG=8031 GT 0/1
20 | chr10 96709040 . A C . . GE=CYP2C9;EG=1559 GT 0/1
21 | chr10 119134281 . G T . . GE=PDZD8;EG=118987 GT 0/1
22 | chr11 118244286 . G G . . GE=UBE4A;EG=9354 GT 0/1
23 | chr12 14794076 . C A . . GE=GUCY2C;EG=2984 GT 0/1
24 | chr12 25398284 . C G . . GE=KRAS;EG=3845 GT 0/1
25 | chr12 42778752 . T A . . GE=PPHLN1;EG=51535 GT 0/1
26 | chr14 31144202 . A C . . GE=SCFD1;EG=23256 GT 0/1
27 | chr16 25704209 . G A . . GE=HS3ST4;EG=9951 GT 0/1
28 | chr17 7577548 . C CA . . GE=TP53;EG=7157 GT 0/1
29 | chr17 36731197 . C AAT . . GE=SRCIN1;EG=80725 GT 0/1
30 |
--------------------------------------------------------------------------------
/tests/data/somatic_hg19_14muts.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varcode/2c1c96e0564d7ad5f66b26e33fc0a027353640f4/tests/data/somatic_hg19_14muts.vcf.gz
--------------------------------------------------------------------------------
/tests/data/tcga_ov.head.maf:
--------------------------------------------------------------------------------
1 | Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_position End_position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_file Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID chromosome_name start stop reference variant type gene_name transcript_name transcript_species transcript_source transcript_version strand transcript_status trv_type c_position amino_acid_change ucsc_cons domain all_domains deletion_substructures transcript_error
2 | CDK11A 0 - 37 1 1650797 1650797 + Missense_Mutation SNP A A G TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 A A Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 1650797 1650797 A G SNP CDK11A ENST00000404249 human ensembl 69_37n -1 known missense c.325 p.C109R 0.971 NULL pfam_Prot_kinase_cat_dom,pfam_Ser-Thr/Tyr_kinase_cat_dom,superfamily_Kinase-like_dom,smart_Ser/Thr_dual-sp_kinase_dom,smart_Tyr_kinase_cat_dom,pfscan_Prot_kinase_cat_dom - no_errors
3 | GNPAT 0 - 37 1 231401797 231401797 + Missense_Mutation SNP A A C TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 A A Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 231401797 231401797 A C SNP GNPAT ENST00000366647 human ensembl 69_37n +1 known missense c.810 p.R270S 0.997 pfam_Acyltransferase,smart_Acyltransferase pfam_Acyltransferase,smart_Acyltransferase - no_errors
4 | E2F2 0 - 37 1 23836447 23836447 + Silent SNP C C A TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 C C Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 23836447 23836447 C A SNP E2F2 ENST00000361729 human ensembl 69_37n -1 known silent c.1239 p.L413 0.999 NULL pfam_E2F_TDP - no_errors
5 | VSIG2 0 - 37 11 124617502 124617502 + Missense_Mutation SNP C C G TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 C C Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 11 124617502 124617502 C G SNP VSIG2 ENST00000326621 human ensembl 69_37n -1 known missense c.913 p.G305R 0.813 NULL pfam_Ig_V-set,pfam_Ig_I-set,pfam_Immunoglobulin,smart_Ig_sub,smart_Ig_sub2,smart_Ig_V-set_subgr,pfscan_Ig-like - no_errors
6 |
--------------------------------------------------------------------------------
/tests/data/tcga_ov.head.xychr.maf:
--------------------------------------------------------------------------------
1 | Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_position End_position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_file Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID chromosome_name start stop reference variant type gene_name transcript_name transcript_species transcript_source transcript_version strand transcript_status trv_type c_position amino_acid_change ucsc_cons domain all_domains deletion_substructures transcript_error
2 | CDK11A 0 - 37 X 1650797 1650797 + Missense_Mutation SNP A A G TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 A A Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 1650797 1650797 A G SNP CDK11A ENST00000404249 human ensembl 69_37n -1 known missense c.325 p.C109R 0.971 NULL pfam_Prot_kinase_cat_dom,pfam_Ser-Thr/Tyr_kinase_cat_dom,superfamily_Kinase-like_dom,smart_Ser/Thr_dual-sp_kinase_dom,smart_Tyr_kinase_cat_dom,pfscan_Prot_kinase_cat_dom - no_errors
3 | GNPAT 0 - 37 Y 231401797 231401797 + Missense_Mutation SNP A A C TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 A A Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 231401797 231401797 A C SNP GNPAT ENST00000366647 human ensembl 69_37n +1 known missense c.810 p.R270S 0.997 pfam_Acyltransferase,smart_Acyltransferase pfam_Acyltransferase,smart_Acyltransferase - no_errors
4 | E2F2 0 - 37 1 23836447 23836447 + Silent SNP C C A TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 C C Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 1 23836447 23836447 C A SNP E2F2 ENST00000361729 human ensembl 69_37n -1 known silent c.1239 p.L413 0.999 NULL pfam_E2F_TDP - no_errors
5 | VSIG2 0 - 37 11 124617502 124617502 + Missense_Mutation SNP C C G TCGA-04-1337-01A-01W-0484-10 TCGA-04-1337-11A-01W-0485-10 C C Unknown Unknown Somatic Phase_IV Capture 1 dbGAP - 11 124617502 124617502 C G SNP VSIG2 ENST00000326621 human ensembl 69_37n -1 known missense c.913 p.G305R 0.813 NULL pfam_Ig_V-set,pfam_Ig_I-set,pfam_Immunoglobulin,smart_Ig_sub,smart_Ig_sub2,smart_Ig_V-set_subgr,pfscan_Ig-like - no_errors
6 |
--------------------------------------------------------------------------------
/tests/test_cli_effects.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from tempfile import NamedTemporaryFile
14 | import pandas as pd
15 |
16 | from varcode.cli.effects_script import main as run_script
17 | from varcode import Variant
18 |
19 | from .common import eq_
20 | def test_varcode_effects_script_kras_g12d_top_effect():
21 | """
22 | Load a variant collection with combines the ovarian cancer test VCF
23 | and a small number of variants from dbSNP
24 | """
25 | kras_g12d_variant = Variant(
26 | 12,
27 | 25398284,
28 | "C",
29 | "T",
30 | "GRCh37")
31 | commandline_args = ["--genome", "grch37", "--only-coding", "--one-per-variant"]
32 | commandline_args.append("--variant")
33 | commandline_args.append(str(kras_g12d_variant.contig))
34 | commandline_args.append(str(kras_g12d_variant.start))
35 | commandline_args.append(str(kras_g12d_variant.original_ref))
36 | commandline_args.append(str(kras_g12d_variant.original_alt))
37 | with NamedTemporaryFile(mode="r+", delete=True) as f:
38 | commandline_args.extend(["--output-csv", f.name])
39 | run_script(commandline_args)
40 | f.flush()
41 | df = pd.read_csv(f.name)
42 | eq_(len(df), 1)
43 | eq_(df.loc[0].gene_name, "KRAS")
44 | eq_(df.iloc[0].effect, "p.G12D")
45 |
46 |
--------------------------------------------------------------------------------
/tests/test_cli_genes.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from varcode.cli.genes_script import main as run_script
14 | from .data import ov_wustle_variants, db_snp_variants
15 |
16 | from tempfile import NamedTemporaryFile
17 | import pandas as pd
18 |
19 |
20 | def test_varcode_effects_script():
21 | """
22 | Load a variant collection with combines the ovarian cancer test VCF
23 | and a small number of variants from dbSNP
24 | """
25 | commandline_args = ["--genome", "grch37"]
26 | commandline_args.extend(["--maf", ov_wustle_variants.path])
27 | for variant in db_snp_variants:
28 | commandline_args.append("--variant")
29 | commandline_args.append(str(variant.contig))
30 | commandline_args.append(str(variant.start))
31 | commandline_args.append(str(variant.original_ref))
32 | commandline_args.append(str(variant.original_alt))
33 | with NamedTemporaryFile(mode="r+", delete=True) as f:
34 | commandline_args.extend(["--output-csv", f.name])
35 | run_script(commandline_args)
36 | f.flush()
37 | combined_variants = pd.read_csv(f.name)
38 | assert len(combined_variants) == (len(ov_wustle_variants) + len(db_snp_variants))
39 |
--------------------------------------------------------------------------------
/tests/test_collection_filtering.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 |
14 | from varcode import VariantCollection
15 | from .common import eq_
16 | from .data import (
17 | snp_rs4244285,
18 | snp_rs1537415
19 | )
20 |
21 | variants = VariantCollection([
22 | # gene ids: ['ENSG00000165841', 'ENSG00000276490']
23 | # transcript_ids : ['ENST00000371321', 'ENST00000464755']
24 | snp_rs4244285,
25 | # gene ids: ['ENSG00000204007']
26 | # transcript ids: ['ENST00000371763', 'ENST00000613244']
27 | snp_rs1537415,
28 | ])
29 |
30 | gene_fpkm_dict = {
31 | "ENSG00000165841": 10.0,
32 | "ENSG00000204007": 20.0,
33 | "ENSG00000276490": 30.0,
34 | }
35 |
36 | transcript_fpkm_dict = {
37 | "ENST00000371321": 10.0,
38 | "ENST00000464755": 20.0,
39 | "ENST00000371763": 30.0,
40 | "ENST00000613244": 40.0,
41 | "ENST00000645461": 5.0,
42 | }
43 |
44 | effects = variants.effects()
45 |
46 | empty_variants = VariantCollection([])
47 | empty_effects = empty_variants.effects()
48 |
49 |
50 | def test_filter_variants():
51 | eq_(variants.filter(lambda _: True), variants)
52 | eq_(variants.filter(lambda _: False), empty_variants)
53 |
54 |
55 | def test_filter_effects():
56 | eq_(effects.filter(lambda _: True), effects)
57 | eq_(effects.filter(lambda _: False), empty_effects)
58 |
59 |
60 | def test_filter_variants_by_gene_expression():
61 | eq_(variants.filter_by_gene_expression(
62 | gene_fpkm_dict, 0.0), variants)
63 | eq_(variants.filter_by_gene_expression(
64 | gene_fpkm_dict, 100.0), empty_variants)
65 |
66 |
67 | def test_filter_effects_by_gene_expression():
68 | eq_(effects.filter_by_gene_expression(
69 | gene_fpkm_dict, 0.0), effects)
70 | eq_(effects.filter_by_gene_expression(
71 | gene_fpkm_dict, 100.0), empty_effects)
72 |
73 |
74 | def test_filter_variants_by_transcript_expression():
75 | expect_all = variants.filter_by_gene_expression(
76 | gene_fpkm_dict, 0.0)
77 | eq_(expect_all, variants)
78 | expect_none = variants.filter_by_gene_expression(
79 | gene_fpkm_dict, 100.0)
80 | eq_(expect_none, empty_variants)
81 |
82 |
83 | def test_filter_effects_by_transcript_expression():
84 |
85 | expect_all = effects.filter_by_transcript_expression(
86 | transcript_fpkm_dict, 0.0)
87 | eq_(expect_all, effects)
88 | expect_none = effects.filter_by_transcript_expression(
89 | transcript_fpkm_dict, 100.0)
90 | eq_(expect_none, empty_effects)
91 |
92 |
93 | def test_filter_silent_effects():
94 | # all dbSNP entries in the collection are silent
95 | assert len(effects.drop_silent_and_noncoding()) == 0
96 |
--------------------------------------------------------------------------------
/tests/test_common.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | import varcode
14 | from .common import eq_
15 |
16 | def test_memoize():
17 | class State(object):
18 | def __init__(self):
19 | self.x = 0
20 |
21 | def incr(self):
22 | self.x += 1
23 |
24 | state1 = State()
25 | # call incr twice and expect state to increment twice
26 | state1.incr()
27 | state1.incr()
28 | eq_(state1.x, 2)
29 |
30 | state2 = State()
31 | memoized = varcode.common.memoize(state2.incr)
32 | # call twice but should only increase once
33 | memoized()
34 | memoized()
35 | eq_(state2.x, 1)
36 |
37 | def test_groupby_field():
38 | class Record(object):
39 | def __init__(self, x, y):
40 | self.x = x
41 | self.y = y
42 |
43 | def __eq__(self, other):
44 | return self.x == other.x and self.y == other.y
45 |
46 | def __str__(self):
47 | return "Record(%s, %s)" % (self.x, self.y)
48 |
49 | def __repr__(self):
50 | return str(self)
51 |
52 | r1_2 = Record(1, 2)
53 | r10_20 = Record(10, 20)
54 | r1_3 = Record(1, 3)
55 | data = [r1_2, r10_20, r1_3]
56 | grouped_dict = varcode.common.groupby_field(data, 'x')
57 | eq_(tuple(sorted(grouped_dict.keys())), (1, 10))
58 | eq_(grouped_dict[1], [r1_2, r1_3])
59 | eq_(grouped_dict[10], [r10_20])
60 |
--------------------------------------------------------------------------------
/tests/test_cosmic_mutations.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from pyensembl import ensembl_grch37 as ensembl
14 | from varcode import Variant
15 | from varcode.effects import (
16 | Substitution,
17 | Deletion,
18 | Insertion,
19 | FrameShift,
20 | Silent,
21 | ExonicSpliceSite,
22 | )
23 |
24 | def _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id):
25 | variant = Variant(chrom, pos, dna_ref, dna_alt, ensembl=ensembl)
26 | effects = variant.effects()
27 | transcript_dict = effects.top_priority_effect_per_transcript_id()
28 | assert transcript_id in transcript_dict, \
29 | "Expected transcript ID %s for variant %s not found in %s" % (
30 | transcript_id, variant, transcript_dict)
31 | effect = transcript_dict[transcript_id]
32 |
33 | # COSMIC seems to ignore exonic splice sites
34 | if isinstance(effect, ExonicSpliceSite):
35 | return effect.alternate_effect
36 | else:
37 | return effect
38 |
39 | def _substitution(chrom, pos, dna_ref, dna_alt, transcript_id, aa_ref, aa_alt):
40 | effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id)
41 | assert isinstance(effect, Substitution), \
42 | "Expected effect to be substitution, got %s" % (effect,)
43 |
44 | assert effect.aa_ref == aa_ref, \
45 | "Expected aa_ref='%s' : %s but got %s : %s from %s" % (
46 | aa_ref, type(aa_ref),
47 | effect.aa_ref, type(effect.aa_ref),
48 | effect)
49 | assert effect.aa_alt == aa_alt, \
50 | "Expected aa_alt='%s' but got %s" % (
51 | aa_alt, effect)
52 |
53 | def _silent(chrom, pos, dna_ref, dna_alt, transcript_id, aa_ref):
54 | effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id)
55 | assert isinstance(effect, Silent), \
56 | "Expected effect to be silent, got %s" % (effect,)
57 | assert effect.aa_ref == aa_ref, "Expected aa_ref='%s', got '%s'" % (
58 | aa_ref, effect.aa_ref)
59 |
60 | def _deletion(chrom, pos, dna_ref, dna_alt, transcript_id, deleted):
61 | effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id)
62 | assert isinstance(effect, Deletion), \
63 | "Expected deletion, got %s" % (effect,)
64 | assert effect.aa_ref == deleted, \
65 | "Expected deletion of '%s' but got deletion of '%s' for %s:%d%s>%s" % (
66 | deleted, effect.aa_ref, chrom, pos, dna_ref, dna_alt)
67 |
68 | def _insertion(chrom, pos, dna_ref, dna_alt, transcript_id, inserted):
69 | effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id)
70 | assert isinstance(effect, Insertion), \
71 | "Expected insertion, got %s" % (effect,)
72 | assert effect.aa_alt == inserted, \
73 | "Expected insertion of '%s' but got %s for %s:%d%s>%s" % (
74 | inserted,
75 | effect.short_description(),
76 | chrom,
77 | pos,
78 | dna_ref,
79 | dna_alt)
80 |
81 | def _frameshift(
82 | chrom,
83 | pos,
84 | dna_ref,
85 | dna_alt,
86 | transcript_id,
87 | aa_pos,
88 | aa_ref):
89 | effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id)
90 | assert isinstance(effect, FrameShift), \
91 | "Expected frameshift, got %s" % (effect,)
92 | effect_aa_pos = effect.aa_mutation_start_offset
93 | assert effect.aa_ref[0] == aa_ref and effect_aa_pos + 1 == aa_pos, \
94 | ("Expected frameshift to replace p.%d%s but instead got %s" % (
95 | aa_pos, aa_ref, effect))
96 |
97 | def test_COSM3939556_silent():
98 | # 22 19222059 COSM3939556 G>T
99 | # GENE=CLTCL1_ENST00000427926
100 | # STRAND=-
101 | # CDS=c.1140C>A
102 | # AA=p.A380A
103 | _silent("22", 19222059, "G", "T", "ENST00000427926", "A")
104 |
105 | def test_COSM3747785_NBPF10_Q363L():
106 | # 1 145311839 COSM3747785 A>T
107 | # GENE=NBPF10_ENST00000369338
108 | # STRAND=+
109 | # CDS=c.1088A>T
110 | # AA=p.Q363L
111 | _substitution("1", 145311839, "A", "T", "ENST00000369338", "Q", "L")
112 |
113 | def test_COSM3368867_SMUG1_Q133L():
114 | # 12 54576295 COSM3368867 T>A
115 | # GENE=SMUG1_ENST00000513838
116 | # STRAND=-
117 | # CDS=c.398A>T
118 | # AA=p.Q133L
119 | _substitution("12", 54576295, "T", "A", "ENST00000513838", "Q", "L")
120 |
121 | def test_COSM3508871_FBRS_K224N():
122 | # 16 30676364 COSM3508871 A>T
123 | # GENE=FBRS_ENST00000356166
124 | # STRAND=+
125 | # CDS=c.1572A>T
126 | # AA=p.K524N
127 | _substitution("16", 30676364, "A", "T", "ENST00000356166", "K", "N")
128 |
129 | def test_COSM1616161_L1724R():
130 | # 21 46932218 COSM1616161 T>G
131 | # GENE=COL18A1_ENST00000359759
132 | # STRAND=+
133 | # CDS=c.5171T>G
134 | # AA=p.L1724R
135 | _substitution("21", 46932218, "T", "G", "ENST00000359759", "L", "R")
136 |
137 | def test_COSM1651074_IL9R_D148Y():
138 | # X 155234091 COSM1651074 TGG>TCT
139 | # GENE=IL9R_ENST00000244174
140 | # STRAND=+
141 | # CDS=c.441_442GG>CT
142 | # AA=p.D148Y
143 | _substitution("X", 155234091, "TGG", "TCT", "ENST00000244174", "D", "Y")
144 |
145 | def test_COSM3682816_RBMY1D_V193A():
146 | # Y 24030663 COSM3682816 A>G
147 | # GENE=RBMY1D_ENST00000382680
148 | # STRAND=-
149 | # CDS=c.578T>C
150 | # AA=p.V193A
151 | _substitution("Y", 24030663, "A", "G", "ENST00000382680", "V", "A")
152 |
153 | def test_COSM1333672_BCL9_Q1150delQ():
154 | """
155 | test_COSM1333672_BCL9_Q1150delQ : in-frame deletion of 3 nucleotides
156 | """
157 | # 1 147095923 COSM1333672 ACAG> A
158 | # GENE=BCL9_ENST00000234739
159 | # STRAND=+
160 | # CDS=c.3445_3447delCAG
161 | # AA=p.Q1150delQ
162 | _deletion("1", 147095923, "ACAG", "A", "ENST00000234739", "Q")
163 |
164 | def test_COSM1190996_FBX011_P57insQQQ():
165 | """
166 | test_COSM1190996_FBX011_P57insQQQ : in-frame insertion of 9 nucleotides
167 | """
168 | # 2 48132713 COSM1190996 C>CTGCTGCTGC
169 | # GENE=FBXO11_ENST00000403359
170 | # STRAND=-
171 | # CDS=c.146_147insGCAGCAGCA
172 | # AA=p.Q56_P57insQQQ;CNT=1
173 | _insertion("2", 48132713, "C", "CTGCTGCTGC", "ENST00000403359", "QQQ")
174 |
175 | def test_COSM1732848_CCDC109B_F264fs():
176 | """
177 | test_COSM1732848_CCDC109B_F264fs : frame shift from nucleotide deletion
178 | """
179 | # 4 110605772 COSM1732848 CT>C
180 | # GENE=CCDC109B_ENST00000394650
181 | # STRAND=+
182 | # CDS=c.787delT
183 | # AA=p.F264fs*5
184 | _frameshift(
185 | "4", 110605772, "CT", "C", "ENST00000394650",
186 | aa_pos=264,
187 | aa_ref="F")
188 |
189 | def test_COSM87531_SYNE1_E4738fs():
190 | """
191 | test_COSM87531_SYNE1_E4738fs : frame shift from nucleotide insertion
192 | """
193 | # The given genomic mutation is:
194 | # 6 152651608 COSM87531 C>CA
195 | # but through some painful manual checking I realized that
196 | # the nucleotides here are *not* the correct ones for the
197 | # forward strand (SYNE1 is on the negative strand) and instead
198 | # it should be:
199 | # 6 152651608 COSM87531 C>CT
200 | # GENE=SYNE1_ENST00000265368
201 | # STRAND=-
202 | # CDS=c.14211_14212insA
203 | # AA=p.E4738fs*34
204 | _frameshift(
205 | "6", 152651608, "C", "GT", "ENST00000265368",
206 | aa_pos=4738,
207 | aa_ref="E")
208 |
209 | def test_COSM27279_CTNNB1_Q4H():
210 | """
211 | test_COSM27279_CTNNB1_Q4H : Apply Cosmic mutation COSM27279
212 | transcript = 'ENST00000405570'
213 | pos: 41265571,
214 | ref : A, alt : T
215 | amino acids = Q -> H @ pos 4 (mutation = Q4H)
216 | """
217 | _substitution("3", 41265571, "A", "T", "ENST00000405570", "Q", "H")
218 |
--------------------------------------------------------------------------------
/tests/test_dbnsfp_validation.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | import pytest
14 | from pyensembl import ensembl_grch37
15 | from varcode import Variant
16 | from varcode.effects import (
17 | ExonicSpliceSite,
18 | Substitution,
19 | TranscriptMutationEffect
20 | )
21 | import pandas as pd
22 |
23 | from .data import data_path
24 |
25 |
26 | def generate_dbnsfp_validation_set():
27 | # check that amino acid substitution gives
28 | # same answer as subset of dbNSFP entries (using Ensembl 75)
29 |
30 | # columns for validation dataset:
31 | # - aa_pos : base-1 position within protein
32 | # - dna_alt : non-reference DNA nucleotide
33 | # - chrom : choromosome
34 | # - ensembl_transcript : transcript ID
35 | # - dna_position : base-1 position within chromosome
36 | # - dna_ref : reference DNA nucleotide
37 |
38 | # pylint: disable=no-member
39 | # pylint gets confused by read_csv
40 | validation_set = pd.read_csv(data_path('dbnsfp_validation_set.csv'))
41 | for _, row in validation_set.iterrows():
42 | args = (
43 | row['ensembl_transcript'],
44 | row['chrom'],
45 | row['dna_position'],
46 | row['dna_ref'],
47 | row['dna_alt'],
48 | row['aa_pos'],
49 | row['aa_alt']
50 | )
51 | # making this a generator so every row shows up as its
52 | # owns test in nose
53 | yield args
54 |
55 |
56 |
57 | @pytest.mark.parametrize([
58 | 'ensembl_transcript_id',
59 | 'chrom',
60 | 'dna_position',
61 | 'dna_ref',
62 | 'dna_alt',
63 | 'aa_pos',
64 | 'aa_alt'], generate_dbnsfp_validation_set())
65 | def test_dbnsfp_validation_set_transcript_mutation(
66 | ensembl_transcript_id,
67 | chrom,
68 | dna_position,
69 | dna_ref,
70 | dna_alt,
71 | aa_pos,
72 | aa_alt):
73 | variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl_grch37)
74 | effects = variant.effects()
75 | transcript_id_dict = {
76 | effect.transcript.id: effect
77 | for effect in effects
78 | if isinstance(effect, TranscriptMutationEffect)
79 | }
80 | assert ensembl_transcript_id in transcript_id_dict, \
81 | "%s not found in %s" % (ensembl_transcript_id, transcript_id_dict)
82 | effect = transcript_id_dict[ensembl_transcript_id]
83 |
84 | if isinstance(effect, ExonicSpliceSite):
85 | # exonic splice site mutations carry with them an alternate effect
86 | # which is what we check against dbNSFP (since that database seemed
87 | # to ignore exonic splicing mutations)
88 | effect = effect.alternate_effect
89 |
90 | assert isinstance(effect, Substitution), \
91 | "Expected substitution (aa_pos=%d, aa_alt=%s) but got %s" % (
92 | aa_pos, aa_alt, effect)
93 | effect_aa_pos = effect.aa_mutation_start_offset
94 | effect_aa_alt = effect.mutant_protein_sequence[effect_aa_pos]
95 | assert (
96 | effect_aa_pos + 1 == aa_pos and
97 | effect_aa_alt == aa_alt), \
98 | "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % (
99 | aa_alt,
100 | aa_pos,
101 | chrom,
102 | dna_position,
103 | dna_ref,
104 | dna_alt,
105 | effect)
106 |
--------------------------------------------------------------------------------
/tests/test_effect_collection.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | """
14 | Test properties of EffectCollection
15 | """
16 |
17 |
18 | from varcode.effects import IncompleteTranscript, Substitution
19 | from .common import eq_
20 | from .data import tcga_ov_variants, ov_wustle_variants
21 | tcga_ov_effects = tcga_ov_variants.effects()
22 | ov_wustle_effects = ov_wustle_variants.effects()
23 |
24 | def test_to_dataframe():
25 | df = tcga_ov_effects.to_dataframe()
26 | eq_(len(tcga_ov_effects), len(df))
27 |
28 | def test_effect_collection_gene_counts():
29 | # test that each gene is counted just once
30 | for gene, count in ov_wustle_effects.gene_counts().items():
31 | assert count > 1, \
32 | "Expected more than 1 effect for %s (got %d)" % (gene, count)
33 |
34 | def test_effect_collection_groupby_gene():
35 | genes = ov_wustle_effects.groupby_gene().keys()
36 | # make sure that the IDs attached to Gene objects are the same as IDs
37 | # of groupby_gene_id
38 | gene_ids = set(ov_wustle_effects.groupby_gene_id().keys())
39 | eq_({gene.id for gene in genes}, gene_ids)
40 |
41 | def test_effect_collection_groupby_gene_id():
42 | gene_ids = set(ov_wustle_effects.groupby_gene_id().keys())
43 | eq_(gene_ids, {
44 | 'ENSG00000060718',
45 | 'ENSG00000156876',
46 | 'ENSG00000130939',
47 | 'ENSG00000122477',
48 | 'ENSG00000162688'
49 | })
50 |
51 | def test_effect_collection_groupby_gene_name():
52 | gene_names = set(ov_wustle_effects.groupby_gene_name().keys())
53 | eq_(gene_names, {"AGL", "SASS6", "LRRC39", "UBE4B", "COL11A1"})
54 |
55 | def test_effect_collection_groupby_variant():
56 | variants = set(ov_wustle_effects.groupby_variant().keys())
57 | # make sure that all the original variants are still present
58 | # in the group keys
59 | eq_(variants, set(ov_wustle_variants))
60 |
61 | def test_effect_collection_filter_by_effect_priority():
62 | # every effect should be at least the same priority as "incomplete"
63 | eq_(
64 | tcga_ov_effects,
65 | tcga_ov_effects.filter_by_effect_priority(IncompleteTranscript))
66 | assert len(tcga_ov_effects) > len(
67 | tcga_ov_effects.filter_by_effect_priority(Substitution))
68 |
69 | def test_effect_collection_drop_silent_and_noncoding():
70 | # some of the predicted effects are non-coding so should get dropped
71 | assert len(tcga_ov_effects) > len(tcga_ov_effects.drop_silent_and_noncoding())
72 |
--------------------------------------------------------------------------------
/tests/test_effect_collection_serialization.py:
--------------------------------------------------------------------------------
1 |
2 | # Licensed under the Apache License, Version 2.0 (the "License");
3 | # you may not use this file except in compliance with the License.
4 | # You may obtain a copy of the License at
5 | #
6 | # http://www.apache.org/licenses/LICENSE-2.0
7 | #
8 | # Unless required by applicable law or agreed to in writing, software
9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 |
14 | import pickle
15 |
16 | from varcode import EffectCollection
17 |
18 | from .common import eq_
19 | from .data import tcga_ov_variants, ov_wustle_variants
20 |
21 | tcga_ov_effects = tcga_ov_variants.effects()
22 | ov_wustle_effects = ov_wustle_variants.effects()
23 |
24 | def test_tcga_effect_collection_to_dict():
25 | eq_(
26 | tcga_ov_effects,
27 | EffectCollection.from_dict(tcga_ov_effects.to_dict()))
28 |
29 | def test_wustle_effect_collection_to_dict():
30 | eq_(
31 | ov_wustle_effects,
32 | EffectCollection.from_dict(ov_wustle_effects.to_dict()))
33 |
34 | def test_tcga_effect_collection_to_json():
35 | eq_(tcga_ov_effects, EffectCollection.from_json(tcga_ov_effects.to_json()))
36 |
37 | def test_wustle_effect_collection_to_json():
38 | eq_(
39 | ov_wustle_effects,
40 | EffectCollection.from_json(ov_wustle_effects.to_json()))
41 |
42 | def test_tcga_effect_collection_pickling():
43 | reconstructed = pickle.loads(pickle.dumps(tcga_ov_effects))
44 | eq_(tcga_ov_effects, reconstructed)
45 |
46 | def test_wustle_effect_collection_pickling():
47 | reconstructed = pickle.loads(pickle.dumps(ov_wustle_effects))
48 | eq_(ov_wustle_effects, reconstructed)
--------------------------------------------------------------------------------
/tests/test_effects_from_mutagenix_variants.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | """
14 | List of variants copied from:
15 | https://mutagenetix.utsouthwestern.edu
16 | /incidental/incidental_rec.cfm?
17 | mid=&so=rb&ac=1&r0=0&nr=100&rn=29&rl=1&scd=IGL01779&mid=153891
18 | """
19 |
20 | from varcode import Variant
21 | from varcode.effects import Substitution
22 |
23 | from .common import expect_effect
24 |
25 | def test_substitution_Akt1_chr12_112657169_C_T_G286R():
26 | expect_effect(
27 | variant=Variant("chr12", 112657169, "C", "T", "mm10"),
28 | effect_class=Substitution,
29 | aa_mutation_start_offset=285,
30 | aa_ref="G",
31 | aa_alt="R")
32 |
33 | def test_substitution_Apof_chr10_128269477_A_G_I167V():
34 | expect_effect(
35 | variant=Variant("chr10", 128269477, "A", "G", "mm10"),
36 | effect_class=Substitution,
37 | aa_mutation_start_offset=166,
38 | aa_ref="I",
39 | aa_alt="V")
40 |
41 | def test_substitution_Csmd3_chr15_47857894_A_T_V1551D():
42 | expect_effect(
43 | variant=Variant("chr15", 47857894, "A", "T", "mm10"),
44 | effect_class=Substitution,
45 | aa_mutation_start_offset=1550,
46 | aa_ref="V",
47 | aa_alt="D")
48 |
49 | def test_substitution_Pprc1_chr19_46062202_T_A_I130N():
50 | expect_effect(
51 | variant=Variant("chr19", 46062202, "T", "A", "mm10"),
52 | effect_class=Substitution,
53 | aa_mutation_start_offset=129,
54 | aa_ref="I",
55 | aa_alt="N")
56 |
57 | def test_substitution_Vipr1_chr9_121664630_T_C_F249S():
58 | expect_effect(
59 | variant=Variant("chr9", 121664630, "T", "C", "mm10"),
60 | effect_class=Substitution,
61 | aa_mutation_start_offset=248,
62 | aa_ref="F",
63 | aa_alt="S")
64 |
--------------------------------------------------------------------------------
/tests/test_exonic_splice_site.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from varcode import Variant
14 | from varcode.effects import ExonicSpliceSite, PrematureStop
15 |
16 |
17 | def test_STAT1_stop_gain_at_exon_boundary():
18 | # top priority effect for this variant should be PrematureStop,
19 | # even though it's also ExonicSpliceSite
20 | stat1_variant = Variant("2", "191872291", "G", "A", "GRCh37")
21 | effects = stat1_variant.effects()
22 | print(effects)
23 | assert any([e.__class__ is ExonicSpliceSite for e in effects])
24 | top_effect = effects.top_priority_effect()
25 | print(top_effect)
26 | assert top_effect.__class__ is PrematureStop
27 |
--------------------------------------------------------------------------------
/tests/test_frameshift_helpers.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from varcode.effects.effect_prediction_coding_frameshift import (
14 | cdna_codon_sequence_after_insertion_frameshift,
15 | cdna_codon_sequence_after_deletion_or_substitution_frameshift,
16 | )
17 |
18 | from .common import eq_
19 |
20 | def test_cdna_codon_sequence_after_insertion_frameshift_before_codon():
21 | # insertion: T_ATGCCCTAG
22 | i, s = cdna_codon_sequence_after_insertion_frameshift(
23 | sequence_from_start_codon="ATGCCCTAG",
24 | cds_offset_before_insertion=-1,
25 | inserted_nucleotides="T")
26 | eq_(i, 0)
27 | eq_(s, "TATGCCCTAG")
28 |
29 | def test_cdna_codon_sequence_after_insertion_frameshift_in_middle_of_codon():
30 | # insertion: A_T_TGCCCTAG
31 | i, s = cdna_codon_sequence_after_insertion_frameshift(
32 | sequence_from_start_codon="ATGCCCTAG",
33 | cds_offset_before_insertion=0,
34 | inserted_nucleotides="T")
35 | eq_(i, 0)
36 | eq_(s, "ATTGCCCTAG")
37 |
38 | def test_cdna_codon_sequence_after_insertion_frameshift_at_end_of_codon():
39 | # insertion: AT_T_GCCCTAG
40 | i, s = cdna_codon_sequence_after_insertion_frameshift(
41 | sequence_from_start_codon="ATGCCCTAG",
42 | cds_offset_before_insertion=1,
43 | inserted_nucleotides="T")
44 | eq_(i, 0)
45 | eq_(s, "ATTGCCCTAG")
46 |
47 | def test_cdna_codon_sequence_after_insertion_frameshift_after_codon():
48 | # insertion: ATG_T_CCCTAG
49 | i, s = cdna_codon_sequence_after_insertion_frameshift(
50 | sequence_from_start_codon="ATGCCCTAG",
51 | cds_offset_before_insertion=2,
52 | inserted_nucleotides="T")
53 | eq_(i, 1)
54 | eq_(s, "TCCCTAG")
55 |
56 | def test_cdna_codon_sequence_after_deletion_or_substitution_frameshift_delA():
57 | i, s = cdna_codon_sequence_after_deletion_or_substitution_frameshift(
58 | sequence_from_start_codon="ATGCCCTAG",
59 | cds_offset=0,
60 | trimmed_cdna_ref="A",
61 | trimmed_cdna_alt="")
62 | eq_(i, 0)
63 | eq_(s, "TGCCCTAG")
64 |
65 |
66 | def test_cdna_codon_sequence_after_deletion_or_substitution_frameshift_AT_to_C():
67 | i, s = cdna_codon_sequence_after_deletion_or_substitution_frameshift(
68 | sequence_from_start_codon="ATGCCCTAG",
69 | cds_offset=0,
70 | trimmed_cdna_ref="AT",
71 | trimmed_cdna_alt="C")
72 | eq_(i, 0)
73 | eq_(s, "CGCCCTAG")
74 |
--------------------------------------------------------------------------------
/tests/test_maf.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | import pytest
14 |
15 | from pyensembl import ensembl_grch37 as ensembl
16 |
17 | from varcode import Variant, load_maf, load_maf_dataframe
18 |
19 | import pandas as pd
20 |
21 | from .common import eq_
22 | from .data import tcga_ov_variants, ov_wustle_variants, data_path
23 |
24 | def test_maf():
25 | expected_tcga_ov_variants = [
26 | Variant(1, 1650797, "A", "G", ensembl),
27 | Variant(1, 23836447, "C", "A", ensembl),
28 | Variant(1, 231401797, "A", "C", ensembl),
29 | Variant(11, 124617502, "C", "G", ensembl),
30 | ]
31 | eq_(len(tcga_ov_variants), len(expected_tcga_ov_variants))
32 | for v_expect, v_maf in zip(expected_tcga_ov_variants, tcga_ov_variants):
33 | eq_(v_expect, v_maf)
34 | gene_name = tcga_ov_variants.metadata[v_maf]['Hugo_Symbol']
35 | assert any(gene.name == gene_name for gene in v_maf.genes), \
36 | "Expected gene name %s but got %s" % (gene_name, v_maf.genes)
37 |
38 |
39 | def generate_maf_aa_changes():
40 | # Parse a MAF file and make sure we're annotating the protein amino acid
41 | # changes in the same way.
42 | #
43 | # The data file used also contains spaces, which is good to test the parser
44 | # on.
45 | assert len(ov_wustle_variants) == 5
46 |
47 | expected_changes = {}
48 | # pylint: disable=no-member
49 | # pylint gets confused by read_csv
50 | maf_fields = pd.read_csv(
51 | ov_wustle_variants.path,
52 | sep="\t",
53 | comment="#")
54 | for _, row in maf_fields.iterrows():
55 | key = (str(row.Chromosome), row.Start_position)
56 | change = row.amino_acid_change
57 | # silent mutations just specificy which amino acid they affect via
58 | # e.g. "p.G384"
59 | if change[-1].isdigit():
60 | expected_changes[key] = "silent"
61 | else:
62 | expected_changes[key] = change
63 |
64 | for variant in ov_wustle_variants:
65 | key = (variant.contig, variant.start)
66 | expected = expected_changes[key]
67 | yield (variant, expected)
68 |
69 | @pytest.mark.parametrize(['variant', 'expected_aa_change'], generate_maf_aa_changes())
70 | def test_maf_aa_changes(variant, expected_aa_change):
71 | effect = variant.effects().top_priority_effect()
72 | change = effect.short_description
73 | eq_(
74 | change,
75 | expected_aa_change,
76 | "MAF file had annotation %s but Varcode gave %s" % (
77 | expected_aa_change, change))
78 |
79 |
80 | def test_maf_number_entries_duplicates():
81 | # There are 3 duplicated mutations listed in the MAF
82 | path_to_maf_with_duplicates = data_path("duplicates.maf")
83 | variants = load_maf(path_to_maf_with_duplicates, distinct=True)
84 | assert len(variants) == 1
85 | variants = load_maf(path_to_maf_with_duplicates, distinct=False)
86 | assert len(variants) == 3
87 |
88 | def test_load_maf():
89 | for raise_on_error in [True, False]:
90 | variants = load_maf(
91 | data_path("ov.wustle.subset5.maf"), raise_on_error=raise_on_error)
92 | eq_(len(variants), 5)
93 |
94 |
95 | def test_load_maf_dataframe():
96 | for raise_on_error in [True, False]:
97 | variants_df = load_maf_dataframe(
98 | data_path("ov.wustle.subset5.maf"), raise_on_error=raise_on_error)
99 | eq_(len(variants_df), 5)
100 |
101 |
102 | def test_xy_contigs():
103 | """
104 | Test MAFs with X and Y chromosomes rather than just numerical chromosomes.
105 | """
106 | for raise_on_error in [True, False]:
107 | variants = load_maf(
108 | data_path("tcga_ov.head.xychr.maf"), raise_on_error=True)
109 | eq_(len(variants), 4)
110 |
111 |
112 | def test_load_utf8():
113 | """
114 | Test MAFs loaded with utf-8 encoding.
115 | """
116 | for raise_on_error in [True, False]:
117 | variants = load_maf(
118 | data_path("ov.wustle.subset5.maf"), raise_on_error=True, encoding="utf-8")
119 | eq_(len(variants), 5)
120 | # Make sure we avoid "TypeError: character mapping must return integer, None or unicode"
121 | # from Bio.Seq.
122 | _ = variants.effects()
123 |
--------------------------------------------------------------------------------
/tests/test_mm10_klf6_frameshift.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from varcode import Variant
14 | from varcode.effects import FrameShift
15 | from varcode.effects.effect_prediction_coding_frameshift import (
16 | predict_frameshift_coding_effect,
17 | cdna_codon_sequence_after_insertion_frameshift,
18 | )
19 |
20 | from .common import eq_
21 |
22 |
23 | def validate_effect_values(effect):
24 | eq_(effect.__class__, FrameShift)
25 | transcript = effect.transcript
26 | eq_(transcript.name, "Klf6-201")
27 | eq_(transcript.spliced_offset(5864876), 469)
28 | eq_(effect.shifted_sequence, "GEEGGIRTEDFF")
29 |
30 |
31 | def test_mm10_Klf6_frameshift():
32 | variant = Variant("chr13", 5864876, "", "G", "mm10")
33 | effects = variant.effects().drop_silent_and_noncoding()
34 | eq_(len(effects), 1)
35 | validate_effect_values(effects[0])
36 |
37 |
38 | def test_mm10_Klf6_frameshift_coding_effect_fn():
39 | variant = Variant("chr13", 5864876, "", "G", "mm10")
40 | transcripts = variant.transcripts
41 | coding_transcripts = [
42 | t for t in transcripts
43 | if t.biotype == "protein_coding"
44 | ]
45 | eq_(len(coding_transcripts), 1)
46 | t = coding_transcripts[0]
47 | eq_(t.name, "Klf6-201")
48 | # first start codon offset is 157
49 | # mutation occurs after offset 469
50 | effect = predict_frameshift_coding_effect(
51 | trimmed_cdna_ref="",
52 | trimmed_cdna_alt="G",
53 | cds_offset=469 - 157,
54 | sequence_from_start_codon=t.sequence[157:],
55 | variant=variant,
56 | transcript=t)
57 | validate_effect_values(effect)
58 |
59 |
60 | def test_mm10_Klf6_frameshift_cdna_codon_sequence():
61 | variant = Variant("chr13", 5864876, "", "G", "mm10")
62 | transcripts = variant.transcripts
63 | coding_transcripts = [
64 | t for t in transcripts
65 | if t.biotype == "protein_coding"
66 | ]
67 | eq_(len(coding_transcripts), 1)
68 | t = coding_transcripts[0]
69 | eq_(t.name, "Klf6-201")
70 | mutant_codon_index, seq_after_mutated_codon = \
71 | cdna_codon_sequence_after_insertion_frameshift(
72 | sequence_from_start_codon=t.sequence[157:],
73 | cds_offset_before_insertion=469 - 157,
74 | inserted_nucleotides="G")
75 | eq_(mutant_codon_index, 104)
76 | expected_sequence = t.sequence[469] + "G" + t.sequence[470:]
77 | eq_(seq_after_mutated_codon, expected_sequence)
78 |
--------------------------------------------------------------------------------
/tests/test_mouse.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from .common import eq_
14 |
15 | from varcode import load_vcf, load_vcf_fast, Variant
16 | from varcode.effects import Substitution
17 | from pyensembl import Genome, EnsemblRelease
18 | from .data import data_path
19 |
20 | MOUSE_ENSEMBL_RELEASE = 95
21 | SERVER = "ftp://ftp.ensembl.org"
22 | MOUSE_GTF_PATH = \
23 | SERVER + "/pub/release-%d/gtf/mus_musculus/Mus_musculus.GRCm38.%d.gtf.gz" % (
24 | MOUSE_ENSEMBL_RELEASE, MOUSE_ENSEMBL_RELEASE)
25 | MOUSE_TRANSCRIPT_FASTA_PATH = \
26 | SERVER + "/pub/release-%d/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz"
27 | MOUSE_PROTEIN_FASTA_PATH = \
28 | SERVER + "/pub/release-%d/fasta/mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz" % (
29 | MOUSE_ENSEMBL_RELEASE)
30 |
31 | MOUSE_VCF = data_path("mouse_vcf_dbsnp_chr1_partial.vcf")
32 |
33 | explicit_url_genome = Genome(
34 | reference_name="GRCm38",
35 | annotation_name="ensembl",
36 | annotation_version=MOUSE_ENSEMBL_RELEASE,
37 | gtf_path_or_url=MOUSE_GTF_PATH,
38 | transcript_fasta_paths_or_urls=[MOUSE_TRANSCRIPT_FASTA_PATH],
39 | protein_fasta_paths_or_urls=[MOUSE_PROTEIN_FASTA_PATH])
40 |
41 | ensembl_mouse_genome = EnsemblRelease(MOUSE_ENSEMBL_RELEASE, species="mouse")
42 |
43 | def test_load_vcf_mouse_with_explicit_urls():
44 | variants = load_vcf(MOUSE_VCF, genome=explicit_url_genome)
45 | eq_(len(variants), 217)
46 |
47 | def test_load_vcf_mouse_with_ensembl_release():
48 | variants = load_vcf(MOUSE_VCF, genome=ensembl_mouse_genome)
49 | eq_(len(variants), 217)
50 |
51 | def test_load_vcf_mouse_with_inferred_genome():
52 | variants = load_vcf(MOUSE_VCF)
53 | eq_(len(variants), 217)
54 |
55 | def test_specific_variant_mouse_with_explicit_urls():
56 | # Exon #2 at http://useast.ensembl.org/Mus_musculus/Transcript/Exons?
57 | # db=core;g=ENSMUSG00000017167;r=11:101170523-101190724;t=ENSMUST00000103109
58 | variant = Variant(
59 | contig=11,
60 | start=101177240,
61 | ref="G",
62 | alt="T",
63 | ensembl=explicit_url_genome)
64 | effects = variant.effects()
65 | eq_(len(effects), 2)
66 | substitution_effects = [
67 | effect
68 | for effect in effects
69 | if isinstance(effect, Substitution)
70 | ]
71 | eq_(len(substitution_effects), 1)
72 | substitution_effect = substitution_effects[0]
73 | # The coding sequence through the sub:
74 | # ATGATGAGTCTCCGGCTCTTCAGCATCCTGCTCGCCACG
75 | # GTGGTCTCTGGAGCTTGGGGCTGGGGCTACTACGGTTGC
76 | # (The final G is the sub: the 77th nucleotide)
77 | # TGC (C) -> TTC (F)
78 | # 78 / 3 = 26
79 | # 0-base = 25
80 | eq_(substitution_effect.mutant_protein_sequence[25], "F")
81 | eq_(substitution_effect.original_protein_sequence[25], "C")
82 |
83 |
84 | def test_specific_variant_mouse_with_ensembl_genome():
85 | # Exon #2 at http://useast.ensembl.org/Mus_musculus/Transcript/Exons?
86 | # db=core;g=ENSMUSG00000017167;r=11:101170523-101190724;t=ENSMUST00000103109
87 | variant = Variant(
88 | contig=11,
89 | start=101177240,
90 | ref="G",
91 | alt="T",
92 | ensembl=ensembl_mouse_genome)
93 | effects = variant.effects()
94 | eq_(len(effects), 2)
95 | substitution_effects = [
96 | effect
97 | for effect in effects
98 | if isinstance(effect, Substitution)
99 | ]
100 | eq_(len(substitution_effects), 1)
101 | substitution_effect = substitution_effects[0]
102 | # The coding sequence through the sub:
103 | # ATGATGAGTCTCCGGCTCTTCAGCATCCTGCTCGCCACG
104 | # GTGGTCTCTGGAGCTTGGGGCTGGGGCTACTACGGTTGC
105 | # (The final G is the sub: the 77th nucleotide)
106 | # TGC (C) -> TTC (F)
107 | # 78 / 3 = 26
108 | # 0-base = 25
109 | eq_(substitution_effect.mutant_protein_sequence[25], "F")
110 | eq_(substitution_effect.original_protein_sequence[25], "C")
111 |
--------------------------------------------------------------------------------
/tests/test_mutate.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from varcode.effects import mutate
14 | from .common import eq_
15 |
16 | def test_snp_mutation():
17 | seq = "AACCTT"
18 | mutated = mutate.substitute(seq, 1, "A", "G")
19 | eq_(mutated, "AGCCTT")
20 |
21 | def test_deletion_mutation():
22 | seq = "AACT"
23 | mutated = mutate.substitute(seq, 1, "ACT", "T")
24 | eq_(mutated, "AT")
25 |
26 | def test_insert_before():
27 | mutated = mutate.insert_before("AACT", 1, "GG")
28 | eq_(mutated, "AGGACT")
29 |
30 | def test_insert_after():
31 | mutated = mutate.insert_after("AACT", 1, "GG")
32 | eq_(mutated, "AAGGCT")
33 |
--------------------------------------------------------------------------------
/tests/test_no_duplicate_variants.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from pyensembl import EnsemblRelease
14 | from varcode import Variant, VariantCollection
15 |
16 | def test_drop_duplicates():
17 | ensembl = EnsemblRelease(78)
18 | v1 = Variant("1", 3000, "A", "G", ensembl=ensembl)
19 | v1_copy = Variant("1", 3000, "A", "G", ensembl=ensembl)
20 | v2 = Variant("2", 10, "G", "T", ensembl=ensembl)
21 | collection_without_duplicates = VariantCollection(
22 | variants=[v1, v1, v1_copy, v2])
23 | assert len(collection_without_duplicates) == 2
24 |
--------------------------------------------------------------------------------
/tests/test_problematic_variants.py:
--------------------------------------------------------------------------------
1 | """
2 | Any variants which are encountered in the wild and either cause Varcode
3 | to crash or return an incorrect annotation should be added to this
4 | test module.
5 | """
6 |
7 | import pytest
8 | from varcode import Variant
9 |
10 | from .common import check_effect_properties
11 |
12 | # variants which have previously resulted in raised exceptions
13 | # during effect annotation
14 | should_not_crash_variants = [
15 | # error message:
16 | # "Couldn't find position 92979124 on any exon of ENST00000540033"
17 | Variant(
18 | contig=1,
19 | start=92979092,
20 | ref="ATATATATATATATATATATATATATATATATG",
21 | alt="A",
22 | genome="GRCh37"),
23 | # error message:
24 | # "Expect non-silent stop-loss variant to cause longer variant protein"
25 | # "" but got len(original) = 653, len(variant) = 653"
26 | Variant(
27 | contig=1,
28 | start=167385324,
29 | ref="TAA",
30 | alt="T",
31 | genome="GRCh37"),
32 | # error message:
33 | # "Variant which span 5' UTR and CDS not supported"
34 | Variant(
35 | contig=19,
36 | start=44351166,
37 | ref="GGGAGAT",
38 | alt="G",
39 | genome="GRCh37"),
40 | # error message:
41 | # "Can't have ref = '' and alt = 'E' at aa_pos = 445, cds_pos = 1335"
42 | Variant(
43 | contig=1,
44 | start=1684347,
45 | ref="",
46 | alt="CCT",
47 | genome="GRCh37"),
48 | Variant(
49 | contig=11,
50 | start=47640416,
51 | ref="",
52 | alt="TCTTT",
53 | genome="GRCh37"),
54 | Variant(
55 | contig=12,
56 | start=98880902,
57 | ref="A",
58 | alt="",
59 | genome="GRCh37"),
60 | Variant(
61 | contig=19,
62 | start=52803670,
63 | ref="TG",
64 | alt="",
65 | genome="GRCh37"),
66 | Variant(
67 | contig=1,
68 | start=109792735,
69 | ref="",
70 | alt="CGC",
71 | genome="GRCh37"),
72 | # error message:
73 | # "expected ref 'GATGTCGG' at offset 1412 of ENST00000297524...CDS has 'G'"
74 | Variant(
75 | contig=8,
76 | start=87226635,
77 | ref="CCGACATC",
78 | alt="",
79 | genome="GRCh37"),
80 | # error message: "Can't have empty aa_ref and aa_alt"
81 | Variant(
82 | contig=8,
83 | start=141488566,
84 | ref="T",
85 | alt="C",
86 | genome="GRCh38"),
87 | # error message: "len(aa_alt) = 0"
88 | Variant(
89 | contig=11,
90 | start=57741870,
91 | ref="G",
92 | alt="C",
93 | genome="GRCh38"),
94 | # error message: "IndexError: string index out of range"
95 | Variant(
96 | contig=11,
97 | start=63676705,
98 | ref="T", alt="",
99 | genome="GRCh37"),
100 | # AssertionError: aa_ref and aa_alt can't both be empty string
101 | Variant(
102 | contig=1,
103 | start=56962223,
104 | ref='C',
105 | alt='T',
106 | genome="GRCh37"),
107 | # AssertionError: aa_ref and aa_alt can't both be empty string
108 | Variant(
109 | contig=1,
110 | start=56962223,
111 | ref="C",
112 | alt="T",
113 | genome="GRCh37"),
114 | # AssertionError: aa_ref and aa_alt can't both be empty string
115 | Variant(
116 | contig=1,
117 | start=151314663,
118 | ref="C",
119 | alt="T",
120 | genome="GRCh37"),
121 | # AssertionError: aa_ref and aa_alt can't both be empty string
122 | Variant(
123 | contig=1,
124 | start=153409535,
125 | ref="C",
126 | alt="T",
127 | genome="GRCh37"),
128 | # AssertionError: aa_ref and aa_alt can't both be empty string
129 | Variant(
130 | contig=10,
131 | start=105791994,
132 | ref="C",
133 | alt="T",
134 | genome="GRCh37"),
135 | # Expected frameshift_insertion to be before stop codon
136 | # for Variant(contig=1, start=109925189, ref=., alt=A, genome=GRCh38)
137 | # on transcript_id=ENST00000329608
138 | # len(protein) = 554, aa_pos = 554
139 | Variant(
140 | contig=1,
141 | start=109925189,
142 | ref="",
143 | alt="A",
144 | genome="GRCh38"),
145 | Variant(
146 | contig=7,
147 | start=117120188,
148 | ref="A",
149 | alt="AAGT",
150 | genome="GRCh37"),
151 | # had problems with end coordinate loading this one from a MAF but also
152 | # want to make sure it doesn't cause other trouble
153 | Variant(
154 | contig=1,
155 | start=109461324,
156 | ref="GG",
157 | alt="TT",
158 | genome="GRCh37")
159 | ]
160 |
161 |
162 | @pytest.mark.parametrize(['variant'], [(v,) for v in should_not_crash_variants])
163 | def test_crashing_variants(variant):
164 | effect = variant.effects().top_priority_effect()
165 | check_effect_properties(effect)
--------------------------------------------------------------------------------
/tests/test_reference.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 |
14 | import warnings
15 |
16 | import pytest
17 |
18 | from varcode.reference import infer_reference_name, ensembl_reference_aliases, most_recent_assembly_name
19 | from .common import eq_
20 |
21 | ## test cases are given as
22 | ## expected response: list of inputs
23 | reference_test_cases = {
24 | 'NCBI36': [
25 | 'ncbi36p2.fasta',
26 | 'b36.fasta',
27 | '##reference=file:///var/lib/cwl/ncbi36/homo_sapiens.d1.vd1.fa'],
28 | 'GRCh38': [
29 | 'grch38p2.fasta',
30 | '##reference=file:///var/lib/cwl/job367935311_index_001zdr/GRCh38.d1.vd1.fa',
31 | '##reference=file:///var/lib/cwl/job367935311_index_001zdr/GRCh38.job36.d1.vd1.fa',
32 | ],
33 | }
34 |
35 | def test_most_recent_assembly():
36 | eq_(most_recent_assembly_name(['ncbi36', 'grch38']), 'grch38')
37 | eq_(most_recent_assembly_name(['ncbi36', 'grch38', '37mm']), 'grch38')
38 | eq_(most_recent_assembly_name(['ncbi36']), 'ncbi36')
39 | eq_(most_recent_assembly_name(['ncbi36', '35']), 'ncbi36')
40 | def generate_reference_name_aliases():
41 | with warnings.catch_warnings(record=True) as w:
42 | for assembly_name, aliases in ensembl_reference_aliases.items():
43 | candidate_list = [assembly_name] + list(aliases)
44 | for candidate in candidate_list:
45 | yield (
46 | candidate,
47 | assembly_name
48 | )
49 |
50 | @pytest.mark.parametrize(['candidate', 'assembly_name'], generate_reference_name_aliases())
51 | def test_infer_reference_name_aliases(candidate, assembly_name):
52 | eq_(infer_reference_name(candidate), assembly_name)
53 |
54 | def generate_reference_name_fasta_filenames():
55 | with warnings.catch_warnings(record=True):
56 | for assembly_name, aliases in reference_test_cases.items():
57 | candidate_list = [assembly_name] + list(aliases)
58 | for candidate in candidate_list:
59 | yield (
60 | candidate,
61 | assembly_name
62 | )
63 |
64 | @pytest.mark.parametrize(['candidate', 'assembly_name'], generate_reference_name_fasta_filenames())
65 | def test_reference_name_fasta_filenames(candidate, assembly_name):
66 | eq_(infer_reference_name(candidate), assembly_name)
67 |
68 |
--------------------------------------------------------------------------------
/tests/test_string_helpers.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from .common import eq_
14 |
15 | from varcode.string_helpers import trim_shared_flanking_strings
16 |
17 | def test_trim_shared_string_endings():
18 | # empty strings
19 | eq_(trim_shared_flanking_strings("", "A"), ("", "A", "", ""))
20 | eq_(trim_shared_flanking_strings("A", ""), ("A", "", "", ""))
21 |
22 | # string pairs with shared prefixes
23 | eq_(trim_shared_flanking_strings("AA", "AA"), ("", "", "AA", ""))
24 | eq_(trim_shared_flanking_strings("AB", "AA"), ("B", "A", "A", ""))
25 | eq_(trim_shared_flanking_strings("AA", "AB"), ("A", "B", "A", ""))
26 | eq_(trim_shared_flanking_strings("AB", "A"), ("B", "", "A", ""))
27 | eq_(trim_shared_flanking_strings("AB", "A"), ("B", "", "A", ""))
28 | eq_(trim_shared_flanking_strings("A", "AB"), ("", "B", "A", ""))
29 |
30 | # string pairs with shared suffixes
31 | eq_(trim_shared_flanking_strings("CCAT", "GT"),
32 | ("CCA", "G", "", "T"))
33 | eq_(trim_shared_flanking_strings("CCAT", "GT"),
34 | ("CCA", "G", "", "T"))
35 |
36 | # string pairs with shared prefixes+suffixes
37 | eq_(trim_shared_flanking_strings(
38 | "AATG", "AACG"), ("T", "C", "AA", "G"))
39 | eq_(trim_shared_flanking_strings(
40 | "ABG", "AG"), ("B", "", "A", "G"))
41 | eq_(trim_shared_flanking_strings(
42 | "AG", "ABG"), ("", "B", "A", "G"))
43 |
--------------------------------------------------------------------------------
/tests/test_timings.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from __future__ import print_function, division, absolute_import
14 | import time
15 |
16 | from varcode.util import random_variants
17 |
18 | def _time_variant_annotation(variant_collection):
19 | start_t = time.time()
20 | effects = variant_collection.effects()
21 | end_t = time.time()
22 | assert len(effects.groupby_variant()) == len(variant_collection)
23 | elapsed_t = end_t - start_t
24 | return elapsed_t
25 |
26 |
27 | def test_effect_timing(
28 | n_variants=100,
29 | random_seed=0,
30 | n_warmup_variants=5):
31 | warmup_collection = random_variants(
32 | n_warmup_variants,
33 | random_seed=None)
34 | warmup_collection.effects()
35 |
36 | variant_collection = random_variants(
37 | n_variants,
38 | random_seed=random_seed)
39 | elapsed_t = _time_variant_annotation(variant_collection)
40 | print("Elapsed: %0.4f for %d variants" % (elapsed_t, n_variants))
41 | assert elapsed_t / n_variants < 0.1, \
42 | "Should be faster than 100ms / variant!"
43 |
44 | if __name__ == "__main__":
45 | test_effect_timing()
46 |
--------------------------------------------------------------------------------
/tests/test_variant.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | """
14 | Test simple properties of Variant objects, such as their trimming
15 | of shared prefix/suffix strings from ref/alt fields.
16 | """
17 |
18 | import pickle
19 | from pyensembl import ensembl_grch38
20 |
21 | from varcode import Variant
22 | from .common import eq_
23 |
24 | def test_insertion_shared_prefix():
25 | variant = Variant(1, start=10, ref="AA", alt="AAT")
26 | eq_(variant.contig, "1")
27 | eq_(variant.original_ref, "AA")
28 | eq_(variant.original_alt, "AAT")
29 | eq_(variant.original_start, 10)
30 | # since this variant is just an insertion of a "T", get rid of
31 | # the prefix context
32 | eq_(variant.ref, "")
33 | eq_(variant.alt, "T")
34 | # the [start,end] interval for an insertion is just the base we're
35 | # inserting after, which in this case is the 11th position
36 | eq_(variant.start, 11)
37 | eq_(variant.end, 11)
38 | eq_(variant.short_description, "chr1 g.11_12insT")
39 | assert variant.is_indel
40 | assert variant.is_insertion
41 | assert not variant.is_deletion
42 |
43 | def test_insertion_no_prefix():
44 | variant = Variant(1, start=11, ref="", alt="T")
45 | eq_(variant.contig, "1")
46 | eq_(variant.original_ref, "")
47 | eq_(variant.original_alt, "T")
48 | eq_(variant.original_start, 11)
49 | eq_(variant.ref, "")
50 | eq_(variant.alt, "T")
51 | eq_(variant.start, 11)
52 | eq_(variant.end, 11)
53 | eq_(variant.short_description, "chr1 g.11_12insT")
54 | assert variant.is_indel
55 | assert variant.is_insertion
56 | assert not variant.is_deletion
57 |
58 | def test_substitution_no_prefix():
59 | variant = Variant(1, start=11, ref="A", alt="T")
60 | eq_(variant.contig, "1")
61 | eq_(variant.original_ref, "A")
62 | eq_(variant.original_alt, "T")
63 | eq_(variant.original_start, 11)
64 | eq_(variant.ref, "A")
65 | eq_(variant.alt, "T")
66 | eq_(variant.start, 11)
67 | eq_(variant.end, 11)
68 | eq_(variant.short_description, "chr1 g.11A>T")
69 | assert not variant.is_indel
70 | assert not variant.is_insertion
71 | assert not variant.is_deletion
72 |
73 | def test_substitution_shared_prefix():
74 | variant = Variant(1, start=10, ref="AA", alt="AT")
75 | eq_(variant.contig, "1")
76 | eq_(variant.original_ref, "AA")
77 | eq_(variant.original_alt, "AT")
78 | eq_(variant.original_start, 10)
79 | eq_(variant.ref, "A")
80 | eq_(variant.alt, "T")
81 | eq_(variant.start, 11)
82 | eq_(variant.end, 11)
83 | eq_(variant.short_description, "chr1 g.11A>T")
84 | assert not variant.is_indel
85 | assert not variant.is_insertion
86 | assert not variant.is_deletion
87 |
88 | def test_deletion_shared_suffix():
89 | variant = Variant(1, start=10, ref="AAC", alt="C")
90 | eq_(variant.contig, "1")
91 | eq_(variant.original_ref, "AAC")
92 | eq_(variant.original_alt, "C")
93 | eq_(variant.original_start, 10)
94 | eq_(variant.ref, "AA")
95 | eq_(variant.alt, "")
96 | eq_(variant.start, 10)
97 | eq_(variant.end, 11)
98 | eq_(variant.short_description, "chr1 g.10_11delAA")
99 | assert variant.is_indel
100 | assert not variant.is_insertion
101 | assert variant.is_deletion
102 |
103 | def test_deletion_no_suffix():
104 | variant = Variant(1, start=10, ref="AA", alt="")
105 | eq_(variant.contig, "1")
106 | eq_(variant.original_ref, "AA")
107 | eq_(variant.original_alt, "")
108 | eq_(variant.original_start, 10)
109 | eq_(variant.ref, "AA")
110 | eq_(variant.alt, "")
111 | eq_(variant.start, 10)
112 | eq_(variant.end, 11)
113 | eq_(variant.short_description, "chr1 g.10_11delAA")
114 | assert variant.is_indel
115 | assert not variant.is_insertion
116 | assert variant.is_deletion
117 |
118 | def test_serialization():
119 | variants = [
120 | Variant(
121 | 1, start=10, ref="AA", alt="AAT", genome=ensembl_grch38),
122 | Variant(10, start=15, ref="A", alt="G"),
123 | Variant(20, start=150, ref="", alt="G"),
124 | ]
125 | for original in variants:
126 | # This causes the variant's ensembl object to make a SQL connection,
127 | # which makes the ensembl object non-serializable. By calling this
128 | # method, we are checking that we don't attempt to directly serialize
129 | # the ensembl object.
130 | original.effects()
131 |
132 | # Test pickling.
133 | serialized = pickle.dumps(original)
134 | reconstituted = pickle.loads(serialized)
135 | eq_(original, reconstituted)
136 |
137 | eq_(original.contig, reconstituted.contig)
138 | eq_(original.ref, reconstituted.ref)
139 | eq_(original.alt, reconstituted.alt)
140 | eq_(original.start, reconstituted.start)
141 | eq_(original.end, reconstituted.end)
142 | eq_(original.original_ref, reconstituted.original_ref)
143 | eq_(original.original_alt, reconstituted.original_alt)
144 | eq_(original.original_start, reconstituted.original_start)
145 |
146 | # Test json.
147 | serialized = original.to_json()
148 | reconstituted = Variant.from_json(serialized)
149 | eq_(original, reconstituted)
150 |
151 | def test_deserialization_old_keywords():
152 | old_variant_representation_json = """
153 | {
154 | "ref": "T",
155 | "contig": "22",
156 | "start": 23230319,
157 | "__class__": {
158 | "__name__": "Variant",
159 | "__module__": "varcode.variant"
160 | },
161 | "normalize_contig_name": true,
162 | "alt": "G",
163 | "allow_extended_nucleotides": false,
164 | "ensembl": {
165 | "__class__": {
166 | "__name__": "EnsemblRelease",
167 | "__module__": "pyensembl.ensembl_release"
168 | },
169 | "release": 75,
170 | "server": "ftp://ftp.ensembl.org",
171 | "species": {
172 | "__class__": {
173 | "__name__": "Species",
174 | "__module__": "pyensembl.species"
175 | },
176 | "latin_name": "homo_sapiens"
177 | }
178 | }
179 | }
180 | """
181 | variant = Variant.from_json(old_variant_representation_json)
182 | eq_(variant.contig, "22")
183 | eq_(variant.ref, "T")
184 | eq_(variant.alt, "G")
185 | eq_(variant.reference_name, "GRCh37")
186 | eq_(variant.normalize_contig_names, True)
187 | eq_(variant.allow_extended_nucleotides, False)
188 |
189 | def test_hg19_chromosome_names():
190 | # trimming of mithochondrial name
191 | eq_(Variant("M", 1, "A", "G", "hg19", convert_ucsc_contig_names=True).contig, "MT")
192 | eq_(Variant("M", 1, "A", "G", "hg19", convert_ucsc_contig_names=False).contig, "M")
193 |
194 | eq_(Variant("chrM", 1, "A", "G", "hg19", convert_ucsc_contig_names=True).contig, "MT")
195 | eq_(Variant("chrM", 1, "A", "G", "hg19", convert_ucsc_contig_names=False).contig, "chrM")
196 |
197 | # uppercase
198 | eq_(Variant("chrm", 1, "A", "G", "hg19", convert_ucsc_contig_names=True).contig, "MT")
199 | eq_(Variant("chrm", 1, "A", "G", "hg19", convert_ucsc_contig_names=False).contig, "chrM")
200 |
201 | # trimming of 'chr' prefix from hg19
202 | eq_(Variant("chr1", 1, "A", "G", "hg19", convert_ucsc_contig_names=True).contig, "1")
203 | eq_(Variant("chr1", 1, "A", "G", "hg19", convert_ucsc_contig_names=False).contig, "chr1")
204 |
205 | def test_contig_name_normalization():
206 | eq_(Variant(1, 1, "A", "G", normalize_contig_names=True).contig, "1")
207 | eq_(Variant(1, 1, "A", "G", normalize_contig_names=False).contig, 1)
208 |
209 | # uppercase
210 | eq_(Variant(
211 | "chrm", 1, "A", "G", normalize_contig_names=True, convert_ucsc_contig_names=False).contig, "chrM")
212 | eq_(Variant(
213 | "chrm", 1, "A", "G", normalize_contig_names=False, convert_ucsc_contig_names=False).contig, "chrm")
214 |
215 |
216 | def test_snv_transition_transversion():
217 | ref_variant = Variant(1, start=100, ref="C", alt="C")
218 | assert not ref_variant.is_snv
219 |
220 | variant = Variant(1, start=100, ref="C", alt="T")
221 | assert variant.is_snv
222 | assert variant.is_transition
223 | assert not variant.is_transversion
224 |
225 | transversion = Variant(1, start=100, ref="C", alt="A")
226 | assert transversion.is_snv
227 | assert not transversion.is_transition
228 | assert transversion.is_transversion
229 |
--------------------------------------------------------------------------------
/tests/test_variant_collection.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | """
14 | Test properties of VariantCollection objects other than effect annotations
15 | """
16 | from collections import Counter
17 | import pickle
18 |
19 | from .common import eq_
20 | from .data import ov_wustle_variants, tcga_ov_variants
21 |
22 | from varcode import VariantCollection, Variant
23 |
24 | def test_variant_collection_union():
25 | combined = ov_wustle_variants.union(tcga_ov_variants)
26 | eq_(set(combined.sources), {ov_wustle_variants.source, tcga_ov_variants.source})
27 | eq_(len(combined), len(ov_wustle_variants) + len(tcga_ov_variants))
28 |
29 | def test_variant_collection_intersection():
30 | combined = ov_wustle_variants.intersection(tcga_ov_variants)
31 | eq_(set(combined.sources), {ov_wustle_variants.source, tcga_ov_variants.source})
32 | eq_(len(combined), 0)
33 |
34 | def test_variant_collection_gene_counts():
35 | gene_counts = ov_wustle_variants.gene_counts()
36 | # test that each gene is counted just once
37 | eq_(list(gene_counts.values()), [1] * len(gene_counts))
38 |
39 | def test_variant_collection_groupby_gene():
40 | genes = ov_wustle_variants.groupby_gene().keys()
41 | # make sure that the IDs attached to Gene objects are the same as IDs
42 | # of groupby_gene_id
43 | gene_ids = set(ov_wustle_variants.groupby_gene_id().keys())
44 | eq_({gene.id for gene in genes}, gene_ids)
45 |
46 | def test_variant_collection_groupby_gene_id():
47 | gene_ids = set(ov_wustle_variants.groupby_gene_id().keys())
48 | eq_(gene_ids, {
49 | 'ENSG00000060718',
50 | 'ENSG00000156876',
51 | 'ENSG00000130939',
52 | 'ENSG00000122477',
53 | 'ENSG00000162688'
54 | })
55 |
56 | def test_variant_collection_groupby_gene_name():
57 | gene_names = set(ov_wustle_variants.groupby_gene_name().keys())
58 | eq_(gene_names, {"AGL", "SASS6", "LRRC39", "UBE4B", "COL11A1"})
59 |
60 | def test_reference_names():
61 | eq_(ov_wustle_variants.reference_names(), {"GRCh37"})
62 |
63 | def test_to_string():
64 | string_repr = str(ov_wustle_variants)
65 | assert "start=10238758, ref='G', alt='C'" in string_repr, \
66 | "Expected variant g.10238758 G>C in __str__:\n%s" % (
67 | string_repr,)
68 |
69 | def test_detailed_string():
70 | detailed_string = ov_wustle_variants.detailed_string()
71 | # expect one of the gene names from the MAF to be in the summary string
72 | assert "UBE4B" in detailed_string, \
73 | "Expected gene name UBE4B in detailed_string():\n%s" % detailed_string
74 | assert "start=10238758, ref='G', alt='C'" in detailed_string, \
75 | "Expected variant g.10238758 G>C in detailed_string():\n%s" % (
76 | detailed_string,)
77 |
78 | def test_gene_counts():
79 | expected_coding_gene_counts = Counter()
80 | expected_coding_gene_counts["CDK11A"] = 1
81 | expected_coding_gene_counts["GNPAT"] = 1
82 | expected_coding_gene_counts["E2F2"] = 1
83 | expected_coding_gene_counts["VSIG2"] = 1
84 | all_gene_counts = tcga_ov_variants.gene_counts()
85 | assert len(all_gene_counts) > len(expected_coding_gene_counts), \
86 | ("Gene counts for all genes must contain more elements than"
87 | " gene counts for only coding genes.")
88 | for (gene_name, count) in expected_coding_gene_counts.items():
89 | eq_(count, all_gene_counts[gene_name])
90 |
91 | # TODO: add `only_coding` parameter to gene_counts and then test
92 | # for exact equality between `coding_gene_counts` and
93 | # `expected_counts`
94 | #
95 | # coding_gene_counts = variants.gene_counts(only_coding=True)
96 | # eq_(coding_gene_counts, expected_counts)
97 |
98 | def test_variant_collection_serialization():
99 | variant_list = [
100 | Variant(
101 | 1, start=10, ref="AA", alt="AAT"),
102 | Variant(10, start=15, ref="A", alt="G"),
103 | Variant(20, start=150, ref="", alt="G"),
104 | ]
105 | original = VariantCollection(
106 | variant_list,
107 | source_to_metadata_dict={
108 | "test_data":
109 | {variant: {"a": "b", "bar": 2} for variant in variant_list}})
110 |
111 | # This causes the variants' ensembl objects to make a SQL connection,
112 | # which makes the ensembl object non-serializable. By calling this
113 | # method, we are checking that we don't attempt to directly serialize
114 | # the ensembl object.
115 | original.effects()
116 |
117 | original_first_variant = original[0]
118 | original_metadata = original.metadata
119 |
120 | # Test pickling
121 | reconstructed = pickle.loads(pickle.dumps(original))
122 | eq_(original, reconstructed)
123 | eq_(reconstructed[0], original_first_variant)
124 | eq_(reconstructed.metadata[original_first_variant],
125 | original_metadata[original_first_variant])
126 |
127 | merged = original.intersection(original)
128 | merged_reconstructed = pickle.loads(pickle.dumps(merged))
129 | eq_(merged, merged_reconstructed)
130 |
131 | # Test JSON serialization
132 | variants_from_json = VariantCollection.from_json(original.to_json())
133 | eq_(original, variants_from_json)
134 |
135 | eq_(variants_from_json[0], original_first_variant)
136 |
137 | # pylint: disable=no-member
138 | eq_(variants_from_json.metadata[original_first_variant],
139 | original_metadata[original_first_variant])
140 |
141 | def test_merged_variant_collection_serialization():
142 | intersection = ov_wustle_variants.intersection(tcga_ov_variants)
143 | eq_(intersection, pickle.loads(pickle.dumps(intersection)))
144 |
145 | union = ov_wustle_variants.union(tcga_ov_variants)
146 | eq_(union, pickle.loads(pickle.dumps(union)))
147 |
--------------------------------------------------------------------------------
/tests/test_vcf.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | import os
14 |
15 | import pytest
16 |
17 | from pyensembl import cached_release
18 | from varcode import load_vcf, Variant
19 |
20 | from .common import eq_
21 | from .data import data_path
22 |
23 |
24 |
25 | # Set to 1 to enable, 0 to disable.
26 | # TODO: consider running in an in-process HTTP server instead for these tests.
27 | RUN_TESTS_REQUIRING_INTERNET = bool(int(
28 | os.environ.get("RUN_TESTS_REQUIRING_INTERNET", 0)))
29 |
30 | HG19_VCF_FILENAME = data_path("somatic_hg19_14muts.vcf")
31 | HG19_VCF_EXTERNAL_URL = (
32 | "https://raw.githubusercontent.com/hammerlab/varcode/master/test/data/somatic_hg19_14muts.vcf")
33 |
34 | # To load from the branch that introduced these changs:
35 | # (needed before this gets merged to master, can be removed after)
36 | # VCF_EXTERNAL_URL = (
37 | # "https://raw.githubusercontent.com/hammerlab/varcode/faster-vcf-parsing/test/data/somatic_hg19_14muts.vcf")
38 |
39 | def test_load_vcf_local():
40 | variants = load_vcf(HG19_VCF_FILENAME)
41 | assert variants.reference_names() == {"GRCh37"}
42 | assert len(variants) == 14
43 |
44 | variants = load_vcf(HG19_VCF_FILENAME + ".gz")
45 | assert variants.reference_names() == {"GRCh37"}
46 | assert len(variants) == 14
47 |
48 | variants = load_vcf("file://%s" % HG19_VCF_FILENAME)
49 | assert variants.reference_names() == {"GRCh37"}
50 | assert len(variants) == 14
51 |
52 | variants = load_vcf("file://%s.gz" % HG19_VCF_FILENAME)
53 | assert variants.reference_names() == {"GRCh37"}
54 | assert len(variants) == 14
55 |
56 | # An extra slashe before an absolute path can confuse URL parsing.
57 | # Test that it can still be opened:
58 | variants = load_vcf("/%s" % HG19_VCF_FILENAME)
59 | assert variants.reference_names() == {"GRCh37"}
60 | assert len(variants) == 14
61 |
62 | if RUN_TESTS_REQUIRING_INTERNET:
63 | def test_load_vcf_external():
64 | variants = load_vcf(HG19_VCF_FILENAME)
65 | eq_(variants.reference_names(), {"GRCh37"})
66 | eq_(variants.original_reference_names(), {"hg19"})
67 | eq_(len(variants), 14)
68 |
69 | variants = load_vcf(HG19_VCF_FILENAME + ".gz")
70 | eq_(variants.reference_names(), {"GRCh37"})
71 | eq_(len(variants), 14)
72 |
73 | def test_vcf_reference_name():
74 | variants = load_vcf(HG19_VCF_FILENAME)
75 |
76 | # after normalization, hg19 should be remapped to GRCh37
77 | assert variants.reference_names() == {"GRCh37"}
78 |
79 | def test_genome_arg_to_load_vcf_hg19():
80 | eq_(load_vcf(HG19_VCF_FILENAME),
81 | load_vcf(HG19_VCF_FILENAME, genome="hg19"))
82 |
83 | def test_genome_arg_to_load_vcf_int_75():
84 | # if we use Ensembl 75 -- which is backed by GRCh37 -- then the two variant
85 | # collections will be the same as long as we also convert the contig names
86 | eq_(load_vcf(HG19_VCF_FILENAME),
87 | load_vcf(HG19_VCF_FILENAME, genome=75, convert_ucsc_contig_names=True))
88 |
89 | assert load_vcf(HG19_VCF_FILENAME) != load_vcf(
90 | HG19_VCF_FILENAME,
91 | genome=75,
92 | convert_ucsc_contig_names=False)
93 |
94 | def test_genome_arg_to_load_vcf_cached_75():
95 | eq_(load_vcf(HG19_VCF_FILENAME),
96 | load_vcf(HG19_VCF_FILENAME,
97 | genome=cached_release(75), convert_ucsc_contig_names=True))
98 | assert load_vcf(HG19_VCF_FILENAME) != load_vcf(
99 | HG19_VCF_FILENAME,
100 | genome=cached_release(75),
101 | convert_ucsc_contig_names=False)
102 |
103 | def test_genome_arg_to_load_vcf_grch37():
104 | eq_(load_vcf(HG19_VCF_FILENAME),
105 | load_vcf(
106 | HG19_VCF_FILENAME,
107 | genome="grch37",
108 | convert_ucsc_contig_names=True))
109 | eq_(load_vcf(HG19_VCF_FILENAME), load_vcf(
110 | HG19_VCF_FILENAME,
111 | genome="GRCh37",
112 | convert_ucsc_contig_names=True))
113 |
114 | assert load_vcf(HG19_VCF_FILENAME) != load_vcf(
115 | HG19_VCF_FILENAME,
116 | genome="grch37",
117 | convert_ucsc_contig_names=False)
118 |
119 | def test_genome_arg_to_load_vcf_b37():
120 | eq_(load_vcf(HG19_VCF_FILENAME),
121 | load_vcf(HG19_VCF_FILENAME, genome="b37", convert_ucsc_contig_names=True))
122 |
123 | def test_vcf_number_entries():
124 | # there are 14 mutations listed in the VCF, make sure they are all parsed
125 | variants = load_vcf(HG19_VCF_FILENAME)
126 | assert len(variants) == 14, \
127 | "Expected 14 mutations, got %d" % (len(variants),)
128 |
129 | def test_vcf_number_entries_duplicates():
130 | # There are 3 duplicated mutations listed in the VCF
131 | path_to_vcf_with_duplicates = data_path("duplicates.vcf")
132 | variants = load_vcf(
133 | path_to_vcf_with_duplicates,
134 | genome='hg38',
135 | distinct=True)
136 | assert len(variants) == 1
137 | variants = load_vcf(
138 | path_to_vcf_with_duplicates,
139 | genome='hg38',
140 | distinct=False)
141 | assert len(variants) == 3
142 |
143 | def generate_vcf_gene_names():
144 | variants = load_vcf(HG19_VCF_FILENAME)
145 | for variant in variants:
146 | yield (variants, variant)
147 |
148 | @pytest.mark.parametrize(['collection', 'variant'], generate_vcf_gene_names())
149 | def test_vcf_gene_names(collection, variant):
150 | expected_gene_names = collection.metadata[variant]['info']['GE']
151 | assert variant.gene_names == expected_gene_names, \
152 | "Expected gene name %s for variant %s, got %s" % (
153 | expected_gene_names, variant, variant.gene_names)
154 |
155 |
156 | def test_multiple_alleles_per_line():
157 | variants = load_vcf(data_path("multiallelic.vcf"))
158 | assert len(variants) == 2, "Expected 2 variants but got %s" % variants
159 | variant_list = list(variants)
160 | expected_variants = [
161 | Variant(1, 1431105, "A", "C", genome="GRCh37"),
162 | Variant(1, 1431105, "A", "G", genome="GRCh37"),
163 | ]
164 | eq_(set(variant_list), set(expected_variants))
165 |
166 | def test_sample_info_genotype():
167 | variants = load_vcf(data_path("multiallelic.vcf"))
168 | assert len(variants) == 2, "Expected 2 variants but got %s" % variants
169 | eq_(variants.metadata[variants[0]]['sample_info']['metastasis']['GT'],
170 | '0/1')
171 | eq_(variants.metadata[variants[1]]['sample_info']['metastasis']['GT'],
172 | '0/1')
173 |
--------------------------------------------------------------------------------
/tests/test_vcf_output.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | import tempfile
14 |
15 | import pytest
16 |
17 | from varcode import load_vcf, load_maf
18 | from varcode.vcf_output import variants_to_vcf
19 |
20 | from .data import data_path
21 |
22 |
23 | TEST_FILENAMES_HUMAN = [
24 | 'duplicates.maf',
25 | 'multiallelic.vcf',
26 | 'mutect-example.vcf',
27 | 'ov.wustle.subset5.maf',
28 | 'somatic_hg19_14muts.space_in_sample_name.vcf',
29 | 'somatic_hg19_14muts.vcf',
30 | 'strelka-example.vcf',
31 | 'tcga_ov.head.maf',
32 | 'tcga_ov.head.xychr.maf',
33 | # 'dbnsfp_validation_set.csv', # csv
34 | # 'duplicates.vcf', # no ref genome header
35 | # 'mutect-example-headerless.vcf', # no ref genome header
36 | # 'somatic_hg19_14muts.vcf.gz', # gzip
37 | ]
38 |
39 | TEST_FILENAMES_MOUSE = [
40 | 'mouse_vcf_dbsnp_chr1_partial.vcf',
41 | ]
42 |
43 | TEST_FILENAMES = TEST_FILENAMES_HUMAN + TEST_FILENAMES_MOUSE
44 |
45 |
46 | def _merge_metadata_naive(variants):
47 | return {
48 | k: v
49 | for d in variants.source_to_metadata_dict.values()
50 | for k, v in d.items()
51 | }
52 |
53 |
54 |
55 | def _do_roundtrip_test(filenames, convert_ucsc_to_grch37=False):
56 |
57 | def load_fn(filename):
58 | return {
59 | 'vcf': load_vcf,
60 | 'maf': load_maf
61 | }[filename.split('.')[-1]]
62 |
63 | def load_variants():
64 | variant_collections = []
65 | for filename in filenames:
66 | variant_collections.append(load_fn(filename)(data_path(filename)))
67 | return variant_collections[0].union(*variant_collections[1:])
68 |
69 | variants = load_variants()
70 | if convert_ucsc_to_grch37:
71 | variants = variants.clone_without_ucsc_data()
72 |
73 | with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
74 | metadata = _merge_metadata_naive(variants)
75 | variants_to_vcf(variants, metadata, out=f)
76 | tmp_name = f.name
77 | reparsed_variants = load_vcf(tmp_name)
78 |
79 | # `==` checks the reference genome, which won't necessarily match.
80 | for (v1, v2) in zip(variants, reparsed_variants):
81 | assert (
82 | v1.contig == v2.contig and
83 | v1.start == v2.start and
84 | v1.ref == v2.ref and
85 | v1.start == v2.start), (v1, v2)
86 |
87 | return (variants, reparsed_variants)
88 |
89 | # TODO:
90 | # There is definitely more opportunity here to compare metadata
91 | # fields, with caveats.
92 | # ---
93 | # First, any variants from non-VCF sources (e.g., MAF files) will inevitably
94 | # lose some information through the change in representation (more importantly,
95 | # even if there is no loss in data, that data will be in a different format in
96 | # the new metadata dictionary). Thus, we should either ignore such variants
97 | # or only check certain fields.
98 | # ---
99 | # Second, without the original metadata headers in the VCF file, all metadata
100 | # information will be parsed as strings. Thus, for a simple comparison between
101 | # metadata (without the need to individually convert fields), we'd need to add
102 | # these headers to the output VCF file. See `vcf_output.py` for more info.
103 |
104 |
105 | @pytest.mark.parametrize(['filename'], [(f,) for f in TEST_FILENAMES])
106 | def test_roundtrip_serialization_single_file(filename):
107 | _do_roundtrip_test([filename])
108 |
109 | FILENAME_PAIRS = (
110 | ['simple.1.vcf', 'simple.2.vcf'], # basic multi-file VCF test
111 | ['duplicates.maf', 'ov.wustle.subset5.maf'], # multiple MAF files
112 | ['duplicate-id.1.vcf', 'duplicate-id.2.vcf'],
113 | )
114 |
115 | @pytest.mark.parametrize(['file_group'], [(f,) for f in FILENAME_PAIRS])
116 | def test_multiple_file_roundtrip_conversion(file_group):
117 | _do_roundtrip_test(file_group)
118 |
119 | def test_multiple_file_roundtrip_conversion_mixed_references():
120 | # testing roundtrip serialization of hg19 VCF files
121 | # converted to GRCh37 combined with b37 MAFs
122 | _do_roundtrip_test(TEST_FILENAMES_HUMAN, convert_ucsc_to_grch37=True)
123 |
124 | def test_same_samples_produce_samples():
125 | """test_same_samples_produce_samples
126 |
127 | Ensures that, if a set of variants have the same samples, the reparsed
128 | collection will output these samples.
129 | """
130 | (variants, reparsed_variants) = _do_roundtrip_test(
131 | ['same-samples.1.vcf', 'same-samples.2.vcf'])
132 |
133 | original_metadata = _merge_metadata_naive(variants)
134 | reparsed_metadata = _merge_metadata_naive(reparsed_variants)
135 |
136 | sample_names = set(list(original_metadata.values())[0]['sample_info'].keys())
137 | assert all(
138 | set(d.get('sample_info', {}).keys()) == sample_names
139 | for d in reparsed_metadata.values())
140 |
141 |
142 | def test_different_samples_produce_no_samples():
143 | """test_different_samples_produce_no_samples
144 |
145 | Ensures that, if a set of variants have different samples, the reparsed
146 | collection will not output any samples.
147 |
148 | See `vcf_output.py` for details as to why this is the way it's done for now.
149 | """
150 | (_, reparsed_variants) = _do_roundtrip_test(
151 | ['different-samples.1.vcf', 'different-samples.2.vcf'])
152 |
153 | metadata = _merge_metadata_naive(reparsed_variants)
154 | assert all(d.get('sample_info') is None for d in metadata.values())
155 |
--------------------------------------------------------------------------------
/varcode/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed under the Apache License, Version 2.0 (the "License");
3 | # you may not use this file except in compliance with the License.
4 | # You may obtain a copy of the License at
5 | #
6 | # http://www.apache.org/licenses/LICENSE-2.0
7 | #
8 | # Unless required by applicable law or agreed to in writing, software
9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 |
14 | from .variant import Variant
15 | from .variant_collection import VariantCollection
16 | from .maf import load_maf, load_maf_dataframe
17 | from .vcf import load_vcf, load_vcf_fast
18 | from .effects import (
19 | effect_priority,
20 | top_priority_effect,
21 | EffectCollection,
22 | MutationEffect,
23 | NonsilentCodingMutation,
24 | )
25 | from .version import __version__
26 |
27 | __all__ = [
28 | "__version__",
29 |
30 | # basic classes
31 | "Variant",
32 | "EffectCollection",
33 | "VariantCollection",
34 |
35 | # effects
36 | "effect_priority",
37 | "top_priority_effect",
38 | "MutationEffect",
39 | "NonsilentCodingMutation",
40 |
41 | # file loading
42 | "load_maf",
43 | "load_maf_dataframe",
44 | "load_vcf",
45 | "load_vcf_fast",
46 | ]
47 |
--------------------------------------------------------------------------------
/varcode/cli/__init__.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from .variant_args import (
14 | add_variant_args,
15 | make_variants_parser,
16 | variant_collection_from_args,
17 | )
18 |
19 | __all__ = [
20 | "add_variant_args",
21 | "make_variants_parser",
22 | "variant_collection_from_args",
23 | ]
24 |
--------------------------------------------------------------------------------
/varcode/cli/effects_script.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | import logging.config
14 | import pkg_resources
15 | import sys
16 |
17 | from .version_info import print_version_info
18 | from .variant_args import make_variants_parser, variant_collection_from_args
19 |
20 |
21 | logging.config.fileConfig(pkg_resources.resource_filename(__name__, 'logging.conf'))
22 | logger = logging.getLogger(__name__)
23 |
24 | arg_parser = make_variants_parser(
25 | description="Annotate variants with overlapping gene names and predicted coding effects")
26 |
27 | arg_parser.add_argument("--output-csv", help="Output path to CSV")
28 |
29 | arg_parser.add_argument(
30 | "--one-per-variant",
31 | default=False,
32 | action="store_true",
33 | help=(
34 | "Only return highest priority effect overlapping a variant, "
35 | "otherwise all overlapping transcripts are returned."))
36 |
37 | arg_parser.add_argument(
38 | "--only-coding",
39 | default=False,
40 | action="store_true",
41 | help="Filter silent and non-coding effects")
42 |
43 | def main(args_list=None):
44 | """
45 | Script which loads variants and annotates them with overlapping genes
46 | and predicted coding effects.
47 |
48 | Example usage:
49 | varcode
50 | --vcf mutect.vcf \
51 | --vcf strelka.vcf \
52 | --maf tcga_brca.maf \
53 | --variant chr1 498584 C G \
54 | --json-variants more_variants.json
55 | """
56 | print_version_info()
57 | if args_list is None:
58 | args_list = sys.argv[1:]
59 |
60 | args = arg_parser.parse_args(args_list)
61 | variants = variant_collection_from_args(args)
62 | effects = variants.effects()
63 | if args.only_coding:
64 | effects = effects.drop_silent_and_noncoding()
65 | if args.one_per_variant:
66 | variant_to_effect_dict = effects.top_priority_effect_per_variant()
67 | effects = effects.clone_with_new_elements(list(variant_to_effect_dict.values()))
68 |
69 | effects_dataframe = effects.to_dataframe()
70 | logger.info('\n%s', effects)
71 | if args.output_csv:
72 | effects_dataframe.to_csv(args.output_csv, index=False)
73 |
--------------------------------------------------------------------------------
/varcode/cli/genes_script.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | import logging
14 | import logging.config
15 | import pkg_resources
16 | import sys
17 |
18 | from .version_info import print_version_info
19 | from .variant_args import make_variants_parser, variant_collection_from_args
20 |
21 |
22 | logging.config.fileConfig(pkg_resources.resource_filename(__name__, 'logging.conf'))
23 | logger = logging.getLogger(__name__)
24 |
25 | arg_parser = make_variants_parser(
26 | description="Annotate variants with overlapping gene names")
27 | arg_parser.add_argument("--output-csv", help="Output path to CSV")
28 |
29 | def main(args_list=None):
30 | """
31 | Script which loads variants and annotates them with overlapping genes.
32 |
33 | Example usage:
34 | varcode-genes
35 | --vcf mutect.vcf \
36 | --vcf strelka.vcf \
37 | --maf tcga_brca.maf \
38 | --variant chr1 498584 C G \
39 | --json-variants more_variants.json
40 | """
41 | print_version_info()
42 | if args_list is None:
43 | args_list = sys.argv[1:]
44 | args = arg_parser.parse_args(args_list)
45 | variants = variant_collection_from_args(args)
46 | variants_dataframe = variants.to_dataframe()
47 | logger.info('\n%s', variants_dataframe)
48 | if args.output_csv:
49 | variants_dataframe.to_csv(args.output_csv, index=False)
50 |
--------------------------------------------------------------------------------
/varcode/cli/logging.conf:
--------------------------------------------------------------------------------
1 | [loggers]
2 | keys=root,varcode,pyensembl,datacache
3 |
4 | [formatters]
5 | keys=simpleFormatter
6 |
7 | [handlers]
8 | keys=consoleHandler,consoleHandlerCritical
9 |
10 | [logger_root]
11 | level=INFO
12 | handlers=consoleHandlerCritical
13 |
14 | [handler_consoleHandler]
15 | class=StreamHandler
16 | level=INFO
17 | formatter=simpleFormatter
18 | args=(sys.stdout,)
19 |
20 | [handler_consoleHandlerCritical] # only for root logger: essentially silent
21 | class=StreamHandler
22 | level=CRITICAL
23 | formatter=simpleFormatter
24 | args=(sys.stdout,)
25 |
26 | [formatter_simpleFormatter]
27 | format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
28 | datefmt=
29 |
30 | # varcode
31 |
32 | [logger_varcode]
33 | level=DEBUG
34 | qualname=varcode
35 | handlers=consoleHandler
36 |
37 | # pyensembl
38 |
39 | [logger_pyensembl]
40 | level=DEBUG
41 | qualname=pyensembl
42 | handlers=consoleHandler
43 |
44 | # datacache
45 |
46 | [logger_datacache]
47 | level=DEBUG
48 | qualname=datacache
49 | handlers=consoleHandler
50 |
--------------------------------------------------------------------------------
/varcode/cli/variant_args.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from argparse import ArgumentParser
14 |
15 | from ..vcf import load_vcf
16 | from ..maf import load_maf
17 | from ..variant_collection import VariantCollection
18 | from ..variant import Variant
19 |
20 |
21 | def add_variant_args(arg_parser):
22 | """
23 | Extends an ArgumentParser instance with the following commandline arguments:
24 | --vcf
25 | --genome
26 | --maf
27 | --variant
28 | --json-variants
29 | """
30 | variant_arg_group = arg_parser.add_argument_group(
31 | title="Variants",
32 | description="Genomic variant files")
33 |
34 | variant_arg_group.add_argument(
35 | "--vcf",
36 | default=[],
37 | action="append",
38 | help="Genomic variants in VCF format")
39 |
40 | variant_arg_group.add_argument(
41 | "--maf",
42 | default=[],
43 | action="append",
44 | help="Genomic variants in TCGA's MAF format",)
45 |
46 | variant_arg_group.add_argument(
47 | "--variant",
48 | default=[],
49 | action="append",
50 | nargs=4,
51 | metavar=("CHR", "POS", "REF", "ALT"),
52 | help=(
53 | "Individual variant as 4 arguments giving chromsome, position, ref,"
54 | " and alt. Example: chr1 3848 C G. Use '.' to indicate empty alleles"
55 | " for insertions or deletions."))
56 |
57 | variant_arg_group.add_argument(
58 | "--genome",
59 | type=str,
60 | help=(
61 | "What reference assembly your variant coordinates are using. "
62 | "Examples: 'hg19', 'GRCh38', or 'mm9'. "
63 | "This argument is ignored for MAF files, since each row includes "
64 | "the reference. "
65 | "For VCF files, this is used if specified, and otherwise is guessed from "
66 | "the header. For variants specfied on the commandline with --variant, "
67 | "this option is required."))
68 |
69 | variant_arg_group.add_argument(
70 | "--download-reference-genome-data",
71 | action="store_true",
72 | default=False,
73 | help=(
74 | ("Automatically download genome reference data required for "
75 | "annotation using PyEnsembl. Otherwise you must first run "
76 | "'pyensembl install' for the release/species corresponding "
77 | "to the genome used in your VCF.")))
78 |
79 | variant_arg_group.add_argument(
80 | "--json-variants",
81 | default=[],
82 | action="append",
83 | help="Path to Varcode.VariantCollection object serialized as a JSON file.")
84 |
85 | return variant_arg_group
86 |
87 |
88 | def make_variants_parser(**kwargs):
89 | """
90 | Parameters
91 | ----------
92 | **kwargs : dict
93 | Passed directly to argparse.ArgumentParser
94 |
95 | Creates argparse.ArgumentParser instance with options needed for loading
96 | variants from VCF, MAF, or JSON files.
97 | """
98 | parser = ArgumentParser(**kwargs)
99 | add_variant_args(parser)
100 | return parser
101 |
102 |
103 | def download_and_install_reference_data(variant_collections):
104 | unique_genomes = {
105 | variant.ensembl
106 | for variant_collection in variant_collections
107 | for variant in variant_collection
108 | }
109 | for genome in unique_genomes:
110 | if not genome.required_local_files_exist():
111 | genome.download()
112 | genome.index()
113 |
114 |
115 | def variant_collection_from_args(args, required=True):
116 | variant_collections = []
117 |
118 | for vcf_path in args.vcf:
119 | variant_collections.append(
120 | load_vcf(vcf_path, genome=args.genome))
121 |
122 | for maf_path in args.maf:
123 | variant_collections.append(load_maf(maf_path))
124 |
125 | if args.variant:
126 | if not args.genome:
127 | raise ValueError(
128 | "--genome must be specified when using --variant")
129 |
130 | variants = [
131 | Variant(
132 | chromosome,
133 | start=position,
134 | ref=ref,
135 | alt=alt,
136 | genome=args.genome)
137 | for (chromosome, position, ref, alt)
138 | in args.variant
139 | ]
140 | variant_collection = VariantCollection(variants)
141 | variant_collections.append(variant_collection)
142 |
143 | for json_path in args.json_variants:
144 | with open(json_path, 'r') as f:
145 | variant_collections.append(
146 | VariantCollection.from_json(f.read()))
147 |
148 | if required and len(variant_collections) == 0:
149 | raise ValueError(
150 | "No variants loaded (use --maf, --vcf, --variant, or --json-variants options)")
151 |
152 | if args.download_reference_genome_data:
153 | download_and_install_reference_data(variant_collections)
154 |
155 | # pylint: disable=no-value-for-parameter
156 | return VariantCollection.union(*variant_collections)
157 |
--------------------------------------------------------------------------------
/varcode/cli/version_info.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from collections import OrderedDict
14 | from os.path import dirname
15 | from .. import __file__ as package_init_file_path
16 | from .. import __version__
17 |
18 |
19 | def collect_version_info():
20 | """
21 | Collection the version and path of Varcode.
22 |
23 | TODO:
24 | add a `dependencies=False` option to also collect this info from
25 | major Python dependencies such as PyEnsembl
26 | """
27 | d = OrderedDict()
28 | d["Varcode"] = (__version__, dirname(package_init_file_path))
29 | return d
30 |
31 |
32 | def print_version_info(dependencies=False):
33 | for (program, (version, path)) in collect_version_info().items():
34 | print(program)
35 | print(" Version: %s" % version)
36 | print(" Path: %s" % path)
37 |
--------------------------------------------------------------------------------
/varcode/common.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from collections import defaultdict
14 |
15 | from functools import wraps
16 |
17 |
18 | def apply_groupby(records, fn, skip_none=False):
19 | """
20 | Given a list of objects, group them into a dictionary by
21 | applying fn to each one and using returned values as a dictionary
22 | key.
23 |
24 | Parameters
25 | ----------
26 | records : list
27 |
28 | fn : function
29 |
30 | skip_none : bool
31 | If False, then None can be a key in the returned dictionary,
32 | otherwise records whose key value is None get skipped.
33 |
34 | Returns dict.
35 | """
36 |
37 | # create an empty list for every new key
38 | groups = defaultdict(list)
39 | for record in records:
40 | value = fn(record)
41 | if value is not None or not skip_none:
42 | groups[value].append(record)
43 | return dict(groups)
44 |
45 |
46 | def groupby_field(records, field_name, skip_none=True):
47 | """
48 | Given a list of objects, group them into a dictionary by
49 | the unique values of a given field name.
50 | """
51 | return apply_groupby(
52 | records,
53 | lambda obj: getattr(obj, field_name),
54 | skip_none=skip_none)
55 |
56 |
57 | def memoize(fn):
58 | """
59 | Simple memoization decorator for functions and methods,
60 | assumes that all arguments to the function can be hashed and
61 | compared.
62 | """
63 | memoized_values = {}
64 |
65 | @wraps(fn)
66 | def wrapped_fn(*args, **kwargs):
67 | if kwargs:
68 | cache_key = (args, tuple(sorted(kwargs.items())))
69 | else:
70 | cache_key = (args, ())
71 | try:
72 | return memoized_values[cache_key]
73 | except KeyError:
74 | memoized_values[cache_key] = fn(*args, **kwargs)
75 | return memoized_values[cache_key]
76 |
77 | return wrapped_fn
78 |
--------------------------------------------------------------------------------
/varcode/effects/__init__.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from __future__ import print_function, division, absolute_import
14 |
15 | from .effect_collection import EffectCollection
16 | from .effect_ordering import (
17 | effect_priority,
18 | top_priority_effect,
19 | )
20 | from .effect_prediction import (
21 | predict_variant_effects,
22 | predict_variant_effect_on_transcript,
23 | predict_variant_effect_on_transcript_or_failure,
24 | )
25 | from .effect_classes import (
26 | MutationEffect,
27 | TranscriptMutationEffect,
28 | NonsilentCodingMutation,
29 | Failure,
30 | IncompleteTranscript,
31 | Intergenic,
32 | Intragenic,
33 | NoncodingTranscript,
34 | Intronic,
35 | ThreePrimeUTR,
36 | FivePrimeUTR,
37 | Silent,
38 | Substitution,
39 | Insertion,
40 | Deletion,
41 | ComplexSubstitution,
42 | AlternateStartCodon,
43 | IntronicSpliceSite,
44 | ExonicSpliceSite,
45 | StopLoss,
46 | SpliceDonor,
47 | SpliceAcceptor,
48 | PrematureStop,
49 | FrameShiftTruncation,
50 | StartLoss,
51 | FrameShift,
52 | ExonLoss,
53 | )
54 |
55 | __all__ = [
56 | "EffectCollection",
57 | # effect ordering
58 | "effect_priority",
59 | "top_priority_effect",
60 |
61 | # prediction functions
62 | "predict_variant_effects",
63 | "predict_variant_effect_on_transcript",
64 | "predict_variant_effect_on_transcript_or_failure",
65 |
66 | # effect classes
67 | "MutationEffect",
68 | "TranscriptMutationEffect",
69 | "Failure",
70 | "IncompleteTranscript",
71 | "Intergenic",
72 | "Intragenic",
73 | "IncompleteTranscript",
74 | "NoncodingTranscript",
75 | "ThreePrimeUTR",
76 | "FivePrimeUTR",
77 | "Intronic",
78 | "Silent",
79 | "NonsilentCodingMutation",
80 | "Substitution",
81 | "Insertion",
82 | "Deletion",
83 | "ComplexSubstitution",
84 | "AlternateStartCodon",
85 | "IntronicSpliceSite",
86 | "ExonicSpliceSite",
87 | "StopLoss",
88 | "SpliceDonor",
89 | "SpliceAcceptor",
90 | "PrematureStop",
91 | "FrameShiftTruncation",
92 | "StartLoss",
93 | "FrameShift",
94 | "ExonLoss",
95 | ]
96 |
--------------------------------------------------------------------------------
/varcode/effects/common.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from Bio.Seq import Seq
14 |
15 |
16 |
17 | def bio_seq_to_str(seq):
18 | if type(seq) is str:
19 | return seq
20 | else:
21 | return str(seq)
22 |
--------------------------------------------------------------------------------
/varcode/effects/effect_helpers.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | """
14 | Helper functions for determine effect annotation for a variant
15 | """
16 |
17 |
18 | from ..nucleotides import PURINE_NUCLEOTIDES, AMINO_NUCLEOTIDES
19 |
20 | def variant_overlaps_interval(
21 | variant_start,
22 | n_ref_bases,
23 | interval_start,
24 | interval_end):
25 | """
26 | Does a variant overlap a given interval on the same chromosome?
27 |
28 | Parameters
29 | ----------
30 | variant_start : int
31 | Inclusive base-1 position of variant's starting location
32 | (or location before an insertion)
33 |
34 | n_ref_bases : int
35 | Number of reference bases affect by variant (used to compute
36 | end coordinate or determine whether variant is an insertion)
37 |
38 | interval_start : int
39 | Interval's inclusive base-1 start position
40 |
41 | interval_end : int
42 | Interval's inclusive base-1 end position
43 | """
44 |
45 | if n_ref_bases == 0:
46 | # insertions only overlap intervals which start before and
47 | # end after the insertion point, they must be fully contained
48 | # by the other interval
49 | return interval_start <= variant_start and interval_end >= variant_start
50 | variant_end = variant_start + n_ref_bases
51 | """
52 | if self._changes_exonic_splice_site(
53 | strand_ref,
54 | strand_alt,)
55 | """
56 | # overlap means other interval starts before this variant ends
57 | # and the interval ends after this variant starts
58 | return interval_start <= variant_end and interval_end >= variant_start
59 |
60 |
61 | def matches_exon_end_pattern(seq):
62 | """Does the end of the nucleotide string `seq` match the canonical splice
63 | signal for the 3' end of an exon: "MAG", where M is either amino base.
64 | """
65 | if len(seq) < 3:
66 | return False
67 | return seq[-3] in AMINO_NUCLEOTIDES and seq[-2] == "A" and seq[-1] == "G"
68 |
69 | def changes_exonic_splice_site(
70 | transcript_offset,
71 | transcript,
72 | transcript_ref,
73 | transcript_alt,
74 | exon_start_offset,
75 | exon_end_offset,
76 | exon_number):
77 | """Does the given exonic mutation of a particular transcript change a
78 | splice site?
79 |
80 | Parameters
81 | ----------
82 | transcript_offset : int
83 | Offset from start of transcript of first reference nucleotide
84 | (or the last nucleotide before an insertion)
85 |
86 | transcript : pyensembl.Transcript
87 |
88 | transcript_ref : str
89 | Reference nucleotides
90 |
91 | transcript_alt : alt
92 | Alternate nucleotides
93 |
94 | exon_start_offset : int
95 | Start offset of exon relative to beginning of transcript
96 |
97 | exon_end_offset : int
98 | End offset of exon relative to beginning of transcript
99 |
100 | exon_number : int
101 | Which exon in the order they form the transcript
102 | """
103 | # first we're going to make sure the variant doesn't disrupt the
104 | # splicing sequences we got from Divina et. al's
105 | # Ab initio prediction of mutation-induced cryptic
106 | # splice-site activation and exon skipping
107 | # (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2947103/)
108 | #
109 | # 5' splice site: MAG|GURAGU consensus
110 | # M is A or C; R is purine; | is the exon-intron boundary
111 | #
112 | # 3' splice site: YAG|R
113 | #
114 | if exon_number > 1 and transcript_offset == exon_start_offset:
115 | # if this is any exon past the first, check to see if it lost
116 | # the purine on its left side
117 | #
118 | # the 3' splice site sequence has just a single purine on
119 | # the exon side
120 | if len(transcript_ref) > 0 and transcript_ref[0] in PURINE_NUCLEOTIDES:
121 | if len(transcript_alt) > 0:
122 | if transcript_alt[0] not in PURINE_NUCLEOTIDES:
123 | return True
124 | else:
125 | # if the mutation is a deletion, are there ref nucleotides
126 | # afterward?
127 | offset_after_deletion = transcript_offset + len(transcript_ref)
128 | if len(transcript.sequence) > offset_after_deletion:
129 | next_base = transcript.sequence[offset_after_deletion]
130 | if next_base not in PURINE_NUCLEOTIDES:
131 | return True
132 |
133 | if exon_number < len(transcript.exons):
134 | # if the mutation affects an exon whose right end gets spliced
135 | # to a next exon, check if the variant alters the exon side of
136 | # 5' consensus splicing sequence
137 | #
138 | # splicing sequence:
139 | # MAG|GURAGU
140 | # M is A or C; R is purine; | is the exon-intron boundary
141 | #
142 | # TODO: check for overlap of two intervals instead of just
143 | # seeing if the mutation starts inside the exonic splice site
144 | if variant_overlaps_interval(
145 | variant_start=transcript_offset,
146 | n_ref_bases=len(transcript_ref),
147 | interval_start=exon_end_offset - 2,
148 | interval_end=exon_end_offset):
149 | end_of_reference_exon = transcript.sequence[
150 | exon_end_offset - 2:exon_end_offset + 1]
151 |
152 | if matches_exon_end_pattern(end_of_reference_exon):
153 | # if the last three nucleotides conform to the consensus
154 | # sequence then treat any deviation as an ExonicSpliceSite
155 | # mutation
156 | end_of_variant_exon = end_of_reference_exon
157 | if matches_exon_end_pattern(end_of_variant_exon):
158 | # end of exon matches splicing signal, check if it still
159 | # does after the mutation
160 | return True
161 |
--------------------------------------------------------------------------------
/varcode/effects/effect_prediction_coding.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from .effect_prediction_coding_frameshift import predict_frameshift_coding_effect
14 | from .effect_prediction_coding_in_frame import predict_in_frame_coding_effect
15 |
16 |
17 | def predict_variant_coding_effect_on_transcript(
18 | variant,
19 | transcript,
20 | trimmed_cdna_ref,
21 | trimmed_cdna_alt,
22 | transcript_offset):
23 | """
24 | Given a minimal cDNA ref/alt nucleotide string pair and an offset into a
25 | given transcript, determine the coding effect of this nucleotide substitution
26 | onto the translated protein.
27 |
28 | Parameters
29 | ----------
30 | variant : Variant
31 |
32 | transcript : Transcript
33 |
34 | trimmed_cdna_ref : str
35 | Reference nucleotides we expect to find in the transcript's CDS
36 |
37 | trimmed_cdna_alt : str
38 | Alternate nucleotides we're replacing the reference with
39 |
40 | transcript_offset : int
41 | Offset into the full transcript sequence of the ref->alt substitution
42 | """
43 | if not transcript.complete:
44 | raise ValueError(
45 | ("Can't annotate coding effect for %s"
46 | " on incomplete transcript %s" % (variant, transcript)))
47 |
48 | sequence = transcript.sequence
49 |
50 | n_ref = len(trimmed_cdna_ref)
51 | n_alt = len(trimmed_cdna_alt)
52 |
53 | # reference nucleotides found on the transcript, if these don't match
54 | # what we were told to expect from the variant then raise an exception
55 | ref_nucleotides_from_transcript = str(
56 | sequence[transcript_offset:transcript_offset + n_ref])
57 |
58 | # Make sure that the reference sequence agrees with what we expected
59 | # from the VCF
60 | assert ref_nucleotides_from_transcript == trimmed_cdna_ref, \
61 | "%s: expected ref '%s' at offset %d of %s, transcript has '%s'" % (
62 | variant,
63 | trimmed_cdna_ref,
64 | transcript_offset,
65 | transcript,
66 | ref_nucleotides_from_transcript)
67 |
68 | start_codon_offset = transcript.first_start_codon_spliced_offset
69 | stop_codon_offset = transcript.last_stop_codon_spliced_offset
70 |
71 | cds_len = stop_codon_offset - start_codon_offset + 1
72 |
73 | if cds_len < 3:
74 | raise ValueError(
75 | "Coding sequence for %s is too short: '%s'" % (
76 | transcript,
77 | transcript.sequence[start_codon_offset:stop_codon_offset + 1]))
78 |
79 | if n_ref == 0 and transcript.strand == "-":
80 | # By convention, genomic insertions happen *after* their base 1 position on
81 | # a chromosome. On the reverse strand, however, an insertion has to go
82 | # before the nucleotide at some transcript offset.
83 | # Example:
84 | # chromosome sequence:
85 | # TTT|GATCTCGTA|CCC
86 | # transcript on reverse strand:
87 | # CCC|ATGCTCTAG|TTT
88 | # where the CDS is emphasized:
89 | # ATGCTCTAG
90 | # If we have a genomic insertion g.6insATT
91 | # the genomic sequence becomes:
92 | # TTT|GAT_ATT_CTCGTA|CCC
93 | # (insert the "ATT" after the "T" at position 6)
94 | # On the reverse strand this becomes:
95 | # CCC|ATGCTC_TTA_TAG|TTT
96 | # (insert the "ATT" *before* the "T" at position 10)
97 | #
98 | # To preserve the interpretation of the start offset as the base
99 | # before the insertion, need to subtract one
100 | cds_offset = transcript_offset - start_codon_offset - 1
101 | else:
102 | cds_offset = transcript_offset - start_codon_offset
103 |
104 | assert cds_offset < cds_len, \
105 | "Expected CDS offset (%d) < |CDS| (%d) for %s on %s" % (
106 | cds_offset, cds_len, variant, transcript)
107 |
108 | sequence_from_start_codon = str(sequence[start_codon_offset:])
109 |
110 | # is this an in-frame mutations?
111 | if (n_ref - n_alt) % 3 == 0:
112 | return predict_in_frame_coding_effect(
113 | variant=variant,
114 | transcript=transcript,
115 | trimmed_cdna_ref=trimmed_cdna_ref,
116 | trimmed_cdna_alt=trimmed_cdna_alt,
117 | cds_offset=cds_offset,
118 | sequence_from_start_codon=sequence_from_start_codon)
119 | else:
120 | return predict_frameshift_coding_effect(
121 | variant=variant,
122 | transcript=transcript,
123 | trimmed_cdna_ref=trimmed_cdna_ref,
124 | trimmed_cdna_alt=trimmed_cdna_alt,
125 | cds_offset=cds_offset,
126 | sequence_from_start_codon=sequence_from_start_codon)
127 |
--------------------------------------------------------------------------------
/varcode/effects/effect_prediction_coding_frameshift.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | """
14 | Effect annotation for variants which modify the coding sequence and change
15 | reading frame.
16 | """
17 |
18 | from ..string_helpers import trim_shared_prefix
19 |
20 | from .effect_classes import (
21 | FrameShift,
22 | FrameShiftTruncation,
23 | StartLoss,
24 | StopLoss,
25 | Silent
26 | )
27 | from .mutate import substitute
28 | from .translate import translate
29 |
30 |
31 | def create_frameshift_effect(
32 | mutated_codon_index,
33 | sequence_from_mutated_codon,
34 | variant,
35 | transcript):
36 | """
37 | Determine frameshift effect within a coding sequence (possibly affecting
38 | either the start or stop codons, or anythign in between)
39 |
40 | Parameters
41 | ----------
42 | mutated_codon_index : int
43 | Codon offset (starting from 0 = start codon) of first non-reference
44 | amino acid in the variant protein
45 |
46 | sequence_from_mutated_codon: Bio.Seq
47 | Sequence of mutated cDNA, starting from first mutated codon, until
48 | the end of the transcript
49 |
50 | variant : Variant
51 |
52 | transcript : transcript
53 | """
54 |
55 | assert transcript.protein_sequence is not None, \
56 | "Expect transcript %s to have protein sequence" % transcript
57 |
58 | original_protein_sequence = transcript.protein_sequence
59 | original_protein_length = len(original_protein_sequence)
60 |
61 | mutant_protein_suffix = translate(
62 | nucleotide_sequence=sequence_from_mutated_codon,
63 | first_codon_is_start=False,
64 | to_stop=True,
65 | truncate=True)
66 |
67 | if mutated_codon_index == 0:
68 | # TODO: scan through sequence_from_mutated_codon for
69 | # Kozak sequence + start codon to choose the new start
70 | return StartLoss(variant=variant, transcript=transcript)
71 |
72 | # the frameshifted sequence may contain some amino acids which are
73 | # the same as the original protein!
74 | _, mutant_protein_suffix, unchanged_amino_acids = trim_shared_prefix(
75 | ref=original_protein_sequence[mutated_codon_index:],
76 | alt=mutant_protein_suffix)
77 | n_unchanged_amino_acids = len(unchanged_amino_acids)
78 | offset_to_first_different_amino_acid = mutated_codon_index + n_unchanged_amino_acids
79 | # miraculously, this frameshift left the protein unchanged,
80 | # most likely by turning one stop codon into another stop codon
81 | if n_unchanged_amino_acids == 0:
82 | aa_ref = ""
83 | else:
84 | aa_ref = original_protein_sequence[-n_unchanged_amino_acids:]
85 | if offset_to_first_different_amino_acid >= original_protein_length:
86 | # frameshift is either extending the protein or leaving it unchanged
87 | if len(mutant_protein_suffix) == 0:
88 |
89 | return Silent(
90 | variant=variant,
91 | transcript=transcript,
92 | aa_pos=mutated_codon_index,
93 | aa_ref=aa_ref)
94 | else:
95 | # When all the amino acids are the same as the original, we either
96 | # have the original protein or we've extended it.
97 | # If we've extended it, it means we must have lost our stop codon.
98 | return StopLoss(
99 | variant=variant,
100 | transcript=transcript,
101 | aa_ref=aa_ref,
102 | aa_alt=mutant_protein_suffix)
103 | # original amino acid at the mutated codon before the frameshift occurred
104 | aa_ref = original_protein_sequence[offset_to_first_different_amino_acid]
105 |
106 | # TODO: what if all the shifted amino acids were the same and the protein
107 | # ended up the same length? Add a Silent case?
108 | if len(mutant_protein_suffix) == 0:
109 | # if a frameshift doesn't create any new amino acids, then
110 | # it must immediately have hit a stop codon
111 | return FrameShiftTruncation(
112 | variant=variant,
113 | transcript=transcript,
114 | stop_codon_offset=offset_to_first_different_amino_acid)
115 | return FrameShift(
116 | variant=variant,
117 | transcript=transcript,
118 | aa_mutation_start_offset=offset_to_first_different_amino_acid,
119 | shifted_sequence=str(mutant_protein_suffix))
120 |
121 | def cdna_codon_sequence_after_insertion_frameshift(
122 | sequence_from_start_codon,
123 | cds_offset_before_insertion,
124 | inserted_nucleotides):
125 | """
126 | Returns index of mutated codon and nucleotide sequence starting at the first
127 | mutated codon.
128 | """
129 | # special logic for insertions
130 | coding_sequence_after_insertion = \
131 | sequence_from_start_codon[cds_offset_before_insertion + 1:]
132 |
133 | if cds_offset_before_insertion % 3 == 2:
134 | # insertion happens after last nucleotide in a codon,
135 | # doesn't disrupt the existing codon from cds_offset-2 to cds_offset
136 | mutated_codon_index = cds_offset_before_insertion // 3 + 1
137 | nucleotides_before = ""
138 | elif cds_offset_before_insertion % 3 == 1:
139 | # insertion happens after 2nd nucleotide of a codon
140 | # codon positions:
141 | # 1) cds_offset - 1
142 | # 2) cds_offset
143 | # <----- Insertsion
144 | # 3) cds_offset + 1
145 | mutated_codon_index = cds_offset_before_insertion // 3
146 | # the first codon in the returned sequence will contain two reference
147 | # nucleotides before the insertion
148 | nucleotides_before = sequence_from_start_codon[
149 | cds_offset_before_insertion - 1:cds_offset_before_insertion + 1]
150 | elif cds_offset_before_insertion % 3 == 0:
151 | # insertion happens after 1st nucleotide of a codon
152 | # codon positions:
153 | # 1) cds_offset
154 | # <----- Insertsion
155 | # 2) cds_offset + 1
156 | # 3) cds_offset + 2
157 | mutated_codon_index = cds_offset_before_insertion // 3
158 | # the first codon in the returned sequence will contain one reference
159 | # nucleotide before the insertion
160 | nucleotides_before = sequence_from_start_codon[cds_offset_before_insertion]
161 | sequence_from_mutated_codon = (
162 | nucleotides_before +
163 | inserted_nucleotides +
164 | coding_sequence_after_insertion)
165 | return mutated_codon_index, sequence_from_mutated_codon
166 |
167 |
168 | def cdna_codon_sequence_after_deletion_or_substitution_frameshift(
169 | sequence_from_start_codon,
170 | cds_offset,
171 | trimmed_cdna_ref,
172 | trimmed_cdna_alt):
173 | """
174 | Logic for any frameshift which isn't an insertion.
175 |
176 | We have insertions as a special case since our base-inclusive
177 | indexing means something different for insertions:
178 | cds_offset = base before insertion
179 | Whereas in this case:
180 | cds_offset = first reference base affected by a variant
181 |
182 | Returns index of first modified codon and sequence from that codon
183 | onward.
184 | """
185 | mutated_codon_index = cds_offset // 3
186 | # get the sequence starting from the first modified codon until the end
187 | # of the transcript.
188 | sequence_after_mutated_codon = \
189 | sequence_from_start_codon[mutated_codon_index * 3:]
190 |
191 | # the variant's ref nucleotides should start either 0, 1, or 2 nucleotides
192 | # into `sequence_after_mutated_codon`
193 | offset_into_mutated_codon = cds_offset % 3
194 |
195 | sequence_from_mutated_codon = substitute(
196 | sequence=sequence_after_mutated_codon,
197 | offset=offset_into_mutated_codon,
198 | ref=trimmed_cdna_ref,
199 | alt=trimmed_cdna_alt)
200 | return mutated_codon_index, sequence_from_mutated_codon
201 |
202 |
203 | def predict_frameshift_coding_effect(
204 | variant,
205 | transcript,
206 | trimmed_cdna_ref,
207 | trimmed_cdna_alt,
208 | cds_offset,
209 | sequence_from_start_codon):
210 | """
211 | Coding effect of a frameshift mutation.
212 |
213 | Parameters
214 | ----------
215 | variant : Variant
216 |
217 | transcript : Transcript
218 |
219 | trimmed_cdna_ref : nucleotide sequence
220 | Reference nucleotides in the coding sequence of the given transcript.
221 |
222 | trimmed_cdna_alt : nucleotide sequence
223 | Alternate nucleotides introduced by mutation
224 |
225 | cds_offset : int
226 | Offset into the CDS of first ref nucleotide. For insertions, this
227 | is the offset of the last ref nucleotide before the insertion.
228 |
229 | sequence_from_start_codon : nucleotide sequence
230 | Nucleotides of the coding sequence and 3' UTR
231 |
232 | """
233 | if len(trimmed_cdna_ref) != 0:
234 | mutated_codon_index, sequence_from_mutated_codon = \
235 | cdna_codon_sequence_after_deletion_or_substitution_frameshift(
236 | sequence_from_start_codon=sequence_from_start_codon,
237 | cds_offset=cds_offset,
238 | trimmed_cdna_ref=trimmed_cdna_ref,
239 | trimmed_cdna_alt=trimmed_cdna_alt)
240 | else:
241 | mutated_codon_index, sequence_from_mutated_codon = \
242 | cdna_codon_sequence_after_insertion_frameshift(
243 | sequence_from_start_codon=sequence_from_start_codon,
244 | cds_offset_before_insertion=cds_offset,
245 | inserted_nucleotides=trimmed_cdna_alt)
246 | return create_frameshift_effect(
247 | mutated_codon_index=mutated_codon_index,
248 | sequence_from_mutated_codon=sequence_from_mutated_codon,
249 | variant=variant,
250 | transcript=transcript)
251 |
--------------------------------------------------------------------------------
/varcode/effects/mutate.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 |
14 | def insert_before(sequence, offset, new_residues):
15 | """Mutate the given sequence by inserting the string `new_residues` before
16 | `offset`.
17 |
18 | Parameters
19 | ----------
20 | sequence : sequence
21 | String of amino acids or DNA bases
22 |
23 | offset : int
24 | Base 0 offset from start of sequence, after which we should insert
25 | `new_residues`.
26 |
27 | new_residues : sequence
28 | """
29 | assert 0 < offset <= len(sequence), \
30 | "Invalid position %d for sequence of length %d" % (
31 | offset, len(sequence))
32 | prefix = sequence[:offset]
33 | suffix = sequence[offset:]
34 | return prefix + new_residues + suffix
35 |
36 | def insert_after(sequence, offset, new_residues):
37 | """Mutate the given sequence by inserting the string `new_residues` after
38 | `offset`.
39 |
40 | Parameters
41 | ----------
42 | sequence : sequence
43 | String of amino acids or DNA bases
44 |
45 | offset : int
46 | Base 0 offset from start of sequence, after which we should insert
47 | `new_residues`.
48 |
49 | new_residues : sequence
50 | """
51 | assert 0 <= offset < len(sequence), \
52 | "Invalid position %d for sequence of length %d" % (
53 | offset, len(sequence))
54 | prefix = sequence[:offset + 1]
55 | suffix = sequence[offset + 1:]
56 | return prefix + new_residues + suffix
57 |
58 | def substitute(sequence, offset, ref, alt):
59 | """Mutate a sequence by substituting given `alt` at instead of `ref` at the
60 | given `position`.
61 |
62 | Parameters
63 | ----------
64 | sequence : sequence
65 | String of amino acids or DNA bases
66 |
67 | offset : int
68 | Base 0 offset from start of `sequence`
69 |
70 | ref : sequence or str
71 | What do we expect to find at the position?
72 |
73 | alt : sequence or str
74 | Alternate sequence to insert
75 | """
76 | n_ref = len(ref)
77 | sequence_ref = sequence[offset:offset + n_ref]
78 | assert str(sequence_ref) == str(ref), \
79 | "Reference %s at offset %d != expected reference %s" % \
80 | (sequence_ref, offset, ref)
81 | prefix = sequence[:offset]
82 | suffix = sequence[offset + n_ref:]
83 | return prefix + alt + suffix
84 |
--------------------------------------------------------------------------------
/varcode/effects/transcript_helpers.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 |
14 | def interval_offset_on_transcript(start, end, transcript):
15 | """
16 | Given an interval [start:end] and a particular transcript,
17 | return the start offset of the interval relative to the
18 | chromosomal positions of the transcript.
19 | """
20 | # ensure that start_pos:end_pos overlap with transcript positions
21 | if start > end:
22 | raise ValueError(
23 | "start_pos %d shouldn't be greater than end_pos %d" % (
24 | start, end))
25 | if start > transcript.end:
26 | raise ValueError(
27 | "Range %d:%d starts after transcript %s (%d:%d)" % (
28 | start,
29 | end,
30 | transcript,
31 | transcript.start,
32 | transcript.end))
33 | if end < transcript.start:
34 | raise ValueError(
35 | "Range %d:%d ends before transcript %s (%d:%d)" % (
36 | start,
37 | end,
38 | transcript,
39 | transcript.start,
40 | transcript.end))
41 | # trim the start position to the beginning of the transcript
42 | if start < transcript.start:
43 | start = transcript.start
44 | # trim the end position to the end of the transcript
45 | if end > transcript.end:
46 | end = transcript.end
47 | # return earliest offset into the spliced transcript
48 | return min(
49 | transcript.spliced_offset(start),
50 | transcript.spliced_offset(end))
51 |
--------------------------------------------------------------------------------
/varcode/effects/translate.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | """Helpers for cDNA -> protein translation.
14 |
15 | TODO: generalize this to work with the mitochondrial codon table.
16 | """
17 |
18 | from Bio.Data import CodonTable
19 | from Bio.Seq import Seq
20 |
21 | DNA_CODON_TABLE = CodonTable.standard_dna_table.forward_table
22 | START_CODONS = set(CodonTable.standard_dna_table.start_codons)
23 | STOP_CODONS = set(CodonTable.standard_dna_table.stop_codons)
24 |
25 |
26 | def translate_codon(codon, aa_pos):
27 | """Translate a single codon into a single amino acid or stop '*'
28 |
29 | Parameters
30 | ----------
31 | codon : str
32 | Expected to be of length 3
33 | aa_pos : int
34 | Codon/amino acid offset into the protein (starting from 0)
35 | """
36 | # not handling rare Leucine or Valine starts!
37 | if aa_pos == 0 and codon in START_CODONS:
38 | return "M"
39 | elif codon in STOP_CODONS:
40 | return "*"
41 | else:
42 | return DNA_CODON_TABLE[codon]
43 |
44 |
45 | def translate(
46 | nucleotide_sequence,
47 | first_codon_is_start=True,
48 | to_stop=True,
49 | truncate=False):
50 | """Translates cDNA coding sequence into amino acid protein sequence.
51 |
52 | Should typically start with a start codon but allowing non-methionine
53 | first residues since the CDS we're translating might have been affected
54 | by a start loss mutation.
55 |
56 | The sequence may include the 3' UTR but will stop translation at the first
57 | encountered stop codon.
58 |
59 | Parameters
60 | ----------
61 | nucleotide_sequence : BioPython Seq
62 | cDNA sequence
63 |
64 | first_codon_is_start : bool
65 | Treat the beginning of nucleotide_sequence (translates methionin)
66 |
67 | truncate : bool
68 | Truncate sequence if it's not a multiple of 3 (default = False)
69 | Returns BioPython Seq of amino acids
70 | """
71 | if not isinstance(nucleotide_sequence, Seq):
72 | nucleotide_sequence = Seq(nucleotide_sequence)
73 |
74 | if truncate:
75 | # if sequence isn't a multiple of 3, truncate it so BioPython
76 | # doesn't complain
77 | n_nucleotides = int(len(nucleotide_sequence) / 3) * 3
78 | nucleotide_sequence = nucleotide_sequence[:n_nucleotides]
79 | else:
80 | n_nucleotides = len(nucleotide_sequence)
81 |
82 | assert n_nucleotides % 3 == 0, \
83 | ("Expected nucleotide sequence to be multiple of 3"
84 | " but got %s of length %d") % (
85 | nucleotide_sequence,
86 | n_nucleotides)
87 |
88 | # passing cds=False to translate since we may want to deal with premature
89 | # stop codons
90 | protein_sequence = nucleotide_sequence.translate(to_stop=to_stop, cds=False)
91 |
92 | if first_codon_is_start and (
93 | len(protein_sequence) == 0 or protein_sequence[0] != "M"):
94 | if nucleotide_sequence[:3] in START_CODONS:
95 | # TODO: figure out when these should be made into methionines
96 | # and when left as whatever amino acid they normally code for
97 | # e.g. Leucine start codons
98 | # See: DOI: 10.1371/journal.pbio.0020397
99 | return "M" + protein_sequence[1:]
100 | else:
101 | raise ValueError(
102 | ("Expected first codon of %s to be start codon"
103 | " (one of %s) but got %s") % (
104 | protein_sequence[:10],
105 | START_CODONS,
106 | nucleotide_sequence))
107 |
108 | return protein_sequence
109 |
110 |
111 | def find_first_stop_codon(nucleotide_sequence):
112 | """
113 | Given a sequence of codons (expected to have length multiple of three),
114 | return index of first stop codon, or -1 if none is in the sequence.
115 | """
116 | n_mutant_codons = len(nucleotide_sequence) // 3
117 | for i in range(n_mutant_codons):
118 | codon = nucleotide_sequence[3 * i:3 * i + 3]
119 | if codon in STOP_CODONS:
120 | return i
121 | return -1
122 |
123 |
124 | def translate_in_frame_mutation(
125 | transcript,
126 | ref_codon_start_offset,
127 | ref_codon_end_offset,
128 | mutant_codons):
129 | """
130 | Returns:
131 | - mutant amino acid sequence
132 | - offset of first stop codon in the mutant sequence (or -1 if there was none)
133 | - boolean flag indicating whether any codons from the 3' UTR were used
134 |
135 | Parameters
136 | ----------
137 | transcript : pyensembl.Transcript
138 | Reference transcript to which a cDNA mutation should be applied.
139 |
140 | ref_codon_start_offset : int
141 | Starting (base 0) integer offset into codons (character triplets) of the
142 | transcript's reference coding sequence.
143 |
144 | ref_codon_end_offset : int
145 | Final (base 0) integer offset into codons of the transcript's
146 | reference coding sequence.
147 |
148 | mutant_codons : str
149 | Nucleotide sequence to replace the reference codons with
150 | (expected to have length that is a multiple of three)
151 | """
152 | mutant_stop_codon_index = find_first_stop_codon(mutant_codons)
153 |
154 | using_three_prime_utr = False
155 |
156 | if mutant_stop_codon_index != -1:
157 | mutant_codons = mutant_codons[:3 * mutant_stop_codon_index]
158 | elif ref_codon_end_offset > len(transcript.protein_sequence):
159 | # if the mutant codons didn't contain a stop but did mutate the
160 | # true reference stop codon then the translated sequence might involve
161 | # the 3' UTR
162 | three_prime_utr = transcript.three_prime_utr_sequence
163 | n_utr_codons = len(three_prime_utr) // 3
164 | # trim the 3' UTR sequence to have a length that is a multiple of 3
165 | truncated_utr_sequence = three_prime_utr[:n_utr_codons * 3]
166 |
167 | # note the offset of the first stop codon in the combined
168 | # nucleotide sequence of both the end of the CDS and the 3' UTR
169 | first_utr_stop_codon_index = find_first_stop_codon(truncated_utr_sequence)
170 |
171 | if first_utr_stop_codon_index > 0:
172 | # if there is a stop codon in the 3' UTR sequence and it's not the
173 | # very first codon
174 | using_three_prime_utr = True
175 | n_mutant_codons_before_utr = len(mutant_codons) // 3
176 | mutant_stop_codon_index = n_mutant_codons_before_utr + first_utr_stop_codon_index
177 | # combine the in-frame mutant codons with the truncated sequence of
178 | # the 3' UTR
179 | mutant_codons += truncated_utr_sequence[:first_utr_stop_codon_index * 3]
180 | elif first_utr_stop_codon_index == -1:
181 | # if there is no stop codon in the 3' UTR sequence
182 | using_three_prime_utr = True
183 | mutant_codons += truncated_utr_sequence
184 |
185 | amino_acids = translate(
186 | mutant_codons,
187 | first_codon_is_start=(ref_codon_start_offset == 0))
188 |
189 | return amino_acids, mutant_stop_codon_index, using_three_prime_utr
190 |
--------------------------------------------------------------------------------
/varcode/maf.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | import logging
14 |
15 | import pandas
16 | from typechecks import require_string
17 | from pandas import isnull
18 |
19 | from .reference import infer_genome
20 | from .variant import Variant, variant_ascending_position_sort_key
21 | from .variant_collection import VariantCollection
22 |
23 | TCGA_PATIENT_ID_LENGTH = 12
24 |
25 | MAF_COLUMN_NAMES = [
26 | 'Hugo_Symbol',
27 | 'Entrez_Gene_Id',
28 | 'Center',
29 | 'NCBI_Build',
30 | 'Chromosome',
31 | 'Start_Position',
32 | 'End_Position',
33 | 'Strand',
34 | 'Variant_Classification',
35 | 'Variant_Type',
36 | 'Reference_Allele',
37 | 'Tumor_Seq_Allele1',
38 | 'Tumor_Seq_Allele2',
39 | 'dbSNP_RS',
40 | 'dbSNP_Val_Status',
41 | 'Tumor_Sample_Barcode',
42 | 'Matched_Norm_Sample_Barcode',
43 | 'Match_Norm_Seq_Allele1',
44 | 'Match_Norm_Seq_Allele2',
45 | ]
46 |
47 |
48 | def load_maf_dataframe(path, nrows=None, raise_on_error=True, encoding=None):
49 | """
50 | Load the guaranteed columns of a TCGA MAF file into a DataFrame
51 |
52 | Parameters
53 | ----------
54 | path : str
55 | Path to MAF file
56 |
57 | nrows : int
58 | Optional limit to number of rows loaded
59 |
60 | raise_on_error : bool
61 | Raise an exception upon encountering an error or log an error
62 |
63 | encoding : str, optional
64 | Encoding to use for UTF when reading MAF file.
65 | """
66 | require_string(path, "Path to MAF")
67 |
68 | n_basic_columns = len(MAF_COLUMN_NAMES)
69 |
70 | # pylint: disable=no-member
71 | # pylint gets confused by read_csv
72 | df = pandas.read_csv(
73 | path,
74 | comment="#",
75 | sep="\t",
76 | low_memory=False,
77 | skip_blank_lines=True,
78 | header=0,
79 | nrows=nrows,
80 | encoding=encoding)
81 |
82 | if len(df.columns) < n_basic_columns:
83 | error_message = (
84 | "Too few columns in MAF file %s, expected %d but got %d : %s" % (
85 | path, n_basic_columns, len(df.columns), df.columns))
86 | if raise_on_error:
87 | raise ValueError(error_message)
88 | else:
89 | logging.warn(error_message)
90 |
91 | # check each pair of expected/actual column names to make sure they match
92 | for expected, actual in zip(MAF_COLUMN_NAMES, df.columns):
93 | if expected != actual:
94 | # MAFs in the wild have capitalization differences in their
95 | # column names, normalize them to always use the names above
96 | if expected.lower() == actual.lower():
97 | # using DataFrame.rename in Python 2.7.x doesn't seem to
98 | # work for some files, possibly because Pandas treats
99 | # unicode vs. str columns as different?
100 | df[expected] = df[actual]
101 | del df[actual]
102 | else:
103 | error_message = (
104 | "Expected column %s but got %s" % (expected, actual))
105 | if raise_on_error:
106 | raise ValueError(error_message)
107 | else:
108 | logging.warn(error_message)
109 |
110 | return df
111 |
112 | def load_maf(
113 | path,
114 | optional_cols=[],
115 | sort_key=variant_ascending_position_sort_key,
116 | distinct=True,
117 | raise_on_error=True,
118 | encoding=None,
119 | nrows=None):
120 | """
121 | Load reference name and Variant objects from MAF filename.
122 |
123 | Parameters
124 | ----------
125 |
126 | path : str
127 | Path to MAF (*.maf).
128 |
129 | optional_cols : list, optional
130 | A list of MAF columns to include as metadata if they are present in the MAF.
131 | Does not result in an error if those columns are not present.
132 |
133 | sort_key : fn
134 | Function which maps each element to a sorting criterion.
135 | Set to None to not to sort the variants.
136 |
137 | distinct : bool
138 | Don't keep repeated variants
139 |
140 | raise_on_error : bool
141 | Raise an exception upon encountering an error or just log a warning.
142 |
143 | encoding : str, optional
144 | Encoding to use for UTF when reading MAF file.
145 |
146 | nrows : int, optional
147 | Limit to number of rows loaded
148 | """
149 | # pylint: disable=no-member
150 | # pylint gets confused by read_csv inside load_maf_dataframe
151 | maf_df = load_maf_dataframe(
152 | path,
153 | nrows=nrows,
154 | raise_on_error=raise_on_error,
155 | encoding=encoding)
156 |
157 | if len(maf_df) == 0 and raise_on_error:
158 | raise ValueError("Empty MAF file %s" % path)
159 |
160 | ensembl_objects = {}
161 | variants = []
162 | metadata = {}
163 | for _, x in maf_df.iterrows():
164 | contig = x.Chromosome
165 | if isnull(contig):
166 | error_message = "Invalid contig name: %s" % (contig,)
167 | if raise_on_error:
168 | raise ValueError(error_message)
169 | else:
170 | logging.warn(error_message)
171 | continue
172 |
173 | start_pos = x.Start_Position
174 | ref = x.Reference_Allele
175 |
176 | # it's possible in a MAF file to have multiple Ensembl releases
177 | # mixed in a single MAF file (the genome assembly is
178 | # specified by the NCBI_Build column)
179 | ncbi_build = x.NCBI_Build
180 | if ncbi_build in ensembl_objects:
181 | genome = ensembl_objects[ncbi_build]
182 | else:
183 | if isinstance(ncbi_build, int):
184 | reference_name = "B%d" % ncbi_build
185 | else:
186 | reference_name = str(ncbi_build)
187 | genome, _ = infer_genome(reference_name)
188 | ensembl_objects[ncbi_build] = genome
189 |
190 | # have to try both Tumor_Seq_Allele1 and Tumor_Seq_Allele2
191 | # to figure out which is different from the reference allele
192 | if x.Tumor_Seq_Allele1 != ref:
193 | alt = x.Tumor_Seq_Allele1
194 | else:
195 | if x.Tumor_Seq_Allele2 == ref:
196 | error_message = (
197 | "Both tumor alleles agree with reference %s: %s" % (
198 | ref, x,))
199 | if raise_on_error:
200 | raise ValueError(error_message)
201 | else:
202 | logging.warn(error_message)
203 | continue
204 | alt = x.Tumor_Seq_Allele2
205 |
206 | variant = Variant(
207 | contig,
208 | start_pos,
209 | str(ref),
210 | str(alt),
211 | genome)
212 |
213 | # keep metadata about the variant and its TCGA annotation
214 | metadata[variant] = {
215 | 'Hugo_Symbol': x.Hugo_Symbol,
216 | 'Center': x.Center,
217 | 'Strand': x.Strand,
218 | 'Variant_Classification': x.Variant_Classification,
219 | 'Variant_Type': x.Variant_Type,
220 | 'dbSNP_RS': x.dbSNP_RS,
221 | 'dbSNP_Val_Status': x.dbSNP_Val_Status,
222 | 'Tumor_Sample_Barcode': x.Tumor_Sample_Barcode,
223 | 'Matched_Norm_Sample_Barcode': x.Matched_Norm_Sample_Barcode,
224 | }
225 | for optional_col in optional_cols:
226 | if optional_col in x:
227 | metadata[variant][optional_col] = x[optional_col]
228 |
229 | variants.append(variant)
230 |
231 | return VariantCollection(
232 | variants=variants,
233 | source_to_metadata_dict={path: metadata},
234 | sort_key=sort_key,
235 | distinct=distinct)
236 |
--------------------------------------------------------------------------------
/varcode/nucleotides.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from __future__ import print_function, division, absolute_import
14 |
15 | import numpy as np
16 |
17 | from typechecks import require_string
18 |
19 | # include all pseudonucleotides encoding repeats and uncertain bases
20 | STANDARD_NUCLEOTIDES = {'A', 'C', 'T', 'G'}
21 |
22 | PURINE_NUCLEOTIDES = {'A', 'G'}
23 |
24 | PYRIMIDINE_NUCLEOTIDES = {'C', 'T'}
25 |
26 | AMINO_NUCLEOTIDES = {'A', 'C'}
27 |
28 | KETO_NUCLEOTIDES = {'T', 'G'}
29 |
30 | STRONG_NUCLEOTIDES = {'G', 'C'}
31 |
32 | WEAK_NUCLEOTIDES = {'A', 'T'}
33 |
34 | EXTENDED_NUCLEOTIDES = {
35 | 'A', 'C', 'T', 'G',
36 | 'Y', # Pyrimidine (C or T)
37 | 'R', # Purine (A or G)
38 | 'W', # weak (A or T)
39 | 'S', # strong (G or C)
40 | 'K', # keto (T or G)
41 | 'M', # amino (C or A)
42 | 'D', # A, G, T (not C)
43 | 'V', # A, C, G (not T)
44 | 'H', # A, C, T (not G)
45 | 'B', # C, G, T (not A)
46 | 'X', # any base
47 | 'N', # any base
48 | }
49 |
50 |
51 | def is_purine(nucleotide, allow_extended_nucleotides=False):
52 | """Is the nucleotide a purine"""
53 | if not allow_extended_nucleotides and nucleotide not in STANDARD_NUCLEOTIDES:
54 | raise ValueError(
55 | "{} is a non-standard nucleotide, neither purine or pyrimidine".format(nucleotide))
56 | return nucleotide in PURINE_NUCLEOTIDES
57 |
58 |
59 | def all_standard_nucleotides(nucleotides):
60 | return all(base in STANDARD_NUCLEOTIDES for base in nucleotides)
61 |
62 |
63 | def normalize_nucleotide_string(
64 | nucleotides,
65 | allow_extended_nucleotides=False,
66 | empty_chars=".-",
67 | treat_nan_as_empty=True):
68 | """
69 | Normalizes a nucleotide string by converting various ways of encoding empty
70 | strings into "", making all letters upper case, and checking to make sure
71 | all letters in the string are actually nucleotides.
72 |
73 | Parameters
74 | ----------
75 | nucleotides : str
76 | Sequence of nucleotides, e.g. "ACCTG"
77 |
78 | extended_nucleotides : bool
79 | Allow non-canonical nucleotide characters like 'X' for unknown base
80 |
81 | empty_chars : str
82 | Characters which encode empty strings, such as "." used in VCF format
83 | or "-" used in MAF format
84 |
85 | treat_nan_as_empty : bool
86 | Some MAF files represent deletions/insertions with NaN ref/alt values
87 | """
88 | if nucleotides in empty_chars:
89 | return ""
90 | elif treat_nan_as_empty and isinstance(nucleotides, float) and np.isnan(nucleotides):
91 | return ""
92 |
93 | require_string(nucleotides, name="nucleotide string")
94 |
95 | nucleotides = nucleotides.upper()
96 |
97 | if allow_extended_nucleotides:
98 | valid_nucleotides = EXTENDED_NUCLEOTIDES
99 | else:
100 | valid_nucleotides = STANDARD_NUCLEOTIDES
101 |
102 | if not set(nucleotides) <= valid_nucleotides:
103 | raise ValueError(
104 | "Invalid character(s) in nucleotide string: %s" % (
105 | ",".join(set(nucleotides) - valid_nucleotides),))
106 |
107 | return nucleotides
108 |
--------------------------------------------------------------------------------
/varcode/string_helpers.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from __future__ import print_function, division, absolute_import
14 |
15 |
16 | def trim_shared_prefix(ref, alt):
17 | """
18 | Sometimes mutations are given with a shared prefix between the reference
19 | and alternate strings. Examples: C>CT (nucleotides) or GYFP>G (amino acids).
20 |
21 | This function trims the common prefix and returns the disjoint ref
22 | and alt strings, along with the shared prefix.
23 | """
24 | n_ref = len(ref)
25 | n_alt = len(alt)
26 | n_min = min(n_ref, n_alt)
27 | i = 0
28 | while i < n_min and ref[i] == alt[i]:
29 | i += 1
30 |
31 | # guaranteed that ref and alt agree on all the characters
32 | # up to i'th position, so it doesn't matter which one we pull
33 | # the prefix out of
34 | prefix = ref[:i]
35 | ref_suffix = ref[i:]
36 | alt_suffix = alt[i:]
37 | return ref_suffix, alt_suffix, prefix
38 |
39 |
40 | def trim_shared_suffix(ref, alt):
41 | """
42 | Reuse the `trim_shared_prefix` function above to implement similar
43 | functionality for string suffixes.
44 |
45 | Given ref='ABC' and alt='BC', we first revese both strings:
46 | reverse_ref = 'CBA'
47 | reverse_alt = 'CB'
48 | and then the result of calling trim_shared_prefix will be:
49 | ('A', '', 'CB')
50 | We then reverse all three of the result strings to get back
51 | the shared suffix and both prefixes leading up to it:
52 | ('A', '', 'BC')
53 | """
54 | n_ref = len(ref)
55 | n_alt = len(alt)
56 | n_min = min(n_ref, n_alt)
57 | i = 0
58 | while i < n_min and ref[-i - 1] == alt[-i - 1]:
59 | i += 1
60 |
61 | # i is length of shared suffix.
62 | if i == 0:
63 | return (ref, alt, '')
64 | return (ref[:-i], alt[:-i], ref[-i:])
65 |
66 |
67 | def trim_shared_flanking_strings(ref, alt):
68 | """
69 | Given two nucleotide or amino acid strings, identify
70 | if they have a common prefix, a common suffix, and return
71 | their unique components along with the prefix and suffix.
72 |
73 | For example, if the input ref = "SYFFQGR" and alt = "SYMLLFIFQGR"
74 | then the result will be:
75 | ("F", "MLLFI", "SY", "FQGR")
76 | """
77 | ref, alt, prefix = trim_shared_prefix(ref, alt)
78 | ref, alt, suffix = trim_shared_suffix(ref, alt)
79 | return ref, alt, prefix, suffix
80 |
--------------------------------------------------------------------------------
/varcode/ucsc_reference_names.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | # NCBI builds and hg releases aren't identical
14 | # but the differences are all on chrM and unplaced contigs
15 | # Mapping between names copied from:
16 | # https://genome.ucsc.edu/FAQ/FAQreleases.html#release1
17 |
18 |
--------------------------------------------------------------------------------
/varcode/util.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | import random
14 |
15 | from Bio.Seq import reverse_complement
16 | from pyensembl import genome_for_reference_name
17 |
18 | from .nucleotides import STANDARD_NUCLEOTIDES
19 | from .variant import Variant
20 | from .variant_collection import VariantCollection
21 |
22 | # cache lists of all transcript IDs for difference Ensembl releases
23 | _transcript_ids_cache = {}
24 |
25 | def random_variants(
26 | count,
27 | genome_name="GRCh38",
28 | deletions=True,
29 | insertions=True,
30 | random_seed=None):
31 | """
32 | Generate a VariantCollection with random variants that overlap
33 | at least one complete coding transcript.
34 | """
35 | rng = random.Random(random_seed)
36 | ensembl = genome_for_reference_name(genome_name)
37 |
38 | if ensembl in _transcript_ids_cache:
39 | transcript_ids = _transcript_ids_cache[ensembl]
40 | else:
41 | transcript_ids = ensembl.transcript_ids()
42 | _transcript_ids_cache[ensembl] = transcript_ids
43 |
44 | variants = []
45 |
46 | # we should finish way before this loop is over but just in case
47 | # something is wrong with PyEnsembl we want to avoid an infinite loop
48 | for _ in range(count * 100):
49 | if len(variants) < count:
50 | transcript_id = rng.choice(transcript_ids)
51 | transcript = ensembl.transcript_by_id(transcript_id)
52 |
53 | if not transcript.complete:
54 | continue
55 |
56 | exon = rng.choice(transcript.exons)
57 | base1_genomic_position = rng.randint(exon.start, exon.end)
58 | transcript_offset = transcript.spliced_offset(base1_genomic_position)
59 | seq = transcript.sequence
60 |
61 | ref = str(seq[transcript_offset])
62 | if transcript.on_backward_strand:
63 | ref = reverse_complement(ref)
64 |
65 | alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref]
66 |
67 | if insertions:
68 | nucleotide_pairs = [
69 | x + y
70 | for x in STANDARD_NUCLEOTIDES
71 | for y in STANDARD_NUCLEOTIDES
72 | ]
73 | alt_nucleotides.extend(nucleotide_pairs)
74 | if deletions:
75 | alt_nucleotides.append("")
76 | alt = rng.choice(alt_nucleotides)
77 | variant = Variant(
78 | transcript.contig,
79 | base1_genomic_position,
80 | ref=ref,
81 | alt=alt,
82 | ensembl=ensembl)
83 | variants.append(variant)
84 | else:
85 | return VariantCollection(variants)
86 | raise ValueError(
87 | ("Unable to generate %d random variants, "
88 | "there may be a problem with PyEnsembl") % count)
89 |
--------------------------------------------------------------------------------
/varcode/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.2.1"
2 |
--------------------------------------------------------------------------------