├── .coveragerc
├── .gitattributes
├── .github
    └── workflows
    │   └── tests.yml
├── .github_changelog_generator
├── .gitignore
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── RELEASING.md
├── code-of-conduct.md
├── deploy.sh
├── examples
    └── varcode-quick_start.ipynb
├── lint-and-test.sh
├── lint.sh
├── pylintrc
├── requirements.txt
├── setup.py
├── test.sh
├── tests
    ├── __init__.py
    ├── benchmark_vcf_load.py
    ├── common.py
    ├── data.py
    ├── data
    │   ├── dbnsfp_validation_set.csv
    │   ├── different-samples.1.vcf
    │   ├── different-samples.2.vcf
    │   ├── duplicate-id.1.vcf
    │   ├── duplicate-id.2.vcf
    │   ├── duplicates.maf
    │   ├── duplicates.vcf
    │   ├── mouse_vcf_dbsnp_chr1_partial.vcf
    │   ├── multiallelic.vcf
    │   ├── mutect-example-headerless.vcf
    │   ├── mutect-example.vcf
    │   ├── ov.wustle.subset5.maf
    │   ├── same-samples.1.vcf
    │   ├── same-samples.2.vcf
    │   ├── simple.1.vcf
    │   ├── simple.2.vcf
    │   ├── somatic_hg19_14muts.space_in_sample_name.vcf
    │   ├── somatic_hg19_14muts.vcf
    │   ├── somatic_hg19_14muts.vcf.gz
    │   ├── strelka-example.vcf
    │   ├── tcga_ov.head.maf
    │   └── tcga_ov.head.xychr.maf
    ├── test_cli_effects.py
    ├── test_cli_genes.py
    ├── test_collection_filtering.py
    ├── test_common.py
    ├── test_cosmic_mutations.py
    ├── test_dbnsfp_validation.py
    ├── test_effect_annotation_errors.py
    ├── test_effect_classes.py
    ├── test_effect_collection.py
    ├── test_effect_collection_serialization.py
    ├── test_effects_from_mutagenix_variants.py
    ├── test_exonic_splice_site.py
    ├── test_frameshift_helpers.py
    ├── test_maf.py
    ├── test_mm10_klf6_frameshift.py
    ├── test_mouse.py
    ├── test_mutate.py
    ├── test_no_duplicate_variants.py
    ├── test_problematic_variants.py
    ├── test_reference.py
    ├── test_string_helpers.py
    ├── test_timings.py
    ├── test_variant.py
    ├── test_variant_collection.py
    ├── test_vcf.py
    └── test_vcf_output.py
└── varcode
    ├── __init__.py
    ├── cli
        ├── __init__.py
        ├── effects_script.py
        ├── genes_script.py
        ├── logging.conf
        ├── variant_args.py
        └── version_info.py
    ├── common.py
    ├── effects
        ├── __init__.py
        ├── common.py
        ├── effect_classes.py
        ├── effect_collection.py
        ├── effect_helpers.py
        ├── effect_ordering.py
        ├── effect_prediction.py
        ├── effect_prediction_coding.py
        ├── effect_prediction_coding_frameshift.py
        ├── effect_prediction_coding_in_frame.py
        ├── mutate.py
        ├── transcript_helpers.py
        └── translate.py
    ├── maf.py
    ├── nucleotides.py
    ├── reference.py
    ├── string_helpers.py
    ├── ucsc_reference_names.py
    ├── util.py
    ├── variant.py
    ├── variant_collection.py
    ├── vcf.py
    ├── vcf_output.py
    └── version.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | # .coveragerc to control coverage.py
2 | [run]
3 | omit = 
4 | 	test/*
5 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | varcode/_version.py export-subst
2 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | # TODO:
 5 | # - cache this directory $HOME/.cache/pyensembl/
 6 | # - update coveralls
 7 | # - get a badge for tests passing
 8 | # - download binary dependencies from conda
 9 | name: Tests
10 | on: [push, pull_request]
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       fail-fast: true
17 |       matrix:
18 |         python-version: ["3.9", "3.10", "3.11"]
19 | 
20 |     steps:
21 |       - name: Checkout repository
22 |         uses: actions/checkout@v3
23 |       - name: Set up Python ${{ matrix.python-version }}
24 |         uses: actions/setup-python@v3
25 |         with:
26 |           python-version: ${{ matrix.python-version }}
27 |           cache: "pip"
28 |       - name: Install dependencies
29 |         run: |
30 |           python -m pip install --upgrade pip
31 |           python -m pip install flake8 pytest pytest-cov coveralls
32 |           pip install -r requirements.txt
33 |           pip install .
34 |       - name: Lint with flake8
35 |         run: |
36 |           # stop the build if there are Python syntax errors or undefined names
37 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
38 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
39 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
40 |       - name: Run default linting script
41 |         run: |
42 |           ./lint.sh
43 |       - name: Install Ensembl data
44 |         run: |
45 |           echo "Before installing Ensembl releases" && df -h
46 |           pyensembl install --release 75 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh37.75/
47 |           pyensembl install --release 81 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.81/
48 |           pyensembl install --release 95 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.95/
49 |           pyensembl install --release 95 --species mouse --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCm38.95/
50 |           echo "After installing Ensembl releases" && df -h
51 |       - name: Run unit tests
52 |         run: |
53 |           ./test.sh
54 |       - name: Publish coverage to Coveralls
55 |         uses: coverallsapp/github-action@v2.2.3
56 | 


--------------------------------------------------------------------------------
/.github_changelog_generator:
--------------------------------------------------------------------------------
1 | unreleased=false
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 
56 | # PyCharm 
57 | .idea
58 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | Contributing to Varcode
 2 | ==========================
 3 | 
 4 | [Varcode](http://www.github.com/openvax/varcode) is open source software and
 5 | we welcome your contributions. This document should help you get started
 6 | contributing to Varcode.
 7 | 
 8 | Filing Issues
 9 | -------------
10 | If you find any bugs or problems while using Varcode or have any feature requests, please feel free to file an issue against the project. When doing so, please follow the guidelines below:
11 | 
12 | To report any bugs, issues, or feature requests, please [open an issue](https://github.com/openvax/varcode/issues)
13 | Please check the [current open issues](https://github.com/openvax/varcode/issues) to see if the request already exists
14 | If you are filing a bug report, please describe the version of Varcode, PyEnsembl, and Python being used. If your problem involves a particular genomic variant, please include that variant and its corresponding reference genome (e.g. "GRCh37 1:384747 AAC>T").
15 | 
16 | Coding Guidelines
17 | -----------------
18 | * Varcode is written in Python and adheres to the [PEP8](https://www.python.org/dev/peps/pep-0008/)
19 | style guidelines.
20 | * Contributions should come in the form of GitHub pull requests.
21 | * New features should start with a GitHub issue explaining their scope and rationale.
22 | * If the work is based on an existing issue, please reference the issue in the PR.
23 | * All new code should be accompanied by comprehensive unit tests.
24 | * If the PR fixes or implements an issue, please state "Closes #XYZ" or "Fixes #XYZ", where XYZ is the issue number.
25 | * Please ensure that your code works under Python >= 3.7.
26 | 
27 | Licensing
28 | ---------
29 | Varcode is licensed under the Apache 2.0 license. Your code is assumed to be as well.
30 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include versioneer.py
2 | include varcode/_version.py
3 | include README.md
4 | include LICENSE
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Tests](https://github.com/openvax/varcode/actions/workflows/tests.yml/badge.svg)](https://github.com/openvax/varcode/actions/workflows/tests.yml)
  2 | <a href="https://coveralls.io/github/openvax/varcode?branch=master">
  3 | <img src="https://coveralls.io/repos/openvax/varcode/badge.svg?branch=master&service=github" alt="Coverage Status" />
  4 | </a>
  5 | <a href="https://pypi.python.org/pypi/varcode/">
  6 | <img src="https://img.shields.io/pypi/v/varcode.svg?maxAge=1000" alt="PyPI" />
  7 | </a>
  8 | [![PyPI downloads](https://img.shields.io/pypi/dm/varcode.svg)](https://pypistats.org/packages/varcode)
  9 | 
 10 | # Varcode
 11 | 
 12 | Varcode is a library for working with genomic variant data in Python and predicting the impact of those variants on protein sequences.
 13 | 
 14 | ## Installation
 15 | 
 16 | You can install varcode using [pip](https://pip.pypa.io/en/latest/quickstart.html):
 17 | 
 18 | ```bash
 19 | pip install varcode
 20 | ```
 21 | 
 22 | You can install required reference genome data through [PyEnsembl](https://github.com/openvax/pyensembl) as follows:
 23 | 
 24 | ```bash
 25 | # Downloads and installs the Ensembl releases (75 and 76)
 26 | pyensembl install --release 75 76
 27 | ```
 28 | 
 29 | ## Example
 30 | 
 31 | ```python
 32 | import varcode
 33 | 
 34 | # Load TCGA MAF containing variants from their
 35 | variants = varcode.load_maf("tcga-ovarian-cancer-variants.maf")
 36 | 
 37 | print(variants)
 38 | ### <VariantCollection from 'tcga-ovarian-cancer-variants.maf' with 6428 elements>
 39 | ###  -- Variant(contig=1, start=69538, ref=G, alt=A, genome=GRCh37)
 40 | ###  -- Variant(contig=1, start=881892, ref=T, alt=G, genome=GRCh37)
 41 | ###  -- Variant(contig=1, start=3389714, ref=G, alt=A, genome=GRCh37)
 42 | ###  -- Variant(contig=1, start=3624325, ref=G, alt=T, genome=GRCh37)
 43 | ###  ...
 44 | 
 45 | # you can index into a VariantCollection and get back a Variant object
 46 | variant = variants[0]
 47 | 
 48 | # groupby_gene_name returns a dictionary whose keys are gene names
 49 | # and whose values are themselves VariantCollections
 50 | gene_groups = variants.groupby_gene_name()
 51 | 
 52 | # get variants which affect the TP53 gene
 53 | TP53_variants = gene_groups["TP53"]
 54 | 
 55 | # predict protein coding effect of every TP53 variant on
 56 | # each transcript of the TP53 gene
 57 | TP53_effects = TP53_variants.effects()
 58 | 
 59 | print(TP53_effects)
 60 | ### <EffectCollection with 789 elements>
 61 | ### -- PrematureStop(variant=chr17 g.7574003G>A, transcript_name=TP53-001, transcript_id=ENST00000269305, effect_description=p.R342*)
 62 | ### -- ThreePrimeUTR(variant=chr17 g.7574003G>A, transcript_name=TP53-005, transcript_id=ENST00000420246)
 63 | ### -- PrematureStop(variant=chr17 g.7574003G>A, transcript_name=TP53-002, transcript_id=ENST00000445888, effect_description=p.R342*)
 64 | ### -- FrameShift(variant=chr17 g.7574030_7574030delG, transcript_name=TP53-001, transcript_id=ENST00000269305, effect_description=p.R333fs)
 65 | ### ...
 66 | 
 67 | premature_stop_effect = TP53_effects[0]
 68 | 
 69 | print(str(premature_stop_effect.mutant_protein_sequence))
 70 | ### 'MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMF'
 71 | 
 72 | print(premature_stop_effect.aa_mutation_start_offset)
 73 | ### 341
 74 | 
 75 | print(premature_stop_effect.transcript)
 76 | ### Transcript(id=ENST00000269305, name=TP53-001, gene_name=TP53, biotype=protein_coding, location=17:7571720-7590856)
 77 | 
 78 | print(premature_stop_effect.gene.name)
 79 | ### 'TP53'
 80 | ```
 81 | 
 82 | If you are looking for a quick start guide, you can check out [this iPython book](./examples/varcode-quick_start.ipynb) that demonstrates simple use cases of Varcode
 83 | 
 84 | ## Effect Types
 85 | 
 86 | |            Effect type | Description                                                                                                                                   |
 87 | | ---------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------- |
 88 | |  _AlternateStartCodon_ | Replace annotated start codon with alternative start codon (_e.g._ "ATG>CAG").                                                                |
 89 | |  _ComplexSubstitution_ | Insertion and deletion of multiple amino acids.                                                                                               |
 90 | |             _Deletion_ | Coding mutation which causes deletion of amino acid(s).                                                                                       |
 91 | |             _ExonLoss_ | Deletion of entire exon, significantly disrupts protein.                                                                                      |
 92 | |     _ExonicSpliceSite_ | Mutation at the beginning or end of an exon, may affect splicing.                                                                             |
 93 | |         _FivePrimeUTR_ | Variant affects 5' untranslated region before start codon.                                                                                    |
 94 | | _FrameShiftTruncation_ | A frameshift which leads immediately to a stop codon (no novel amino acids created).                                                          |
 95 | |           _FrameShift_ | Out-of-frame insertion or deletion of nucleotides, causes novel protein sequence and often premature stop codon.                              |
 96 | | _IncompleteTranscript_ | Can't determine effect since transcript annotation is incomplete (often missing either the start or stop codon).                              |
 97 | |            _Insertion_ | Coding mutation which causes insertion of amino acid(s).                                                                                      |
 98 | |           _Intergenic_ | Occurs outside of any annotated gene.                                                                                                         |
 99 | |           _Intragenic_ | Within the annotated boundaries of a gene but not in a region that's transcribed into pre-mRNA.                                               |
100 | |   _IntronicSpliceSite_ | Mutation near the beginning or end of an intron but less likely to affect splicing than donor/acceptor mutations.                             |
101 | |             _Intronic_ | Variant occurs between exons and is unlikely to affect splicing.                                                                              |
102 | |  _NoncodingTranscript_ | Transcript doesn't code for a protein.                                                                                                        |
103 | |        _PrematureStop_ | Insertion of stop codon, truncates protein.                                                                                                   |
104 | |               _Silent_ | Mutation in coding sequence which does not change the amino acid sequence of the translated protein.                                          |
105 | |       _SpliceAcceptor_ | Mutation in the last two nucleotides of an intron, likely to affect splicing.                                                                 |
106 | |          _SpliceDonor_ | Mutation in the first two nucleotides of an intron, likely to affect splicing.                                                                |
107 | |            _StartLoss_ | Mutation causes loss of start codon, likely result is that an alternate start codon will be used down-stream (possibly in a different frame). |
108 | |             _StopLoss_ | Loss of stop codon, causes extension of protein by translation of nucleotides from 3' UTR.                                                    |
109 | |         _Substitution_ | Coding mutation which causes simple substitution of one amino acid for another.                                                               |
110 | |        _ThreePrimeUTR_ | Variant affects 3' untranslated region after stop codon of mRNA.                                                                              |
111 | 
112 | ## Coordinate System
113 | 
114 | Varcode currently uses a "base counted, one start" genomic coordinate system, to match the Ensembl annotation database. We are planning to switch over to "space counted, zero start" (interbase) coordinates, since that system allows for more uniform logic (no special cases for insertions). To learn more about genomic coordinate systems, read this [blog post](http://alternateallele.blogspot.com/2012/03/genome-coordinate-conventions.html).
115 | 


--------------------------------------------------------------------------------
/RELEASING.md:
--------------------------------------------------------------------------------
 1 | # Releasing Varcode
 2 | 
 3 | This document explains what do once your [Pull Request](https://www.atlassian.com/git/tutorials/making-a-pull-request/) has been reviewed and all final changes applied. Now you're ready merge your branch into master and release it to the world:
 4 | 
 5 | 0. Assign a version to the release you are preparing and update `__version__` in `version.py` using [semantic versioning](https://semver.org/). 
 6 | 
 7 | 1. Merge your branch into master.
 8 | 
 9 | 2. Run `deploy.sh`.
10 | 
11 | 


--------------------------------------------------------------------------------
/code-of-conduct.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at hello@openvax.org. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/deploy.sh:
--------------------------------------------------------------------------------
1 | ./lint.sh && \
2 | ./test.sh && \
3 | python3 -m pip install --upgrade build && \
4 | python3 -m pip install --upgrade twine && \
5 | rm -rf dist && \
6 | python3 -m build && \
7 | python3 -m twine upload dist/*
8 | 


--------------------------------------------------------------------------------
/lint-and-test.sh:
--------------------------------------------------------------------------------
1 | ./lint.sh && ./test.sh 
2 | 


--------------------------------------------------------------------------------
/lint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -o errexit
 3 | 
 4 | # getting false positives due to this issue with pylint:
 5 | # https://bitbucket.org/logilab/pylint/issues/701/false-positives-with-not-an-iterable-and
 6 | 
 7 | find varcode tests -name '*.py' \
 8 |   | xargs pylint \
 9 |   --errors-only \
10 |   --disable=unsubscriptable-object,not-an-iterable
11 | 
12 | echo 'Passes pylint check'
13 | 


--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
1 | [TYPECHECK]
2 | # Without ignoring this, we get errors like:
3 | # E:249,20: Module 'numpy' has no 'nan' member (no-member)
4 | ignored-modules = numpy
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy>=1.7,<2.0
 2 | pandas>=2.0.0,<3.0.0
 3 | pyensembl>=1.8.1
 4 | biopython>=1.64
 5 | pyvcf3>=1.0.0
 6 | memoized_property>=1.0.2
 7 | pylint>=1.4.4
 8 | serializable>=0.2.1
 9 | sercol>=0.1.4
10 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import os
15 | import re
16 | 
17 | from setuptools import setup, find_packages
18 | 
19 | readme_filename = "README.md"
20 | current_directory = os.path.dirname(__file__)
21 | readme_path = os.path.join(current_directory, readme_filename)
22 | 
23 | try:
24 |     with open(readme_path, 'r') as f:
25 |         readme_markdown = f.read()
26 | except Exception as e:
27 |     readme_markdown = ""
28 |     print(e)
29 |     print("Failed to open %s" % readme_path)
30 | 
31 | # Determine version number
32 | with open('varcode/version.py', 'r') as f:
33 |     version = re.search(
34 |         r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
35 |         f.read(),
36 |         re.MULTILINE).group(1)
37 |     print("Version: %s" % version)
38 | 
39 | if __name__ == '__main__':
40 |     setup(
41 |         name='varcode',
42 |         packages=find_packages(),
43 |         package_data={'varcode.cli': ['logging.conf']},
44 |         version=version,
45 |         description="Variant annotation in Python",
46 |         long_description=readme_markdown,
47 |         long_description_content_type='text/markdown',
48 |         url="https://github.com/openvax/varcode",
49 |         author="Alex Rubinsteyn",
50 |         author_email="alex.rubinsteyn@unc.edu",
51 |         license="http://www.apache.org/licenses/LICENSE-2.0.html",
52 |         classifiers=[
53 |             'Development Status :: 4 - Beta',
54 |             'Environment :: Console',
55 |             'Operating System :: OS Independent',
56 |             'Intended Audience :: Science/Research',
57 |             'License :: OSI Approved :: Apache Software License',
58 |             'Programming Language :: Python',
59 |             'Topic :: Scientific/Engineering :: Bio-Informatics',
60 |         ],
61 |         install_requires=[
62 |             'numpy>=1.7, <2.0',
63 |             'pandas>=0.15',
64 |             'pyensembl>=1.8.1',
65 |             'biopython>=1.64',
66 |             'pyvcf3>=1.0.0',
67 |             'memoized_property>=1.0.2',
68 |             'serializable>=0.2.1',
69 |             'sercol>=0.1.4',
70 |         ],
71 |         entry_points={
72 |             'console_scripts': [
73 |                 'varcode-genes = varcode.cli.genes_script:main',
74 |                 'varcode = varcode.cli.effects_script:main',
75 |             ]
76 |         })
77 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | pytest --cov=varcode/ --cov-report=term-missing tests
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from __future__ import print_function, division, absolute_import
14 | 


--------------------------------------------------------------------------------
/tests/benchmark_vcf_load.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | """
14 | Time how long it takes to open a VCF.
15 | 
16 | Run as:
17 |     python -m profile -s cumtime %(prog)s
18 | 
19 | to get profiling output.
20 | 
21 | """
22 | import argparse
23 | import time
24 | 
25 | import varcode
26 | 
27 | parser = argparse.ArgumentParser(description=__doc__)
28 | 
29 | parser.add_argument(
30 |     "path", help="Path or URL to VCF")
31 | 
32 | parser.add_argument(
33 |     "--profile", action="store_true",
34 |     default=False,
35 |     help="Run in a profiler.")
36 | 
37 | parser.add_argument(
38 |     "--no-info-field",
39 |     dest="info_field",
40 |     action="store_false",
41 |     default=True)
42 | 
43 | parser.add_argument(
44 |     "--pyvcf",
45 |     help="use pyvcf implementation",
46 |     action="store_true",
47 |     default=False)
48 | 
49 | def run():
50 |     args = parser.parse_args()
51 | 
52 |     extra_args = {}
53 |     if not args.info_field:
54 |         extra_args["include_info"] = False
55 | 
56 |     start = time.time()
57 | 
58 |     if args.pyvcf:
59 |         result = varcode.load_vcf(
60 |             args.path,
61 |             allow_extended_nucleotides=True)
62 |     else:
63 |         result = varcode.load_vcf_fast(
64 |             args.path,
65 |             allow_extended_nucleotides=True,
66 |             **extra_args)
67 | 
68 |     print("Loaded %d variants in %0.3f sec. " % (
69 |         len(result), time.time() - start))
70 |     print(result.to_string(limit=5))
71 | 
72 | if __name__ == '__main__':
73 |     run()
74 | 


--------------------------------------------------------------------------------
/tests/common.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #         http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | 
14 | expected_effect_properties = [
15 |     'gene',
16 |     'gene_name',
17 |     'gene_id',
18 |     'transcript',
19 |     'transcript_name',
20 |     'transcript_id',
21 |     'modifies_coding_sequence',
22 |     'modifies_protein_sequence',
23 |     'aa_mutation_start_offset',
24 |     'aa_mutation_end_offset',
25 |     'mutant_protein_sequence',
26 |     'short_description'
27 | ]
28 | 
29 | def check_effect_properties(effect):
30 |     assert effect is not None
31 |     # try accessing all the properties to make sure none crash
32 |     for attribute_name in expected_effect_properties:
33 |         getattr(effect, attribute_name)
34 |     assert len(str(effect)) > 0
35 |     assert len(repr(effect)) > 0
36 |     assert effect.short_description is not None, \
37 |         "Expected effect %s to have a `short_description` property" % (effect,)
38 |     assert len(effect.short_description) > 0
39 |     assert effect.__class__.__name__ in str(effect), \
40 |         "Expected string representation of %s to include effect name %s" % (
41 |             effect, effect.__class__.__name__)
42 | 
43 | def expect_effect(
44 |         variant,
45 |         transcript_id=None,
46 |         effect_class=None,
47 |         protein_sequence=None,
48 |         **kwargs):
49 |     if transcript_id is None:
50 |         effects = variant.effects()
51 |         effect = effects.top_priority_effect()
52 |     else:
53 |         transcript = variant.ensembl.transcript_by_id(transcript_id)
54 |         effect = variant.effect_on_transcript(transcript)
55 |     check_effect_properties(effect)
56 |     if effect_class is not None:
57 |         assert effect.__class__ is effect_class, \
58 |             "Expected effect class %s but got %s" % (
59 |                 effect_class.__name__,
60 |                 effect.__class__.__name__)
61 |     if protein_sequence is not None:
62 |         assert effect.mutant_protein_sequence == protein_sequence, \
63 |             "Expected protein sequence %s but got %s" % (
64 |                 protein_sequence,
65 |                 effect.mutant_protein_sequence)
66 |     for field, expected_value in kwargs.items():
67 |         actual_value = getattr(effect, field)
68 |         if isinstance(expected_value, int):
69 |             format_string = "Expected %s=%d but got %s"
70 |         elif isinstance(expected_value, float):
71 |             format_string = "Expected %s=%f but got %s"
72 |         else:
73 |             format_string = "Expected %s='%s' but got '%s'"
74 |         assert actual_value == expected_value, format_string % (field, expected_value, actual_value)
75 | 
76 | def eq_(x, y, s=None):
77 |     if s is None:
78 |         assert x == y, "%s != %s" % (x, y)
79 |     else:
80 |         assert x == y, s


--------------------------------------------------------------------------------
/tests/data.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #         http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | """
14 | Helper functions and shared datasets for tests
15 | """
16 | 
17 | import os
18 | from varcode import Variant, VariantCollection, load_maf
19 | import pandas as pd
20 | 
21 | def data_path(name):
22 |     """
23 |     Return the absolute path to a file in the varcode/test/data directory.
24 |     The name specified should be relative to varcode/test/data.
25 |     """
26 |     return os.path.join(os.path.dirname(__file__), "data", name)
27 | 
28 | dbnsp_validation_df = pd.read_csv(data_path('dbnsfp_validation_set.csv'))
29 | tcga_ov_variants = load_maf(data_path("tcga_ov.head.maf"))
30 | ov_wustle_variants = load_maf(data_path("ov.wustle.subset5.maf"))
31 | 
32 | snp_rs4244285 = Variant(
33 |     contig=10,
34 |     start=94781859,
35 |     ref="G",
36 |     alt="A")
37 | snp_rs1537415 = Variant(
38 |     contig=9,
39 |     start=135637876,
40 |     ref="C",
41 |     alt="G")
42 | snp_rs3892097 = Variant(
43 |     contig=22,
44 |     start=42524947,
45 |     ref="G",
46 |     alt="A")
47 | 
48 | db_snp_variants = VariantCollection([
49 |     snp_rs4244285,
50 |     snp_rs1537415,
51 |     snp_rs3892097,
52 | ])
53 | 


--------------------------------------------------------------------------------
/tests/data/dbnsfp_validation_set.csv:
--------------------------------------------------------------------------------
  1 | aa_alt,aa_pos,dna_alt,chrom,ensembl_transcript,dna_position,dna_ref
  2 | K,143,T,14,ENST00000379932,105675961,C
  3 | L,852,A,12,ENST00000261740,110221487,C
  4 | L,805,A,12,ENST00000392719,110221487,C
  5 | L,792,A,12,ENST00000346520,110221487,C
  6 | L,745,A,12,ENST00000544971,110221487,C
  7 | L,792,A,12,ENST00000537083,110221487,C
  8 | L,805,A,12,ENST00000541794,110221487,C
  9 | L,818,A,12,ENST00000536838,110221487,C
 10 | W,241,C,15,ENST00000288235,59516943,G
 11 | N,82,G,6,ENST00000377451,27279704,C
 12 | C,354,G,1,ENST00000546424,15820483,C
 13 | C,354,G,1,ENST00000333868,15820483,C
 14 | C,204,G,1,ENST00000348549,15820483,C
 15 | C,271,G,1,ENST00000375890,15820483,C
 16 | N,176,T,6,ENST00000521485,84368738,C
 17 | H,178,C,1,ENST00000368764,152882807,G
 18 | H,32,C,1,ENST00000392667,152882807,G
 19 | K,2885,T,1,ENST00000368346,155308045,C
 20 | K,2880,T,1,ENST00000392403,155308045,C
 21 | P,1534,G,22,ENST00000441493,18300827,T
 22 | L,32,A,7,ENST00000394507,91871355,G
 23 | K,84,T,4,ENST00000296522,175439195,C
 24 | Q,446,C,22,ENST00000536101,26165219,G
 25 | D,3878,C,2,ENST00000409009,73827899,G
 26 | T,10,T,16,ENST00000283025,10788703,C
 27 | T,610,T,14,ENST00000331968,30093435,C
 28 | G,185,C,20,ENST00000546004,5283287,G
 29 | M,170,T,17,ENST00000269051,30616025,C
 30 | M,162,T,17,ENST00000538145,30616025,C
 31 | M,72,T,17,ENST00000536287,30616025,C
 32 | M,1664,A,9,ENST00000313050,139355629,C
 33 | M,1486,A,9,ENST00000371706,139355629,C
 34 | M,1486,A,9,ENST00000290037,139355629,C
 35 | M,1486,A,9,ENST00000431893,139355629,C
 36 | A,666,C,4,ENST00000508776,128744730,G
 37 | A,697,C,4,ENST00000439123,128744730,G
 38 | A,666,C,4,ENST00000296464,128744730,G
 39 | A,640,C,4,ENST00000505726,128744730,G
 40 | L,38,T,10,ENST00000370196,102891411,C
 41 | K,270,A,1,ENST00000498508,214170686,G
 42 | H,110,C,16,ENST00000311620,21261217,G
 43 | L,947,A,2,ENST00000419748,88857312,G
 44 | L,1098,A,2,ENST00000303236,88857312,G
 45 | L,260,A,10,ENST00000372873,75407959,G
 46 | L,484,A,10,ENST00000394810,75407959,G
 47 | E,123,C,6,ENST00000531224,136599652,G
 48 | E,121,C,6,ENST00000353331,136599652,G
 49 | E,123,C,6,ENST00000527536,136599652,G
 50 | E,121,C,6,ENST00000392348,136599652,G
 51 | L,2419,T,5,ENST00000438447,32090810,C
 52 | L,2419,T,5,ENST00000282493,32090810,C
 53 | K,32,T,X,ENST00000375992,51239203,C
 54 | R,250,A,14,ENST00000306051,52735280,G
 55 | K,467,T,X,ENST00000396992,47483685,C
 56 | V,1462,G,5,ENST00000399503,56184179,T
 57 | K,123,A,16,ENST00000434417,30429101,G
 58 | Q,312,T,1,ENST00000427495,242271091,C
 59 | Q,282,T,1,ENST00000442594,242271091,C
 60 | Q,374,T,1,ENST00000536534,242271091,C
 61 | L,150,T,20,ENST00000244051,49575828,C
 62 | K,774,A,16,ENST00000301727,2285538,G
 63 | R,85,G,10,ENST00000520547,81272659,A
 64 | N,532,A,2,ENST00000393504,99013227,G
 65 | N,536,A,2,ENST00000409937,99013227,G
 66 | L,26,A,7,ENST00000394507,91871373,C
 67 | M,2116,G,19,ENST00000352632,41073580,C
 68 | M,859,G,19,ENST00000392025,41073580,C
 69 | H,161,G,17,ENST00000301037,26939700,C
 70 | K,170,A,20,ENST00000375994,30409276,G
 71 | F,679,A,15,ENST00000389039,45398436,G
 72 | N,342,T,8,ENST00000361421,59728265,C
 73 | K,167,A,13,ENST00000376958,95264638,G
 74 | K,1371,T,8,ENST00000320476,144874944,C
 75 | K,1290,T,8,ENST00000377533,144874944,C
 76 | K,1666,A,11,ENST00000321505,33680325,G
 77 | K,1672,A,11,ENST00000389726,33680325,G
 78 | A,1326,G,12,ENST00000267101,56495786,C
 79 | A,683,G,12,ENST00000450146,56495786,C
 80 | A,1267,G,12,ENST00000415288,56495786,C
 81 | K,635,T,1,ENST00000366508,247057966,C
 82 | K,609,T,1,ENST00000326225,247057966,C
 83 | H,379,C,12,ENST00000547057,94691119,G
 84 | H,71,C,12,ENST00000545312,94691119,G
 85 | K,545,A,3,ENST00000263967,178936091,G
 86 | K,402,A,16,ENST00000416441,29996834,G
 87 | K,278,A,16,ENST00000389398,22128096,G
 88 | Q,837,C,11,ENST00000529051,124908424,G
 89 | R,407,A,12,ENST00000257963,52380684,G
 90 | R,448,A,12,ENST00000541224,52380684,G
 91 | R,355,A,12,ENST00000542485,52380684,G
 92 | Q,10,T,6,ENST00000011619,13711709,G
 93 | W,763,A,2,ENST00000281405,20136107,G
 94 | C,7,G,11,ENST00000398534,71249121,C
 95 | Q,119,G,12,ENST00000204726,133393177,C
 96 | I,745,T,15,ENST00000356865,25958932,C
 97 | K,569,A,1,ENST00000369130,150116967,G
 98 | K,526,A,18,ENST00000342988,48604754,G
 99 | N,91,T,11,ENST00000528117,8974698,C
100 | L,444,A,5,ENST00000507386,147020337,G
101 | L,444,A,5,ENST00000265272,147020337,G
102 | L,402,A,5,ENST00000333010,147020337,G
103 | E,190,T,15,ENST00000324324,48451958,C
104 | T,108,A,19,ENST00000392518,50203981,G
105 | P,1241,C,2,ENST00000401884,242011122,A
106 | M,951,T,3,ENST00000474889,62253472,C
107 | M,922,T,3,ENST00000295874,62253472,C
108 | E,268,A,9,ENST00000380607,17793439,G
109 | E,221,A,9,ENST00000537391,17793439,G
110 | A,508,G,12,ENST00000228437,108136084,C
111 | N,87,A,10,ENST00000373910,60124591,G
112 | 


--------------------------------------------------------------------------------
/tests/data/different-samples.1.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
 3 | ##INFO=<ID=GE,Number=.,Type=String,Description="HGNC Gene Symbol (could be more than one)">
 4 | ##INFO=<ID=EG,Number=.,Type=String,Description="Entrez Gene ID (could be more than one)">
 5 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 6 | ##contig=<ID=chrM,length=16571>
 7 | ##contig=<ID=chr1,length=249250621>
 8 | ##contig=<ID=chr10,length=135534747>
 9 | ##contig=<ID=chr11,length=135006516>
10 | ##contig=<ID=chr12,length=133851895>
11 | ##contig=<ID=chr14,length=107349540>
12 | ##contig=<ID=chr15,length=102531392>
13 | ##contig=<ID=chr16,length=90354753>
14 | ##contig=<ID=chr17,length=81195210>
15 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	metastasis
16 | chr1	53513530	.	A	C	.	.	GE=SCP2;EG=6342	GT	0/1
17 | chr17	36731197	.	C	AAT	.	.	GE=SRCIN1;EG=80725	GT	0/1
18 | 


--------------------------------------------------------------------------------
/tests/data/different-samples.2.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
 3 | ##INFO=<ID=GE,Number=.,Type=String,Description="HGNC Gene Symbol (could be more than one)">
 4 | ##INFO=<ID=EG,Number=.,Type=String,Description="Entrez Gene ID (could be more than one)">
 5 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 6 | ##contig=<ID=chrM,length=16571>
 7 | ##contig=<ID=chr1,length=249250621>
 8 | ##contig=<ID=chr10,length=135534747>
 9 | ##contig=<ID=chr11,length=135006516>
10 | ##contig=<ID=chr12,length=133851895>
11 | ##contig=<ID=chr14,length=107349540>
12 | ##contig=<ID=chr15,length=102531392>
13 | ##contig=<ID=chr16,length=90354753>
14 | ##contig=<ID=chr17,length=81195210>
15 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	normal
16 | chr2	5	.	A	C	.	.	GE=SCP2;EG=6342	GT	0/1
17 | chr7	18	.	C	AAT	.	.	GE=SRCIN1;EG=80725	GT	0/1
18 | 


--------------------------------------------------------------------------------
/tests/data/duplicate-id.1.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.2
2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
3 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
4 | chr1	13281	1	C	G	.	PASS	VT=SNP;SOMATIC
5 | 


--------------------------------------------------------------------------------
/tests/data/duplicate-id.2.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.2
2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
3 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
4 | chr1	13281	1	C	G,T	.	PASS	VT=SNP;SOMATIC
5 | 


--------------------------------------------------------------------------------
/tests/data/duplicates.maf:
--------------------------------------------------------------------------------
1 | Hugo_Symbol	Entrez_Gene_Id	Center	NCBI_Build	Chromosome	Start_position	End_position	Strand	Variant_Classification	Variant_Type	Reference_Allele	Tumor_Seq_Allele1	Tumor_Seq_Allele2	dbSNP_RS	dbSNP_Val_Status	Tumor_Sample_Barcode	Matched_Norm_Sample_Barcode	Match_Norm_Seq_Allele1	Match_Norm_Seq_Allele2
2 | A1CF	29974	.	37	10	52573692	52573692	+	Missense_Mutation	SNP	G	G	T	.	.	.	.	.	.
3 | A1CF	29974	.	37	10	52573692	52573692	+	Missense_Mutation	SNP	G	G	T	.	.	.	.	.	.
4 | A1CF	29974	.	37	10	52573692	52573692	+	Missense_Mutation	SNP	G	G	T	.	.	.	.	.	.


--------------------------------------------------------------------------------
/tests/data/duplicates.vcf:
--------------------------------------------------------------------------------
1 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
2 | chr17	7675088	.	G	A	0	PASS	.	
3 | chr17	7675088	.	G	A	0	PASS	.	
4 | chr17	7675088	.	G	A	0	PASS	.	


--------------------------------------------------------------------------------
/tests/data/multiallelic.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
 3 | ##INFO=<ID=GE,Number=.,Type=String,Description="HGNC Gene Symbol (could be more than one)">
 4 | ##INFO=<ID=DP,Number=.,Type=Integer,Description="Depth">
 5 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 6 | ##contig=<ID=chrM,length=16571>
 7 | ##contig=<ID=chr1,length=249250621>
 8 | ##contig=<ID=chr10,length=135534747>
 9 | ##contig=<ID=chr11,length=135006516>
10 | ##contig=<ID=chr12,length=133851895>
11 | ##contig=<ID=chr14,length=107349540>
12 | ##contig=<ID=chr15,length=102531392>
13 | ##contig=<ID=chr16,length=90354753>
14 | ##contig=<ID=chr17,length=81195210>
15 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	metastasis
16 | chr1	1431105	rs199599542	A	C,G	593.69	PASS	DP=17;GE=Wuzzle	GT	0/1
17 | 


--------------------------------------------------------------------------------
/tests/data/ov.wustle.subset5.maf:
--------------------------------------------------------------------------------
1 | #version 2.4
2 | Hugo_Symbol	Entrez_Gene_Id	Center	NCBI_Build	Chromosome	Start_position	End_position	Strand	Variant_Classification	Variant_Type	Reference_Allele	Tumor_Seq_Allele1	Tumor_Seq_Allele2	dbSNP_RS	dbSNP_Val_Status	Tumor_Sample_Barcode	Matched_Norm_Sample_Barcode	Match_Norm_Seq_Allele1	Match_Norm_Seq_Allele2	Tumor_Validation_Allele1	Tumor_Validation_Allele2	Match_Norm_Validation_Allele1	Match_Norm_Validation_Allele2	Verification_Status	Validation_Status	Mutation_Status	Sequencing_Phase	Sequence_Source	Validation_Method	Score	BAM_file	Sequencer	Tumor_Sample_UUID	Matched_Norm_Sample_UUID	chromosome_name	start	stop	reference	variant	type	gene_name	transcript_name	transcript_species	transcript_source	transcript_version	strand	transcript_status	trv_type	c_position	amino_acid_change	ucsc_cons	domain	all_domains	deletion_substructures	transcript_error
3 | AGL	178	genome.wustl.edu	37	1	100349684	100349684	+	Missense_Mutation	SNP	G	G	A			TCGA-13-1405-01A-01W-0494-09	TCGA-13-1405-10A-01W-0495-09	G	G	G	A	G	G	Unknown	Valid	Somatic	4	WXS	454_PCR_WGA	1	dbGAP	Illumina GAIIx	c0d1de72-4cce-4d74-93f0-29c462dc1426	89f04056-0478-4305-b1ce-486ae469b4dd	1	100122272	100122272	G	A	SNP	AGL	NM_000028	human	genbank	54_36p	1	validated	missense	c.2317	p.E773K	1	NULL	superfamily_Six-hairpin glycosidases;HMMPfam_GDE_C;superfamily_(Trans)glycosidases	-	no_errors
4 | SASS6	163786	genome.wustl.edu	37	1	100573197	100573197	+	Missense_Mutation	SNP	G	G	A			TCGA-04-1542-01A-01W-0553-09	TCGA-04-1542-10A-01W-0553-09	G	G	G	A	G	G	Unknown	Valid	Somatic	4	WXS	454_PCR_WGA	1	dbGAP	Illumina GAIIx	317a63af-e862-43df-8ef5-7c555b2cb678	b94052a8-c3d2-4e47-81e2-62242bc0841a	1	100345785	100345785	G	A	SNP	SASS6	NM_194292	human	genbank	54_36p	-1	validated	missense	c.1133	p.A378V	1	NULL	-	-	no_errors
5 | LRRC39	127495	genome.wustl.edu	37	1	100618068	100618068	+	Silent	SNP	G	G	A			TCGA-23-1022-01A-02W-0488-09	TCGA-23-1022-10A-01W-0488-09	G	G	G	A	G	G	Unknown	Valid	Somatic	4	WXS	454_PCR_WGA	1	dbGAP	Illumina GAIIx	160a0e7d-315e-4de3-a7d4-928412fd909c	6bd506d5-4f1a-4f51-a71f-e453196b245a	1	100390656	100390656	G	A	SNP	LRRC39	NM_144620	human	genbank	54_36p	-1	provisional	silent	c.825	p.F275	1	NULL	-	-	no_errors
6 | UBE4B	10277	genome.wustl.edu	37	1	10238758	10238758	+	Silent	SNP	G	G	C			TCGA-13-0920-01A-01W-0421-09	TCGA-13-0920-10A-01W-0421-09	G	G	G	C	G	G	Unknown	Valid	Somatic	4	WXS	454_PCR_WGA	1	dbGAP	Illumina GAIIx	2e28969b-c9a9-41ec-80bf-f583197b7f92	83a80d56-e463-4096-8c17-a44000f80f66	1	10161345	10161345	G	C	SNP	UBE4B	NM_001105562	human	genbank	54_36p	1	reviewed	silent	c.3582	p.G1194	0.97	NULL	-	-	no_errors
7 | COL11A1	1301	genome.wustl.edu	37	1	103491420	103491420	+	Missense_Mutation	SNP	T	T	A			TCGA-13-0893-01B-01W-0494-09	TCGA-13-0893-10A-01W-0494-09	T	T	T	A	T	T	Unknown	Valid	Somatic	4	WXS	454_PCR_WGA	1	dbGAP	Illumina GAIIx	a335ab49-84b7-4d3b-a03d-9c3931904ca5	23f57381-b679-41b8-8197-aed711f71db4	1	103264008	103264008	T	A	SNP	COL11A1	NM_080629	human	genbank	54_36p	-1	reviewed	missense	c.869	p.E290V	1	NULL	HMMPfam_COLFI;HMMPfam_Collagen;superfamily_Concanavalin A-like lectins/glucanases;HMMPfam_Laminin_G_2;superfamily_Fibrinogen C-terminal domain-like	-	no_errors
8 | 


--------------------------------------------------------------------------------
/tests/data/same-samples.1.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
 3 | ##INFO=<ID=GE,Number=.,Type=String,Description="HGNC Gene Symbol (could be more than one)">
 4 | ##INFO=<ID=EG,Number=.,Type=String,Description="Entrez Gene ID (could be more than one)">
 5 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 6 | ##contig=<ID=chrM,length=16571>
 7 | ##contig=<ID=chr1,length=249250621>
 8 | ##contig=<ID=chr10,length=135534747>
 9 | ##contig=<ID=chr11,length=135006516>
10 | ##contig=<ID=chr12,length=133851895>
11 | ##contig=<ID=chr14,length=107349540>
12 | ##contig=<ID=chr15,length=102531392>
13 | ##contig=<ID=chr16,length=90354753>
14 | ##contig=<ID=chr17,length=81195210>
15 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	metastasis
16 | chr1	53513530	.	A	C	.	.	GE=SCP2;EG=6342	GT	0/1
17 | chr17	36731197	.	C	AAT	.	.	GE=SRCIN1;EG=80725	GT	0/1
18 | 


--------------------------------------------------------------------------------
/tests/data/same-samples.2.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
 3 | ##INFO=<ID=GE,Number=.,Type=String,Description="HGNC Gene Symbol (could be more than one)">
 4 | ##INFO=<ID=EG,Number=.,Type=String,Description="Entrez Gene ID (could be more than one)">
 5 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 6 | ##contig=<ID=chrM,length=16571>
 7 | ##contig=<ID=chr1,length=249250621>
 8 | ##contig=<ID=chr10,length=135534747>
 9 | ##contig=<ID=chr11,length=135006516>
10 | ##contig=<ID=chr12,length=133851895>
11 | ##contig=<ID=chr14,length=107349540>
12 | ##contig=<ID=chr15,length=102531392>
13 | ##contig=<ID=chr16,length=90354753>
14 | ##contig=<ID=chr17,length=81195210>
15 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	metastasis
16 | chr1	53513530	.	A	C	.	.	GE=SCP2;EG=6342	GT	0/1
17 | chr17	36731197	.	C	AAT	.	.	GE=SRCIN1;EG=80725	GT	0/1
18 | 


--------------------------------------------------------------------------------
/tests/data/simple.1.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.2
2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
3 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
4 | chr1	13281	.	C	G	.	PASS	VT=SNP;SOMATIC
5 | 


--------------------------------------------------------------------------------
/tests/data/simple.2.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.2
2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
3 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
4 | chr1	13289	.	T	C	.	PASS	VT=SNP;SOMATIC
5 | chr2	13289	.	A	G	.	PASS	VT=SNP;SOMATIC
6 | 


--------------------------------------------------------------------------------
/tests/data/somatic_hg19_14muts.space_in_sample_name.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
 3 | ##INFO=<ID=GE,Number=.,Type=String,Description="HGNC Gene Symbol (could be more than one)">
 4 | ##INFO=<ID=EG,Number=.,Type=String,Description="Entrez Gene ID (could be more than one)">
 5 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 6 | ##contig=<ID=chrM,length=16571>
 7 | ##contig=<ID=chr1,length=249250621>
 8 | ##contig=<ID=chr10,length=135534747>
 9 | ##contig=<ID=chr11,length=135006516>
10 | ##contig=<ID=chr12,length=133851895>
11 | ##contig=<ID=chr14,length=107349540>
12 | ##contig=<ID=chr15,length=102531392>
13 | ##contig=<ID=chr16,length=90354753>
14 | ##contig=<ID=chr17,length=81195210>
15 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	metastasis foo
16 | chr1	53513530	.	A	C	.	.	GE=SCP2;EG=6342	GT	0/1
17 | chr1	228295398	.	G	T	.	.	GE=MRPL55;EG=128308	GT	0/1
18 | chr10	49658590	.	T	C	.	.	GE=ARHGAP22;EG=58504	GT	0/1
19 | chr10	51585166	.	G	T	.	.	GE=NCOA4;EG=8031	GT	0/1
20 | chr10	96709040	.	A	C	.	.	GE=CYP2C9;EG=1559	GT	0/1
21 | chr10	119134281	.	G	T	.	.	GE=PDZD8;EG=118987	GT	0/1
22 | chr11	118244286	.	G	G	.	.	GE=UBE4A;EG=9354	GT	0/1
23 | chr12	14794076	.	C	A	.	.	GE=GUCY2C;EG=2984	GT	0/1
24 | chr12	25398284	.	C	G	.	.	GE=KRAS;EG=3845	GT	0/1
25 | chr12	42778752	.	T	A	.	.	GE=PPHLN1;EG=51535	GT	0/1
26 | chr14	31144202	.	A	C	.	.	GE=SCFD1;EG=23256	GT	0/1
27 | chr16	25704209	.	G	A	.	.	GE=HS3ST4;EG=9951	GT	0/1
28 | chr17	7577548	.	C	CA	.	.	GE=TP53;EG=7157	GT	0/1
29 | chr17	36731197	.	C	AAT	.	.	GE=SRCIN1;EG=80725	GT	0/1
30 | 


--------------------------------------------------------------------------------
/tests/data/somatic_hg19_14muts.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
 3 | ##INFO=<ID=GE,Number=.,Type=String,Description="HGNC Gene Symbol (could be more than one)">
 4 | ##INFO=<ID=EG,Number=.,Type=String,Description="Entrez Gene ID (could be more than one)">
 5 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 6 | ##contig=<ID=chrM,length=16571>
 7 | ##contig=<ID=chr1,length=249250621>
 8 | ##contig=<ID=chr10,length=135534747>
 9 | ##contig=<ID=chr11,length=135006516>
10 | ##contig=<ID=chr12,length=133851895>
11 | ##contig=<ID=chr14,length=107349540>
12 | ##contig=<ID=chr15,length=102531392>
13 | ##contig=<ID=chr16,length=90354753>
14 | ##contig=<ID=chr17,length=81195210>
15 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	metastasis
16 | chr1	53513530	.	A	C	.	.	GE=SCP2;EG=6342	GT	0/1
17 | chr1	228295398	.	G	T	.	.	GE=MRPL55;EG=128308	GT	0/1
18 | chr10	49658590	.	T	C	.	.	GE=ARHGAP22;EG=58504	GT	0/1
19 | chr10	51585166	.	G	T	.	.	GE=NCOA4;EG=8031	GT	0/1
20 | chr10	96709040	.	A	C	.	.	GE=CYP2C9;EG=1559	GT	0/1
21 | chr10	119134281	.	G	T	.	.	GE=PDZD8;EG=118987	GT	0/1
22 | chr11	118244286	.	G	G	.	.	GE=UBE4A;EG=9354	GT	0/1
23 | chr12	14794076	.	C	A	.	.	GE=GUCY2C;EG=2984	GT	0/1
24 | chr12	25398284	.	C	G	.	.	GE=KRAS;EG=3845	GT	0/1
25 | chr12	42778752	.	T	A	.	.	GE=PPHLN1;EG=51535	GT	0/1
26 | chr14	31144202	.	A	C	.	.	GE=SCFD1;EG=23256	GT	0/1
27 | chr16	25704209	.	G	A	.	.	GE=HS3ST4;EG=9951	GT	0/1
28 | chr17	7577548	.	C	CA	.	.	GE=TP53;EG=7157	GT	0/1
29 | chr17	36731197	.	C	AAT	.	.	GE=SRCIN1;EG=80725	GT	0/1
30 | 


--------------------------------------------------------------------------------
/tests/data/somatic_hg19_14muts.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/varcode/2c1c96e0564d7ad5f66b26e33fc0a027353640f4/tests/data/somatic_hg19_14muts.vcf.gz


--------------------------------------------------------------------------------
/tests/data/tcga_ov.head.maf:
--------------------------------------------------------------------------------
1 | Hugo_Symbol	Entrez_Gene_Id	Center	NCBI_Build	Chromosome	Start_position	End_position	Strand	Variant_Classification	Variant_Type	Reference_Allele	Tumor_Seq_Allele1	Tumor_Seq_Allele2	dbSNP_RS	dbSNP_Val_Status	Tumor_Sample_Barcode	Matched_Norm_Sample_Barcode	Match_Norm_Seq_Allele1	Match_Norm_Seq_Allele2	Tumor_Validation_Allele1	Tumor_Validation_Allele2	Match_Norm_Validation_Allele1	Match_Norm_Validation_Allele2	Verification_Status	Validation_Status	Mutation_Status	Sequencing_Phase	Sequence_Source	Validation_Method	Score	BAM_file	Sequencer	Tumor_Sample_UUID	Matched_Norm_Sample_UUID	chromosome_name	start	stop	reference	variant	type	gene_name	transcript_name	transcript_species	transcript_source	transcript_version	strand	transcript_status	trv_type	c_position	amino_acid_change	ucsc_cons	domain	all_domains	deletion_substructures	transcript_error
2 | CDK11A	0	-	37	1	1650797	1650797	+	Missense_Mutation	SNP	A	A	G			TCGA-04-1337-01A-01W-0484-10	TCGA-04-1337-11A-01W-0485-10	A	A					Unknown	Unknown	Somatic	Phase_IV	Capture		1	dbGAP	-	1	1650797	1650797	A	G	SNP	CDK11A	ENST00000404249	human	ensembl	69_37n	-1	known	missense	c.325	p.C109R	0.971	NULL	pfam_Prot_kinase_cat_dom,pfam_Ser-Thr/Tyr_kinase_cat_dom,superfamily_Kinase-like_dom,smart_Ser/Thr_dual-sp_kinase_dom,smart_Tyr_kinase_cat_dom,pfscan_Prot_kinase_cat_dom	-	no_errors
3 | GNPAT	0	-	37	1	231401797	231401797	+	Missense_Mutation	SNP	A	A	C			TCGA-04-1337-01A-01W-0484-10	TCGA-04-1337-11A-01W-0485-10	A	A					Unknown	Unknown	Somatic	Phase_IV	Capture		1	dbGAP	-	1	231401797	231401797	A	C	SNP	GNPAT	ENST00000366647	human	ensembl	69_37n	+1	known	missense	c.810	p.R270S	0.997	pfam_Acyltransferase,smart_Acyltransferase	pfam_Acyltransferase,smart_Acyltransferase	-	no_errors
4 | E2F2	0	-	37	1	23836447	23836447	+	Silent	SNP	C	C	A			TCGA-04-1337-01A-01W-0484-10	TCGA-04-1337-11A-01W-0485-10	C	C					Unknown	Unknown	Somatic	Phase_IV	Capture		1	dbGAP	-	1	23836447	23836447	C	A	SNP	E2F2	ENST00000361729	human	ensembl	69_37n	-1	known	silent	c.1239	p.L413	0.999	NULL	pfam_E2F_TDP	-	no_errors
5 | VSIG2	0	-	37	11	124617502	124617502	+	Missense_Mutation	SNP	C	C	G			TCGA-04-1337-01A-01W-0484-10	TCGA-04-1337-11A-01W-0485-10	C	C					Unknown	Unknown	Somatic	Phase_IV	Capture		1	dbGAP	-	11	124617502	124617502	C	G	SNP	VSIG2	ENST00000326621	human	ensembl	69_37n	-1	known	missense	c.913	p.G305R	0.813	NULL	pfam_Ig_V-set,pfam_Ig_I-set,pfam_Immunoglobulin,smart_Ig_sub,smart_Ig_sub2,smart_Ig_V-set_subgr,pfscan_Ig-like	-	no_errors
6 | 


--------------------------------------------------------------------------------
/tests/data/tcga_ov.head.xychr.maf:
--------------------------------------------------------------------------------
1 | Hugo_Symbol	Entrez_Gene_Id	Center	NCBI_Build	Chromosome	Start_position	End_position	Strand	Variant_Classification	Variant_Type	Reference_Allele	Tumor_Seq_Allele1	Tumor_Seq_Allele2	dbSNP_RS	dbSNP_Val_Status	Tumor_Sample_Barcode	Matched_Norm_Sample_Barcode	Match_Norm_Seq_Allele1	Match_Norm_Seq_Allele2	Tumor_Validation_Allele1	Tumor_Validation_Allele2	Match_Norm_Validation_Allele1	Match_Norm_Validation_Allele2	Verification_Status	Validation_Status	Mutation_Status	Sequencing_Phase	Sequence_Source	Validation_Method	Score	BAM_file	Sequencer	Tumor_Sample_UUID	Matched_Norm_Sample_UUID	chromosome_name	start	stop	reference	variant	type	gene_name	transcript_name	transcript_species	transcript_source	transcript_version	strand	transcript_status	trv_type	c_position	amino_acid_change	ucsc_cons	domain	all_domains	deletion_substructures	transcript_error
2 | CDK11A	0	-	37	X	1650797	1650797	+	Missense_Mutation	SNP	A	A	G			TCGA-04-1337-01A-01W-0484-10	TCGA-04-1337-11A-01W-0485-10	A	A					Unknown	Unknown	Somatic	Phase_IV	Capture		1	dbGAP	-	1	1650797	1650797	A	G	SNP	CDK11A	ENST00000404249	human	ensembl	69_37n	-1	known	missense	c.325	p.C109R	0.971	NULL	pfam_Prot_kinase_cat_dom,pfam_Ser-Thr/Tyr_kinase_cat_dom,superfamily_Kinase-like_dom,smart_Ser/Thr_dual-sp_kinase_dom,smart_Tyr_kinase_cat_dom,pfscan_Prot_kinase_cat_dom	-	no_errors
3 | GNPAT	0	-	37	Y	231401797	231401797	+	Missense_Mutation	SNP	A	A	C			TCGA-04-1337-01A-01W-0484-10	TCGA-04-1337-11A-01W-0485-10	A	A					Unknown	Unknown	Somatic	Phase_IV	Capture		1	dbGAP	-	1	231401797	231401797	A	C	SNP	GNPAT	ENST00000366647	human	ensembl	69_37n	+1	known	missense	c.810	p.R270S	0.997	pfam_Acyltransferase,smart_Acyltransferase	pfam_Acyltransferase,smart_Acyltransferase	-	no_errors
4 | E2F2	0	-	37	1	23836447	23836447	+	Silent	SNP	C	C	A			TCGA-04-1337-01A-01W-0484-10	TCGA-04-1337-11A-01W-0485-10	C	C					Unknown	Unknown	Somatic	Phase_IV	Capture		1	dbGAP	-	1	23836447	23836447	C	A	SNP	E2F2	ENST00000361729	human	ensembl	69_37n	-1	known	silent	c.1239	p.L413	0.999	NULL	pfam_E2F_TDP	-	no_errors
5 | VSIG2	0	-	37	11	124617502	124617502	+	Missense_Mutation	SNP	C	C	G			TCGA-04-1337-01A-01W-0484-10	TCGA-04-1337-11A-01W-0485-10	C	C					Unknown	Unknown	Somatic	Phase_IV	Capture		1	dbGAP	-	11	124617502	124617502	C	G	SNP	VSIG2	ENST00000326621	human	ensembl	69_37n	-1	known	missense	c.913	p.G305R	0.813	NULL	pfam_Ig_V-set,pfam_Ig_I-set,pfam_Immunoglobulin,smart_Ig_sub,smart_Ig_sub2,smart_Ig_V-set_subgr,pfscan_Ig-like	-	no_errors
6 | 


--------------------------------------------------------------------------------
/tests/test_cli_effects.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from tempfile import NamedTemporaryFile
14 | import pandas as pd
15 | 
16 | from varcode.cli.effects_script import main as run_script
17 | from varcode import Variant
18 | 
19 | from .common import eq_
20 | def test_varcode_effects_script_kras_g12d_top_effect():
21 |     """
22 |     Load a variant collection with combines the ovarian cancer test VCF
23 |     and a small number of variants from dbSNP
24 |     """
25 |     kras_g12d_variant = Variant(
26 |         12,
27 |         25398284,
28 |         "C",
29 |         "T",
30 |         "GRCh37")
31 |     commandline_args = ["--genome", "grch37", "--only-coding", "--one-per-variant"]
32 |     commandline_args.append("--variant")
33 |     commandline_args.append(str(kras_g12d_variant.contig))
34 |     commandline_args.append(str(kras_g12d_variant.start))
35 |     commandline_args.append(str(kras_g12d_variant.original_ref))
36 |     commandline_args.append(str(kras_g12d_variant.original_alt))
37 |     with NamedTemporaryFile(mode="r+", delete=True) as f:
38 |         commandline_args.extend(["--output-csv", f.name])
39 |         run_script(commandline_args)
40 |         f.flush()
41 |         df = pd.read_csv(f.name)
42 |     eq_(len(df), 1)
43 |     eq_(df.loc[0].gene_name, "KRAS")
44 |     eq_(df.iloc[0].effect, "p.G12D")
45 | 
46 | 


--------------------------------------------------------------------------------
/tests/test_cli_genes.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from varcode.cli.genes_script import main as run_script
14 | from .data import ov_wustle_variants, db_snp_variants
15 | 
16 | from tempfile import NamedTemporaryFile
17 | import pandas as pd
18 | 
19 | 
20 | def test_varcode_effects_script():
21 |     """
22 |     Load a variant collection with combines the ovarian cancer test VCF
23 |     and a small number of variants from dbSNP
24 |     """
25 |     commandline_args = ["--genome", "grch37"]
26 |     commandline_args.extend(["--maf", ov_wustle_variants.path])
27 |     for variant in db_snp_variants:
28 |         commandline_args.append("--variant")
29 |         commandline_args.append(str(variant.contig))
30 |         commandline_args.append(str(variant.start))
31 |         commandline_args.append(str(variant.original_ref))
32 |         commandline_args.append(str(variant.original_alt))
33 |     with NamedTemporaryFile(mode="r+", delete=True) as f:
34 |         commandline_args.extend(["--output-csv", f.name])
35 |         run_script(commandline_args)
36 |         f.flush()
37 |         combined_variants = pd.read_csv(f.name)
38 |         assert len(combined_variants) == (len(ov_wustle_variants) + len(db_snp_variants))
39 | 


--------------------------------------------------------------------------------
/tests/test_collection_filtering.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | 
14 | from varcode import VariantCollection
15 | from .common import eq_
16 | from .data import (
17 |     snp_rs4244285,
18 |     snp_rs1537415
19 | )
20 | 
21 | variants = VariantCollection([
22 |     # gene ids: ['ENSG00000165841', 'ENSG00000276490']
23 |     # transcript_ids : ['ENST00000371321', 'ENST00000464755']
24 |     snp_rs4244285,
25 |     # gene ids: ['ENSG00000204007']
26 |     # transcript ids:  ['ENST00000371763', 'ENST00000613244']
27 |     snp_rs1537415,
28 | ])
29 | 
30 | gene_fpkm_dict = {
31 |     "ENSG00000165841": 10.0,
32 |     "ENSG00000204007": 20.0,
33 |     "ENSG00000276490": 30.0,
34 | }
35 | 
36 | transcript_fpkm_dict = {
37 |     "ENST00000371321": 10.0,
38 |     "ENST00000464755": 20.0,
39 |     "ENST00000371763": 30.0,
40 |     "ENST00000613244": 40.0,
41 |     "ENST00000645461": 5.0,
42 | }
43 | 
44 | effects = variants.effects()
45 | 
46 | empty_variants = VariantCollection([])
47 | empty_effects = empty_variants.effects()
48 | 
49 | 
50 | def test_filter_variants():
51 |     eq_(variants.filter(lambda _: True), variants)
52 |     eq_(variants.filter(lambda _: False), empty_variants)
53 | 
54 | 
55 | def test_filter_effects():
56 |     eq_(effects.filter(lambda _: True), effects)
57 |     eq_(effects.filter(lambda _: False), empty_effects)
58 | 
59 | 
60 | def test_filter_variants_by_gene_expression():
61 |     eq_(variants.filter_by_gene_expression(
62 |         gene_fpkm_dict, 0.0), variants)
63 |     eq_(variants.filter_by_gene_expression(
64 |         gene_fpkm_dict, 100.0), empty_variants)
65 | 
66 | 
67 | def test_filter_effects_by_gene_expression():
68 |     eq_(effects.filter_by_gene_expression(
69 |         gene_fpkm_dict, 0.0), effects)
70 |     eq_(effects.filter_by_gene_expression(
71 |         gene_fpkm_dict, 100.0), empty_effects)
72 | 
73 | 
74 | def test_filter_variants_by_transcript_expression():
75 |     expect_all = variants.filter_by_gene_expression(
76 |         gene_fpkm_dict, 0.0)
77 |     eq_(expect_all, variants)
78 |     expect_none = variants.filter_by_gene_expression(
79 |         gene_fpkm_dict, 100.0)
80 |     eq_(expect_none, empty_variants)
81 | 
82 | 
83 | def test_filter_effects_by_transcript_expression():
84 | 
85 |     expect_all = effects.filter_by_transcript_expression(
86 |         transcript_fpkm_dict, 0.0)
87 |     eq_(expect_all, effects)
88 |     expect_none = effects.filter_by_transcript_expression(
89 |         transcript_fpkm_dict, 100.0)
90 |     eq_(expect_none, empty_effects)
91 | 
92 | 
93 | def test_filter_silent_effects():
94 |     # all dbSNP entries in the collection are silent
95 |     assert len(effects.drop_silent_and_noncoding()) == 0
96 | 


--------------------------------------------------------------------------------
/tests/test_common.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #         http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | import varcode
14 | from .common import eq_
15 | 
16 | def test_memoize():
17 |     class State(object):
18 |         def __init__(self):
19 |             self.x = 0
20 | 
21 |         def incr(self):
22 |             self.x += 1
23 | 
24 |     state1 = State()
25 |     # call incr twice and expect state to increment twice
26 |     state1.incr()
27 |     state1.incr()
28 |     eq_(state1.x, 2)
29 | 
30 |     state2 = State()
31 |     memoized = varcode.common.memoize(state2.incr)
32 |     # call twice but should only increase once
33 |     memoized()
34 |     memoized()
35 |     eq_(state2.x, 1)
36 | 
37 | def test_groupby_field():
38 |     class Record(object):
39 |         def __init__(self, x, y):
40 |             self.x = x
41 |             self.y = y
42 | 
43 |         def __eq__(self, other):
44 |             return self.x == other.x and self.y == other.y
45 | 
46 |         def __str__(self):
47 |             return "Record(%s, %s)" % (self.x, self.y)
48 | 
49 |         def __repr__(self):
50 |             return str(self)
51 | 
52 |     r1_2 = Record(1, 2)
53 |     r10_20 = Record(10, 20)
54 |     r1_3 = Record(1, 3)
55 |     data = [r1_2, r10_20, r1_3]
56 |     grouped_dict = varcode.common.groupby_field(data, 'x')
57 |     eq_(tuple(sorted(grouped_dict.keys())), (1, 10))
58 |     eq_(grouped_dict[1], [r1_2, r1_3])
59 |     eq_(grouped_dict[10], [r10_20])
60 | 


--------------------------------------------------------------------------------
/tests/test_cosmic_mutations.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | from pyensembl import ensembl_grch37 as ensembl
 14 | from varcode import Variant
 15 | from varcode.effects import (
 16 |     Substitution,
 17 |     Deletion,
 18 |     Insertion,
 19 |     FrameShift,
 20 |     Silent,
 21 |     ExonicSpliceSite,
 22 | )
 23 | 
 24 | def _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id):
 25 |     variant = Variant(chrom, pos, dna_ref, dna_alt, ensembl=ensembl)
 26 |     effects = variant.effects()
 27 |     transcript_dict = effects.top_priority_effect_per_transcript_id()
 28 |     assert transcript_id in transcript_dict, \
 29 |         "Expected transcript ID %s for variant %s not found in %s" % (
 30 |             transcript_id, variant, transcript_dict)
 31 |     effect = transcript_dict[transcript_id]
 32 | 
 33 |     # COSMIC seems to ignore exonic splice sites
 34 |     if isinstance(effect, ExonicSpliceSite):
 35 |         return effect.alternate_effect
 36 |     else:
 37 |         return effect
 38 | 
 39 | def _substitution(chrom, pos, dna_ref, dna_alt, transcript_id, aa_ref, aa_alt):
 40 |     effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id)
 41 |     assert isinstance(effect, Substitution), \
 42 |         "Expected effect to be substitution, got %s" % (effect,)
 43 | 
 44 |     assert effect.aa_ref == aa_ref, \
 45 |         "Expected aa_ref='%s' : %s but got %s : %s from %s" % (
 46 |             aa_ref, type(aa_ref),
 47 |             effect.aa_ref, type(effect.aa_ref),
 48 |             effect)
 49 |     assert effect.aa_alt == aa_alt, \
 50 |         "Expected aa_alt='%s' but got %s" % (
 51 |             aa_alt, effect)
 52 | 
 53 | def _silent(chrom, pos, dna_ref, dna_alt, transcript_id, aa_ref):
 54 |     effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id)
 55 |     assert isinstance(effect, Silent), \
 56 |         "Expected effect to be silent, got %s" % (effect,)
 57 |     assert effect.aa_ref == aa_ref, "Expected aa_ref='%s', got '%s'" % (
 58 |         aa_ref, effect.aa_ref)
 59 | 
 60 | def _deletion(chrom, pos, dna_ref, dna_alt, transcript_id, deleted):
 61 |     effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id)
 62 |     assert isinstance(effect, Deletion), \
 63 |         "Expected deletion, got %s" % (effect,)
 64 |     assert effect.aa_ref == deleted, \
 65 |         "Expected deletion of '%s' but got deletion of '%s' for %s:%d%s>%s" % (
 66 |             deleted, effect.aa_ref, chrom, pos, dna_ref, dna_alt)
 67 | 
 68 | def _insertion(chrom, pos, dna_ref, dna_alt, transcript_id, inserted):
 69 |     effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id)
 70 |     assert isinstance(effect, Insertion), \
 71 |         "Expected insertion, got %s" % (effect,)
 72 |     assert effect.aa_alt == inserted, \
 73 |         "Expected insertion of '%s' but got %s for %s:%d%s>%s" % (
 74 |             inserted,
 75 |             effect.short_description(),
 76 |             chrom,
 77 |             pos,
 78 |             dna_ref,
 79 |             dna_alt)
 80 | 
 81 | def _frameshift(
 82 |         chrom,
 83 |         pos,
 84 |         dna_ref,
 85 |         dna_alt,
 86 |         transcript_id,
 87 |         aa_pos,
 88 |         aa_ref):
 89 |     effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id)
 90 |     assert isinstance(effect, FrameShift), \
 91 |         "Expected frameshift, got %s" % (effect,)
 92 |     effect_aa_pos = effect.aa_mutation_start_offset
 93 |     assert effect.aa_ref[0] == aa_ref and effect_aa_pos + 1 == aa_pos, \
 94 |         ("Expected frameshift to replace p.%d%s but instead got %s" % (
 95 |             aa_pos, aa_ref, effect))
 96 | 
 97 | def test_COSM3939556_silent():
 98 |     # 22  19222059    COSM3939556 G>T
 99 |     # GENE=CLTCL1_ENST00000427926
100 |     # STRAND=-
101 |     # CDS=c.1140C>A
102 |     # AA=p.A380A
103 |     _silent("22", 19222059, "G", "T", "ENST00000427926", "A")
104 | 
105 | def test_COSM3747785_NBPF10_Q363L():
106 |     # 1   145311839   COSM3747785 A>T
107 |     # GENE=NBPF10_ENST00000369338
108 |     # STRAND=+
109 |     # CDS=c.1088A>T
110 |     # AA=p.Q363L
111 |     _substitution("1", 145311839, "A", "T", "ENST00000369338", "Q", "L")
112 | 
113 | def test_COSM3368867_SMUG1_Q133L():
114 |     # 12  54576295    COSM3368867 T>A
115 |     # GENE=SMUG1_ENST00000513838
116 |     # STRAND=-
117 |     # CDS=c.398A>T
118 |     # AA=p.Q133L
119 |     _substitution("12", 54576295, "T", "A", "ENST00000513838", "Q", "L")
120 | 
121 | def test_COSM3508871_FBRS_K224N():
122 |     # 16  30676364    COSM3508871 A>T
123 |     # GENE=FBRS_ENST00000356166
124 |     # STRAND=+
125 |     # CDS=c.1572A>T
126 |     # AA=p.K524N
127 |     _substitution("16", 30676364, "A", "T", "ENST00000356166", "K", "N")
128 | 
129 | def test_COSM1616161_L1724R():
130 |     # 21  46932218    COSM1616161 T>G
131 |     # GENE=COL18A1_ENST00000359759
132 |     # STRAND=+
133 |     # CDS=c.5171T>G
134 |     # AA=p.L1724R
135 |     _substitution("21", 46932218, "T", "G", "ENST00000359759", "L", "R")
136 | 
137 | def test_COSM1651074_IL9R_D148Y():
138 |     # X   155234091   COSM1651074 TGG>TCT
139 |     # GENE=IL9R_ENST00000244174
140 |     # STRAND=+
141 |     # CDS=c.441_442GG>CT
142 |     # AA=p.D148Y
143 |     _substitution("X", 155234091, "TGG", "TCT", "ENST00000244174", "D", "Y")
144 | 
145 | def test_COSM3682816_RBMY1D_V193A():
146 |     # Y   24030663    COSM3682816 A>G
147 |     # GENE=RBMY1D_ENST00000382680
148 |     # STRAND=-
149 |     # CDS=c.578T>C
150 |     # AA=p.V193A
151 |     _substitution("Y", 24030663, "A", "G", "ENST00000382680", "V", "A")
152 | 
153 | def test_COSM1333672_BCL9_Q1150delQ():
154 |     """
155 |     test_COSM1333672_BCL9_Q1150delQ : in-frame deletion of 3 nucleotides
156 |     """
157 |     # 1   147095923   COSM1333672 ACAG> A
158 |     # GENE=BCL9_ENST00000234739
159 |     # STRAND=+
160 |     # CDS=c.3445_3447delCAG
161 |     # AA=p.Q1150delQ
162 |     _deletion("1", 147095923, "ACAG", "A", "ENST00000234739", "Q")
163 | 
164 | def test_COSM1190996_FBX011_P57insQQQ():
165 |     """
166 |     test_COSM1190996_FBX011_P57insQQQ : in-frame insertion of 9 nucleotides
167 |     """
168 |     # 2   48132713    COSM1190996 C>CTGCTGCTGC
169 |     # GENE=FBXO11_ENST00000403359
170 |     # STRAND=-
171 |     # CDS=c.146_147insGCAGCAGCA
172 |     # AA=p.Q56_P57insQQQ;CNT=1
173 |     _insertion("2", 48132713, "C", "CTGCTGCTGC", "ENST00000403359", "QQQ")
174 | 
175 | def test_COSM1732848_CCDC109B_F264fs():
176 |     """
177 |     test_COSM1732848_CCDC109B_F264fs : frame shift from nucleotide deletion
178 |     """
179 |     # 4   110605772   COSM1732848 CT>C
180 |     # GENE=CCDC109B_ENST00000394650
181 |     # STRAND=+
182 |     # CDS=c.787delT
183 |     # AA=p.F264fs*5
184 |     _frameshift(
185 |         "4", 110605772, "CT", "C", "ENST00000394650",
186 |         aa_pos=264,
187 |         aa_ref="F")
188 | 
189 | def test_COSM87531_SYNE1_E4738fs():
190 |     """
191 |     test_COSM87531_SYNE1_E4738fs : frame shift from nucleotide insertion
192 |     """
193 |     # The given genomic mutation is:
194 |     #    6   152651608   COSM87531   C>CA
195 |     # but through some painful manual checking I realized that
196 |     # the nucleotides here are *not* the correct ones for the
197 |     # forward strand (SYNE1 is on the negative strand) and instead
198 |     # it should be:
199 |     #    6   152651608   COSM87531   C>CT
200 |     # GENE=SYNE1_ENST00000265368
201 |     # STRAND=-
202 |     # CDS=c.14211_14212insA
203 |     # AA=p.E4738fs*34
204 |     _frameshift(
205 |         "6", 152651608, "C", "GT", "ENST00000265368",
206 |         aa_pos=4738,
207 |         aa_ref="E")
208 | 
209 | def test_COSM27279_CTNNB1_Q4H():
210 |     """
211 |     test_COSM27279_CTNNB1_Q4H : Apply Cosmic mutation COSM27279
212 |     transcript = 'ENST00000405570'
213 |     pos: 41265571,
214 |     ref : A, alt : T
215 |     amino acids = Q -> H  @ pos 4 (mutation = Q4H)
216 |     """
217 |     _substitution("3", 41265571, "A", "T", "ENST00000405570", "Q", "H")
218 | 


--------------------------------------------------------------------------------
/tests/test_dbnsfp_validation.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #         http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import pytest
 14 | from pyensembl import ensembl_grch37
 15 | from varcode import Variant
 16 | from varcode.effects import (
 17 |     ExonicSpliceSite,
 18 |     Substitution,
 19 |     TranscriptMutationEffect
 20 | )
 21 | import pandas as pd
 22 | 
 23 | from .data import data_path
 24 | 
 25 | 
 26 | def generate_dbnsfp_validation_set():
 27 |     # check that amino acid substitution gives
 28 |     # same answer as subset of dbNSFP entries (using Ensembl 75)
 29 | 
 30 |     # columns for validation dataset:
 31 |     # - aa_pos : base-1 position within protein
 32 |     # - dna_alt : non-reference DNA nucleotide
 33 |     # - chrom : choromosome
 34 |     # - ensembl_transcript : transcript ID
 35 |     # - dna_position : base-1 position within chromosome
 36 |     # - dna_ref : reference DNA nucleotide
 37 | 
 38 |     # pylint: disable=no-member
 39 |     # pylint gets confused by read_csv
 40 |     validation_set = pd.read_csv(data_path('dbnsfp_validation_set.csv'))
 41 |     for _, row in validation_set.iterrows():
 42 |         args = (
 43 |             row['ensembl_transcript'],
 44 |             row['chrom'],
 45 |             row['dna_position'],
 46 |             row['dna_ref'],
 47 |             row['dna_alt'],
 48 |             row['aa_pos'],
 49 |             row['aa_alt']
 50 |         )
 51 |         # making this a generator so every row shows up as its
 52 |         # owns test in nose
 53 |         yield args
 54 | 
 55 | 
 56 | 
 57 | @pytest.mark.parametrize([
 58 |         'ensembl_transcript_id', 
 59 |         'chrom',
 60 |         'dna_position',
 61 |         'dna_ref',
 62 |         'dna_alt',
 63 |         'aa_pos',
 64 |         'aa_alt'], generate_dbnsfp_validation_set())
 65 | def test_dbnsfp_validation_set_transcript_mutation(
 66 |         ensembl_transcript_id,
 67 |         chrom,
 68 |         dna_position,
 69 |         dna_ref,
 70 |         dna_alt,
 71 |         aa_pos,
 72 |         aa_alt):
 73 |     variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl_grch37)
 74 |     effects = variant.effects()
 75 |     transcript_id_dict = {
 76 |         effect.transcript.id: effect
 77 |         for effect in effects
 78 |         if isinstance(effect, TranscriptMutationEffect)
 79 |     }
 80 |     assert ensembl_transcript_id in transcript_id_dict, \
 81 |         "%s not found in %s" % (ensembl_transcript_id, transcript_id_dict)
 82 |     effect = transcript_id_dict[ensembl_transcript_id]
 83 | 
 84 |     if isinstance(effect, ExonicSpliceSite):
 85 |         # exonic splice site mutations carry with them an alternate effect
 86 |         # which is what we check against dbNSFP (since that database seemed
 87 |         # to ignore exonic splicing mutations)
 88 |         effect = effect.alternate_effect
 89 | 
 90 |     assert isinstance(effect, Substitution), \
 91 |         "Expected substitution (aa_pos=%d, aa_alt=%s) but got %s" % (
 92 |             aa_pos, aa_alt, effect)
 93 |     effect_aa_pos = effect.aa_mutation_start_offset
 94 |     effect_aa_alt = effect.mutant_protein_sequence[effect_aa_pos]
 95 |     assert (
 96 |         effect_aa_pos + 1 == aa_pos and
 97 |         effect_aa_alt == aa_alt), \
 98 |         "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % (
 99 |             aa_alt,
100 |             aa_pos,
101 |             chrom,
102 |             dna_position,
103 |             dna_ref,
104 |             dna_alt,
105 |             effect)
106 | 


--------------------------------------------------------------------------------
/tests/test_effect_collection.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | """
14 | Test properties of EffectCollection
15 | """
16 | 
17 | 
18 | from varcode.effects import IncompleteTranscript, Substitution
19 | from .common import eq_
20 | from .data import tcga_ov_variants, ov_wustle_variants
21 | tcga_ov_effects = tcga_ov_variants.effects()
22 | ov_wustle_effects = ov_wustle_variants.effects()
23 | 
24 | def test_to_dataframe():
25 |     df = tcga_ov_effects.to_dataframe()
26 |     eq_(len(tcga_ov_effects), len(df))
27 | 
28 | def test_effect_collection_gene_counts():
29 |     # test that each gene is counted just once
30 |     for gene, count in ov_wustle_effects.gene_counts().items():
31 |         assert count > 1, \
32 |             "Expected more than 1 effect for %s (got %d)" % (gene, count)
33 | 
34 | def test_effect_collection_groupby_gene():
35 |     genes = ov_wustle_effects.groupby_gene().keys()
36 |     # make sure that the IDs attached to Gene objects are the same as IDs
37 |     # of groupby_gene_id
38 |     gene_ids = set(ov_wustle_effects.groupby_gene_id().keys())
39 |     eq_({gene.id for gene in genes}, gene_ids)
40 | 
41 | def test_effect_collection_groupby_gene_id():
42 |     gene_ids = set(ov_wustle_effects.groupby_gene_id().keys())
43 |     eq_(gene_ids, {
44 |         'ENSG00000060718',
45 |         'ENSG00000156876',
46 |         'ENSG00000130939',
47 |         'ENSG00000122477',
48 |         'ENSG00000162688'
49 |     })
50 | 
51 | def test_effect_collection_groupby_gene_name():
52 |     gene_names = set(ov_wustle_effects.groupby_gene_name().keys())
53 |     eq_(gene_names, {"AGL", "SASS6", "LRRC39", "UBE4B", "COL11A1"})
54 | 
55 | def test_effect_collection_groupby_variant():
56 |     variants = set(ov_wustle_effects.groupby_variant().keys())
57 |     # make sure that all the original variants are still present
58 |     # in the group keys
59 |     eq_(variants, set(ov_wustle_variants))
60 | 
61 | def test_effect_collection_filter_by_effect_priority():
62 |     # every effect should be at least the same priority as "incomplete"
63 |     eq_(
64 |         tcga_ov_effects,
65 |         tcga_ov_effects.filter_by_effect_priority(IncompleteTranscript))
66 |     assert len(tcga_ov_effects) > len(
67 |         tcga_ov_effects.filter_by_effect_priority(Substitution))
68 | 
69 | def test_effect_collection_drop_silent_and_noncoding():
70 |     # some of the predicted effects are non-coding so should get dropped
71 |     assert len(tcga_ov_effects) > len(tcga_ov_effects.drop_silent_and_noncoding())
72 | 


--------------------------------------------------------------------------------
/tests/test_effect_collection_serialization.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | import pickle
15 | 
16 | from varcode import EffectCollection
17 | 
18 | from .common import eq_
19 | from .data import tcga_ov_variants, ov_wustle_variants
20 | 
21 | tcga_ov_effects = tcga_ov_variants.effects()
22 | ov_wustle_effects = ov_wustle_variants.effects()
23 | 
24 | def test_tcga_effect_collection_to_dict():
25 |     eq_(
26 |         tcga_ov_effects,
27 |         EffectCollection.from_dict(tcga_ov_effects.to_dict()))
28 | 
29 | def test_wustle_effect_collection_to_dict():
30 |     eq_(
31 |         ov_wustle_effects,
32 |         EffectCollection.from_dict(ov_wustle_effects.to_dict()))
33 | 
34 | def test_tcga_effect_collection_to_json():
35 |     eq_(tcga_ov_effects, EffectCollection.from_json(tcga_ov_effects.to_json()))
36 | 
37 | def test_wustle_effect_collection_to_json():
38 |     eq_(
39 |         ov_wustle_effects,
40 |         EffectCollection.from_json(ov_wustle_effects.to_json()))
41 | 
42 | def test_tcga_effect_collection_pickling():
43 |     reconstructed = pickle.loads(pickle.dumps(tcga_ov_effects))
44 |     eq_(tcga_ov_effects, reconstructed)
45 | 
46 | def test_wustle_effect_collection_pickling():
47 |     reconstructed = pickle.loads(pickle.dumps(ov_wustle_effects))
48 |     eq_(ov_wustle_effects, reconstructed)


--------------------------------------------------------------------------------
/tests/test_effects_from_mutagenix_variants.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | """
14 | List of variants copied from:
15 |     https://mutagenetix.utsouthwestern.edu
16 |     /incidental/incidental_rec.cfm?
17 |     mid=&so=rb&ac=1&r0=0&nr=100&rn=29&rl=1&scd=IGL01779&mid=153891
18 | """
19 | 
20 | from varcode import Variant
21 | from varcode.effects import Substitution
22 | 
23 | from .common import expect_effect
24 | 
25 | def test_substitution_Akt1_chr12_112657169_C_T_G286R():
26 |     expect_effect(
27 |         variant=Variant("chr12", 112657169, "C", "T", "mm10"),
28 |         effect_class=Substitution,
29 |         aa_mutation_start_offset=285,
30 |         aa_ref="G",
31 |         aa_alt="R")
32 | 
33 | def test_substitution_Apof_chr10_128269477_A_G_I167V():
34 |     expect_effect(
35 |         variant=Variant("chr10", 128269477, "A", "G", "mm10"),
36 |         effect_class=Substitution,
37 |         aa_mutation_start_offset=166,
38 |         aa_ref="I",
39 |         aa_alt="V")
40 | 
41 | def test_substitution_Csmd3_chr15_47857894_A_T_V1551D():
42 |     expect_effect(
43 |         variant=Variant("chr15", 47857894, "A", "T", "mm10"),
44 |         effect_class=Substitution,
45 |         aa_mutation_start_offset=1550,
46 |         aa_ref="V",
47 |         aa_alt="D")
48 | 
49 | def test_substitution_Pprc1_chr19_46062202_T_A_I130N():
50 |     expect_effect(
51 |         variant=Variant("chr19", 46062202, "T", "A", "mm10"),
52 |         effect_class=Substitution,
53 |         aa_mutation_start_offset=129,
54 |         aa_ref="I",
55 |         aa_alt="N")
56 | 
57 | def test_substitution_Vipr1_chr9_121664630_T_C_F249S():
58 |     expect_effect(
59 |         variant=Variant("chr9", 121664630, "T", "C", "mm10"),
60 |         effect_class=Substitution,
61 |         aa_mutation_start_offset=248,
62 |         aa_ref="F",
63 |         aa_alt="S")
64 | 


--------------------------------------------------------------------------------
/tests/test_exonic_splice_site.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #         http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from varcode import Variant
14 | from varcode.effects import ExonicSpliceSite, PrematureStop
15 | 
16 | 
17 | def test_STAT1_stop_gain_at_exon_boundary():
18 |     # top priority effect for this variant should be PrematureStop,
19 |     # even though it's also ExonicSpliceSite
20 |     stat1_variant = Variant("2", "191872291", "G", "A", "GRCh37")
21 |     effects = stat1_variant.effects()
22 |     print(effects)
23 |     assert any([e.__class__ is ExonicSpliceSite for e in effects])
24 |     top_effect = effects.top_priority_effect()
25 |     print(top_effect)
26 |     assert top_effect.__class__ is PrematureStop
27 | 


--------------------------------------------------------------------------------
/tests/test_frameshift_helpers.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from varcode.effects.effect_prediction_coding_frameshift import (
14 |     cdna_codon_sequence_after_insertion_frameshift,
15 |     cdna_codon_sequence_after_deletion_or_substitution_frameshift,
16 | )
17 | 
18 | from .common import eq_
19 | 
20 | def test_cdna_codon_sequence_after_insertion_frameshift_before_codon():
21 |     # insertion: T_ATGCCCTAG
22 |     i, s = cdna_codon_sequence_after_insertion_frameshift(
23 |         sequence_from_start_codon="ATGCCCTAG",
24 |         cds_offset_before_insertion=-1,
25 |         inserted_nucleotides="T")
26 |     eq_(i, 0)
27 |     eq_(s, "TATGCCCTAG")
28 | 
29 | def test_cdna_codon_sequence_after_insertion_frameshift_in_middle_of_codon():
30 |     # insertion: A_T_TGCCCTAG
31 |     i, s = cdna_codon_sequence_after_insertion_frameshift(
32 |         sequence_from_start_codon="ATGCCCTAG",
33 |         cds_offset_before_insertion=0,
34 |         inserted_nucleotides="T")
35 |     eq_(i, 0)
36 |     eq_(s, "ATTGCCCTAG")
37 | 
38 | def test_cdna_codon_sequence_after_insertion_frameshift_at_end_of_codon():
39 |     # insertion: AT_T_GCCCTAG
40 |     i, s = cdna_codon_sequence_after_insertion_frameshift(
41 |         sequence_from_start_codon="ATGCCCTAG",
42 |         cds_offset_before_insertion=1,
43 |         inserted_nucleotides="T")
44 |     eq_(i, 0)
45 |     eq_(s, "ATTGCCCTAG")
46 | 
47 | def test_cdna_codon_sequence_after_insertion_frameshift_after_codon():
48 |     # insertion: ATG_T_CCCTAG
49 |     i, s = cdna_codon_sequence_after_insertion_frameshift(
50 |         sequence_from_start_codon="ATGCCCTAG",
51 |         cds_offset_before_insertion=2,
52 |         inserted_nucleotides="T")
53 |     eq_(i, 1)
54 |     eq_(s, "TCCCTAG")
55 | 
56 | def test_cdna_codon_sequence_after_deletion_or_substitution_frameshift_delA():
57 |     i, s = cdna_codon_sequence_after_deletion_or_substitution_frameshift(
58 |         sequence_from_start_codon="ATGCCCTAG",
59 |         cds_offset=0,
60 |         trimmed_cdna_ref="A",
61 |         trimmed_cdna_alt="")
62 |     eq_(i, 0)
63 |     eq_(s, "TGCCCTAG")
64 | 
65 | 
66 | def test_cdna_codon_sequence_after_deletion_or_substitution_frameshift_AT_to_C():
67 |     i, s = cdna_codon_sequence_after_deletion_or_substitution_frameshift(
68 |         sequence_from_start_codon="ATGCCCTAG",
69 |         cds_offset=0,
70 |         trimmed_cdna_ref="AT",
71 |         trimmed_cdna_alt="C")
72 |     eq_(i, 0)
73 |     eq_(s, "CGCCCTAG")
74 | 


--------------------------------------------------------------------------------
/tests/test_maf.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import pytest 
 14 | 
 15 | from pyensembl import ensembl_grch37 as ensembl
 16 | 
 17 | from varcode import Variant, load_maf, load_maf_dataframe
 18 | 
 19 | import pandas as pd
 20 | 
 21 | from .common import eq_
 22 | from .data import tcga_ov_variants, ov_wustle_variants, data_path
 23 | 
 24 | def test_maf():
 25 |     expected_tcga_ov_variants = [
 26 |         Variant(1, 1650797, "A", "G", ensembl),
 27 |         Variant(1, 23836447, "C", "A", ensembl),
 28 |         Variant(1, 231401797, "A", "C", ensembl),
 29 |         Variant(11, 124617502, "C", "G", ensembl),
 30 |     ]
 31 |     eq_(len(tcga_ov_variants), len(expected_tcga_ov_variants))
 32 |     for v_expect, v_maf in zip(expected_tcga_ov_variants, tcga_ov_variants):
 33 |         eq_(v_expect, v_maf)
 34 |         gene_name = tcga_ov_variants.metadata[v_maf]['Hugo_Symbol']
 35 |         assert any(gene.name == gene_name for gene in v_maf.genes), \
 36 |             "Expected gene name %s but got %s" % (gene_name, v_maf.genes)
 37 | 
 38 | 
 39 | def generate_maf_aa_changes():
 40 |     # Parse a MAF file and make sure we're annotating the protein amino acid
 41 |     # changes in the same way.
 42 |     #
 43 |     # The data file used also contains spaces, which is good to test the parser
 44 |     # on.
 45 |     assert len(ov_wustle_variants) == 5
 46 | 
 47 |     expected_changes = {}
 48 |     # pylint: disable=no-member
 49 |     # pylint gets confused by read_csv
 50 |     maf_fields = pd.read_csv(
 51 |         ov_wustle_variants.path,
 52 |         sep="\t",
 53 |         comment="#")
 54 |     for _, row in maf_fields.iterrows():
 55 |         key = (str(row.Chromosome), row.Start_position)
 56 |         change = row.amino_acid_change
 57 |         # silent mutations just specificy which amino acid they affect via
 58 |         # e.g. "p.G384"
 59 |         if change[-1].isdigit():
 60 |             expected_changes[key] = "silent"
 61 |         else:
 62 |             expected_changes[key] = change
 63 | 
 64 |     for variant in ov_wustle_variants:
 65 |         key = (variant.contig, variant.start)
 66 |         expected = expected_changes[key]
 67 |         yield (variant, expected)
 68 | 
 69 | @pytest.mark.parametrize(['variant', 'expected_aa_change'], generate_maf_aa_changes())
 70 | def test_maf_aa_changes(variant, expected_aa_change):
 71 |     effect = variant.effects().top_priority_effect()
 72 |     change = effect.short_description
 73 |     eq_(
 74 |         change,
 75 |         expected_aa_change,
 76 |         "MAF file had annotation %s but Varcode gave %s" % (
 77 |             expected_aa_change, change))
 78 | 
 79 | 
 80 | def test_maf_number_entries_duplicates():
 81 |     # There are 3 duplicated mutations listed in the MAF
 82 |     path_to_maf_with_duplicates = data_path("duplicates.maf")
 83 |     variants = load_maf(path_to_maf_with_duplicates, distinct=True)
 84 |     assert len(variants) == 1
 85 |     variants = load_maf(path_to_maf_with_duplicates, distinct=False)
 86 |     assert len(variants) == 3
 87 | 
 88 | def test_load_maf():
 89 |     for raise_on_error in [True, False]:
 90 |         variants = load_maf(
 91 |             data_path("ov.wustle.subset5.maf"), raise_on_error=raise_on_error)
 92 |         eq_(len(variants), 5)
 93 | 
 94 | 
 95 | def test_load_maf_dataframe():
 96 |     for raise_on_error in [True, False]:
 97 |         variants_df = load_maf_dataframe(
 98 |             data_path("ov.wustle.subset5.maf"), raise_on_error=raise_on_error)
 99 |         eq_(len(variants_df), 5)
100 | 
101 | 
102 | def test_xy_contigs():
103 |     """
104 |     Test MAFs with X and Y chromosomes rather than just numerical chromosomes.
105 |     """
106 |     for raise_on_error in [True, False]:
107 |         variants = load_maf(
108 |             data_path("tcga_ov.head.xychr.maf"), raise_on_error=True)
109 |         eq_(len(variants), 4)
110 | 
111 | 
112 | def test_load_utf8():
113 |     """
114 |     Test MAFs loaded with utf-8 encoding.
115 |     """
116 |     for raise_on_error in [True, False]:
117 |         variants = load_maf(
118 |             data_path("ov.wustle.subset5.maf"), raise_on_error=True, encoding="utf-8")
119 |         eq_(len(variants), 5)
120 |         # Make sure we avoid "TypeError: character mapping must return integer, None or unicode"
121 |         # from Bio.Seq.
122 |         _ = variants.effects()
123 | 


--------------------------------------------------------------------------------
/tests/test_mm10_klf6_frameshift.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from varcode import Variant
14 | from varcode.effects import FrameShift
15 | from varcode.effects.effect_prediction_coding_frameshift import (
16 |     predict_frameshift_coding_effect,
17 |     cdna_codon_sequence_after_insertion_frameshift,
18 | )
19 | 
20 | from .common import eq_
21 | 
22 | 
23 | def validate_effect_values(effect):
24 |     eq_(effect.__class__, FrameShift)
25 |     transcript = effect.transcript
26 |     eq_(transcript.name, "Klf6-201")
27 |     eq_(transcript.spliced_offset(5864876), 469)
28 |     eq_(effect.shifted_sequence, "GEEGGIRTEDFF")
29 | 
30 | 
31 | def test_mm10_Klf6_frameshift():
32 |     variant = Variant("chr13", 5864876, "", "G", "mm10")
33 |     effects = variant.effects().drop_silent_and_noncoding()
34 |     eq_(len(effects), 1)
35 |     validate_effect_values(effects[0])
36 | 
37 | 
38 | def test_mm10_Klf6_frameshift_coding_effect_fn():
39 |     variant = Variant("chr13", 5864876, "", "G", "mm10")
40 |     transcripts = variant.transcripts
41 |     coding_transcripts = [
42 |         t for t in transcripts
43 |         if t.biotype == "protein_coding"
44 |     ]
45 |     eq_(len(coding_transcripts), 1)
46 |     t = coding_transcripts[0]
47 |     eq_(t.name, "Klf6-201")
48 |     # first start codon offset is 157
49 |     # mutation occurs after offset 469
50 |     effect = predict_frameshift_coding_effect(
51 |         trimmed_cdna_ref="",
52 |         trimmed_cdna_alt="G",
53 |         cds_offset=469 - 157,
54 |         sequence_from_start_codon=t.sequence[157:],
55 |         variant=variant,
56 |         transcript=t)
57 |     validate_effect_values(effect)
58 | 
59 | 
60 | def test_mm10_Klf6_frameshift_cdna_codon_sequence():
61 |     variant = Variant("chr13", 5864876, "", "G", "mm10")
62 |     transcripts = variant.transcripts
63 |     coding_transcripts = [
64 |         t for t in transcripts
65 |         if t.biotype == "protein_coding"
66 |     ]
67 |     eq_(len(coding_transcripts), 1)
68 |     t = coding_transcripts[0]
69 |     eq_(t.name, "Klf6-201")
70 |     mutant_codon_index, seq_after_mutated_codon = \
71 |         cdna_codon_sequence_after_insertion_frameshift(
72 |             sequence_from_start_codon=t.sequence[157:],
73 |             cds_offset_before_insertion=469 - 157,
74 |             inserted_nucleotides="G")
75 |     eq_(mutant_codon_index, 104)
76 |     expected_sequence = t.sequence[469] + "G" + t.sequence[470:]
77 |     eq_(seq_after_mutated_codon, expected_sequence)
78 | 


--------------------------------------------------------------------------------
/tests/test_mouse.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | from .common import eq_
 14 | 
 15 | from varcode import load_vcf, load_vcf_fast, Variant
 16 | from varcode.effects import Substitution
 17 | from pyensembl import Genome, EnsemblRelease
 18 | from .data import data_path
 19 | 
 20 | MOUSE_ENSEMBL_RELEASE = 95
 21 | SERVER = "ftp://ftp.ensembl.org"
 22 | MOUSE_GTF_PATH = \
 23 |     SERVER + "/pub/release-%d/gtf/mus_musculus/Mus_musculus.GRCm38.%d.gtf.gz" % (
 24 |         MOUSE_ENSEMBL_RELEASE, MOUSE_ENSEMBL_RELEASE)
 25 | MOUSE_TRANSCRIPT_FASTA_PATH = \
 26 |     SERVER + "/pub/release-%d/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz"
 27 | MOUSE_PROTEIN_FASTA_PATH = \
 28 |     SERVER + "/pub/release-%d/fasta/mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz" % (
 29 |         MOUSE_ENSEMBL_RELEASE)
 30 | 
 31 | MOUSE_VCF = data_path("mouse_vcf_dbsnp_chr1_partial.vcf")
 32 | 
 33 | explicit_url_genome = Genome(
 34 |     reference_name="GRCm38",
 35 |     annotation_name="ensembl",
 36 |     annotation_version=MOUSE_ENSEMBL_RELEASE,
 37 |     gtf_path_or_url=MOUSE_GTF_PATH,
 38 |     transcript_fasta_paths_or_urls=[MOUSE_TRANSCRIPT_FASTA_PATH],
 39 |     protein_fasta_paths_or_urls=[MOUSE_PROTEIN_FASTA_PATH])
 40 | 
 41 | ensembl_mouse_genome = EnsemblRelease(MOUSE_ENSEMBL_RELEASE, species="mouse")
 42 | 
 43 | def test_load_vcf_mouse_with_explicit_urls():
 44 |     variants = load_vcf(MOUSE_VCF, genome=explicit_url_genome)
 45 |     eq_(len(variants), 217)
 46 | 
 47 | def test_load_vcf_mouse_with_ensembl_release():
 48 |     variants = load_vcf(MOUSE_VCF, genome=ensembl_mouse_genome)
 49 |     eq_(len(variants), 217)
 50 | 
 51 | def test_load_vcf_mouse_with_inferred_genome():
 52 |     variants = load_vcf(MOUSE_VCF)
 53 |     eq_(len(variants), 217)
 54 | 
 55 | def test_specific_variant_mouse_with_explicit_urls():
 56 |     # Exon #2 at http://useast.ensembl.org/Mus_musculus/Transcript/Exons?
 57 |     # db=core;g=ENSMUSG00000017167;r=11:101170523-101190724;t=ENSMUST00000103109
 58 |     variant = Variant(
 59 |         contig=11,
 60 |         start=101177240,
 61 |         ref="G",
 62 |         alt="T",
 63 |         ensembl=explicit_url_genome)
 64 |     effects = variant.effects()
 65 |     eq_(len(effects), 2)
 66 |     substitution_effects = [
 67 |         effect
 68 |         for effect in effects
 69 |         if isinstance(effect, Substitution)
 70 |     ]
 71 |     eq_(len(substitution_effects), 1)
 72 |     substitution_effect = substitution_effects[0]
 73 |     # The coding sequence through the sub:
 74 |     # ATGATGAGTCTCCGGCTCTTCAGCATCCTGCTCGCCACG
 75 |     # GTGGTCTCTGGAGCTTGGGGCTGGGGCTACTACGGTTGC
 76 |     # (The final G is the sub: the 77th nucleotide)
 77 |     # TGC (C) -> TTC (F)
 78 |     # 78 / 3 = 26
 79 |     # 0-base = 25
 80 |     eq_(substitution_effect.mutant_protein_sequence[25], "F")
 81 |     eq_(substitution_effect.original_protein_sequence[25], "C")
 82 | 
 83 | 
 84 | def test_specific_variant_mouse_with_ensembl_genome():
 85 |     # Exon #2 at http://useast.ensembl.org/Mus_musculus/Transcript/Exons?
 86 |     # db=core;g=ENSMUSG00000017167;r=11:101170523-101190724;t=ENSMUST00000103109
 87 |     variant = Variant(
 88 |         contig=11,
 89 |         start=101177240,
 90 |         ref="G",
 91 |         alt="T",
 92 |         ensembl=ensembl_mouse_genome)
 93 |     effects = variant.effects()
 94 |     eq_(len(effects), 2)
 95 |     substitution_effects = [
 96 |         effect
 97 |         for effect in effects
 98 |         if isinstance(effect, Substitution)
 99 |     ]
100 |     eq_(len(substitution_effects), 1)
101 |     substitution_effect = substitution_effects[0]
102 |     # The coding sequence through the sub:
103 |     # ATGATGAGTCTCCGGCTCTTCAGCATCCTGCTCGCCACG
104 |     # GTGGTCTCTGGAGCTTGGGGCTGGGGCTACTACGGTTGC
105 |     # (The final G is the sub: the 77th nucleotide)
106 |     # TGC (C) -> TTC (F)
107 |     # 78 / 3 = 26
108 |     # 0-base = 25
109 |     eq_(substitution_effect.mutant_protein_sequence[25], "F")
110 |     eq_(substitution_effect.original_protein_sequence[25], "C")
111 | 


--------------------------------------------------------------------------------
/tests/test_mutate.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from varcode.effects import mutate
14 | from .common import eq_
15 | 
16 | def test_snp_mutation():
17 |     seq = "AACCTT"
18 |     mutated = mutate.substitute(seq, 1, "A", "G")
19 |     eq_(mutated, "AGCCTT")
20 | 
21 | def test_deletion_mutation():
22 |     seq = "AACT"
23 |     mutated = mutate.substitute(seq, 1, "ACT", "T")
24 |     eq_(mutated, "AT")
25 | 
26 | def test_insert_before():
27 |     mutated = mutate.insert_before("AACT", 1, "GG")
28 |     eq_(mutated, "AGGACT")
29 | 
30 | def test_insert_after():
31 |     mutated = mutate.insert_after("AACT", 1, "GG")
32 |     eq_(mutated, "AAGGCT")
33 | 


--------------------------------------------------------------------------------
/tests/test_no_duplicate_variants.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from pyensembl import EnsemblRelease
14 | from varcode import Variant, VariantCollection
15 | 
16 | def test_drop_duplicates():
17 |     ensembl = EnsemblRelease(78)
18 |     v1 = Variant("1", 3000, "A", "G", ensembl=ensembl)
19 |     v1_copy = Variant("1", 3000, "A", "G", ensembl=ensembl)
20 |     v2 = Variant("2", 10, "G", "T", ensembl=ensembl)
21 |     collection_without_duplicates = VariantCollection(
22 |         variants=[v1, v1, v1_copy, v2])
23 |     assert len(collection_without_duplicates) == 2
24 | 


--------------------------------------------------------------------------------
/tests/test_problematic_variants.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Any variants which are encountered in the wild and either cause Varcode
  3 | to crash or return an incorrect annotation should be added to this
  4 | test module.
  5 | """
  6 | 
  7 | import pytest
  8 | from varcode import Variant
  9 | 
 10 | from .common import check_effect_properties
 11 | 
 12 | # variants which have previously resulted in raised exceptions
 13 | # during effect annotation
 14 | should_not_crash_variants = [
 15 |     # error message:
 16 |     # "Couldn't find position 92979124 on any exon of ENST00000540033"
 17 |     Variant(
 18 |         contig=1,
 19 |         start=92979092,
 20 |         ref="ATATATATATATATATATATATATATATATATG",
 21 |         alt="A",
 22 |         genome="GRCh37"),
 23 |     # error message:
 24 |     # "Expect non-silent stop-loss variant to cause longer variant protein"
 25 |     # "" but got len(original) = 653, len(variant) = 653"
 26 |     Variant(
 27 |         contig=1,
 28 |         start=167385324,
 29 |         ref="TAA",
 30 |         alt="T",
 31 |         genome="GRCh37"),
 32 |     # error message:
 33 |     # "Variant which span 5' UTR and CDS not supported"
 34 |     Variant(
 35 |         contig=19,
 36 |         start=44351166,
 37 |         ref="GGGAGAT",
 38 |         alt="G",
 39 |         genome="GRCh37"),
 40 |     # error message:
 41 |     # "Can't have ref = '' and alt = 'E' at aa_pos = 445, cds_pos = 1335"
 42 |     Variant(
 43 |         contig=1,
 44 |         start=1684347,
 45 |         ref="",
 46 |         alt="CCT",
 47 |         genome="GRCh37"),
 48 |     Variant(
 49 |         contig=11,
 50 |         start=47640416,
 51 |         ref="",
 52 |         alt="TCTTT",
 53 |         genome="GRCh37"),
 54 |     Variant(
 55 |         contig=12,
 56 |         start=98880902,
 57 |         ref="A",
 58 |         alt="",
 59 |         genome="GRCh37"),
 60 |     Variant(
 61 |         contig=19,
 62 |         start=52803670,
 63 |         ref="TG",
 64 |         alt="",
 65 |         genome="GRCh37"),
 66 |     Variant(
 67 |         contig=1,
 68 |         start=109792735,
 69 |         ref="",
 70 |         alt="CGC",
 71 |         genome="GRCh37"),
 72 |     # error message:
 73 |     # "expected ref 'GATGTCGG' at offset 1412 of ENST00000297524...CDS has 'G'"
 74 |     Variant(
 75 |         contig=8,
 76 |         start=87226635,
 77 |         ref="CCGACATC",
 78 |         alt="",
 79 |         genome="GRCh37"),
 80 |     # error message: "Can't have empty aa_ref and aa_alt"
 81 |     Variant(
 82 |         contig=8,
 83 |         start=141488566,
 84 |         ref="T",
 85 |         alt="C",
 86 |         genome="GRCh38"),
 87 |     # error message: "len(aa_alt) = 0"
 88 |     Variant(
 89 |         contig=11,
 90 |         start=57741870,
 91 |         ref="G",
 92 |         alt="C",
 93 |         genome="GRCh38"),
 94 |     # error message: "IndexError: string index out of range"
 95 |     Variant(
 96 |         contig=11,
 97 |         start=63676705,
 98 |         ref="T", alt="",
 99 |         genome="GRCh37"),
100 |     # AssertionError: aa_ref and aa_alt can't both be empty string
101 |     Variant(
102 |         contig=1,
103 |         start=56962223,
104 |         ref='C',
105 |         alt='T',
106 |         genome="GRCh37"),
107 |     # AssertionError: aa_ref and aa_alt can't both be empty string
108 |     Variant(
109 |         contig=1,
110 |         start=56962223,
111 |         ref="C",
112 |         alt="T",
113 |         genome="GRCh37"),
114 |     # AssertionError: aa_ref and aa_alt can't both be empty string
115 |     Variant(
116 |         contig=1,
117 |         start=151314663,
118 |         ref="C",
119 |         alt="T",
120 |         genome="GRCh37"),
121 |     # AssertionError: aa_ref and aa_alt can't both be empty string
122 |     Variant(
123 |         contig=1,
124 |         start=153409535,
125 |         ref="C",
126 |         alt="T",
127 |         genome="GRCh37"),
128 |     # AssertionError: aa_ref and aa_alt can't both be empty string
129 |     Variant(
130 |         contig=10,
131 |         start=105791994,
132 |         ref="C",
133 |         alt="T",
134 |         genome="GRCh37"),
135 |     # Expected frameshift_insertion to be before stop codon
136 |     # for Variant(contig=1, start=109925189, ref=., alt=A, genome=GRCh38)
137 |     # on transcript_id=ENST00000329608
138 |     # len(protein) = 554, aa_pos = 554
139 |     Variant(
140 |         contig=1,
141 |         start=109925189,
142 |         ref="",
143 |         alt="A",
144 |         genome="GRCh38"),
145 |     Variant(
146 |         contig=7,
147 |         start=117120188,
148 |         ref="A",
149 |         alt="AAGT",
150 |         genome="GRCh37"),
151 |     # had problems with end coordinate loading this one from a MAF but also
152 |     # want to make sure it doesn't cause other trouble
153 |     Variant(
154 |         contig=1,
155 |         start=109461324,
156 |         ref="GG",
157 |         alt="TT",
158 |         genome="GRCh37")
159 | ]
160 | 
161 | 
162 | @pytest.mark.parametrize(['variant'], [(v,) for v in should_not_crash_variants])
163 | def test_crashing_variants(variant):
164 |     effect = variant.effects().top_priority_effect()
165 |     check_effect_properties(effect)


--------------------------------------------------------------------------------
/tests/test_reference.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | 
14 | import warnings
15 | 
16 | import pytest 
17 | 
18 | from varcode.reference import infer_reference_name, ensembl_reference_aliases, most_recent_assembly_name
19 | from .common import eq_
20 | 
21 | ## test cases are given as 
22 | ## expected response: list of inputs
23 | reference_test_cases = {
24 |     'NCBI36': [
25 |         'ncbi36p2.fasta', 
26 |         'b36.fasta', 
27 |         '##reference=file:///var/lib/cwl/ncbi36/homo_sapiens.d1.vd1.fa'],
28 |     'GRCh38': [
29 |         'grch38p2.fasta', 
30 |         '##reference=file:///var/lib/cwl/job367935311_index_001zdr/GRCh38.d1.vd1.fa',
31 |         '##reference=file:///var/lib/cwl/job367935311_index_001zdr/GRCh38.job36.d1.vd1.fa',
32 |     ],
33 | }
34 | 
35 | def test_most_recent_assembly():
36 |     eq_(most_recent_assembly_name(['ncbi36', 'grch38']), 'grch38')
37 |     eq_(most_recent_assembly_name(['ncbi36', 'grch38', '37mm']), 'grch38')
38 |     eq_(most_recent_assembly_name(['ncbi36']), 'ncbi36')
39 |     eq_(most_recent_assembly_name(['ncbi36', '35']), 'ncbi36')
40 | def generate_reference_name_aliases():
41 |     with warnings.catch_warnings(record=True) as w:
42 |         for assembly_name, aliases in ensembl_reference_aliases.items():
43 |             candidate_list = [assembly_name] + list(aliases)
44 |             for candidate in candidate_list:
45 |                 yield (                
46 |                     candidate,
47 |                     assembly_name
48 |                 )
49 | 
50 | @pytest.mark.parametrize(['candidate', 'assembly_name'], generate_reference_name_aliases())
51 | def test_infer_reference_name_aliases(candidate, assembly_name):
52 |     eq_(infer_reference_name(candidate), assembly_name)
53 |     
54 | def generate_reference_name_fasta_filenames():
55 |     with warnings.catch_warnings(record=True):
56 |         for assembly_name, aliases in reference_test_cases.items():
57 |             candidate_list = [assembly_name] + list(aliases)
58 |             for candidate in candidate_list:
59 |                 yield (
60 |                     candidate,
61 |                     assembly_name
62 |                 )
63 | 
64 | @pytest.mark.parametrize(['candidate', 'assembly_name'], generate_reference_name_fasta_filenames())
65 | def test_reference_name_fasta_filenames(candidate, assembly_name):
66 |     eq_(infer_reference_name(candidate), assembly_name)
67 | 
68 | 


--------------------------------------------------------------------------------
/tests/test_string_helpers.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from .common import eq_
14 | 
15 | from varcode.string_helpers import trim_shared_flanking_strings
16 | 
17 | def test_trim_shared_string_endings():
18 |     # empty strings
19 |     eq_(trim_shared_flanking_strings("", "A"), ("", "A", "", ""))
20 |     eq_(trim_shared_flanking_strings("A", ""), ("A", "", "", ""))
21 | 
22 |     # string pairs with shared prefixes
23 |     eq_(trim_shared_flanking_strings("AA", "AA"), ("", "", "AA", ""))
24 |     eq_(trim_shared_flanking_strings("AB", "AA"), ("B", "A", "A", ""))
25 |     eq_(trim_shared_flanking_strings("AA", "AB"), ("A", "B", "A", ""))
26 |     eq_(trim_shared_flanking_strings("AB", "A"), ("B", "", "A", ""))
27 |     eq_(trim_shared_flanking_strings("AB", "A"), ("B", "", "A", ""))
28 |     eq_(trim_shared_flanking_strings("A", "AB"), ("", "B", "A", ""))
29 | 
30 |     # string pairs with shared suffixes
31 |     eq_(trim_shared_flanking_strings("CCAT", "GT"),
32 |         ("CCA", "G", "", "T"))
33 |     eq_(trim_shared_flanking_strings("CCAT", "GT"),
34 |         ("CCA", "G", "", "T"))
35 | 
36 |     # string pairs with shared prefixes+suffixes
37 |     eq_(trim_shared_flanking_strings(
38 |         "AATG", "AACG"), ("T", "C", "AA", "G"))
39 |     eq_(trim_shared_flanking_strings(
40 |         "ABG", "AG"), ("B", "", "A", "G"))
41 |     eq_(trim_shared_flanking_strings(
42 |         "AG", "ABG"), ("", "B", "A", "G"))
43 | 


--------------------------------------------------------------------------------
/tests/test_timings.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from __future__ import print_function, division, absolute_import
14 | import time
15 | 
16 | from varcode.util import random_variants
17 | 
18 | def _time_variant_annotation(variant_collection):
19 |     start_t = time.time()
20 |     effects = variant_collection.effects()
21 |     end_t = time.time()
22 |     assert len(effects.groupby_variant()) == len(variant_collection)
23 |     elapsed_t = end_t - start_t
24 |     return elapsed_t
25 | 
26 | 
27 | def test_effect_timing(
28 |         n_variants=100,
29 |         random_seed=0,
30 |         n_warmup_variants=5):
31 |     warmup_collection = random_variants(
32 |         n_warmup_variants,
33 |         random_seed=None)
34 |     warmup_collection.effects()
35 | 
36 |     variant_collection = random_variants(
37 |         n_variants,
38 |         random_seed=random_seed)
39 |     elapsed_t = _time_variant_annotation(variant_collection)
40 |     print("Elapsed: %0.4f for %d variants" % (elapsed_t, n_variants))
41 |     assert elapsed_t / n_variants < 0.1, \
42 |         "Should be faster than 100ms / variant!"
43 | 
44 | if __name__ == "__main__":
45 |     test_effect_timing()
46 | 


--------------------------------------------------------------------------------
/tests/test_variant.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #         http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | """
 14 | Test simple properties of Variant objects, such as their trimming
 15 | of shared prefix/suffix strings from ref/alt fields.
 16 | """
 17 | 
 18 | import pickle
 19 | from pyensembl import ensembl_grch38
 20 | 
 21 | from varcode import Variant
 22 | from .common import eq_
 23 | 
 24 | def test_insertion_shared_prefix():
 25 |     variant = Variant(1, start=10, ref="AA", alt="AAT")
 26 |     eq_(variant.contig, "1")
 27 |     eq_(variant.original_ref, "AA")
 28 |     eq_(variant.original_alt, "AAT")
 29 |     eq_(variant.original_start, 10)
 30 |     # since this variant is just an insertion of a "T", get rid of
 31 |     # the prefix context
 32 |     eq_(variant.ref, "")
 33 |     eq_(variant.alt, "T")
 34 |     # the [start,end] interval for an insertion is just the base we're
 35 |     # inserting after, which in this case is the 11th position
 36 |     eq_(variant.start, 11)
 37 |     eq_(variant.end, 11)
 38 |     eq_(variant.short_description, "chr1 g.11_12insT")
 39 |     assert variant.is_indel
 40 |     assert variant.is_insertion
 41 |     assert not variant.is_deletion
 42 | 
 43 | def test_insertion_no_prefix():
 44 |     variant = Variant(1, start=11, ref="", alt="T")
 45 |     eq_(variant.contig, "1")
 46 |     eq_(variant.original_ref, "")
 47 |     eq_(variant.original_alt, "T")
 48 |     eq_(variant.original_start, 11)
 49 |     eq_(variant.ref, "")
 50 |     eq_(variant.alt, "T")
 51 |     eq_(variant.start, 11)
 52 |     eq_(variant.end, 11)
 53 |     eq_(variant.short_description, "chr1 g.11_12insT")
 54 |     assert variant.is_indel
 55 |     assert variant.is_insertion
 56 |     assert not variant.is_deletion
 57 | 
 58 | def test_substitution_no_prefix():
 59 |     variant = Variant(1, start=11, ref="A", alt="T")
 60 |     eq_(variant.contig, "1")
 61 |     eq_(variant.original_ref, "A")
 62 |     eq_(variant.original_alt, "T")
 63 |     eq_(variant.original_start, 11)
 64 |     eq_(variant.ref, "A")
 65 |     eq_(variant.alt, "T")
 66 |     eq_(variant.start, 11)
 67 |     eq_(variant.end, 11)
 68 |     eq_(variant.short_description, "chr1 g.11A>T")
 69 |     assert not variant.is_indel
 70 |     assert not variant.is_insertion
 71 |     assert not variant.is_deletion
 72 | 
 73 | def test_substitution_shared_prefix():
 74 |     variant = Variant(1, start=10, ref="AA", alt="AT")
 75 |     eq_(variant.contig, "1")
 76 |     eq_(variant.original_ref, "AA")
 77 |     eq_(variant.original_alt, "AT")
 78 |     eq_(variant.original_start, 10)
 79 |     eq_(variant.ref, "A")
 80 |     eq_(variant.alt, "T")
 81 |     eq_(variant.start, 11)
 82 |     eq_(variant.end, 11)
 83 |     eq_(variant.short_description, "chr1 g.11A>T")
 84 |     assert not variant.is_indel
 85 |     assert not variant.is_insertion
 86 |     assert not variant.is_deletion
 87 | 
 88 | def test_deletion_shared_suffix():
 89 |     variant = Variant(1, start=10, ref="AAC", alt="C")
 90 |     eq_(variant.contig, "1")
 91 |     eq_(variant.original_ref, "AAC")
 92 |     eq_(variant.original_alt, "C")
 93 |     eq_(variant.original_start, 10)
 94 |     eq_(variant.ref, "AA")
 95 |     eq_(variant.alt, "")
 96 |     eq_(variant.start, 10)
 97 |     eq_(variant.end, 11)
 98 |     eq_(variant.short_description, "chr1 g.10_11delAA")
 99 |     assert variant.is_indel
100 |     assert not variant.is_insertion
101 |     assert variant.is_deletion
102 | 
103 | def test_deletion_no_suffix():
104 |     variant = Variant(1, start=10, ref="AA", alt="")
105 |     eq_(variant.contig, "1")
106 |     eq_(variant.original_ref, "AA")
107 |     eq_(variant.original_alt, "")
108 |     eq_(variant.original_start, 10)
109 |     eq_(variant.ref, "AA")
110 |     eq_(variant.alt, "")
111 |     eq_(variant.start, 10)
112 |     eq_(variant.end, 11)
113 |     eq_(variant.short_description, "chr1 g.10_11delAA")
114 |     assert variant.is_indel
115 |     assert not variant.is_insertion
116 |     assert variant.is_deletion
117 | 
118 | def test_serialization():
119 |     variants = [
120 |         Variant(
121 |             1, start=10, ref="AA", alt="AAT", genome=ensembl_grch38),
122 |         Variant(10, start=15, ref="A", alt="G"),
123 |         Variant(20, start=150, ref="", alt="G"),
124 |     ]
125 |     for original in variants:
126 |         # This causes the variant's ensembl object to make a SQL connection,
127 |         # which makes the ensembl object non-serializable. By calling this
128 |         # method, we are checking that we don't attempt to directly serialize
129 |         # the ensembl object.
130 |         original.effects()
131 | 
132 |         # Test pickling.
133 |         serialized = pickle.dumps(original)
134 |         reconstituted = pickle.loads(serialized)
135 |         eq_(original, reconstituted)
136 | 
137 |         eq_(original.contig, reconstituted.contig)
138 |         eq_(original.ref, reconstituted.ref)
139 |         eq_(original.alt, reconstituted.alt)
140 |         eq_(original.start, reconstituted.start)
141 |         eq_(original.end, reconstituted.end)
142 |         eq_(original.original_ref, reconstituted.original_ref)
143 |         eq_(original.original_alt, reconstituted.original_alt)
144 |         eq_(original.original_start, reconstituted.original_start)
145 | 
146 |         # Test json.
147 |         serialized = original.to_json()
148 |         reconstituted = Variant.from_json(serialized)
149 |         eq_(original, reconstituted)
150 | 
151 | def test_deserialization_old_keywords():
152 |     old_variant_representation_json = """
153 |     {
154 |         "ref": "T",
155 |         "contig": "22",
156 |         "start": 23230319,
157 |         "__class__": {
158 |             "__name__": "Variant",
159 |             "__module__": "varcode.variant"
160 |         },
161 |         "normalize_contig_name": true,
162 |         "alt": "G",
163 |         "allow_extended_nucleotides": false,
164 |         "ensembl": {
165 |             "__class__": {
166 |                 "__name__": "EnsemblRelease",
167 |                 "__module__": "pyensembl.ensembl_release"
168 |             },
169 |             "release": 75,
170 |             "server": "ftp://ftp.ensembl.org",
171 |             "species": {
172 |                 "__class__": {
173 |                     "__name__": "Species",
174 |                     "__module__": "pyensembl.species"
175 |                 },
176 |                 "latin_name": "homo_sapiens"
177 |             }
178 |         }
179 |     }
180 |     """
181 |     variant = Variant.from_json(old_variant_representation_json)
182 |     eq_(variant.contig, "22")
183 |     eq_(variant.ref, "T")
184 |     eq_(variant.alt, "G")
185 |     eq_(variant.reference_name, "GRCh37")
186 |     eq_(variant.normalize_contig_names, True)
187 |     eq_(variant.allow_extended_nucleotides, False)
188 | 
189 | def test_hg19_chromosome_names():
190 |     # trimming of mithochondrial name
191 |     eq_(Variant("M", 1, "A", "G", "hg19", convert_ucsc_contig_names=True).contig, "MT")
192 |     eq_(Variant("M", 1, "A", "G", "hg19", convert_ucsc_contig_names=False).contig, "M")
193 | 
194 |     eq_(Variant("chrM", 1, "A", "G", "hg19", convert_ucsc_contig_names=True).contig, "MT")
195 |     eq_(Variant("chrM", 1, "A", "G", "hg19", convert_ucsc_contig_names=False).contig, "chrM")
196 | 
197 |     # uppercase
198 |     eq_(Variant("chrm", 1, "A", "G", "hg19", convert_ucsc_contig_names=True).contig, "MT")
199 |     eq_(Variant("chrm", 1, "A", "G", "hg19", convert_ucsc_contig_names=False).contig, "chrM")
200 | 
201 |     # trimming of 'chr' prefix from hg19
202 |     eq_(Variant("chr1", 1, "A", "G", "hg19", convert_ucsc_contig_names=True).contig, "1")
203 |     eq_(Variant("chr1", 1, "A", "G", "hg19", convert_ucsc_contig_names=False).contig, "chr1")
204 | 
205 | def test_contig_name_normalization():
206 |     eq_(Variant(1, 1, "A", "G", normalize_contig_names=True).contig, "1")
207 |     eq_(Variant(1, 1, "A", "G", normalize_contig_names=False).contig, 1)
208 | 
209 |     # uppercase
210 |     eq_(Variant(
211 |         "chrm", 1, "A", "G", normalize_contig_names=True, convert_ucsc_contig_names=False).contig, "chrM")
212 |     eq_(Variant(
213 |         "chrm", 1, "A", "G", normalize_contig_names=False, convert_ucsc_contig_names=False).contig, "chrm")
214 | 
215 | 
216 | def test_snv_transition_transversion():
217 |     ref_variant = Variant(1, start=100, ref="C", alt="C")
218 |     assert not ref_variant.is_snv
219 | 
220 |     variant = Variant(1, start=100, ref="C", alt="T")
221 |     assert variant.is_snv
222 |     assert variant.is_transition
223 |     assert not variant.is_transversion
224 | 
225 |     transversion = Variant(1, start=100, ref="C", alt="A")
226 |     assert transversion.is_snv
227 |     assert not transversion.is_transition
228 |     assert transversion.is_transversion
229 | 


--------------------------------------------------------------------------------
/tests/test_variant_collection.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | """
 14 | Test properties of VariantCollection objects other than effect annotations
 15 | """
 16 | from collections import Counter
 17 | import pickle
 18 | 
 19 | from .common import eq_ 
 20 | from .data import ov_wustle_variants, tcga_ov_variants
 21 | 
 22 | from varcode import VariantCollection, Variant
 23 | 
 24 | def test_variant_collection_union():
 25 |     combined = ov_wustle_variants.union(tcga_ov_variants)
 26 |     eq_(set(combined.sources), {ov_wustle_variants.source, tcga_ov_variants.source})
 27 |     eq_(len(combined), len(ov_wustle_variants) + len(tcga_ov_variants))
 28 | 
 29 | def test_variant_collection_intersection():
 30 |     combined = ov_wustle_variants.intersection(tcga_ov_variants)
 31 |     eq_(set(combined.sources), {ov_wustle_variants.source, tcga_ov_variants.source})
 32 |     eq_(len(combined), 0)
 33 | 
 34 | def test_variant_collection_gene_counts():
 35 |     gene_counts = ov_wustle_variants.gene_counts()
 36 |     # test that each gene is counted just once
 37 |     eq_(list(gene_counts.values()), [1] * len(gene_counts))
 38 | 
 39 | def test_variant_collection_groupby_gene():
 40 |     genes = ov_wustle_variants.groupby_gene().keys()
 41 |     # make sure that the IDs attached to Gene objects are the same as IDs
 42 |     # of groupby_gene_id
 43 |     gene_ids = set(ov_wustle_variants.groupby_gene_id().keys())
 44 |     eq_({gene.id for gene in genes}, gene_ids)
 45 | 
 46 | def test_variant_collection_groupby_gene_id():
 47 |     gene_ids = set(ov_wustle_variants.groupby_gene_id().keys())
 48 |     eq_(gene_ids, {
 49 |         'ENSG00000060718',
 50 |         'ENSG00000156876',
 51 |         'ENSG00000130939',
 52 |         'ENSG00000122477',
 53 |         'ENSG00000162688'
 54 |     })
 55 | 
 56 | def test_variant_collection_groupby_gene_name():
 57 |     gene_names = set(ov_wustle_variants.groupby_gene_name().keys())
 58 |     eq_(gene_names, {"AGL", "SASS6", "LRRC39", "UBE4B", "COL11A1"})
 59 | 
 60 | def test_reference_names():
 61 |     eq_(ov_wustle_variants.reference_names(), {"GRCh37"})
 62 | 
 63 | def test_to_string():
 64 |     string_repr = str(ov_wustle_variants)
 65 |     assert "start=10238758, ref='G', alt='C'" in string_repr, \
 66 |         "Expected variant g.10238758 G>C in __str__:\n%s" % (
 67 |             string_repr,)
 68 | 
 69 | def test_detailed_string():
 70 |     detailed_string = ov_wustle_variants.detailed_string()
 71 |     # expect one of the gene names from the MAF to be in the summary string
 72 |     assert "UBE4B" in detailed_string, \
 73 |         "Expected gene name UBE4B in detailed_string():\n%s" % detailed_string
 74 |     assert "start=10238758, ref='G', alt='C'" in detailed_string, \
 75 |         "Expected variant g.10238758 G>C in detailed_string():\n%s" % (
 76 |             detailed_string,)
 77 | 
 78 | def test_gene_counts():
 79 |     expected_coding_gene_counts = Counter()
 80 |     expected_coding_gene_counts["CDK11A"] = 1
 81 |     expected_coding_gene_counts["GNPAT"] = 1
 82 |     expected_coding_gene_counts["E2F2"] = 1
 83 |     expected_coding_gene_counts["VSIG2"] = 1
 84 |     all_gene_counts = tcga_ov_variants.gene_counts()
 85 |     assert len(all_gene_counts) > len(expected_coding_gene_counts), \
 86 |         ("Gene counts for all genes must contain more elements than"
 87 |          " gene counts for only coding genes.")
 88 |     for (gene_name, count) in expected_coding_gene_counts.items():
 89 |         eq_(count, all_gene_counts[gene_name])
 90 | 
 91 |     # TODO: add `only_coding` parameter to gene_counts and then test
 92 |     # for exact equality between `coding_gene_counts` and
 93 |     # `expected_counts`
 94 |     #
 95 |     # coding_gene_counts = variants.gene_counts(only_coding=True)
 96 |     # eq_(coding_gene_counts, expected_counts)
 97 | 
 98 | def test_variant_collection_serialization():
 99 |     variant_list = [
100 |         Variant(
101 |             1, start=10, ref="AA", alt="AAT"),
102 |         Variant(10, start=15, ref="A", alt="G"),
103 |         Variant(20, start=150, ref="", alt="G"),
104 |     ]
105 |     original = VariantCollection(
106 |         variant_list,
107 |         source_to_metadata_dict={
108 |             "test_data":
109 |                 {variant: {"a": "b", "bar": 2} for variant in variant_list}})
110 | 
111 |     # This causes the variants' ensembl objects to make a SQL connection,
112 |     # which makes the ensembl object non-serializable. By calling this
113 |     # method, we are checking that we don't attempt to directly serialize
114 |     # the ensembl object.
115 |     original.effects()
116 | 
117 |     original_first_variant = original[0]
118 |     original_metadata = original.metadata
119 | 
120 |     # Test pickling
121 |     reconstructed = pickle.loads(pickle.dumps(original))
122 |     eq_(original, reconstructed)
123 |     eq_(reconstructed[0], original_first_variant)
124 |     eq_(reconstructed.metadata[original_first_variant],
125 |         original_metadata[original_first_variant])
126 | 
127 |     merged = original.intersection(original)
128 |     merged_reconstructed = pickle.loads(pickle.dumps(merged))
129 |     eq_(merged, merged_reconstructed)
130 | 
131 |     # Test JSON serialization
132 |     variants_from_json = VariantCollection.from_json(original.to_json())
133 |     eq_(original, variants_from_json)
134 | 
135 |     eq_(variants_from_json[0], original_first_variant)
136 | 
137 |     # pylint: disable=no-member
138 |     eq_(variants_from_json.metadata[original_first_variant],
139 |         original_metadata[original_first_variant])
140 | 
141 | def test_merged_variant_collection_serialization():
142 |     intersection = ov_wustle_variants.intersection(tcga_ov_variants)
143 |     eq_(intersection, pickle.loads(pickle.dumps(intersection)))
144 | 
145 |     union = ov_wustle_variants.union(tcga_ov_variants)
146 |     eq_(union, pickle.loads(pickle.dumps(union)))
147 | 


--------------------------------------------------------------------------------
/tests/test_vcf.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import os
 14 | 
 15 | import pytest 
 16 | 
 17 | from pyensembl import cached_release
 18 | from varcode import load_vcf, Variant
 19 | 
 20 | from .common import eq_
 21 | from .data import data_path
 22 | 
 23 | 
 24 | 
 25 | # Set to 1 to enable, 0 to disable.
 26 | # TODO: consider running in an in-process HTTP server instead for these tests.
 27 | RUN_TESTS_REQUIRING_INTERNET = bool(int(
 28 |     os.environ.get("RUN_TESTS_REQUIRING_INTERNET", 0)))
 29 | 
 30 | HG19_VCF_FILENAME = data_path("somatic_hg19_14muts.vcf")
 31 | HG19_VCF_EXTERNAL_URL = (
 32 |     "https://raw.githubusercontent.com/hammerlab/varcode/master/test/data/somatic_hg19_14muts.vcf")
 33 | 
 34 | # To load from the branch that introduced these changs:
 35 | # (needed before this gets merged to master, can be removed after)
 36 | # VCF_EXTERNAL_URL = (
 37 | #   "https://raw.githubusercontent.com/hammerlab/varcode/faster-vcf-parsing/test/data/somatic_hg19_14muts.vcf")
 38 | 
 39 | def test_load_vcf_local():
 40 |     variants = load_vcf(HG19_VCF_FILENAME)
 41 |     assert variants.reference_names() == {"GRCh37"}
 42 |     assert len(variants) == 14
 43 | 
 44 |     variants = load_vcf(HG19_VCF_FILENAME + ".gz")
 45 |     assert variants.reference_names() == {"GRCh37"}
 46 |     assert len(variants) == 14
 47 | 
 48 |     variants = load_vcf("file://%s" % HG19_VCF_FILENAME)
 49 |     assert variants.reference_names() == {"GRCh37"}
 50 |     assert len(variants) == 14
 51 | 
 52 |     variants = load_vcf("file://%s.gz" % HG19_VCF_FILENAME)
 53 |     assert variants.reference_names() == {"GRCh37"}
 54 |     assert len(variants) == 14
 55 | 
 56 |     # An extra slashe before an absolute path can confuse URL parsing.
 57 |     # Test that it can still be opened:
 58 |     variants = load_vcf("/%s" % HG19_VCF_FILENAME)
 59 |     assert variants.reference_names() == {"GRCh37"}
 60 |     assert len(variants) == 14
 61 | 
 62 | if RUN_TESTS_REQUIRING_INTERNET:
 63 |     def test_load_vcf_external():
 64 |         variants = load_vcf(HG19_VCF_FILENAME)
 65 |         eq_(variants.reference_names(), {"GRCh37"})
 66 |         eq_(variants.original_reference_names(), {"hg19"})
 67 |         eq_(len(variants), 14)
 68 | 
 69 |         variants = load_vcf(HG19_VCF_FILENAME + ".gz")
 70 |         eq_(variants.reference_names(), {"GRCh37"})
 71 |         eq_(len(variants), 14)
 72 | 
 73 | def test_vcf_reference_name():
 74 |     variants = load_vcf(HG19_VCF_FILENAME)
 75 | 
 76 |     # after normalization, hg19 should be remapped to GRCh37
 77 |     assert variants.reference_names() == {"GRCh37"}
 78 | 
 79 | def test_genome_arg_to_load_vcf_hg19():
 80 |     eq_(load_vcf(HG19_VCF_FILENAME),
 81 |         load_vcf(HG19_VCF_FILENAME, genome="hg19"))
 82 | 
 83 | def test_genome_arg_to_load_vcf_int_75():
 84 |     # if we use Ensembl 75 -- which is backed by GRCh37 -- then the two variant
 85 |     # collections will be the same as long as we also convert the contig names
 86 |     eq_(load_vcf(HG19_VCF_FILENAME),
 87 |         load_vcf(HG19_VCF_FILENAME, genome=75, convert_ucsc_contig_names=True))
 88 | 
 89 |     assert load_vcf(HG19_VCF_FILENAME) != load_vcf(
 90 |         HG19_VCF_FILENAME,
 91 |         genome=75,
 92 |         convert_ucsc_contig_names=False)
 93 | 
 94 | def test_genome_arg_to_load_vcf_cached_75():
 95 |     eq_(load_vcf(HG19_VCF_FILENAME),
 96 |         load_vcf(HG19_VCF_FILENAME,
 97 |                  genome=cached_release(75), convert_ucsc_contig_names=True))
 98 |     assert load_vcf(HG19_VCF_FILENAME) != load_vcf(
 99 |         HG19_VCF_FILENAME,
100 |         genome=cached_release(75),
101 |         convert_ucsc_contig_names=False)
102 | 
103 | def test_genome_arg_to_load_vcf_grch37():
104 |     eq_(load_vcf(HG19_VCF_FILENAME),
105 |         load_vcf(
106 |             HG19_VCF_FILENAME,
107 |             genome="grch37",
108 |             convert_ucsc_contig_names=True))
109 |     eq_(load_vcf(HG19_VCF_FILENAME), load_vcf(
110 |         HG19_VCF_FILENAME,
111 |         genome="GRCh37",
112 |         convert_ucsc_contig_names=True))
113 | 
114 |     assert load_vcf(HG19_VCF_FILENAME) != load_vcf(
115 |         HG19_VCF_FILENAME,
116 |         genome="grch37",
117 |         convert_ucsc_contig_names=False)
118 | 
119 | def test_genome_arg_to_load_vcf_b37():
120 |     eq_(load_vcf(HG19_VCF_FILENAME),
121 |         load_vcf(HG19_VCF_FILENAME, genome="b37", convert_ucsc_contig_names=True))
122 | 
123 | def test_vcf_number_entries():
124 |     # there are 14 mutations listed in the VCF, make sure they are all parsed
125 |     variants = load_vcf(HG19_VCF_FILENAME)
126 |     assert len(variants) == 14, \
127 |         "Expected 14 mutations, got %d" % (len(variants),)
128 | 
129 | def test_vcf_number_entries_duplicates():
130 |     # There are 3 duplicated mutations listed in the VCF
131 |     path_to_vcf_with_duplicates = data_path("duplicates.vcf")
132 |     variants = load_vcf(
133 |         path_to_vcf_with_duplicates,
134 |         genome='hg38',
135 |         distinct=True)
136 |     assert len(variants) == 1
137 |     variants = load_vcf(
138 |         path_to_vcf_with_duplicates,
139 |         genome='hg38',
140 |         distinct=False)
141 |     assert len(variants) == 3
142 | 
143 | def generate_vcf_gene_names():
144 |     variants = load_vcf(HG19_VCF_FILENAME)
145 |     for variant in variants:
146 |         yield (variants, variant)
147 | 
148 | @pytest.mark.parametrize(['collection', 'variant'], generate_vcf_gene_names())
149 | def test_vcf_gene_names(collection, variant):
150 |     expected_gene_names = collection.metadata[variant]['info']['GE']
151 |     assert variant.gene_names == expected_gene_names, \
152 |         "Expected gene name %s for variant %s, got %s" % (
153 |             expected_gene_names, variant, variant.gene_names)
154 | 
155 | 
156 | def test_multiple_alleles_per_line():
157 |     variants = load_vcf(data_path("multiallelic.vcf"))
158 |     assert len(variants) == 2, "Expected 2 variants but got %s" % variants
159 |     variant_list = list(variants)
160 |     expected_variants = [
161 |         Variant(1, 1431105, "A", "C", genome="GRCh37"),
162 |         Variant(1, 1431105, "A", "G", genome="GRCh37"),
163 |     ]
164 |     eq_(set(variant_list), set(expected_variants))
165 | 
166 | def test_sample_info_genotype():
167 |     variants = load_vcf(data_path("multiallelic.vcf"))
168 |     assert len(variants) == 2, "Expected 2 variants but got %s" % variants
169 |     eq_(variants.metadata[variants[0]]['sample_info']['metastasis']['GT'],
170 |         '0/1')
171 |     eq_(variants.metadata[variants[1]]['sample_info']['metastasis']['GT'],
172 |         '0/1')
173 | 


--------------------------------------------------------------------------------
/tests/test_vcf_output.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import tempfile
 14 | 
 15 | import pytest 
 16 | 
 17 | from varcode import load_vcf, load_maf
 18 | from varcode.vcf_output import variants_to_vcf
 19 | 
 20 | from .data import data_path
 21 | 
 22 | 
 23 | TEST_FILENAMES_HUMAN = [
 24 |     'duplicates.maf',
 25 |     'multiallelic.vcf',
 26 |     'mutect-example.vcf',
 27 |     'ov.wustle.subset5.maf',
 28 |     'somatic_hg19_14muts.space_in_sample_name.vcf',
 29 |     'somatic_hg19_14muts.vcf',
 30 |     'strelka-example.vcf',
 31 |     'tcga_ov.head.maf',
 32 |     'tcga_ov.head.xychr.maf',
 33 |     # 'dbnsfp_validation_set.csv',      # csv
 34 |     # 'duplicates.vcf',                 # no ref genome header
 35 |     # 'mutect-example-headerless.vcf',  # no ref genome header
 36 |     # 'somatic_hg19_14muts.vcf.gz',     # gzip
 37 | ]
 38 | 
 39 | TEST_FILENAMES_MOUSE = [
 40 |     'mouse_vcf_dbsnp_chr1_partial.vcf',
 41 | ]
 42 | 
 43 | TEST_FILENAMES = TEST_FILENAMES_HUMAN + TEST_FILENAMES_MOUSE
 44 | 
 45 | 
 46 | def _merge_metadata_naive(variants):
 47 |     return {
 48 |         k: v
 49 |         for d in variants.source_to_metadata_dict.values()
 50 |         for k, v in d.items()
 51 |     }
 52 | 
 53 | 
 54 | 
 55 | def _do_roundtrip_test(filenames, convert_ucsc_to_grch37=False):
 56 | 
 57 |     def load_fn(filename):
 58 |         return {
 59 |             'vcf': load_vcf,
 60 |             'maf': load_maf
 61 |         }[filename.split('.')[-1]]
 62 | 
 63 |     def load_variants():
 64 |         variant_collections = []
 65 |         for filename in filenames:
 66 |             variant_collections.append(load_fn(filename)(data_path(filename)))
 67 |         return variant_collections[0].union(*variant_collections[1:])
 68 | 
 69 |     variants = load_variants()
 70 |     if convert_ucsc_to_grch37:
 71 |         variants = variants.clone_without_ucsc_data()
 72 | 
 73 |     with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
 74 |         metadata = _merge_metadata_naive(variants)
 75 |         variants_to_vcf(variants, metadata, out=f)
 76 |         tmp_name = f.name
 77 |     reparsed_variants = load_vcf(tmp_name)
 78 | 
 79 |     # `==` checks the reference genome, which won't necessarily match.
 80 |     for (v1, v2) in zip(variants, reparsed_variants):
 81 |         assert (
 82 |             v1.contig == v2.contig and
 83 |             v1.start == v2.start and
 84 |             v1.ref == v2.ref and
 85 |             v1.start == v2.start), (v1, v2)
 86 | 
 87 |     return (variants, reparsed_variants)
 88 | 
 89 |     # TODO:
 90 |     #   There is definitely more opportunity here to compare metadata
 91 |     #   fields, with caveats.
 92 |     #   ---
 93 |     #   First, any variants from non-VCF sources (e.g., MAF files) will inevitably
 94 |     #   lose some information through the change in representation (more importantly,
 95 |     #   even if there is no loss in data, that data will be in a different format in
 96 |     #   the new metadata dictionary). Thus, we should either ignore such variants
 97 |     #   or only check certain fields.
 98 |     #   ---
 99 |     #   Second, without the original metadata headers in the VCF file, all metadata
100 |     #   information will be parsed as strings. Thus, for a simple comparison between
101 |     #   metadata (without the need to individually convert fields), we'd need to add
102 |     #   these headers to the output VCF file. See `vcf_output.py` for more info.
103 | 
104 | 
105 | @pytest.mark.parametrize(['filename'], [(f,) for f in TEST_FILENAMES])
106 | def test_roundtrip_serialization_single_file(filename):
107 |     _do_roundtrip_test([filename])
108 | 
109 | FILENAME_PAIRS = (
110 |     ['simple.1.vcf', 'simple.2.vcf'],  # basic multi-file VCF test
111 |     ['duplicates.maf', 'ov.wustle.subset5.maf'],  # multiple MAF files
112 |     ['duplicate-id.1.vcf', 'duplicate-id.2.vcf'],
113 | )
114 | 
115 | @pytest.mark.parametrize(['file_group'], [(f,) for f in FILENAME_PAIRS])
116 | def test_multiple_file_roundtrip_conversion(file_group):
117 |     _do_roundtrip_test(file_group)
118 | 
119 | def test_multiple_file_roundtrip_conversion_mixed_references():
120 |     # testing roundtrip serialization of hg19 VCF files
121 |     # converted to GRCh37 combined with b37 MAFs
122 |     _do_roundtrip_test(TEST_FILENAMES_HUMAN, convert_ucsc_to_grch37=True)
123 | 
124 | def test_same_samples_produce_samples():
125 |     """test_same_samples_produce_samples
126 | 
127 |     Ensures that, if a set of variants have the same samples, the reparsed
128 |     collection will output these samples.
129 |     """
130 |     (variants, reparsed_variants) = _do_roundtrip_test(
131 |         ['same-samples.1.vcf', 'same-samples.2.vcf'])
132 | 
133 |     original_metadata = _merge_metadata_naive(variants)
134 |     reparsed_metadata = _merge_metadata_naive(reparsed_variants)
135 | 
136 |     sample_names = set(list(original_metadata.values())[0]['sample_info'].keys())
137 |     assert all(
138 |         set(d.get('sample_info', {}).keys()) == sample_names
139 |         for d in reparsed_metadata.values())
140 | 
141 | 
142 | def test_different_samples_produce_no_samples():
143 |     """test_different_samples_produce_no_samples
144 | 
145 |     Ensures that, if a set of variants have different samples, the reparsed
146 |     collection will not output any samples.
147 | 
148 |     See `vcf_output.py` for details as to why this is the way it's done for now.
149 |     """
150 |     (_, reparsed_variants) = _do_roundtrip_test(
151 |         ['different-samples.1.vcf', 'different-samples.2.vcf'])
152 | 
153 |     metadata = _merge_metadata_naive(reparsed_variants)
154 |     assert all(d.get('sample_info') is None for d in metadata.values())
155 | 


--------------------------------------------------------------------------------
/varcode/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | from .variant import Variant
15 | from .variant_collection import VariantCollection
16 | from .maf import load_maf, load_maf_dataframe
17 | from .vcf import load_vcf, load_vcf_fast
18 | from .effects import (
19 |     effect_priority,
20 |     top_priority_effect,
21 |     EffectCollection,
22 |     MutationEffect,
23 |     NonsilentCodingMutation,
24 | )
25 | from .version import __version__ 
26 | 
27 | __all__ = [
28 |     "__version__", 
29 | 
30 |     # basic classes
31 |     "Variant",
32 |     "EffectCollection",
33 |     "VariantCollection",
34 | 
35 |     # effects
36 |     "effect_priority",
37 |     "top_priority_effect",
38 |     "MutationEffect",
39 |     "NonsilentCodingMutation",
40 | 
41 |     # file loading
42 |     "load_maf",
43 |     "load_maf_dataframe",
44 |     "load_vcf",
45 |     "load_vcf_fast",
46 | ]
47 | 


--------------------------------------------------------------------------------
/varcode/cli/__init__.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from .variant_args import (
14 |     add_variant_args,
15 |     make_variants_parser,
16 |     variant_collection_from_args,
17 | )
18 | 
19 | __all__ = [
20 |     "add_variant_args",
21 |     "make_variants_parser",
22 |     "variant_collection_from_args",
23 | ]
24 | 


--------------------------------------------------------------------------------
/varcode/cli/effects_script.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | import logging.config
14 | import pkg_resources
15 | import sys
16 | 
17 | from .version_info import print_version_info
18 | from .variant_args import make_variants_parser, variant_collection_from_args
19 | 
20 | 
21 | logging.config.fileConfig(pkg_resources.resource_filename(__name__, 'logging.conf'))
22 | logger = logging.getLogger(__name__)
23 | 
24 | arg_parser = make_variants_parser(
25 |     description="Annotate variants with overlapping gene names and predicted coding effects")
26 | 
27 | arg_parser.add_argument("--output-csv", help="Output path to CSV")
28 | 
29 | arg_parser.add_argument(
30 |     "--one-per-variant",
31 |     default=False,
32 |     action="store_true",
33 |     help=(
34 |         "Only return highest priority effect overlapping a variant, "
35 |         "otherwise all overlapping transcripts are returned."))
36 | 
37 | arg_parser.add_argument(
38 |     "--only-coding",
39 |     default=False,
40 |     action="store_true",
41 |     help="Filter silent and non-coding effects")
42 | 
43 | def main(args_list=None):
44 |     """
45 |     Script which loads variants and annotates them with overlapping genes
46 |     and predicted coding effects.
47 | 
48 |     Example usage:
49 |         varcode
50 |             --vcf mutect.vcf \
51 |             --vcf strelka.vcf \
52 |             --maf tcga_brca.maf \
53 |             --variant chr1 498584 C G \
54 |             --json-variants more_variants.json
55 |     """
56 |     print_version_info()
57 |     if args_list is None:
58 |         args_list = sys.argv[1:]
59 | 
60 |     args = arg_parser.parse_args(args_list)
61 |     variants = variant_collection_from_args(args)
62 |     effects = variants.effects()
63 |     if args.only_coding:
64 |         effects = effects.drop_silent_and_noncoding()
65 |     if args.one_per_variant:
66 |         variant_to_effect_dict = effects.top_priority_effect_per_variant()
67 |         effects = effects.clone_with_new_elements(list(variant_to_effect_dict.values()))
68 | 
69 |     effects_dataframe = effects.to_dataframe()
70 |     logger.info('\n%s', effects)
71 |     if args.output_csv:
72 |         effects_dataframe.to_csv(args.output_csv, index=False)
73 | 


--------------------------------------------------------------------------------
/varcode/cli/genes_script.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | import logging
14 | import logging.config
15 | import pkg_resources
16 | import sys
17 | 
18 | from .version_info import print_version_info
19 | from .variant_args import make_variants_parser, variant_collection_from_args
20 | 
21 | 
22 | logging.config.fileConfig(pkg_resources.resource_filename(__name__, 'logging.conf'))
23 | logger = logging.getLogger(__name__)
24 | 
25 | arg_parser = make_variants_parser(
26 |     description="Annotate variants with overlapping gene names")
27 | arg_parser.add_argument("--output-csv", help="Output path to CSV")
28 | 
29 | def main(args_list=None):
30 |     """
31 |     Script which loads variants and annotates them with overlapping genes.
32 | 
33 |     Example usage:
34 |         varcode-genes
35 |             --vcf mutect.vcf \
36 |             --vcf strelka.vcf \
37 |             --maf tcga_brca.maf \
38 |             --variant chr1 498584 C G \
39 |             --json-variants more_variants.json
40 |     """
41 |     print_version_info()
42 |     if args_list is None:
43 |         args_list = sys.argv[1:]
44 |     args = arg_parser.parse_args(args_list)
45 |     variants = variant_collection_from_args(args)
46 |     variants_dataframe = variants.to_dataframe()
47 |     logger.info('\n%s', variants_dataframe)
48 |     if args.output_csv:
49 |         variants_dataframe.to_csv(args.output_csv, index=False)
50 | 


--------------------------------------------------------------------------------
/varcode/cli/logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root,varcode,pyensembl,datacache
 3 | 
 4 | [formatters]
 5 | keys=simpleFormatter
 6 | 
 7 | [handlers]
 8 | keys=consoleHandler,consoleHandlerCritical
 9 | 
10 | [logger_root]
11 | level=INFO
12 | handlers=consoleHandlerCritical
13 | 
14 | [handler_consoleHandler]
15 | class=StreamHandler
16 | level=INFO
17 | formatter=simpleFormatter
18 | args=(sys.stdout,)
19 | 
20 | [handler_consoleHandlerCritical]  # only for root logger: essentially silent
21 | class=StreamHandler
22 | level=CRITICAL
23 | formatter=simpleFormatter
24 | args=(sys.stdout,)
25 | 
26 | [formatter_simpleFormatter]
27 | format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
28 | datefmt=
29 | 
30 | # varcode
31 | 
32 | [logger_varcode]
33 | level=DEBUG
34 | qualname=varcode
35 | handlers=consoleHandler
36 | 
37 | # pyensembl
38 | 
39 | [logger_pyensembl]
40 | level=DEBUG
41 | qualname=pyensembl
42 | handlers=consoleHandler
43 | 
44 | # datacache
45 | 
46 | [logger_datacache]
47 | level=DEBUG
48 | qualname=datacache
49 | handlers=consoleHandler
50 | 


--------------------------------------------------------------------------------
/varcode/cli/variant_args.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | from argparse import ArgumentParser
 14 | 
 15 | from ..vcf import load_vcf
 16 | from ..maf import load_maf
 17 | from ..variant_collection import VariantCollection
 18 | from ..variant import Variant
 19 | 
 20 | 
 21 | def add_variant_args(arg_parser):
 22 |     """
 23 |     Extends an ArgumentParser instance with the following commandline arguments:
 24 |         --vcf
 25 |         --genome
 26 |         --maf
 27 |         --variant
 28 |         --json-variants
 29 |     """
 30 |     variant_arg_group = arg_parser.add_argument_group(
 31 |         title="Variants",
 32 |         description="Genomic variant files")
 33 | 
 34 |     variant_arg_group.add_argument(
 35 |         "--vcf",
 36 |         default=[],
 37 |         action="append",
 38 |         help="Genomic variants in VCF format")
 39 | 
 40 |     variant_arg_group.add_argument(
 41 |         "--maf",
 42 |         default=[],
 43 |         action="append",
 44 |         help="Genomic variants in TCGA's MAF format",)
 45 | 
 46 |     variant_arg_group.add_argument(
 47 |         "--variant",
 48 |         default=[],
 49 |         action="append",
 50 |         nargs=4,
 51 |         metavar=("CHR", "POS", "REF", "ALT"),
 52 |         help=(
 53 |             "Individual variant as 4 arguments giving chromsome, position, ref,"
 54 |             " and alt. Example: chr1 3848 C G. Use '.' to indicate empty alleles"
 55 |             " for insertions or deletions."))
 56 | 
 57 |     variant_arg_group.add_argument(
 58 |         "--genome",
 59 |         type=str,
 60 |         help=(
 61 |             "What reference assembly your variant coordinates are using. "
 62 |             "Examples: 'hg19', 'GRCh38', or 'mm9'. "
 63 |             "This argument is ignored for MAF files, since each row includes "
 64 |             "the reference. "
 65 |             "For VCF files, this is used if specified, and otherwise is guessed from "
 66 |             "the header. For variants specfied on the commandline with --variant, "
 67 |             "this option is required."))
 68 | 
 69 |     variant_arg_group.add_argument(
 70 |         "--download-reference-genome-data",
 71 |         action="store_true",
 72 |         default=False,
 73 |         help=(
 74 |             ("Automatically download genome reference data required for "
 75 |              "annotation using PyEnsembl. Otherwise you must first run "
 76 |              "'pyensembl install' for the release/species corresponding "
 77 |              "to the genome used in your VCF.")))
 78 | 
 79 |     variant_arg_group.add_argument(
 80 |         "--json-variants",
 81 |         default=[],
 82 |         action="append",
 83 |         help="Path to Varcode.VariantCollection object serialized as a JSON file.")
 84 | 
 85 |     return variant_arg_group
 86 | 
 87 | 
 88 | def make_variants_parser(**kwargs):
 89 |     """
 90 |     Parameters
 91 |     ----------
 92 |     **kwargs : dict
 93 |         Passed directly to argparse.ArgumentParser
 94 | 
 95 |     Creates argparse.ArgumentParser instance with options needed for loading
 96 |     variants from VCF, MAF, or JSON files.
 97 |     """
 98 |     parser = ArgumentParser(**kwargs)
 99 |     add_variant_args(parser)
100 |     return parser
101 | 
102 | 
103 | def download_and_install_reference_data(variant_collections):
104 |     unique_genomes = {
105 |         variant.ensembl
106 |         for variant_collection in variant_collections
107 |         for variant in variant_collection
108 |     }
109 |     for genome in unique_genomes:
110 |         if not genome.required_local_files_exist():
111 |             genome.download()
112 |             genome.index()
113 | 
114 | 
115 | def variant_collection_from_args(args, required=True):
116 |     variant_collections = []
117 | 
118 |     for vcf_path in args.vcf:
119 |         variant_collections.append(
120 |             load_vcf(vcf_path, genome=args.genome))
121 | 
122 |     for maf_path in args.maf:
123 |         variant_collections.append(load_maf(maf_path))
124 | 
125 |     if args.variant:
126 |         if not args.genome:
127 |             raise ValueError(
128 |                 "--genome must be specified when using --variant")
129 | 
130 |         variants = [
131 |             Variant(
132 |                 chromosome,
133 |                 start=position,
134 |                 ref=ref,
135 |                 alt=alt,
136 |                 genome=args.genome)
137 |             for (chromosome, position, ref, alt)
138 |             in args.variant
139 |         ]
140 |         variant_collection = VariantCollection(variants)
141 |         variant_collections.append(variant_collection)
142 | 
143 |     for json_path in args.json_variants:
144 |         with open(json_path, 'r') as f:
145 |             variant_collections.append(
146 |                 VariantCollection.from_json(f.read()))
147 | 
148 |     if required and len(variant_collections) == 0:
149 |         raise ValueError(
150 |             "No variants loaded (use --maf, --vcf, --variant, or --json-variants options)")
151 | 
152 |     if args.download_reference_genome_data:
153 |         download_and_install_reference_data(variant_collections)
154 | 
155 |     # pylint: disable=no-value-for-parameter
156 |     return VariantCollection.union(*variant_collections)
157 | 


--------------------------------------------------------------------------------
/varcode/cli/version_info.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from collections import OrderedDict
14 | from os.path import dirname
15 | from .. import __file__ as package_init_file_path
16 | from .. import __version__
17 | 
18 | 
19 | def collect_version_info():
20 |     """
21 |     Collection the version and path of Varcode.
22 | 
23 |     TODO:
24 |         add a `dependencies=False` option to also collect this info from
25 |         major Python dependencies such as PyEnsembl
26 |     """
27 |     d = OrderedDict()
28 |     d["Varcode"] = (__version__, dirname(package_init_file_path))
29 |     return d
30 | 
31 | 
32 | def print_version_info(dependencies=False):
33 |     for (program, (version, path)) in collect_version_info().items():
34 |         print(program)
35 |         print("  Version: %s" % version)
36 |         print("  Path: %s" % path)
37 | 


--------------------------------------------------------------------------------
/varcode/common.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from collections import defaultdict
14 | 
15 | from functools import wraps
16 | 
17 | 
18 | def apply_groupby(records, fn, skip_none=False):
19 |     """
20 |     Given a list of objects, group them into a dictionary by
21 |     applying fn to each one and using returned values as a dictionary
22 |     key.
23 | 
24 |     Parameters
25 |     ----------
26 |     records : list
27 | 
28 |     fn : function
29 | 
30 |     skip_none : bool
31 |         If False, then None can be a key in the returned dictionary,
32 |         otherwise records whose key value is None get skipped.
33 | 
34 |     Returns dict.
35 |     """
36 | 
37 |     # create an empty list for every new key
38 |     groups = defaultdict(list)
39 |     for record in records:
40 |         value = fn(record)
41 |         if value is not None or not skip_none:
42 |             groups[value].append(record)
43 |     return dict(groups)
44 | 
45 | 
46 | def groupby_field(records, field_name, skip_none=True):
47 |     """
48 |     Given a list of objects, group them into a dictionary by
49 |     the unique values of a given field name.
50 |     """
51 |     return apply_groupby(
52 |         records,
53 |         lambda obj: getattr(obj, field_name),
54 |         skip_none=skip_none)
55 | 
56 | 
57 | def memoize(fn):
58 |     """
59 |     Simple memoization decorator for functions and methods,
60 |     assumes that all arguments to the function can be hashed and
61 |     compared.
62 |     """
63 |     memoized_values = {}
64 | 
65 |     @wraps(fn)
66 |     def wrapped_fn(*args, **kwargs):
67 |         if kwargs:
68 |             cache_key = (args, tuple(sorted(kwargs.items())))
69 |         else:
70 |             cache_key = (args, ())
71 |         try:
72 |             return memoized_values[cache_key]
73 |         except KeyError:
74 |             memoized_values[cache_key] = fn(*args, **kwargs)
75 |             return memoized_values[cache_key]
76 | 
77 |     return wrapped_fn
78 | 


--------------------------------------------------------------------------------
/varcode/effects/__init__.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from __future__ import print_function, division, absolute_import
14 | 
15 | from .effect_collection import EffectCollection
16 | from .effect_ordering import (
17 |     effect_priority,
18 |     top_priority_effect,
19 | )
20 | from .effect_prediction import (
21 |     predict_variant_effects,
22 |     predict_variant_effect_on_transcript,
23 |     predict_variant_effect_on_transcript_or_failure,
24 | )
25 | from .effect_classes import (
26 |     MutationEffect,
27 |     TranscriptMutationEffect,
28 |     NonsilentCodingMutation,
29 |     Failure,
30 |     IncompleteTranscript,
31 |     Intergenic,
32 |     Intragenic,
33 |     NoncodingTranscript,
34 |     Intronic,
35 |     ThreePrimeUTR,
36 |     FivePrimeUTR,
37 |     Silent,
38 |     Substitution,
39 |     Insertion,
40 |     Deletion,
41 |     ComplexSubstitution,
42 |     AlternateStartCodon,
43 |     IntronicSpliceSite,
44 |     ExonicSpliceSite,
45 |     StopLoss,
46 |     SpliceDonor,
47 |     SpliceAcceptor,
48 |     PrematureStop,
49 |     FrameShiftTruncation,
50 |     StartLoss,
51 |     FrameShift,
52 |     ExonLoss,
53 | )
54 | 
55 | __all__ = [
56 |     "EffectCollection",
57 |     # effect ordering
58 |     "effect_priority",
59 |     "top_priority_effect",
60 | 
61 |     # prediction functions
62 |     "predict_variant_effects",
63 |     "predict_variant_effect_on_transcript",
64 |     "predict_variant_effect_on_transcript_or_failure",
65 | 
66 |     # effect classes
67 |     "MutationEffect",
68 |     "TranscriptMutationEffect",
69 |     "Failure",
70 |     "IncompleteTranscript",
71 |     "Intergenic",
72 |     "Intragenic",
73 |     "IncompleteTranscript",
74 |     "NoncodingTranscript",
75 |     "ThreePrimeUTR",
76 |     "FivePrimeUTR",
77 |     "Intronic",
78 |     "Silent",
79 |     "NonsilentCodingMutation",
80 |     "Substitution",
81 |     "Insertion",
82 |     "Deletion",
83 |     "ComplexSubstitution",
84 |     "AlternateStartCodon",
85 |     "IntronicSpliceSite",
86 |     "ExonicSpliceSite",
87 |     "StopLoss",
88 |     "SpliceDonor",
89 |     "SpliceAcceptor",
90 |     "PrematureStop",
91 |     "FrameShiftTruncation",
92 |     "StartLoss",
93 |     "FrameShift",
94 |     "ExonLoss",
95 | ]
96 | 


--------------------------------------------------------------------------------
/varcode/effects/common.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from Bio.Seq import Seq
14 | 
15 | 
16 | 
17 | def bio_seq_to_str(seq):
18 |     if type(seq) is str:
19 |         return seq
20 |     else:
21 |         return str(seq) 
22 |     


--------------------------------------------------------------------------------
/varcode/effects/effect_helpers.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | """
 14 | Helper functions for determine effect annotation for a variant
 15 | """
 16 | 
 17 | 
 18 | from ..nucleotides import PURINE_NUCLEOTIDES, AMINO_NUCLEOTIDES
 19 | 
 20 | def variant_overlaps_interval(
 21 |         variant_start,
 22 |         n_ref_bases,
 23 |         interval_start,
 24 |         interval_end):
 25 |     """
 26 |     Does a variant overlap a given interval on the same chromosome?
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     variant_start : int
 31 |         Inclusive base-1 position of variant's starting location
 32 |         (or location before an insertion)
 33 | 
 34 |     n_ref_bases : int
 35 |         Number of reference bases affect by variant (used to compute
 36 |         end coordinate or determine whether variant is an insertion)
 37 | 
 38 |     interval_start : int
 39 |         Interval's inclusive base-1 start position
 40 | 
 41 |     interval_end : int
 42 |         Interval's inclusive base-1 end position
 43 |     """
 44 | 
 45 |     if n_ref_bases == 0:
 46 |         # insertions only overlap intervals which start before and
 47 |         # end after the insertion point, they must be fully contained
 48 |         # by the other interval
 49 |         return interval_start <= variant_start and interval_end >= variant_start
 50 |     variant_end = variant_start + n_ref_bases
 51 |     """
 52 |     if self._changes_exonic_splice_site(
 53 |             strand_ref,
 54 |             strand_alt,)
 55 |     """
 56 |     # overlap means other interval starts before this variant ends
 57 |     # and the interval ends after this variant starts
 58 |     return interval_start <= variant_end and interval_end >= variant_start
 59 | 
 60 | 
 61 | def matches_exon_end_pattern(seq):
 62 |     """Does the end of the nucleotide string `seq` match the canonical splice
 63 |     signal for the 3' end of an exon: "MAG", where M is either amino base.
 64 |     """
 65 |     if len(seq) < 3:
 66 |         return False
 67 |     return seq[-3] in AMINO_NUCLEOTIDES and seq[-2] == "A" and seq[-1] == "G"
 68 | 
 69 | def changes_exonic_splice_site(
 70 |         transcript_offset,
 71 |         transcript,
 72 |         transcript_ref,
 73 |         transcript_alt,
 74 |         exon_start_offset,
 75 |         exon_end_offset,
 76 |         exon_number):
 77 |     """Does the given exonic mutation of a particular transcript change a
 78 |     splice site?
 79 | 
 80 |     Parameters
 81 |     ----------
 82 |     transcript_offset : int
 83 |         Offset from start of transcript of first reference nucleotide
 84 |         (or the last nucleotide before an insertion)
 85 | 
 86 |     transcript : pyensembl.Transcript
 87 | 
 88 |     transcript_ref : str
 89 |         Reference nucleotides
 90 | 
 91 |     transcript_alt : alt
 92 |         Alternate nucleotides
 93 | 
 94 |     exon_start_offset : int
 95 |         Start offset of exon relative to beginning of transcript
 96 | 
 97 |     exon_end_offset : int
 98 |         End offset of exon relative to beginning of transcript
 99 | 
100 |     exon_number : int
101 |         Which exon in the order they form the transcript
102 |     """
103 |     # first we're going to make sure the variant doesn't disrupt the
104 |     # splicing sequences we got from Divina et. al's
105 |     #   Ab initio prediction of mutation-induced cryptic
106 |     #   splice-site activation and exon skipping
107 |     # (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2947103/)
108 |     #
109 |     # 5' splice site: MAG|GURAGU consensus
110 |     #   M is A or C; R is purine; | is the exon-intron boundary
111 |     #
112 |     # 3' splice site: YAG|R
113 |     #
114 |     if exon_number > 1 and transcript_offset == exon_start_offset:
115 |         # if this is any exon past the first, check to see if it lost
116 |         # the purine on its left side
117 |         #
118 |         # the 3' splice site sequence has just a single purine on
119 |         # the exon side
120 |         if len(transcript_ref) > 0 and transcript_ref[0] in PURINE_NUCLEOTIDES:
121 |             if len(transcript_alt) > 0:
122 |                 if transcript_alt[0] not in PURINE_NUCLEOTIDES:
123 |                     return True
124 |             else:
125 |                 # if the mutation is a deletion, are there ref nucleotides
126 |                 # afterward?
127 |                 offset_after_deletion = transcript_offset + len(transcript_ref)
128 |                 if len(transcript.sequence) > offset_after_deletion:
129 |                     next_base = transcript.sequence[offset_after_deletion]
130 |                     if next_base not in PURINE_NUCLEOTIDES:
131 |                         return True
132 | 
133 |     if exon_number < len(transcript.exons):
134 |         # if the mutation affects an exon whose right end gets spliced
135 |         # to a next exon, check if the variant alters the exon side of
136 |         # 5' consensus splicing sequence
137 |         #
138 |         # splicing sequence:
139 |         #   MAG|GURAGU
140 |         # M is A or C; R is purine; | is the exon-intron boundary
141 |         #
142 |         # TODO: check for overlap of two intervals instead of just
143 |         # seeing if the mutation starts inside the exonic splice site
144 |         if variant_overlaps_interval(
145 |                 variant_start=transcript_offset,
146 |                 n_ref_bases=len(transcript_ref),
147 |                 interval_start=exon_end_offset - 2,
148 |                 interval_end=exon_end_offset):
149 |             end_of_reference_exon = transcript.sequence[
150 |                 exon_end_offset - 2:exon_end_offset + 1]
151 | 
152 |             if matches_exon_end_pattern(end_of_reference_exon):
153 |                 # if the last three nucleotides conform to the consensus
154 |                 # sequence then treat any deviation as an ExonicSpliceSite
155 |                 # mutation
156 |                 end_of_variant_exon = end_of_reference_exon
157 |                 if matches_exon_end_pattern(end_of_variant_exon):
158 |                     # end of exon matches splicing signal, check if it still
159 |                     # does after the mutation
160 |                     return True
161 | 


--------------------------------------------------------------------------------
/varcode/effects/effect_prediction_coding.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | from .effect_prediction_coding_frameshift import predict_frameshift_coding_effect
 14 | from .effect_prediction_coding_in_frame import predict_in_frame_coding_effect
 15 | 
 16 | 
 17 | def predict_variant_coding_effect_on_transcript(
 18 |         variant,
 19 |         transcript,
 20 |         trimmed_cdna_ref,
 21 |         trimmed_cdna_alt,
 22 |         transcript_offset):
 23 |     """
 24 |     Given a minimal cDNA ref/alt nucleotide string pair and an offset into a
 25 |     given transcript, determine the coding effect of this nucleotide substitution
 26 |     onto the translated protein.
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     variant : Variant
 31 | 
 32 |     transcript : Transcript
 33 | 
 34 |     trimmed_cdna_ref : str
 35 |         Reference nucleotides we expect to find in the transcript's CDS
 36 | 
 37 |     trimmed_cdna_alt : str
 38 |         Alternate nucleotides we're replacing the reference with
 39 | 
 40 |     transcript_offset : int
 41 |         Offset into the full transcript sequence of the ref->alt substitution
 42 |     """
 43 |     if not transcript.complete:
 44 |         raise ValueError(
 45 |             ("Can't annotate coding effect for %s"
 46 |              " on incomplete transcript %s" % (variant, transcript)))
 47 | 
 48 |     sequence = transcript.sequence
 49 | 
 50 |     n_ref = len(trimmed_cdna_ref)
 51 |     n_alt = len(trimmed_cdna_alt)
 52 | 
 53 |     # reference nucleotides found on the transcript, if these don't match
 54 |     # what we were told to expect from the variant then raise an exception
 55 |     ref_nucleotides_from_transcript = str(
 56 |         sequence[transcript_offset:transcript_offset + n_ref])
 57 | 
 58 |     # Make sure that the reference sequence agrees with what we expected
 59 |     # from the VCF
 60 |     assert ref_nucleotides_from_transcript == trimmed_cdna_ref, \
 61 |         "%s: expected ref '%s' at offset %d of %s, transcript has '%s'" % (
 62 |             variant,
 63 |             trimmed_cdna_ref,
 64 |             transcript_offset,
 65 |             transcript,
 66 |             ref_nucleotides_from_transcript)
 67 | 
 68 |     start_codon_offset = transcript.first_start_codon_spliced_offset
 69 |     stop_codon_offset = transcript.last_stop_codon_spliced_offset
 70 | 
 71 |     cds_len = stop_codon_offset - start_codon_offset + 1
 72 | 
 73 |     if cds_len < 3:
 74 |         raise ValueError(
 75 |             "Coding sequence for %s is too short: '%s'" % (
 76 |                 transcript,
 77 |                 transcript.sequence[start_codon_offset:stop_codon_offset + 1]))
 78 | 
 79 |     if n_ref == 0 and transcript.strand == "-":
 80 |         # By convention, genomic insertions happen *after* their base 1 position on
 81 |         # a chromosome. On the reverse strand, however, an insertion has to go
 82 |         # before the nucleotide at some transcript offset.
 83 |         # Example:
 84 |         #    chromosome sequence:
 85 |         #        TTT|GATCTCGTA|CCC
 86 |         #    transcript on reverse strand:
 87 |         #        CCC|ATGCTCTAG|TTT
 88 |         #    where the CDS is emphasized:
 89 |         #            ATGCTCTAG
 90 |         # If we have a genomic insertion g.6insATT
 91 |         # the genomic sequence becomes:
 92 |         #       TTT|GAT_ATT_CTCGTA|CCC
 93 |         # (insert the "ATT" after the "T" at position 6)
 94 |         # On the reverse strand this becomes:
 95 |         #       CCC|ATGCTC_TTA_TAG|TTT
 96 |         # (insert the "ATT" *before* the "T" at position 10)
 97 |         #
 98 |         # To preserve the interpretation of the start offset as the base
 99 |         # before the insertion, need to subtract one
100 |         cds_offset = transcript_offset - start_codon_offset - 1
101 |     else:
102 |         cds_offset = transcript_offset - start_codon_offset
103 | 
104 |     assert cds_offset < cds_len, \
105 |         "Expected CDS offset (%d) < |CDS| (%d) for %s on %s" % (
106 |             cds_offset, cds_len, variant, transcript)
107 | 
108 |     sequence_from_start_codon = str(sequence[start_codon_offset:])
109 | 
110 |     # is this an in-frame mutations?
111 |     if (n_ref - n_alt) % 3 == 0:
112 |         return predict_in_frame_coding_effect(
113 |             variant=variant,
114 |             transcript=transcript,
115 |             trimmed_cdna_ref=trimmed_cdna_ref,
116 |             trimmed_cdna_alt=trimmed_cdna_alt,
117 |             cds_offset=cds_offset,
118 |             sequence_from_start_codon=sequence_from_start_codon)
119 |     else:
120 |         return predict_frameshift_coding_effect(
121 |             variant=variant,
122 |             transcript=transcript,
123 |             trimmed_cdna_ref=trimmed_cdna_ref,
124 |             trimmed_cdna_alt=trimmed_cdna_alt,
125 |             cds_offset=cds_offset,
126 |             sequence_from_start_codon=sequence_from_start_codon)
127 | 


--------------------------------------------------------------------------------
/varcode/effects/effect_prediction_coding_frameshift.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | """
 14 | Effect annotation for variants which modify the coding sequence and change
 15 | reading frame.
 16 | """
 17 | 
 18 | from ..string_helpers import trim_shared_prefix
 19 | 
 20 | from .effect_classes import (
 21 |     FrameShift,
 22 |     FrameShiftTruncation,
 23 |     StartLoss,
 24 |     StopLoss,
 25 |     Silent
 26 | )
 27 | from .mutate import substitute
 28 | from .translate import translate
 29 | 
 30 | 
 31 | def create_frameshift_effect(
 32 |         mutated_codon_index,
 33 |         sequence_from_mutated_codon,
 34 |         variant,
 35 |         transcript):
 36 |     """
 37 |     Determine frameshift effect within a coding sequence (possibly affecting
 38 |     either the start or stop codons, or anythign in between)
 39 | 
 40 |     Parameters
 41 |     ----------
 42 |     mutated_codon_index : int
 43 |         Codon offset (starting from 0 = start codon) of first non-reference
 44 |         amino acid in the variant protein
 45 | 
 46 |     sequence_from_mutated_codon: Bio.Seq
 47 |         Sequence of mutated cDNA, starting from first mutated codon, until
 48 |         the end of the transcript
 49 | 
 50 |     variant : Variant
 51 | 
 52 |     transcript : transcript
 53 |     """
 54 | 
 55 |     assert transcript.protein_sequence is not None, \
 56 |         "Expect transcript %s to have protein sequence" % transcript
 57 | 
 58 |     original_protein_sequence = transcript.protein_sequence
 59 |     original_protein_length = len(original_protein_sequence)
 60 | 
 61 |     mutant_protein_suffix = translate(
 62 |         nucleotide_sequence=sequence_from_mutated_codon,
 63 |         first_codon_is_start=False,
 64 |         to_stop=True,
 65 |         truncate=True)
 66 | 
 67 |     if mutated_codon_index == 0:
 68 |         # TODO: scan through sequence_from_mutated_codon for
 69 |         # Kozak sequence + start codon to choose the new start
 70 |         return StartLoss(variant=variant, transcript=transcript)
 71 | 
 72 |     # the frameshifted sequence may contain some amino acids which are
 73 |     # the same as the original protein!
 74 |     _, mutant_protein_suffix, unchanged_amino_acids = trim_shared_prefix(
 75 |         ref=original_protein_sequence[mutated_codon_index:],
 76 |         alt=mutant_protein_suffix)
 77 |     n_unchanged_amino_acids = len(unchanged_amino_acids)
 78 |     offset_to_first_different_amino_acid = mutated_codon_index + n_unchanged_amino_acids
 79 |     # miraculously, this frameshift left the protein unchanged,
 80 |     # most likely by turning one stop codon into another stop codon
 81 |     if n_unchanged_amino_acids == 0:
 82 |         aa_ref = ""
 83 |     else:
 84 |         aa_ref = original_protein_sequence[-n_unchanged_amino_acids:]
 85 |     if offset_to_first_different_amino_acid >= original_protein_length:
 86 |         # frameshift is either extending the protein or leaving it unchanged
 87 |         if len(mutant_protein_suffix) == 0:
 88 | 
 89 |             return Silent(
 90 |                 variant=variant,
 91 |                 transcript=transcript,
 92 |                 aa_pos=mutated_codon_index,
 93 |                 aa_ref=aa_ref)
 94 |         else:
 95 |             # When all the amino acids are the same as the original, we either
 96 |             # have the original protein or we've extended it.
 97 |             # If we've extended it, it means we must have lost our stop codon.
 98 |             return StopLoss(
 99 |                 variant=variant,
100 |                 transcript=transcript,
101 |                 aa_ref=aa_ref,
102 |                 aa_alt=mutant_protein_suffix)
103 |     # original amino acid at the mutated codon before the frameshift occurred
104 |     aa_ref = original_protein_sequence[offset_to_first_different_amino_acid]
105 | 
106 |     # TODO: what if all the shifted amino acids were the same and the protein
107 |     # ended up the same length? Add a Silent case?
108 |     if len(mutant_protein_suffix) == 0:
109 |         # if a frameshift doesn't create any new amino acids, then
110 |         # it must immediately have hit a stop codon
111 |         return FrameShiftTruncation(
112 |             variant=variant,
113 |             transcript=transcript,
114 |             stop_codon_offset=offset_to_first_different_amino_acid)
115 |     return FrameShift(
116 |         variant=variant,
117 |         transcript=transcript,
118 |         aa_mutation_start_offset=offset_to_first_different_amino_acid,
119 |         shifted_sequence=str(mutant_protein_suffix))
120 | 
121 | def cdna_codon_sequence_after_insertion_frameshift(
122 |         sequence_from_start_codon,
123 |         cds_offset_before_insertion,
124 |         inserted_nucleotides):
125 |     """
126 |     Returns index of mutated codon and nucleotide sequence starting at the first
127 |     mutated codon.
128 |     """
129 |     # special logic for insertions
130 |     coding_sequence_after_insertion = \
131 |         sequence_from_start_codon[cds_offset_before_insertion + 1:]
132 | 
133 |     if cds_offset_before_insertion % 3 == 2:
134 |         # insertion happens after last nucleotide in a codon,
135 |         # doesn't disrupt the existing codon from cds_offset-2 to cds_offset
136 |         mutated_codon_index = cds_offset_before_insertion // 3 + 1
137 |         nucleotides_before = ""
138 |     elif cds_offset_before_insertion % 3 == 1:
139 |         # insertion happens after 2nd nucleotide of a codon
140 |         # codon positions:
141 |         #   1) cds_offset - 1
142 |         #   2) cds_offset
143 |         #    <----- Insertsion
144 |         #   3) cds_offset + 1
145 |         mutated_codon_index = cds_offset_before_insertion // 3
146 |         # the first codon in the returned sequence will contain two reference
147 |         # nucleotides before the insertion
148 |         nucleotides_before = sequence_from_start_codon[
149 |             cds_offset_before_insertion - 1:cds_offset_before_insertion + 1]
150 |     elif cds_offset_before_insertion % 3 == 0:
151 |         # insertion happens after 1st nucleotide of a codon
152 |         # codon positions:
153 |         #   1) cds_offset
154 |         #    <----- Insertsion
155 |         #   2) cds_offset + 1
156 |         #   3) cds_offset + 2
157 |         mutated_codon_index = cds_offset_before_insertion // 3
158 |         # the first codon in the returned sequence will contain one reference
159 |         # nucleotide before the insertion
160 |         nucleotides_before = sequence_from_start_codon[cds_offset_before_insertion]
161 |     sequence_from_mutated_codon = (
162 |         nucleotides_before +
163 |         inserted_nucleotides +
164 |         coding_sequence_after_insertion)
165 |     return mutated_codon_index, sequence_from_mutated_codon
166 | 
167 | 
168 | def cdna_codon_sequence_after_deletion_or_substitution_frameshift(
169 |         sequence_from_start_codon,
170 |         cds_offset,
171 |         trimmed_cdna_ref,
172 |         trimmed_cdna_alt):
173 |     """
174 |     Logic for any frameshift which isn't an insertion.
175 | 
176 |     We have insertions as a special case since our base-inclusive
177 |     indexing means something different for insertions:
178 |        cds_offset = base before insertion
179 |     Whereas in this case:
180 |       cds_offset = first reference base affected by a variant
181 | 
182 |     Returns index of first modified codon and sequence from that codon
183 |     onward.
184 |     """
185 |     mutated_codon_index = cds_offset // 3
186 |     # get the sequence starting from the first modified codon until the end
187 |     # of the transcript.
188 |     sequence_after_mutated_codon = \
189 |         sequence_from_start_codon[mutated_codon_index * 3:]
190 | 
191 |     # the variant's ref nucleotides should start either 0, 1, or 2 nucleotides
192 |     # into `sequence_after_mutated_codon`
193 |     offset_into_mutated_codon = cds_offset % 3
194 | 
195 |     sequence_from_mutated_codon = substitute(
196 |         sequence=sequence_after_mutated_codon,
197 |         offset=offset_into_mutated_codon,
198 |         ref=trimmed_cdna_ref,
199 |         alt=trimmed_cdna_alt)
200 |     return mutated_codon_index, sequence_from_mutated_codon
201 | 
202 | 
203 | def predict_frameshift_coding_effect(
204 |         variant,
205 |         transcript,
206 |         trimmed_cdna_ref,
207 |         trimmed_cdna_alt,
208 |         cds_offset,
209 |         sequence_from_start_codon):
210 |     """
211 |     Coding effect of a frameshift mutation.
212 | 
213 |     Parameters
214 |     ----------
215 |     variant : Variant
216 | 
217 |     transcript : Transcript
218 | 
219 |     trimmed_cdna_ref : nucleotide sequence
220 |         Reference nucleotides in the coding sequence of the given transcript.
221 | 
222 |     trimmed_cdna_alt : nucleotide sequence
223 |         Alternate nucleotides introduced by mutation
224 | 
225 |     cds_offset : int
226 |         Offset into the CDS of first ref nucleotide. For insertions, this
227 |         is the offset of the last ref nucleotide before the insertion.
228 | 
229 |     sequence_from_start_codon : nucleotide sequence
230 |         Nucleotides of the coding sequence and 3' UTR
231 | 
232 |     """
233 |     if len(trimmed_cdna_ref) != 0:
234 |         mutated_codon_index, sequence_from_mutated_codon = \
235 |             cdna_codon_sequence_after_deletion_or_substitution_frameshift(
236 |                 sequence_from_start_codon=sequence_from_start_codon,
237 |                 cds_offset=cds_offset,
238 |                 trimmed_cdna_ref=trimmed_cdna_ref,
239 |                 trimmed_cdna_alt=trimmed_cdna_alt)
240 |     else:
241 |         mutated_codon_index, sequence_from_mutated_codon = \
242 |             cdna_codon_sequence_after_insertion_frameshift(
243 |                 sequence_from_start_codon=sequence_from_start_codon,
244 |                 cds_offset_before_insertion=cds_offset,
245 |                 inserted_nucleotides=trimmed_cdna_alt)
246 |     return create_frameshift_effect(
247 |         mutated_codon_index=mutated_codon_index,
248 |         sequence_from_mutated_codon=sequence_from_mutated_codon,
249 |         variant=variant,
250 |         transcript=transcript)
251 | 


--------------------------------------------------------------------------------
/varcode/effects/mutate.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | 
14 | def insert_before(sequence, offset, new_residues):
15 |     """Mutate the given sequence by inserting the string `new_residues` before
16 |     `offset`.
17 | 
18 |     Parameters
19 |     ----------
20 |     sequence : sequence
21 |         String of amino acids or DNA bases
22 | 
23 |     offset : int
24 |         Base 0 offset from start of sequence, after which we should insert
25 |         `new_residues`.
26 | 
27 |     new_residues : sequence
28 |     """
29 |     assert 0 < offset <= len(sequence), \
30 |         "Invalid position %d for sequence of length %d" % (
31 |             offset, len(sequence))
32 |     prefix = sequence[:offset]
33 |     suffix = sequence[offset:]
34 |     return prefix + new_residues + suffix
35 | 
36 | def insert_after(sequence, offset, new_residues):
37 |     """Mutate the given sequence by inserting the string `new_residues` after
38 |     `offset`.
39 | 
40 |     Parameters
41 |     ----------
42 |     sequence : sequence
43 |         String of amino acids or DNA bases
44 | 
45 |     offset : int
46 |         Base 0 offset from start of sequence, after which we should insert
47 |         `new_residues`.
48 | 
49 |     new_residues : sequence
50 |     """
51 |     assert 0 <= offset < len(sequence), \
52 |         "Invalid position %d for sequence of length %d" % (
53 |             offset, len(sequence))
54 |     prefix = sequence[:offset + 1]
55 |     suffix = sequence[offset + 1:]
56 |     return prefix + new_residues + suffix
57 | 
58 | def substitute(sequence, offset, ref, alt):
59 |     """Mutate a sequence by substituting given `alt` at instead of `ref` at the
60 |     given `position`.
61 | 
62 |     Parameters
63 |     ----------
64 |     sequence : sequence
65 |         String of amino acids or DNA bases
66 | 
67 |     offset : int
68 |         Base 0 offset from start of `sequence`
69 | 
70 |     ref : sequence or str
71 |         What do we expect to find at the position?
72 | 
73 |     alt : sequence or str
74 |         Alternate sequence to insert
75 |     """
76 |     n_ref = len(ref)
77 |     sequence_ref = sequence[offset:offset + n_ref]
78 |     assert str(sequence_ref) == str(ref), \
79 |         "Reference %s at offset %d != expected reference %s" % \
80 |         (sequence_ref, offset, ref)
81 |     prefix = sequence[:offset]
82 |     suffix = sequence[offset + n_ref:]
83 |     return prefix + alt + suffix
84 | 


--------------------------------------------------------------------------------
/varcode/effects/transcript_helpers.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | 
14 | def interval_offset_on_transcript(start, end, transcript):
15 |     """
16 |     Given an interval [start:end] and a particular transcript,
17 |     return the start offset of the interval relative to the
18 |     chromosomal positions of the transcript.
19 |     """
20 |     # ensure that start_pos:end_pos overlap with transcript positions
21 |     if start > end:
22 |         raise ValueError(
23 |             "start_pos %d shouldn't be greater than end_pos %d" % (
24 |                 start, end))
25 |     if start > transcript.end:
26 |         raise ValueError(
27 |             "Range %d:%d starts after transcript %s (%d:%d)" % (
28 |                 start,
29 |                 end,
30 |                 transcript,
31 |                 transcript.start,
32 |                 transcript.end))
33 |     if end < transcript.start:
34 |         raise ValueError(
35 |             "Range %d:%d ends before transcript %s (%d:%d)" % (
36 |                 start,
37 |                 end,
38 |                 transcript,
39 |                 transcript.start,
40 |                 transcript.end))
41 |     # trim the start position to the beginning of the transcript
42 |     if start < transcript.start:
43 |         start = transcript.start
44 |     # trim the end position to the end of the transcript
45 |     if end > transcript.end:
46 |         end = transcript.end
47 |     # return earliest offset into the spliced transcript
48 |     return min(
49 |         transcript.spliced_offset(start),
50 |         transcript.spliced_offset(end))
51 | 


--------------------------------------------------------------------------------
/varcode/effects/translate.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | """Helpers for cDNA -> protein translation.
 14 | 
 15 | TODO: generalize this to work with the mitochondrial codon table.
 16 | """
 17 | 
 18 | from Bio.Data import CodonTable
 19 | from Bio.Seq import Seq
 20 | 
 21 | DNA_CODON_TABLE = CodonTable.standard_dna_table.forward_table
 22 | START_CODONS = set(CodonTable.standard_dna_table.start_codons)
 23 | STOP_CODONS = set(CodonTable.standard_dna_table.stop_codons)
 24 | 
 25 | 
 26 | def translate_codon(codon, aa_pos):
 27 |     """Translate a single codon into a single amino acid or stop '*'
 28 | 
 29 |     Parameters
 30 |     ----------
 31 |     codon : str
 32 |         Expected to be of length 3
 33 |     aa_pos : int
 34 |         Codon/amino acid offset into the protein (starting from 0)
 35 |     """
 36 |     # not handling rare Leucine or Valine starts!
 37 |     if aa_pos == 0 and codon in START_CODONS:
 38 |         return "M"
 39 |     elif codon in STOP_CODONS:
 40 |         return "*"
 41 |     else:
 42 |         return DNA_CODON_TABLE[codon]
 43 | 
 44 | 
 45 | def translate(
 46 |         nucleotide_sequence,
 47 |         first_codon_is_start=True,
 48 |         to_stop=True,
 49 |         truncate=False):
 50 |     """Translates cDNA coding sequence into amino acid protein sequence.
 51 | 
 52 |     Should typically start with a start codon but allowing non-methionine
 53 |     first residues since the CDS we're translating might have been affected
 54 |     by a start loss mutation.
 55 | 
 56 |     The sequence may include the 3' UTR but will stop translation at the first
 57 |     encountered stop codon.
 58 | 
 59 |     Parameters
 60 |     ----------
 61 |     nucleotide_sequence : BioPython Seq
 62 |         cDNA sequence
 63 | 
 64 |     first_codon_is_start : bool
 65 |         Treat the beginning of nucleotide_sequence (translates methionin)
 66 | 
 67 |     truncate : bool
 68 |         Truncate sequence if it's not a multiple of 3 (default = False)
 69 |     Returns BioPython Seq of amino acids
 70 |     """
 71 |     if not isinstance(nucleotide_sequence, Seq):
 72 |         nucleotide_sequence = Seq(nucleotide_sequence)
 73 | 
 74 |     if truncate:
 75 |         # if sequence isn't a multiple of 3, truncate it so BioPython
 76 |         # doesn't complain
 77 |         n_nucleotides = int(len(nucleotide_sequence) / 3) * 3
 78 |         nucleotide_sequence = nucleotide_sequence[:n_nucleotides]
 79 |     else:
 80 |         n_nucleotides = len(nucleotide_sequence)
 81 | 
 82 |     assert n_nucleotides % 3 == 0, \
 83 |         ("Expected nucleotide sequence to be multiple of 3"
 84 |          " but got %s of length %d") % (
 85 |             nucleotide_sequence,
 86 |             n_nucleotides)
 87 | 
 88 |     # passing cds=False to translate since we may want to deal with premature
 89 |     # stop codons
 90 |     protein_sequence = nucleotide_sequence.translate(to_stop=to_stop, cds=False)
 91 | 
 92 |     if first_codon_is_start and (
 93 |             len(protein_sequence) == 0 or protein_sequence[0] != "M"):
 94 |         if nucleotide_sequence[:3] in START_CODONS:
 95 |             # TODO: figure out when these should be made into methionines
 96 |             # and when left as whatever amino acid they normally code for
 97 |             # e.g. Leucine start codons
 98 |             # See: DOI: 10.1371/journal.pbio.0020397
 99 |             return "M" + protein_sequence[1:]
100 |         else:
101 |             raise ValueError(
102 |                 ("Expected first codon of %s to be start codon"
103 |                  " (one of %s) but got %s") % (
104 |                     protein_sequence[:10],
105 |                     START_CODONS,
106 |                     nucleotide_sequence))
107 | 
108 |     return protein_sequence
109 | 
110 | 
111 | def find_first_stop_codon(nucleotide_sequence):
112 |     """
113 |     Given a sequence of codons (expected to have length multiple of three),
114 |     return index of first stop codon, or -1 if none is in the sequence.
115 |     """
116 |     n_mutant_codons = len(nucleotide_sequence) // 3
117 |     for i in range(n_mutant_codons):
118 |         codon = nucleotide_sequence[3 * i:3 * i + 3]
119 |         if codon in STOP_CODONS:
120 |             return i
121 |     return -1
122 | 
123 | 
124 | def translate_in_frame_mutation(
125 |         transcript,
126 |         ref_codon_start_offset,
127 |         ref_codon_end_offset,
128 |         mutant_codons):
129 |     """
130 |     Returns:
131 |         - mutant amino acid sequence
132 |         - offset of first stop codon in the mutant sequence (or -1 if there was none)
133 |         - boolean flag indicating whether any codons from the 3' UTR were used
134 | 
135 |     Parameters
136 |     ----------
137 |     transcript : pyensembl.Transcript
138 |         Reference transcript to which a cDNA mutation should be applied.
139 | 
140 |     ref_codon_start_offset : int
141 |         Starting (base 0) integer offset into codons (character triplets) of the
142 |         transcript's reference coding sequence.
143 | 
144 |     ref_codon_end_offset : int
145 |         Final (base 0) integer offset into codons of the transcript's
146 |         reference coding sequence.
147 | 
148 |     mutant_codons : str
149 |         Nucleotide sequence to replace the reference codons with
150 |         (expected to have length that is a multiple of three)
151 |     """
152 |     mutant_stop_codon_index = find_first_stop_codon(mutant_codons)
153 | 
154 |     using_three_prime_utr = False
155 | 
156 |     if mutant_stop_codon_index != -1:
157 |         mutant_codons = mutant_codons[:3 * mutant_stop_codon_index]
158 |     elif ref_codon_end_offset > len(transcript.protein_sequence):
159 |         # if the mutant codons didn't contain a stop but did mutate the
160 |         # true reference stop codon then the translated sequence might involve
161 |         # the 3' UTR
162 |         three_prime_utr = transcript.three_prime_utr_sequence
163 |         n_utr_codons = len(three_prime_utr) // 3
164 |         # trim the 3' UTR sequence to have a length that is a multiple of 3
165 |         truncated_utr_sequence = three_prime_utr[:n_utr_codons * 3]
166 | 
167 |         # note the offset of the first stop codon in the combined
168 |         # nucleotide sequence of both the end of the CDS and the 3' UTR
169 |         first_utr_stop_codon_index = find_first_stop_codon(truncated_utr_sequence)
170 | 
171 |         if first_utr_stop_codon_index > 0:
172 |             # if there is a stop codon in the 3' UTR sequence and it's not the
173 |             # very first codon
174 |             using_three_prime_utr = True
175 |             n_mutant_codons_before_utr = len(mutant_codons) // 3
176 |             mutant_stop_codon_index = n_mutant_codons_before_utr + first_utr_stop_codon_index
177 |             # combine the in-frame mutant codons with the truncated sequence of
178 |             # the 3' UTR
179 |             mutant_codons += truncated_utr_sequence[:first_utr_stop_codon_index * 3]
180 |         elif first_utr_stop_codon_index == -1:
181 |             # if there is no stop codon in the 3' UTR sequence
182 |             using_three_prime_utr = True
183 |             mutant_codons += truncated_utr_sequence
184 | 
185 |     amino_acids = translate(
186 |         mutant_codons,
187 |         first_codon_is_start=(ref_codon_start_offset == 0))
188 | 
189 |     return amino_acids, mutant_stop_codon_index, using_three_prime_utr
190 | 


--------------------------------------------------------------------------------
/varcode/maf.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | import logging
 14 | 
 15 | import pandas
 16 | from typechecks import require_string
 17 | from pandas import isnull
 18 | 
 19 | from .reference import infer_genome
 20 | from .variant import Variant, variant_ascending_position_sort_key
 21 | from .variant_collection import VariantCollection
 22 | 
 23 | TCGA_PATIENT_ID_LENGTH = 12
 24 | 
 25 | MAF_COLUMN_NAMES = [
 26 |     'Hugo_Symbol',
 27 |     'Entrez_Gene_Id',
 28 |     'Center',
 29 |     'NCBI_Build',
 30 |     'Chromosome',
 31 |     'Start_Position',
 32 |     'End_Position',
 33 |     'Strand',
 34 |     'Variant_Classification',
 35 |     'Variant_Type',
 36 |     'Reference_Allele',
 37 |     'Tumor_Seq_Allele1',
 38 |     'Tumor_Seq_Allele2',
 39 |     'dbSNP_RS',
 40 |     'dbSNP_Val_Status',
 41 |     'Tumor_Sample_Barcode',
 42 |     'Matched_Norm_Sample_Barcode',
 43 |     'Match_Norm_Seq_Allele1',
 44 |     'Match_Norm_Seq_Allele2',
 45 | ]
 46 | 
 47 | 
 48 | def load_maf_dataframe(path, nrows=None, raise_on_error=True, encoding=None):
 49 |     """
 50 |     Load the guaranteed columns of a TCGA MAF file into a DataFrame
 51 | 
 52 |     Parameters
 53 |     ----------
 54 |     path : str
 55 |         Path to MAF file
 56 | 
 57 |     nrows : int
 58 |         Optional limit to number of rows loaded
 59 | 
 60 |     raise_on_error : bool
 61 |         Raise an exception upon encountering an error or log an error
 62 | 
 63 |     encoding : str, optional
 64 |         Encoding to use for UTF when reading MAF file.
 65 |     """
 66 |     require_string(path, "Path to MAF")
 67 | 
 68 |     n_basic_columns = len(MAF_COLUMN_NAMES)
 69 | 
 70 |     # pylint: disable=no-member
 71 |     # pylint gets confused by read_csv
 72 |     df = pandas.read_csv(
 73 |         path,
 74 |         comment="#",
 75 |         sep="\t",
 76 |         low_memory=False,
 77 |         skip_blank_lines=True,
 78 |         header=0,
 79 |         nrows=nrows,
 80 |         encoding=encoding)
 81 | 
 82 |     if len(df.columns) < n_basic_columns:
 83 |         error_message = (
 84 |             "Too few columns in MAF file %s, expected %d but got  %d : %s" % (
 85 |                 path, n_basic_columns, len(df.columns), df.columns))
 86 |         if raise_on_error:
 87 |             raise ValueError(error_message)
 88 |         else:
 89 |             logging.warn(error_message)
 90 | 
 91 |     # check each pair of expected/actual column names to make sure they match
 92 |     for expected, actual in zip(MAF_COLUMN_NAMES, df.columns):
 93 |         if expected != actual:
 94 |             # MAFs in the wild have capitalization differences in their
 95 |             # column names, normalize them to always use the names above
 96 |             if expected.lower() == actual.lower():
 97 |                 # using DataFrame.rename in Python 2.7.x doesn't seem to
 98 |                 # work for some files, possibly because Pandas treats
 99 |                 # unicode vs. str columns as different?
100 |                 df[expected] = df[actual]
101 |                 del df[actual]
102 |             else:
103 |                 error_message = (
104 |                     "Expected column %s but got %s" % (expected, actual))
105 |                 if raise_on_error:
106 |                     raise ValueError(error_message)
107 |                 else:
108 |                     logging.warn(error_message)
109 | 
110 |     return df
111 | 
112 | def load_maf(
113 |         path,
114 |         optional_cols=[],
115 |         sort_key=variant_ascending_position_sort_key,
116 |         distinct=True,
117 |         raise_on_error=True,
118 |         encoding=None,
119 |         nrows=None):
120 |     """
121 |     Load reference name and Variant objects from MAF filename.
122 | 
123 |     Parameters
124 |     ----------
125 | 
126 |     path : str
127 |         Path to MAF (*.maf).
128 | 
129 |     optional_cols : list, optional
130 |         A list of MAF columns to include as metadata if they are present in the MAF.
131 |         Does not result in an error if those columns are not present.
132 | 
133 |     sort_key : fn
134 |         Function which maps each element to a sorting criterion.
135 |         Set to None to not to sort the variants.
136 | 
137 |     distinct : bool
138 |         Don't keep repeated variants
139 | 
140 |     raise_on_error : bool
141 |         Raise an exception upon encountering an error or just log a warning.
142 | 
143 |     encoding : str, optional
144 |         Encoding to use for UTF when reading MAF file.
145 | 
146 |     nrows : int, optional
147 |         Limit to number of rows loaded
148 |     """
149 |     # pylint: disable=no-member
150 |     # pylint gets confused by read_csv inside load_maf_dataframe
151 |     maf_df = load_maf_dataframe(
152 |         path,
153 |         nrows=nrows,
154 |         raise_on_error=raise_on_error,
155 |         encoding=encoding)
156 | 
157 |     if len(maf_df) == 0 and raise_on_error:
158 |         raise ValueError("Empty MAF file %s" % path)
159 | 
160 |     ensembl_objects = {}
161 |     variants = []
162 |     metadata = {}
163 |     for _, x in maf_df.iterrows():
164 |         contig = x.Chromosome
165 |         if isnull(contig):
166 |             error_message = "Invalid contig name: %s" % (contig,)
167 |             if raise_on_error:
168 |                 raise ValueError(error_message)
169 |             else:
170 |                 logging.warn(error_message)
171 |                 continue
172 | 
173 |         start_pos = x.Start_Position
174 |         ref = x.Reference_Allele
175 | 
176 |         # it's possible in a MAF file to have multiple Ensembl releases
177 |         # mixed in a single MAF file (the genome assembly is
178 |         # specified by the NCBI_Build column)
179 |         ncbi_build = x.NCBI_Build
180 |         if ncbi_build in ensembl_objects:
181 |             genome = ensembl_objects[ncbi_build]
182 |         else:
183 |             if isinstance(ncbi_build, int):
184 |                 reference_name = "B%d" % ncbi_build
185 |             else:
186 |                 reference_name = str(ncbi_build)
187 |             genome, _ = infer_genome(reference_name)
188 |             ensembl_objects[ncbi_build] = genome
189 | 
190 |         # have to try both Tumor_Seq_Allele1 and Tumor_Seq_Allele2
191 |         # to figure out which is different from the reference allele
192 |         if x.Tumor_Seq_Allele1 != ref:
193 |             alt = x.Tumor_Seq_Allele1
194 |         else:
195 |             if x.Tumor_Seq_Allele2 == ref:
196 |                 error_message = (
197 |                     "Both tumor alleles agree with reference %s: %s" % (
198 |                         ref, x,))
199 |                 if raise_on_error:
200 |                     raise ValueError(error_message)
201 |                 else:
202 |                     logging.warn(error_message)
203 |                     continue
204 |             alt = x.Tumor_Seq_Allele2
205 | 
206 |         variant = Variant(
207 |             contig,
208 |             start_pos,
209 |             str(ref),
210 |             str(alt),
211 |             genome)
212 | 
213 |         # keep metadata about the variant and its TCGA annotation
214 |         metadata[variant] = {
215 |             'Hugo_Symbol': x.Hugo_Symbol,
216 |             'Center': x.Center,
217 |             'Strand': x.Strand,
218 |             'Variant_Classification': x.Variant_Classification,
219 |             'Variant_Type': x.Variant_Type,
220 |             'dbSNP_RS': x.dbSNP_RS,
221 |             'dbSNP_Val_Status': x.dbSNP_Val_Status,
222 |             'Tumor_Sample_Barcode': x.Tumor_Sample_Barcode,
223 |             'Matched_Norm_Sample_Barcode': x.Matched_Norm_Sample_Barcode,
224 |         }
225 |         for optional_col in optional_cols:
226 |             if optional_col in x:
227 |                 metadata[variant][optional_col] = x[optional_col]
228 | 
229 |         variants.append(variant)
230 | 
231 |     return VariantCollection(
232 |         variants=variants,
233 |         source_to_metadata_dict={path: metadata},
234 |         sort_key=sort_key,
235 |         distinct=distinct)
236 | 


--------------------------------------------------------------------------------
/varcode/nucleotides.py:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | 
 13 | from __future__ import print_function, division, absolute_import
 14 | 
 15 | import numpy as np
 16 | 
 17 | from typechecks import require_string
 18 | 
 19 | # include all pseudonucleotides encoding repeats and uncertain bases
 20 | STANDARD_NUCLEOTIDES = {'A', 'C', 'T', 'G'}
 21 | 
 22 | PURINE_NUCLEOTIDES = {'A', 'G'}
 23 | 
 24 | PYRIMIDINE_NUCLEOTIDES = {'C', 'T'}
 25 | 
 26 | AMINO_NUCLEOTIDES = {'A', 'C'}
 27 | 
 28 | KETO_NUCLEOTIDES = {'T', 'G'}
 29 | 
 30 | STRONG_NUCLEOTIDES = {'G', 'C'}
 31 | 
 32 | WEAK_NUCLEOTIDES = {'A', 'T'}
 33 | 
 34 | EXTENDED_NUCLEOTIDES = {
 35 |     'A', 'C', 'T', 'G',
 36 |     'Y',  # Pyrimidine (C or T)
 37 |     'R',  # Purine (A or G)
 38 |     'W',  # weak (A or T)
 39 |     'S',  # strong (G or C)
 40 |     'K',  # keto (T or G)
 41 |     'M',  # amino (C or A)
 42 |     'D',  # A, G, T (not C)
 43 |     'V',  # A, C, G (not T)
 44 |     'H',  # A, C, T (not G)
 45 |     'B',  # C, G, T (not A)
 46 |     'X',  # any base
 47 |     'N',  # any base
 48 | }
 49 | 
 50 | 
 51 | def is_purine(nucleotide, allow_extended_nucleotides=False):
 52 |     """Is the nucleotide a purine"""
 53 |     if not allow_extended_nucleotides and nucleotide not in STANDARD_NUCLEOTIDES:
 54 |         raise ValueError(
 55 |             "{} is a non-standard nucleotide, neither purine or pyrimidine".format(nucleotide))
 56 |     return nucleotide in PURINE_NUCLEOTIDES
 57 | 
 58 | 
 59 | def all_standard_nucleotides(nucleotides):
 60 |     return all(base in STANDARD_NUCLEOTIDES for base in nucleotides)
 61 | 
 62 | 
 63 | def normalize_nucleotide_string(
 64 |         nucleotides,
 65 |         allow_extended_nucleotides=False,
 66 |         empty_chars=".-",
 67 |         treat_nan_as_empty=True):
 68 |     """
 69 |     Normalizes a nucleotide string by converting various ways of encoding empty
 70 |     strings into "", making all letters upper case, and checking to make sure
 71 |     all letters in the string are actually nucleotides.
 72 | 
 73 |     Parameters
 74 |     ----------
 75 |     nucleotides : str
 76 |         Sequence of nucleotides, e.g. "ACCTG"
 77 | 
 78 |     extended_nucleotides : bool
 79 |         Allow non-canonical nucleotide characters like 'X' for unknown base
 80 | 
 81 |     empty_chars : str
 82 |         Characters which encode empty strings, such as "." used in VCF format
 83 |         or "-" used in MAF format
 84 | 
 85 |     treat_nan_as_empty : bool
 86 |         Some MAF files represent deletions/insertions with NaN ref/alt values
 87 |     """
 88 |     if nucleotides in empty_chars:
 89 |         return ""
 90 |     elif treat_nan_as_empty and isinstance(nucleotides, float) and np.isnan(nucleotides):
 91 |         return ""
 92 | 
 93 |     require_string(nucleotides, name="nucleotide string")
 94 | 
 95 |     nucleotides = nucleotides.upper()
 96 | 
 97 |     if allow_extended_nucleotides:
 98 |         valid_nucleotides = EXTENDED_NUCLEOTIDES
 99 |     else:
100 |         valid_nucleotides = STANDARD_NUCLEOTIDES
101 | 
102 |     if not set(nucleotides) <= valid_nucleotides:
103 |         raise ValueError(
104 |             "Invalid character(s) in nucleotide string: %s" % (
105 |                 ",".join(set(nucleotides) - valid_nucleotides),))
106 | 
107 |     return nucleotides
108 | 


--------------------------------------------------------------------------------
/varcode/string_helpers.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from __future__ import print_function, division, absolute_import
14 | 
15 | 
16 | def trim_shared_prefix(ref, alt):
17 |     """
18 |     Sometimes mutations are given with a shared prefix between the reference
19 |     and alternate strings. Examples: C>CT (nucleotides) or GYFP>G (amino acids).
20 | 
21 |     This function trims the common prefix and returns the disjoint ref
22 |     and alt strings, along with the shared prefix.
23 |     """
24 |     n_ref = len(ref)
25 |     n_alt = len(alt)
26 |     n_min = min(n_ref, n_alt)
27 |     i = 0
28 |     while i < n_min and ref[i] == alt[i]:
29 |         i += 1
30 | 
31 |     # guaranteed that ref and alt agree on all the characters
32 |     # up to i'th position, so it doesn't matter which one we pull
33 |     # the prefix out of
34 |     prefix = ref[:i]
35 |     ref_suffix = ref[i:]
36 |     alt_suffix = alt[i:]
37 |     return ref_suffix, alt_suffix, prefix
38 | 
39 | 
40 | def trim_shared_suffix(ref, alt):
41 |     """
42 |     Reuse the `trim_shared_prefix` function above to implement similar
43 |     functionality for string suffixes.
44 | 
45 |     Given ref='ABC' and alt='BC', we first revese both strings:
46 |         reverse_ref = 'CBA'
47 |         reverse_alt = 'CB'
48 |     and then the result of calling trim_shared_prefix will be:
49 |         ('A', '', 'CB')
50 |     We then reverse all three of the result strings to get back
51 |     the shared suffix and both prefixes leading up to it:
52 |         ('A', '', 'BC')
53 |     """
54 |     n_ref = len(ref)
55 |     n_alt = len(alt)
56 |     n_min = min(n_ref, n_alt)
57 |     i = 0
58 |     while i < n_min and ref[-i - 1] == alt[-i - 1]:
59 |         i += 1
60 | 
61 |     # i is length of shared suffix.
62 |     if i == 0:
63 |         return (ref, alt, '')
64 |     return (ref[:-i], alt[:-i], ref[-i:])
65 | 
66 | 
67 | def trim_shared_flanking_strings(ref, alt):
68 |     """
69 |     Given two nucleotide or amino acid strings, identify
70 |     if they have a common prefix, a common suffix, and return
71 |     their unique components along with the prefix and suffix.
72 | 
73 |     For example, if the input ref = "SYFFQGR" and alt = "SYMLLFIFQGR"
74 |     then the result will be:
75 |         ("F", "MLLFI", "SY", "FQGR")
76 |     """
77 |     ref, alt, prefix = trim_shared_prefix(ref, alt)
78 |     ref, alt, suffix = trim_shared_suffix(ref, alt)
79 |     return ref, alt, prefix, suffix
80 | 


--------------------------------------------------------------------------------
/varcode/ucsc_reference_names.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | # NCBI builds and hg releases aren't identical
14 | # but the differences are all on chrM and unplaced contigs
15 | # Mapping between names copied from:
16 | # https://genome.ucsc.edu/FAQ/FAQreleases.html#release1
17 | 
18 | 


--------------------------------------------------------------------------------
/varcode/util.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | import random
14 | 
15 | from Bio.Seq import reverse_complement
16 | from pyensembl import genome_for_reference_name
17 | 
18 | from .nucleotides import STANDARD_NUCLEOTIDES
19 | from .variant import Variant
20 | from .variant_collection import VariantCollection
21 | 
22 | # cache lists of all transcript IDs for difference Ensembl releases
23 | _transcript_ids_cache = {}
24 | 
25 | def random_variants(
26 |         count,
27 |         genome_name="GRCh38",
28 |         deletions=True,
29 |         insertions=True,
30 |         random_seed=None):
31 |     """
32 |     Generate a VariantCollection with random variants that overlap
33 |     at least one complete coding transcript.
34 |     """
35 |     rng = random.Random(random_seed)
36 |     ensembl = genome_for_reference_name(genome_name)
37 | 
38 |     if ensembl in _transcript_ids_cache:
39 |         transcript_ids = _transcript_ids_cache[ensembl]
40 |     else:
41 |         transcript_ids = ensembl.transcript_ids()
42 |         _transcript_ids_cache[ensembl] = transcript_ids
43 | 
44 |     variants = []
45 | 
46 |     # we should finish way before this loop is over but just in case
47 |     # something is wrong with PyEnsembl we want to avoid an infinite loop
48 |     for _ in range(count * 100):
49 |         if len(variants) < count:
50 |             transcript_id = rng.choice(transcript_ids)
51 |             transcript = ensembl.transcript_by_id(transcript_id)
52 | 
53 |             if not transcript.complete:
54 |                 continue
55 | 
56 |             exon = rng.choice(transcript.exons)
57 |             base1_genomic_position = rng.randint(exon.start, exon.end)
58 |             transcript_offset = transcript.spliced_offset(base1_genomic_position)
59 |             seq = transcript.sequence
60 | 
61 |             ref = str(seq[transcript_offset])
62 |             if transcript.on_backward_strand:
63 |                 ref = reverse_complement(ref)
64 | 
65 |             alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref]
66 | 
67 |             if insertions:
68 |                 nucleotide_pairs = [
69 |                     x + y
70 |                     for x in STANDARD_NUCLEOTIDES
71 |                     for y in STANDARD_NUCLEOTIDES
72 |                 ]
73 |                 alt_nucleotides.extend(nucleotide_pairs)
74 |             if deletions:
75 |                 alt_nucleotides.append("")
76 |             alt = rng.choice(alt_nucleotides)
77 |             variant = Variant(
78 |                 transcript.contig,
79 |                 base1_genomic_position,
80 |                 ref=ref,
81 |                 alt=alt,
82 |                 ensembl=ensembl)
83 |             variants.append(variant)
84 |         else:
85 |             return VariantCollection(variants)
86 |     raise ValueError(
87 |         ("Unable to generate %d random variants, "
88 |          "there may be a problem with PyEnsembl") % count)
89 | 


--------------------------------------------------------------------------------
/varcode/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.2.1"
2 | 


--------------------------------------------------------------------------------