├── .github └── workflows │ └── tests.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── RELEASING.md ├── deploy.sh ├── develop.sh ├── docs ├── Makefile ├── conf.py ├── index.rst ├── make.bat ├── modules.rst └── pyensembl.rst ├── lint-and-test.sh ├── lint.sh ├── pyensembl ├── __init__.py ├── common.py ├── database.py ├── download_cache.py ├── ensembl_release.py ├── ensembl_url_templates.py ├── ensembl_versions.py ├── exon.py ├── fasta.py ├── gene.py ├── genome.py ├── locus.py ├── locus_with_genome.py ├── logging.conf ├── normalization.py ├── reference_name.py ├── search.py ├── sequence_data.py ├── shell.py ├── species.py ├── transcript.py └── version.py ├── pylintrc ├── requirements.txt ├── setup.py ├── test.sh └── tests ├── __init__.py ├── common.py ├── data.py ├── data ├── gencode.ucsc.small.gtf ├── mouse.ensembl.81.partial.ENSMUSG00000017167.fa ├── mouse.ensembl.81.partial.ENSMUSG00000017167.gtf ├── mouse.ensembl.81.partial.ENSMUSG00000017167.pep ├── mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa └── refseq.ucsc.small.gtf ├── test_contigs.py ├── test_download_cache.py ├── test_ensembl_gtf.py ├── test_ensembl_object_properties.py ├── test_exon_id.py ├── test_exon_object.py ├── test_gene_ids.py ├── test_gene_names.py ├── test_gene_objects.py ├── test_id_length.py ├── test_locus.py ├── test_missing_genome_sources.py ├── test_mouse.py ├── test_release_versions.py ├── test_search.py ├── test_sequence_data.py ├── test_serialization.py ├── test_shell.py ├── test_string_representation.py ├── test_timings.py ├── test_transcript_ids.py ├── test_transcript_objects.py ├── test_transcript_sequences.py ├── test_transcript_support_level.py └── test_ucsc_gtf.py /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | # TODO: 5 | # - cache this directory $HOME/.cache/pyensembl/ 6 | # - update coveralls 7 | # - get a badge for tests passing 8 | # - download binary dependencies from conda 9 | name: Tests 10 | on: [push, pull_request] 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | fail-fast: true 17 | matrix: 18 | python-version: ["3.9", "3.10", "3.11"] 19 | 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | cache: "pip" 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | python -m pip install flake8 pytest pytest-cov coveralls 32 | pip install -r requirements.txt 33 | pip install . 34 | - name: Lint with flake8 35 | run: | 36 | # stop the build if there are Python syntax errors or undefined names 37 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 38 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 39 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 40 | - name: Run default linting script 41 | run: | 42 | ./lint.sh 43 | - name: Install Ensembl data 44 | run: | 45 | echo "Before installing Ensembl releases" && df -h 46 | pyensembl install --release 75 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh37.75/ 47 | pyensembl install --release 77 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.77/ 48 | pyensembl install --release 93 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.93/ 49 | pyensembl install --release 93 --species mouse --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCm38.93/ 50 | - name: Run unit tests 51 | run: | 52 | ./test.sh 53 | - name: Publish coverage to Coveralls 54 | uses: coverallsapp/github-action@v2.2.3 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to PyEnsembl 2 | 3 | [PyEnsembl](http://www.github.com/hammerlab/pyensembl) is open source software and 4 | we welcome your contributions. This document should help you get started 5 | contributing to PyEnsembl. 6 | 7 | ## Filing Issues 8 | 9 | If you find any bugs or problems while using PyEnsembl or have any feature requests, please feel free to file an issue against the project. When doing so, please follow the guidelines below: 10 | 11 | To report any bugs, issues, or feature requests, please [open an issue](https://github.com/hammerlab/pyensembl/issues) 12 | Please check the [current open issues](https://github.com/hammerlab/pyensembl/issues) to see if the request already exists 13 | If you are filing a bug report, please describe the version of PyEnsembl and Python you are using. If your problem involves a particular gene, transcript, or genomic locus, please include that information (e.g. "Missing transcript sequence for BRCA1-002 for Ensembl release 74"). 14 | 15 | ## Coding Guidelines 16 | 17 | - PyEnsembl is written in Python and adheres to the [PEP8](https://www.python.org/dev/peps/pep-0008/) 18 | style guidelines. 19 | - Contributions should come in the form of GitHub pull requests. 20 | - New features should start with a GitHub issue explaining their scope and rationale. 21 | - If the work is based on an existing issue, please reference the issue in the PR. 22 | - All new code should be accompanied by comprehensive unit tests. 23 | - If the PR fixes or implements an issue, please state "Closes #XYZ" or "Fixes #XYZ", where XYZ is the issue number. 24 | - Please ensure that your code works under Python >= 3.7. 25 | 26 | ## Licensing 27 | 28 | PyEnsembl is licensed under the Apache 2.0 license. Your code is assumed to be as well. 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Tests](https://github.com/openvax/pyensembl/actions/workflows/tests.yml/badge.svg)](https://github.com/openvax/pyensembl/actions/workflows/tests.yml) 2 | [![Coverage Status](https://coveralls.io/repos/github/openvax/pyensembl/badge.svg?branch=main)](https://coveralls.io/github/openvax/pyensembl?branch=main) 3 | 4 | PyPI 5 | 6 | 7 | # PyEnsembl 8 | 9 | PyEnsembl is a Python interface to [Ensembl](http://www.ensembl.org) reference genome metadata such as exons and transcripts. PyEnsembl downloads [GTF](https://en.wikipedia.org/wiki/Gene_transfer_format) and [FASTA](https://en.wikipedia.org/wiki/FASTA_format) files from the [Ensembl FTP server](ftp://ftp.ensembl.org) and loads them into a local database. PyEnsembl can also work with custom reference data specified using user-supplied GTF and FASTA files. 10 | 11 | # Example Usage 12 | 13 | ```python 14 | from pyensembl import EnsemblRelease 15 | 16 | # release 77 uses human reference genome GRCh38 17 | data = EnsemblRelease(77) 18 | 19 | # will return ['HLA-A'] 20 | gene_names = data.gene_names_at_locus(contig=6, position=29945884) 21 | 22 | # get all exons associated with HLA-A 23 | exon_ids = data.exon_ids_of_gene_name('HLA-A') 24 | ``` 25 | 26 | # Installation 27 | 28 | You can install PyEnsembl using [pip](https://pip.pypa.io/en/latest/quickstart.html): 29 | 30 | ```sh 31 | pip install pyensembl 32 | ``` 33 | 34 | This should also install any required packages such as [datacache](https://github.com/openvax/datacache). 35 | 36 | Before using PyEnsembl, run the following command to download and install 37 | Ensembl data: 38 | 39 | ``` 40 | pyensembl install --release --species 41 | ``` 42 | 43 | For example, `pyensembl install --release 75 76 --species human` will download and install all 44 | human reference data from Ensembl releases 75 and 76. 45 | 46 | Alternatively, you can create the `EnsemblRelease` object from inside a Python 47 | process and call `ensembl_object.download()` followed by `ensembl_object.index()`. 48 | 49 | ## Cache Location 50 | 51 | By default, PyEnsembl uses the platform-specific `Cache` folder 52 | and caches the files into the `pyensembl` sub-directory. 53 | You can override this default by setting the environment key `PYENSEMBL_CACHE_DIR` 54 | as your preferred location for caching: 55 | 56 | ```sh 57 | export PYENSEMBL_CACHE_DIR=/custom/cache/dir 58 | ``` 59 | 60 | or 61 | 62 | ```python 63 | import os 64 | 65 | os.environ['PYENSEMBL_CACHE_DIR'] = '/custom/cache/dir' 66 | # ... PyEnsembl API usage 67 | ``` 68 | 69 | # Usage tips 70 | 71 | ## List installed genomes 72 | 73 | To see the genomes for which PyEnsembl has already downloaded and indexed metadata you can run: 74 | 75 | ```sh 76 | pyensembl list 77 | ``` 78 | 79 | Or equivalently do this in Python: 80 | 81 | ```python 82 | from pyensembl.shell import collect_all_installed_ensembl_releases 83 | collect_all_installed_ensembl_releases() 84 | ``` 85 | 86 | ## Load genome in Python 87 | 88 | Here's an example Python snippet that loads fly genome data from Ensembl release v100: 89 | 90 | ```python 91 | from pyensembl import EnsemblRelease 92 | data = EnsemblRelease(release=100, species='drosophila_melanogaster') 93 | ``` 94 | 95 | ## Data structures 96 | 97 | ### Gene 98 | 99 | ```python 100 | gene = genome.gene_by_id(gene_id='FBgn0011747') 101 | ``` 102 | 103 | ### Transcript 104 | 105 | ```python 106 | transcript = gene.transcripts[0] 107 | ``` 108 | 109 | ### Protein information 110 | 111 | ```python 112 | transcript.protein_id 113 | transcript.protein_sequence 114 | ``` 115 | 116 | # Non-Ensembl Data 117 | 118 | PyEnsembl also allows arbitrary genomes via the specification 119 | of local file paths or remote URLs to both Ensembl and non-Ensembl GTF 120 | and FASTA files. (Warning: GTF formats can vary, and handling of 121 | non-Ensembl data is still very much in development.) 122 | 123 | For example: 124 | 125 | ```python 126 | from pyensembl import Genome 127 | data = Genome( 128 | reference_name='GRCh38', 129 | annotation_name='my_genome_features', 130 | # annotation_version=None, 131 | gtf_path_or_url='/My/local/gtf/path_to_my_genome_features.gtf', # Path or URL of GTF file 132 | # transcript_fasta_paths_or_urls=None, # List of paths or URLs of FASTA files containing transcript sequences 133 | # protein_fasta_paths_or_urls=None, # List of paths or URLs of FASTA files containing protein sequences 134 | # cache_directory_path=None, # Where to place downloaded and cached files for this genome 135 | ) 136 | # parse GTF and construct database of genomic features 137 | data.index() 138 | gene_names = data.gene_names_at_locus(contig=6, position=29945884) 139 | ``` 140 | 141 | # API 142 | 143 | The `EnsemblRelease` object has methods to let you access all possible 144 | combinations of the annotation features _gene_name_, _gene_id_, 145 | _transcript_name_, _transcript_id_, _exon_id_ as well as the location of 146 | these genomic elements (contig, start position, end position, strand). 147 | 148 | ## Genes 149 | 150 |
151 |
genes(contig=None, strand=None)
152 |
Returns a list of Gene objects, optionally restricted to a particular contig 153 | or strand.
154 | 155 |
genes_at_locus(contig, position, end=None, strand=None)
156 |
Returns a list of Gene objects overlapping a particular position on a contig, 157 | optionally extend into a range with the end parameter and restrict to 158 | forward or backward strand by passing strand='+' or strand='-'.
159 | 160 |
gene_by_id(gene_id)
161 |
Return a Gene object for given Ensembl gene ID (e.g. "ENSG00000068793").
162 | 163 |
gene_names(contig=None, strand=None)
164 |
Returns all gene names in the annotation database, optionally restricted 165 | to a particular contig or strand.
166 | 167 |
genes_by_name(gene_name)
168 |
Get all the unqiue genes with the given name (there might be multiple 169 | due to copies in the genome), return a list containing a Gene object for each 170 | distinct ID.
171 | 172 |
gene_by_protein_id(protein_id)
173 |
Find Gene associated with the given Ensembl protein ID (e.g. "ENSP00000350283")
174 | 175 |
gene_names_at_locus(contig, position, end=None, strand=None) 176 |
177 |
Names of genes overlapping with the given locus, optionally restricted by strand. 178 | (returns a list to account for overlapping genes)
179 | 180 |
gene_name_of_gene_id(gene_id) 181 |
182 |
Returns name of gene with given genen ID.
183 | 184 |
gene_name_of_transcript_id(transcript_id) 185 |
Returns name of gene associated with given transcript ID.
186 | 187 |
gene_name_of_transcript_name(transcript_name) 188 |
189 |
Returns name of gene associated with given transcript name.
190 | 191 |
gene_name_of_exon_id(exon_id) 192 |
Returns name of gene associated with given exon ID.
193 | 194 |
gene_ids(contig=None, strand=None) 195 |
196 |
Return all gene IDs in the annotation database, optionally restricted by 197 | chromosome name or strand.
198 | 199 |
gene_ids_of_gene_name(gene_name) 200 |
201 |
Returns all Ensembl gene IDs with the given name.
202 | 203 |
204 | 205 | ## Transcripts 206 | 207 |
208 |
transcripts(contig=None, strand=None)
209 |
Returns a list of Transcript objects for all transcript entries in the 210 | Ensembl database, optionally restricted to a particular contig or strand.
211 | 212 |
transcript_by_id(transcript_id)
213 |
Construct a Transcript object for given Ensembl transcript ID (e.g. "ENST00000369985")
214 | 215 |
transcripts_by_name(transcript_name)
216 |
Returns a list of Transcript objects for every transcript matching the given name.
217 | 218 |
transcript_names(contig=None, strand=None)
219 |
Returns all transcript names in the annotation database.
220 | 221 |
transcript_ids(contig=None, strand=None)
222 |
Returns all transcript IDs in the annotation database.
223 | 224 |
transcript_ids_of_gene_id(gene_id)
225 |
Return IDs of all transcripts associated with given gene ID.
226 | 227 |
transcript_ids_of_gene_name(gene_name)
228 |
Return IDs of all transcripts associated with given gene name.
229 | 230 |
transcript_ids_of_transcript_name(transcript_name)
231 |
Find all Ensembl transcript IDs with the given name.
232 | 233 |
transcript_ids_of_exon_id(exon_id)
234 |
Return IDs of all transcripts associatd with given exon ID.
235 |
236 | 237 | ## Exons 238 | 239 |
240 |
exon_ids(contig=None, strand=None)
241 |
Returns a list of exons IDs in the annotation database, optionally restricted 242 | by the given chromosome and strand.
243 | 244 |
exon_by_id(exon_id)
245 |
Construct an Exon object for given Ensembl exon ID (e.g. "ENSE00001209410")
246 | 247 |
exon_ids_of_gene_id(gene_id)
248 |
Returns a list of exon IDs associated with a given gene ID.
249 | 250 |
exon_ids_of_gene_name(gene_name)
251 |
Returns a list of exon IDs associated with a given gene name.
252 | 253 |
exon_ids_of_transcript_id(transcript_id)
254 |
Returns a list of exon IDs associated with a given transcript ID.
255 | 256 |
exon_ids_of_transcript_name(transcript_name)
257 |
Returns a list of exon IDs associated with a given transcript name.
258 |
259 | -------------------------------------------------------------------------------- /RELEASING.md: -------------------------------------------------------------------------------- 1 | # Releasing Pyensembl 2 | 3 | This document explains what do once your [Pull Request](https://www.atlassian.com/git/tutorials/making-a-pull-request/) has been reviewed and all final changes applied. Now you're ready merge your branch into master and release it to the world: 4 | 5 | 1. Bump the [version](http://semver.org/) in `version.py`, as part of the PR you want to release. 6 | 2. Merge your branch into master. 7 | 3. Run `deploy.sh` 8 | -------------------------------------------------------------------------------- /deploy.sh: -------------------------------------------------------------------------------- 1 | ./lint.sh && \ 2 | ./test.sh && \ 3 | python3 -m pip install --upgrade build && \ 4 | python3 -m pip install --upgrade twine && \ 5 | rm -rf dist && \ 6 | python3 -m build && \ 7 | git --version && \ 8 | python3 -m twine upload dist/* && \ 9 | git tag "$(python3 pyensembl/version.py)" && \ 10 | git push --tags 11 | -------------------------------------------------------------------------------- /develop.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | pip install -e . 4 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help 23 | help: 24 | @echo "Please use \`make ' where is one of" 25 | @echo " html to make standalone HTML files" 26 | @echo " dirhtml to make HTML files named index.html in directories" 27 | @echo " singlehtml to make a single large HTML file" 28 | @echo " pickle to make pickle files" 29 | @echo " json to make JSON files" 30 | @echo " htmlhelp to make HTML files and a HTML help project" 31 | @echo " qthelp to make HTML files and a qthelp project" 32 | @echo " applehelp to make an Apple Help Book" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | @echo " coverage to run coverage check of the documentation (if enabled)" 49 | 50 | .PHONY: clean 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | .PHONY: html 55 | html: 56 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 57 | @echo 58 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 59 | 60 | .PHONY: dirhtml 61 | dirhtml: 62 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 63 | @echo 64 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 65 | 66 | .PHONY: singlehtml 67 | singlehtml: 68 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 69 | @echo 70 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 71 | 72 | .PHONY: pickle 73 | pickle: 74 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 75 | @echo 76 | @echo "Build finished; now you can process the pickle files." 77 | 78 | .PHONY: json 79 | json: 80 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 81 | @echo 82 | @echo "Build finished; now you can process the JSON files." 83 | 84 | .PHONY: htmlhelp 85 | htmlhelp: 86 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 87 | @echo 88 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 89 | ".hhp project file in $(BUILDDIR)/htmlhelp." 90 | 91 | .PHONY: qthelp 92 | qthelp: 93 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 94 | @echo 95 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 96 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 97 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pyensembl.qhcp" 98 | @echo "To view the help file:" 99 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pyensembl.qhc" 100 | 101 | .PHONY: applehelp 102 | applehelp: 103 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 104 | @echo 105 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 106 | @echo "N.B. You won't be able to view it unless you put it in" \ 107 | "~/Library/Documentation/Help or install it in your application" \ 108 | "bundle." 109 | 110 | .PHONY: devhelp 111 | devhelp: 112 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 113 | @echo 114 | @echo "Build finished." 115 | @echo "To view the help file:" 116 | @echo "# mkdir -p $$HOME/.local/share/devhelp/pyensembl" 117 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pyensembl" 118 | @echo "# devhelp" 119 | 120 | .PHONY: epub 121 | epub: 122 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 123 | @echo 124 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 125 | 126 | .PHONY: latex 127 | latex: 128 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 129 | @echo 130 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 131 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 132 | "(use \`make latexpdf' here to do that automatically)." 133 | 134 | .PHONY: latexpdf 135 | latexpdf: 136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 137 | @echo "Running LaTeX files through pdflatex..." 138 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 139 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 140 | 141 | .PHONY: latexpdfja 142 | latexpdfja: 143 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 144 | @echo "Running LaTeX files through platex and dvipdfmx..." 145 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 146 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 147 | 148 | .PHONY: text 149 | text: 150 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 151 | @echo 152 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 153 | 154 | .PHONY: man 155 | man: 156 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 157 | @echo 158 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 159 | 160 | .PHONY: texinfo 161 | texinfo: 162 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 163 | @echo 164 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 165 | @echo "Run \`make' in that directory to run these through makeinfo" \ 166 | "(use \`make info' here to do that automatically)." 167 | 168 | .PHONY: info 169 | info: 170 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 171 | @echo "Running Texinfo files through makeinfo..." 172 | make -C $(BUILDDIR)/texinfo info 173 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 174 | 175 | .PHONY: gettext 176 | gettext: 177 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 178 | @echo 179 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 180 | 181 | .PHONY: changes 182 | changes: 183 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 184 | @echo 185 | @echo "The overview file is in $(BUILDDIR)/changes." 186 | 187 | .PHONY: linkcheck 188 | linkcheck: 189 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 190 | @echo 191 | @echo "Link check complete; look for any errors in the above output " \ 192 | "or in $(BUILDDIR)/linkcheck/output.txt." 193 | 194 | .PHONY: doctest 195 | doctest: 196 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 197 | @echo "Testing of doctests in the sources finished, look at the " \ 198 | "results in $(BUILDDIR)/doctest/output.txt." 199 | 200 | .PHONY: coverage 201 | coverage: 202 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 203 | @echo "Testing of coverage in the sources finished, look at the " \ 204 | "results in $(BUILDDIR)/coverage/python.txt." 205 | 206 | .PHONY: xml 207 | xml: 208 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 209 | @echo 210 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 211 | 212 | .PHONY: pseudoxml 213 | pseudoxml: 214 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 215 | @echo 216 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 217 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # pyensembl documentation build configuration file, created by 4 | # sphinx-quickstart on Sat Mar 26 22:47:25 2016. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | sys.path.insert(0, os.path.abspath('..')) 22 | 23 | # -- General configuration ------------------------------------------------ 24 | 25 | # If your documentation needs a minimal Sphinx version, state it here. 26 | #needs_sphinx = '1.0' 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [ 32 | 'sphinx.ext.autodoc', 33 | ] 34 | 35 | # Add any paths that contain templates here, relative to this directory. 36 | templates_path = ['_templates'] 37 | 38 | # The suffix(es) of source filenames. 39 | # You can specify multiple suffix as a list of string: 40 | # source_suffix = ['.rst', '.md'] 41 | source_suffix = '.rst' 42 | 43 | # The encoding of source files. 44 | #source_encoding = 'utf-8-sig' 45 | 46 | # The master toctree document. 47 | master_doc = 'index' 48 | 49 | # General information about the project. 50 | project = u'pyensembl' 51 | copyright = u'2016, Hammer Lab' 52 | author = u'Hammer Lab' 53 | 54 | # The version info for the project you're documenting, acts as replacement for 55 | # |version| and |release|, also used in various other places throughout the 56 | # built documents. 57 | # 58 | # The short X.Y version. 59 | version = u'0.8.10' 60 | # The full version, including alpha/beta/rc tags. 61 | release = u'0.8.10' 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | # 66 | # This is also used if you do content translation via gettext catalogs. 67 | # Usually you set "language" from the command line for these cases. 68 | language = None 69 | 70 | # There are two options for replacing |today|: either, you set today to some 71 | # non-false value, then it is used: 72 | #today = '' 73 | # Else, today_fmt is used as the format for a strftime call. 74 | #today_fmt = '%B %d, %Y' 75 | 76 | # List of patterns, relative to source directory, that match files and 77 | # directories to ignore when looking for source files. 78 | exclude_patterns = ['_build'] 79 | 80 | # The reST default role (used for this markup: `text`) to use for all 81 | # documents. 82 | #default_role = None 83 | 84 | # If true, '()' will be appended to :func: etc. cross-reference text. 85 | #add_function_parentheses = True 86 | 87 | # If true, the current module name will be prepended to all description 88 | # unit titles (such as .. function::). 89 | #add_module_names = True 90 | 91 | # If true, sectionauthor and moduleauthor directives will be shown in the 92 | # output. They are ignored by default. 93 | #show_authors = False 94 | 95 | # The name of the Pygments (syntax highlighting) style to use. 96 | pygments_style = 'sphinx' 97 | 98 | # A list of ignored prefixes for module index sorting. 99 | #modindex_common_prefix = [] 100 | 101 | # If true, keep warnings as "system message" paragraphs in the built documents. 102 | #keep_warnings = False 103 | 104 | # If true, `todo` and `todoList` produce output, else they produce nothing. 105 | todo_include_todos = False 106 | 107 | 108 | # -- Options for HTML output ---------------------------------------------- 109 | 110 | # The theme to use for HTML and HTML Help pages. See the documentation for 111 | # a list of builtin themes. 112 | html_theme = 'alabaster' 113 | 114 | # Theme options are theme-specific and customize the look and feel of a theme 115 | # further. For a list of options available for each theme, see the 116 | # documentation. 117 | #html_theme_options = {} 118 | 119 | # Add any paths that contain custom themes here, relative to this directory. 120 | #html_theme_path = [] 121 | 122 | # The name for this set of Sphinx documents. If None, it defaults to 123 | # " v documentation". 124 | #html_title = None 125 | 126 | # A shorter title for the navigation bar. Default is the same as html_title. 127 | #html_short_title = None 128 | 129 | # The name of an image file (relative to this directory) to place at the top 130 | # of the sidebar. 131 | #html_logo = None 132 | 133 | # The name of an image file (relative to this directory) to use as a favicon of 134 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 135 | # pixels large. 136 | #html_favicon = None 137 | 138 | # Add any paths that contain custom static files (such as style sheets) here, 139 | # relative to this directory. They are copied after the builtin static files, 140 | # so a file named "default.css" will overwrite the builtin "default.css". 141 | html_static_path = ['_static'] 142 | 143 | # Add any extra paths that contain custom files (such as robots.txt or 144 | # .htaccess) here, relative to this directory. These files are copied 145 | # directly to the root of the documentation. 146 | #html_extra_path = [] 147 | 148 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 149 | # using the given strftime format. 150 | #html_last_updated_fmt = '%b %d, %Y' 151 | 152 | # If true, SmartyPants will be used to convert quotes and dashes to 153 | # typographically correct entities. 154 | #html_use_smartypants = True 155 | 156 | # Custom sidebar templates, maps document names to template names. 157 | #html_sidebars = {} 158 | 159 | # Additional templates that should be rendered to pages, maps page names to 160 | # template names. 161 | #html_additional_pages = {} 162 | 163 | # If false, no module index is generated. 164 | #html_domain_indices = True 165 | 166 | # If false, no index is generated. 167 | #html_use_index = True 168 | 169 | # If true, the index is split into individual pages for each letter. 170 | #html_split_index = False 171 | 172 | # If true, links to the reST sources are added to the pages. 173 | #html_show_sourcelink = True 174 | 175 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 176 | #html_show_sphinx = True 177 | 178 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 179 | #html_show_copyright = True 180 | 181 | # If true, an OpenSearch description file will be output, and all pages will 182 | # contain a tag referring to it. The value of this option must be the 183 | # base URL from which the finished HTML is served. 184 | #html_use_opensearch = '' 185 | 186 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 187 | #html_file_suffix = None 188 | 189 | # Language to be used for generating the HTML full-text search index. 190 | # Sphinx supports the following languages: 191 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 192 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' 193 | #html_search_language = 'en' 194 | 195 | # A dictionary with options for the search language support, empty by default. 196 | # Now only 'ja' uses this config value 197 | #html_search_options = {'type': 'default'} 198 | 199 | # The name of a javascript file (relative to the configuration directory) that 200 | # implements a search results scorer. If empty, the default will be used. 201 | #html_search_scorer = 'scorer.js' 202 | 203 | # Output file base name for HTML help builder. 204 | htmlhelp_basename = 'pyensembldoc' 205 | 206 | # -- Options for LaTeX output --------------------------------------------- 207 | 208 | latex_elements = { 209 | # The paper size ('letterpaper' or 'a4paper'). 210 | #'papersize': 'letterpaper', 211 | 212 | # The font size ('10pt', '11pt' or '12pt'). 213 | #'pointsize': '10pt', 214 | 215 | # Additional stuff for the LaTeX preamble. 216 | #'preamble': '', 217 | 218 | # Latex figure (float) alignment 219 | #'figure_align': 'htbp', 220 | } 221 | 222 | # Grouping the document tree into LaTeX files. List of tuples 223 | # (source start file, target name, title, 224 | # author, documentclass [howto, manual, or own class]). 225 | latex_documents = [ 226 | (master_doc, 'pyensembl.tex', u'pyensembl Documentation', 227 | u'Hammer Lab', 'manual'), 228 | ] 229 | 230 | # The name of an image file (relative to this directory) to place at the top of 231 | # the title page. 232 | #latex_logo = None 233 | 234 | # For "manual" documents, if this is true, then toplevel headings are parts, 235 | # not chapters. 236 | #latex_use_parts = False 237 | 238 | # If true, show page references after internal links. 239 | #latex_show_pagerefs = False 240 | 241 | # If true, show URL addresses after external links. 242 | #latex_show_urls = False 243 | 244 | # Documents to append as an appendix to all manuals. 245 | #latex_appendices = [] 246 | 247 | # If false, no module index is generated. 248 | #latex_domain_indices = True 249 | 250 | 251 | # -- Options for manual page output --------------------------------------- 252 | 253 | # One entry per manual page. List of tuples 254 | # (source start file, name, description, authors, manual section). 255 | man_pages = [ 256 | (master_doc, 'pyensembl', u'pyensembl Documentation', 257 | [author], 1) 258 | ] 259 | 260 | # If true, show URL addresses after external links. 261 | #man_show_urls = False 262 | 263 | 264 | # -- Options for Texinfo output ------------------------------------------- 265 | 266 | # Grouping the document tree into Texinfo files. List of tuples 267 | # (source start file, target name, title, author, 268 | # dir menu entry, description, category) 269 | texinfo_documents = [ 270 | (master_doc, 'pyensembl', u'pyensembl Documentation', 271 | author, 'pyensembl', 'One line description of project.', 272 | 'Miscellaneous'), 273 | ] 274 | 275 | # Documents to append as an appendix to all manuals. 276 | #texinfo_appendices = [] 277 | 278 | # If false, no module index is generated. 279 | #texinfo_domain_indices = True 280 | 281 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 282 | #texinfo_show_urls = 'footnote' 283 | 284 | # If true, do not generate a @detailmenu in the "Top" node's menu. 285 | #texinfo_no_detailmenu = False 286 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. pyensembl documentation master file, created by 2 | sphinx-quickstart on Sat Mar 26 22:47:25 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to pyensembl's documentation! 7 | ===================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | modules 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | 23 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | echo. coverage to run coverage check of the documentation if enabled 41 | goto end 42 | ) 43 | 44 | if "%1" == "clean" ( 45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 46 | del /q /s %BUILDDIR%\* 47 | goto end 48 | ) 49 | 50 | 51 | REM Check if sphinx-build is available and fallback to Python version if any 52 | %SPHINXBUILD% 1>NUL 2>NUL 53 | if errorlevel 9009 goto sphinx_python 54 | goto sphinx_ok 55 | 56 | :sphinx_python 57 | 58 | set SPHINXBUILD=python -m sphinx.__init__ 59 | %SPHINXBUILD% 2> nul 60 | if errorlevel 9009 ( 61 | echo. 62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 63 | echo.installed, then set the SPHINXBUILD environment variable to point 64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 65 | echo.may add the Sphinx directory to PATH. 66 | echo. 67 | echo.If you don't have Sphinx installed, grab it from 68 | echo.http://sphinx-doc.org/ 69 | exit /b 1 70 | ) 71 | 72 | :sphinx_ok 73 | 74 | 75 | if "%1" == "html" ( 76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 77 | if errorlevel 1 exit /b 1 78 | echo. 79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 80 | goto end 81 | ) 82 | 83 | if "%1" == "dirhtml" ( 84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 85 | if errorlevel 1 exit /b 1 86 | echo. 87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 88 | goto end 89 | ) 90 | 91 | if "%1" == "singlehtml" ( 92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 93 | if errorlevel 1 exit /b 1 94 | echo. 95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 96 | goto end 97 | ) 98 | 99 | if "%1" == "pickle" ( 100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 101 | if errorlevel 1 exit /b 1 102 | echo. 103 | echo.Build finished; now you can process the pickle files. 104 | goto end 105 | ) 106 | 107 | if "%1" == "json" ( 108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 109 | if errorlevel 1 exit /b 1 110 | echo. 111 | echo.Build finished; now you can process the JSON files. 112 | goto end 113 | ) 114 | 115 | if "%1" == "htmlhelp" ( 116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 117 | if errorlevel 1 exit /b 1 118 | echo. 119 | echo.Build finished; now you can run HTML Help Workshop with the ^ 120 | .hhp project file in %BUILDDIR%/htmlhelp. 121 | goto end 122 | ) 123 | 124 | if "%1" == "qthelp" ( 125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 129 | .qhcp project file in %BUILDDIR%/qthelp, like this: 130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pyensembl.qhcp 131 | echo.To view the help file: 132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pyensembl.ghc 133 | goto end 134 | ) 135 | 136 | if "%1" == "devhelp" ( 137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 138 | if errorlevel 1 exit /b 1 139 | echo. 140 | echo.Build finished. 141 | goto end 142 | ) 143 | 144 | if "%1" == "epub" ( 145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 146 | if errorlevel 1 exit /b 1 147 | echo. 148 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 149 | goto end 150 | ) 151 | 152 | if "%1" == "latex" ( 153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 154 | if errorlevel 1 exit /b 1 155 | echo. 156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 157 | goto end 158 | ) 159 | 160 | if "%1" == "latexpdf" ( 161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 162 | cd %BUILDDIR%/latex 163 | make all-pdf 164 | cd %~dp0 165 | echo. 166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdfja" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf-ja 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "text" ( 181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 182 | if errorlevel 1 exit /b 1 183 | echo. 184 | echo.Build finished. The text files are in %BUILDDIR%/text. 185 | goto end 186 | ) 187 | 188 | if "%1" == "man" ( 189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 190 | if errorlevel 1 exit /b 1 191 | echo. 192 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 193 | goto end 194 | ) 195 | 196 | if "%1" == "texinfo" ( 197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 198 | if errorlevel 1 exit /b 1 199 | echo. 200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 201 | goto end 202 | ) 203 | 204 | if "%1" == "gettext" ( 205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 206 | if errorlevel 1 exit /b 1 207 | echo. 208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 209 | goto end 210 | ) 211 | 212 | if "%1" == "changes" ( 213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 214 | if errorlevel 1 exit /b 1 215 | echo. 216 | echo.The overview file is in %BUILDDIR%/changes. 217 | goto end 218 | ) 219 | 220 | if "%1" == "linkcheck" ( 221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 222 | if errorlevel 1 exit /b 1 223 | echo. 224 | echo.Link check complete; look for any errors in the above output ^ 225 | or in %BUILDDIR%/linkcheck/output.txt. 226 | goto end 227 | ) 228 | 229 | if "%1" == "doctest" ( 230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 231 | if errorlevel 1 exit /b 1 232 | echo. 233 | echo.Testing of doctests in the sources finished, look at the ^ 234 | results in %BUILDDIR%/doctest/output.txt. 235 | goto end 236 | ) 237 | 238 | if "%1" == "coverage" ( 239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 240 | if errorlevel 1 exit /b 1 241 | echo. 242 | echo.Testing of coverage in the sources finished, look at the ^ 243 | results in %BUILDDIR%/coverage/python.txt. 244 | goto end 245 | ) 246 | 247 | if "%1" == "xml" ( 248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 249 | if errorlevel 1 exit /b 1 250 | echo. 251 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 252 | goto end 253 | ) 254 | 255 | if "%1" == "pseudoxml" ( 256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 257 | if errorlevel 1 exit /b 1 258 | echo. 259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 260 | goto end 261 | ) 262 | 263 | :end 264 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | pyensembl 2 | ========= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | pyensembl 8 | -------------------------------------------------------------------------------- /docs/pyensembl.rst: -------------------------------------------------------------------------------- 1 | pyensembl package 2 | ================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | pyensembl.biotypes module 8 | ------------------------- 9 | 10 | .. automodule:: pyensembl.biotypes 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | pyensembl.common module 16 | ----------------------- 17 | 18 | .. automodule:: pyensembl.common 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | pyensembl.database module 24 | ------------------------- 25 | 26 | .. automodule:: pyensembl.database 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | pyensembl.download_cache module 32 | ------------------------------- 33 | 34 | .. automodule:: pyensembl.download_cache 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | pyensembl.ensembl_release module 40 | -------------------------------- 41 | 42 | .. automodule:: pyensembl.ensembl_release 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | pyensembl.ensembl_versions module 48 | ----------------------------------------- 49 | 50 | .. automodule:: pyensembl.ensembl_versions 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | pyensembl.ensembl_url_templates module 56 | -------------------------------------- 57 | 58 | .. automodule:: pyensembl.ensembl_url_templates 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | pyensembl.exon module 64 | --------------------- 65 | 66 | .. automodule:: pyensembl.exon 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | pyensembl.gene module 72 | --------------------- 73 | 74 | .. automodule:: pyensembl.gene 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | 79 | pyensembl.genome module 80 | ----------------------- 81 | 82 | .. automodule:: pyensembl.genome 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | 87 | pyensembl.gtf module 88 | -------------------- 89 | 90 | .. automodule:: pyensembl.gtf 91 | :members: 92 | :undoc-members: 93 | :show-inheritance: 94 | 95 | pyensembl.locus module 96 | ---------------------- 97 | 98 | .. automodule:: pyensembl.locus 99 | :members: 100 | :undoc-members: 101 | :show-inheritance: 102 | 103 | pyensembl.memory_cache module 104 | ----------------------------- 105 | 106 | .. automodule:: pyensembl.memory_cache 107 | :members: 108 | :undoc-members: 109 | :show-inheritance: 110 | 111 | pyensembl.search module 112 | ----------------------- 113 | 114 | .. automodule:: pyensembl.search 115 | :members: 116 | :undoc-members: 117 | :show-inheritance: 118 | 119 | pyensembl.sequence_data module 120 | ------------------------------ 121 | 122 | .. automodule:: pyensembl.sequence_data 123 | :members: 124 | :undoc-members: 125 | :show-inheritance: 126 | 127 | pyensembl.shell module 128 | ---------------------- 129 | 130 | .. automodule:: pyensembl.shell 131 | :members: 132 | :undoc-members: 133 | :show-inheritance: 134 | 135 | pyensembl.species module 136 | ------------------------ 137 | 138 | .. automodule:: pyensembl.species 139 | :members: 140 | :undoc-members: 141 | :show-inheritance: 142 | 143 | pyensembl.transcript module 144 | --------------------------- 145 | 146 | .. automodule:: pyensembl.transcript 147 | :members: 148 | :undoc-members: 149 | :show-inheritance: 150 | 151 | 152 | Module contents 153 | --------------- 154 | 155 | .. automodule:: pyensembl 156 | :members: 157 | :undoc-members: 158 | :show-inheritance: 159 | -------------------------------------------------------------------------------- /lint-and-test.sh: -------------------------------------------------------------------------------- 1 | ./lint.sh && ./test.sh 2 | -------------------------------------------------------------------------------- /lint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -o errexit 3 | 4 | 5 | # disabling several categories of errors due to false positives in pylint, 6 | # see these issues: 7 | # - https://bitbucket.org/logilab/pylint/issues/701/false-positives-with-not-an-iterable-and 8 | # - https://bitbucket.org/logilab/pylint/issues/58 9 | 10 | find pyensembl -name '*.py' \ 11 | | xargs pylint \ 12 | --errors-only \ 13 | --disable=unsubscriptable-object,not-an-iterable,no-member 14 | 15 | echo 'Passes pylint check' 16 | -------------------------------------------------------------------------------- /pyensembl/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from .database import Database 14 | from .download_cache import DownloadCache 15 | from .ensembl_release import EnsemblRelease, cached_release 16 | from .ensembl_versions import MAX_ENSEMBL_RELEASE 17 | from .exon import Exon 18 | from .genome import Genome 19 | from .gene import Gene 20 | from .locus import Locus 21 | from .reference_name import ( 22 | ensembl_grch36, 23 | ensembl_grch37, 24 | ensembl_grch38, 25 | normalize_reference_name, 26 | find_species_by_reference, 27 | which_reference, 28 | genome_for_reference_name, 29 | ) 30 | 31 | from .search import find_nearest_locus 32 | from .sequence_data import SequenceData 33 | from .species import find_species_by_name, check_species_object, normalize_species_name 34 | from .transcript import Transcript 35 | from .version import __version__ 36 | 37 | __all__ = [ 38 | "__version__", 39 | "DownloadCache", 40 | "Database", 41 | "EnsemblRelease", 42 | "cached_release", 43 | "MAX_ENSEMBL_RELEASE", 44 | "Gene", 45 | "Transcript", 46 | "Exon", 47 | "SequenceData", 48 | "find_nearest_locus", 49 | "find_species_by_name", 50 | "find_species_by_reference", 51 | "genome_for_reference_name", 52 | "which_reference", 53 | "check_species_object", 54 | "normalize_reference_name", 55 | "normalize_species_name", 56 | "Genome", 57 | "Locus", 58 | "Exon", 59 | "ensembl_grch36", 60 | "ensembl_grch37", 61 | "ensembl_grch38", 62 | ] 63 | -------------------------------------------------------------------------------- /pyensembl/common.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import pickle 14 | 15 | from functools import wraps 16 | 17 | 18 | def dump_pickle(obj, filepath): 19 | with open(filepath, "wb") as f: 20 | # use lower protocol for compatibility between Python 2 and Python 3 21 | pickle.dump(obj, file=f, protocol=2) 22 | 23 | 24 | def load_pickle(filepath): 25 | with open(filepath, "rb") as f: 26 | obj = pickle.load(f) 27 | return obj 28 | 29 | 30 | def _memoize_cache_key(args, kwargs): 31 | """Turn args tuple and kwargs dictionary into a hashable key. 32 | 33 | Expects that all arguments to a memoized function are either hashable 34 | or can be uniquely identified from type(arg) and repr(arg). 35 | """ 36 | cache_key_list = [] 37 | 38 | # hack to get around the unhashability of lists, 39 | # add a special case to convert them to tuples 40 | for arg in args: 41 | if type(arg) is list: 42 | cache_key_list.append(tuple(arg)) 43 | else: 44 | cache_key_list.append(arg) 45 | for k, v in sorted(kwargs.items()): 46 | if type(v) is list: 47 | cache_key_list.append((k, tuple(v))) 48 | else: 49 | cache_key_list.append((k, v)) 50 | return tuple(cache_key_list) 51 | 52 | 53 | def memoize(fn): 54 | """Simple reset-able memoization decorator for functions and methods, 55 | assumes that all arguments to the function can be hashed and 56 | compared. 57 | """ 58 | cache = {} 59 | 60 | @wraps(fn) 61 | def wrapped_fn(*args, **kwargs): 62 | cache_key = _memoize_cache_key(args, kwargs) 63 | try: 64 | return cache[cache_key] 65 | except KeyError: 66 | value = fn(*args, **kwargs) 67 | cache[cache_key] = value 68 | return value 69 | 70 | def clear_cache(): 71 | cache.clear() 72 | 73 | # Needed to ensure that EnsemblRelease.clear_cache 74 | # is able to clear memoized values from each of its methods 75 | wrapped_fn.clear_cache = clear_cache 76 | # expose the cache so we can check if an item has already been computed 77 | wrapped_fn.cache = cache 78 | # if we want to check whether an item is in the cache, first need 79 | # to construct the same cache key as used by wrapped_fn 80 | wrapped_fn.make_cache_key = _memoize_cache_key 81 | return wrapped_fn 82 | -------------------------------------------------------------------------------- /pyensembl/ensembl_release.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | """ 14 | Contains the EnsemblRelease class, which extends the Genome class 15 | to be specific to a particular release of Ensembl. 16 | """ 17 | from weakref import WeakValueDictionary 18 | 19 | from .genome import Genome 20 | from .ensembl_versions import check_release_number, MAX_ENSEMBL_RELEASE 21 | from .species import check_species_object, human 22 | 23 | from .ensembl_url_templates import ENSEMBL_FTP_SERVER, make_gtf_url, make_fasta_url 24 | 25 | 26 | class EnsemblRelease(Genome): 27 | """ 28 | Bundles together the genomic annotation and sequence data associated with 29 | a particular release of the Ensembl database. 30 | """ 31 | 32 | @classmethod 33 | def normalize_init_values(cls, release, species, server): 34 | """ 35 | Normalizes the arguments which uniquely specify an EnsemblRelease 36 | genome. 37 | """ 38 | release = check_release_number(release) 39 | species = check_species_object(species) 40 | return (release, species, server) 41 | 42 | # Using a WeakValueDictionary instead of an ordinary dict to prevent a 43 | # memory leak in cases where we test many different releases in sequence. 44 | # When all the references to a particular EnsemblRelease die then that 45 | # genome should also be removed from this cache. 46 | _genome_cache = WeakValueDictionary() 47 | 48 | @classmethod 49 | def cached( 50 | cls, release=MAX_ENSEMBL_RELEASE, species=human, server=ENSEMBL_FTP_SERVER 51 | ): 52 | """ 53 | Construct EnsemblRelease if it's never been made before, otherwise 54 | return an old instance. 55 | """ 56 | init_args_tuple = cls.normalize_init_values(release, species, server) 57 | if init_args_tuple in cls._genome_cache: 58 | genome = cls._genome_cache[init_args_tuple] 59 | else: 60 | genome = cls._genome_cache[init_args_tuple] = cls(*init_args_tuple) 61 | return genome 62 | 63 | def __init__( 64 | self, release=MAX_ENSEMBL_RELEASE, species=human, server=ENSEMBL_FTP_SERVER 65 | ): 66 | self.release, self.species, self.server = self.normalize_init_values( 67 | release=release, species=species, server=server 68 | ) 69 | 70 | self.gtf_url = make_gtf_url( 71 | ensembl_release=self.release, species=self.species, server=self.server 72 | ) 73 | 74 | self.transcript_fasta_urls = [ 75 | make_fasta_url( 76 | ensembl_release=self.release, 77 | species=self.species.latin_name, 78 | sequence_type="cdna", 79 | server=server, 80 | is_plant = self.species.is_plant, 81 | ), 82 | make_fasta_url( 83 | ensembl_release=self.release, 84 | species=self.species.latin_name, 85 | sequence_type="ncrna", 86 | server=server, 87 | is_plant = self.species.is_plant, 88 | ), 89 | ] 90 | 91 | self.protein_fasta_urls = [ 92 | make_fasta_url( 93 | ensembl_release=self.release, 94 | species=self.species.latin_name, 95 | sequence_type="pep", 96 | server=self.server, 97 | is_plant = self.species.is_plant, 98 | ) 99 | ] 100 | 101 | self.reference_name = self.species.which_reference(self.release) 102 | 103 | Genome.__init__( 104 | self, 105 | reference_name=self.reference_name, 106 | annotation_name="ensembl", 107 | annotation_version=self.release, 108 | gtf_path_or_url=self.gtf_url, 109 | transcript_fasta_paths_or_urls=self.transcript_fasta_urls, 110 | protein_fasta_paths_or_urls=self.protein_fasta_urls, 111 | ) 112 | 113 | def install_string(self): 114 | return "pyensembl install --release %d --species %s" % ( 115 | self.release, 116 | self.species.latin_name, 117 | ) 118 | 119 | def __str__(self): 120 | return "EnsemblRelease(release=%d, species='%s')" % ( 121 | self.release, 122 | self.species.latin_name, 123 | ) 124 | 125 | def __eq__(self, other): 126 | return ( 127 | other.__class__ is EnsemblRelease 128 | and self.release == other.release 129 | and self.species == other.species 130 | ) 131 | 132 | def __hash__(self): 133 | return hash((self.release, self.species)) 134 | 135 | def to_dict(self): 136 | return {"release": self.release, "species": self.species, "server": self.server} 137 | 138 | @classmethod 139 | def from_dict(cls, state_dict): 140 | """ 141 | Deserialize EnsemblRelease without creating duplicate instances. 142 | """ 143 | return cls.cached(**state_dict) 144 | 145 | 146 | def cached_release(release, species="human"): 147 | """ 148 | Create an EnsemblRelease instance only if it's hasn't already been made, 149 | otherwise returns the old instance. 150 | Keeping this function for backwards compatibility but this functionality 151 | has been moving into the cached method of EnsemblRelease. 152 | """ 153 | return EnsemblRelease.cached(release=release, species=species) 154 | -------------------------------------------------------------------------------- /pyensembl/ensembl_url_templates.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | """ 14 | Templates for URLs and paths to specific relase, species, and file type 15 | on the Ensembl ftp server. 16 | 17 | For example, the human chromosomal DNA sequences for release 78 are in: 18 | 19 | https://ftp.ensembl.org/pub/release-78/fasta/homo_sapiens/dna/ 20 | 21 | """ 22 | 23 | from .species import Species, find_species_by_name 24 | from .ensembl_versions import check_release_number 25 | 26 | ENSEMBL_FTP_SERVER = "https://ftp.ensembl.org" 27 | ENSEMBL_PLANTS_FTP_SERVER = "https://ftp.ensemblgenomes.ebi.ac.uk/" 28 | 29 | # Example directories 30 | # FASTA files: /pub/release-78/fasta/homo_sapiens/ 31 | # GTF annotation files: /pub/release-78/gtf/homo_sapiens/ 32 | FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/fasta/%(species)s/%(type)s/" 33 | PLANTS_FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/plants/fasta/%(species)s/%(type)s/" 34 | GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/gtf/%(species)s/" 35 | PLANTS_GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/plants/gtf/%(species)s/" 36 | 37 | #List plants 38 | #Lest do a vector with all the plants species that we added to make the custom url 39 | lPlants = ("arabidopsis_thaliana","arabidopsis") 40 | 41 | def normalize_release_properties(ensembl_release, species): 42 | """ 43 | Make sure a given release is valid, normalize it to be an integer, 44 | normalize the species name, and get its associated reference. 45 | """ 46 | ensembl_release = check_release_number(ensembl_release) 47 | if not isinstance(species, Species): 48 | species = find_species_by_name(species) 49 | reference_name = species.which_reference(ensembl_release) 50 | return ensembl_release, species.latin_name, reference_name 51 | 52 | 53 | # GTF annotation file example: Homo_sapiens.GTCh38.gtf.gz 54 | GTF_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(release)d.gtf.gz" 55 | 56 | 57 | def make_gtf_filename(ensembl_release, species): 58 | """ 59 | Return GTF filename expect on Ensembl FTP server for a specific 60 | species/release combination 61 | """ 62 | ensembl_release, species, reference_name = normalize_release_properties( 63 | ensembl_release, species 64 | ) 65 | return GTF_FILENAME_TEMPLATE % { 66 | "Species": species.capitalize(), 67 | "reference": reference_name, 68 | "release": ensembl_release, 69 | } 70 | 71 | 72 | def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER, gtf_subdir=GTF_SUBDIR_TEMPLATE): 73 | """ 74 | Returns a URL and a filename, which can be joined together. 75 | """ 76 | if species.is_plant: 77 | server = ENSEMBL_PLANTS_FTP_SERVER 78 | gtf_subdir = PLANTS_GTF_SUBDIR_TEMPLATE 79 | #else: 80 | #print(f"[+] {species.latin_name} it is not a plant", flush=True) 81 | 82 | ensembl_release, species, _ = normalize_release_properties(ensembl_release, species) 83 | subdir = gtf_subdir % {"release": ensembl_release, "species": species} 84 | filename = make_gtf_filename(ensembl_release=ensembl_release, species=species) 85 | return server + subdir + filename 86 | 87 | 88 | # cDNA & protein FASTA file for releases before (and including) Ensembl 75 89 | # example: Homo_sapiens.NCBI36.54.cdna.all.fa.gz 90 | OLD_FASTA_FILENAME_TEMPLATE = ( 91 | "%(Species)s.%(reference)s.%(release)d.%(sequence_type)s.all.fa.gz" 92 | ) 93 | 94 | # ncRNA FASTA file for releases before (and including) Ensembl 75 95 | # example: Homo_sapiens.NCBI36.54.ncrna.fa.gz 96 | 97 | OLD_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.%(release)d.ncrna.fa.gz" 98 | 99 | # cDNA & protein FASTA file for releases after Ensembl 75 100 | # example: Homo_sapiens.GRCh37.cdna.all.fa.gz 101 | NEW_FASTA_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(sequence_type)s.all.fa.gz" 102 | 103 | # ncRNA FASTA file for releases after Ensembl 75 104 | # example: Homo_sapiens.GRCh37.ncrna.fa.gz 105 | NEW_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.ncrna.fa.gz" 106 | 107 | 108 | def make_fasta_filename(ensembl_release, species, sequence_type, is_plant): 109 | ensembl_release, species, reference_name = normalize_release_properties( 110 | ensembl_release, species 111 | ) 112 | if ensembl_release <= 75 and not is_plant: 113 | if sequence_type == "ncrna": 114 | return OLD_FASTA_FILENAME_TEMPLATE_NCRNA % { 115 | "Species": species.capitalize(), 116 | "reference": reference_name, 117 | "release": ensembl_release, 118 | } 119 | else: 120 | return OLD_FASTA_FILENAME_TEMPLATE % { 121 | "Species": species.capitalize(), 122 | "reference": reference_name, 123 | "release": ensembl_release, 124 | "sequence_type": sequence_type, 125 | } 126 | else: 127 | if sequence_type == "ncrna": 128 | return NEW_FASTA_FILENAME_TEMPLATE_NCRNA % { 129 | "Species": species.capitalize(), 130 | "reference": reference_name, 131 | } 132 | else: 133 | return NEW_FASTA_FILENAME_TEMPLATE % { 134 | "Species": species.capitalize(), 135 | "reference": reference_name, 136 | "sequence_type": sequence_type, 137 | } 138 | 139 | 140 | def make_fasta_url(ensembl_release, species, sequence_type, is_plant, server=ENSEMBL_FTP_SERVER, fasta_subdir=FASTA_SUBDIR_TEMPLATE): 141 | """Construct URL to FASTA file with cDNA transcript or protein sequences 142 | 143 | Parameter examples: 144 | ensembl_release = 75 145 | species = "Homo_sapiens" 146 | sequence_type = "cdna" (other option: "pep") 147 | """ 148 | ensembl_release, species, reference_name = normalize_release_properties( 149 | ensembl_release, species 150 | ) 151 | 152 | if is_plant: 153 | server = ENSEMBL_PLANTS_FTP_SERVER 154 | fasta_subdir = PLANTS_FASTA_SUBDIR_TEMPLATE 155 | 156 | subdir = fasta_subdir % { 157 | "release": ensembl_release, 158 | "species": species, 159 | "type": sequence_type, 160 | } 161 | filename = make_fasta_filename( 162 | ensembl_release=ensembl_release, species=species, sequence_type=sequence_type, is_plant = is_plant 163 | ) 164 | return server + subdir + filename 165 | -------------------------------------------------------------------------------- /pyensembl/ensembl_versions.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | MIN_ENSEMBL_RELEASE = 40 14 | MAX_ENSEMBL_RELEASE = 111 15 | MAX_PLANTS_ENSEMBL_RELEASE = 58 16 | 17 | def check_release_number(release): 18 | """ 19 | Check to make sure a release is in the valid range of 20 | Ensembl releases. 21 | """ 22 | try: 23 | release = int(release) 24 | except: 25 | raise ValueError("Invalid Ensembl release: %s" % release) 26 | 27 | if release < MIN_ENSEMBL_RELEASE: 28 | raise ValueError( 29 | "Invalid Ensembl releases %d, must be greater than %d" 30 | % (release, MIN_ENSEMBL_RELEASE) 31 | ) 32 | return release 33 | -------------------------------------------------------------------------------- /pyensembl/exon.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | 14 | from .locus import Locus 15 | 16 | 17 | class Exon(Locus): 18 | def __init__(self, exon_id, contig, start, end, strand, gene_name, gene_id): 19 | Locus.__init__(self, contig, start, end, strand) 20 | self.exon_id = exon_id 21 | self.gene_name = gene_name 22 | self.gene_id = gene_id 23 | 24 | @property 25 | def id(self): 26 | """ 27 | Alias for exon_id necessary for backward compatibility. 28 | """ 29 | return self.exon_id 30 | 31 | def __str__(self): 32 | return ( 33 | "Exon(exon_id='%s'," 34 | " gene_id='%s'," 35 | " gene_name='%s'," 36 | " contig='%s'," 37 | " start=%d," 38 | " end=%s," 39 | " strand='%s')" 40 | ) % ( 41 | self.exon_id, 42 | self.gene_id, 43 | self.gene_name, 44 | self.contig, 45 | self.start, 46 | self.end, 47 | self.strand, 48 | ) 49 | 50 | def __eq__(self, other): 51 | if not isinstance(other, Exon): 52 | raise TypeError( 53 | "Cannot compare %s and %s" 54 | % (self.__class__.__name__, other.__class.__name__) 55 | ) 56 | return ( 57 | self.contig == other.contig 58 | and self.start == other.start 59 | and self.end == other.end 60 | and self.strand == other.strand 61 | and self.id == other.id 62 | ) 63 | 64 | def __hash__(self): 65 | return hash(self.id) 66 | 67 | def to_dict(self): 68 | state_dict = Locus.to_dict(self) 69 | state_dict["exon_id"] = self.id 70 | state_dict["gene_name"] = self.gene_name 71 | state_dict["gene_id"] = self.gene_id 72 | return state_dict 73 | -------------------------------------------------------------------------------- /pyensembl/fasta.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2015-2016. Mount Sinai School of Medicine 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | The worse sin in bioinformatics is to write your own FASTA parser. 17 | Unfortunately, small errors creep in to different FASTA files on the 18 | Ensembl FTP server that no proper FASTA parser lets you skip over. 19 | """ 20 | 21 | 22 | from gzip import GzipFile 23 | import logging 24 | 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | 29 | def _parse_header_id(line): 30 | """ 31 | Pull the transcript or protein identifier from the header line 32 | which starts with '>' 33 | """ 34 | if type(line) is not bytes: 35 | raise TypeError( 36 | "Expected header line to be of type %s but got %s" % (bytes, type(line)) 37 | ) 38 | 39 | if len(line) <= 1: 40 | raise ValueError("No identifier on FASTA line") 41 | 42 | # split line at first space to get the unique identifier for 43 | # this sequence 44 | space_index = line.find(b" ") 45 | if space_index >= 0: 46 | identifier = line[1:space_index] 47 | else: 48 | identifier = line[1:] 49 | 50 | # annoyingly Ensembl83 reformatted the transcript IDs of its 51 | # cDNA FASTA to include sequence version numbers 52 | # .e.g. 53 | # "ENST00000448914.1" instead of "ENST00000448914" 54 | # So now we have to parse out the identifier 55 | 56 | # only split name of ENSEMBL naming. In other database, such as TAIR, 57 | # the '.1' notation is the isoform not the version. 58 | if identifier.startswith(b"ENS"): 59 | dot_index = identifier.find(b".") 60 | if dot_index >= 0: 61 | identifier = identifier[:dot_index] 62 | 63 | return identifier.decode("ascii") 64 | 65 | 66 | class FastaParser(object): 67 | """ 68 | FastaParser object consumes lines of a FASTA file incrementally 69 | while building up a dictionary mapping sequence identifiers to sequences. 70 | """ 71 | 72 | def __init__(self): 73 | self.current_id = None 74 | self.current_lines = [] 75 | 76 | def read_file(self, fasta_path): 77 | """ 78 | Read the contents of a FASTA file into a dictionary 79 | """ 80 | fasta_dictionary = {} 81 | for identifier, sequence in self.iterate_over_file(fasta_path): 82 | fasta_dictionary[identifier] = sequence 83 | return fasta_dictionary 84 | 85 | def iterate_over_file(self, fasta_path): 86 | """ 87 | Generator that yields identifiers paired with sequences. 88 | """ 89 | with self._open(fasta_path) as f: 90 | for line in f: 91 | line = line.rstrip() 92 | 93 | if len(line) == 0: 94 | continue 95 | 96 | # have to slice into a bytes object or else I get a single integer 97 | first_char = line[0:1] 98 | 99 | if first_char == b">": 100 | id_and_seq = self._read_header(line) 101 | if id_and_seq is not None: 102 | yield id_and_seq 103 | 104 | elif first_char == b";": 105 | # semicolon are comment characters 106 | continue 107 | else: 108 | self.current_lines.append(line) 109 | # the last sequence is still in the lines buffer after we're done with 110 | # the file so make sure to yield it 111 | id_and_seq = self._current_entry() 112 | if id_and_seq is not None: 113 | yield id_and_seq 114 | 115 | def _open(self, fasta_path): 116 | """ 117 | Open either a text file or compressed gzip file as a stream of bytes. 118 | """ 119 | if fasta_path.endswith("gz") or fasta_path.endswith("gzip"): 120 | return GzipFile(fasta_path, "rb") 121 | else: 122 | return open(fasta_path, "rb") 123 | 124 | def _current_entry(self): 125 | # when we hit a new entry, if this isn't the first 126 | # entry of the file then put the last one in the dictionary 127 | if self.current_id: 128 | if len(self.current_lines) == 0: 129 | logger.warn("No sequence data for '%s'", self.current_id) 130 | else: 131 | sequence = b"".join(self.current_lines) 132 | sequence = sequence.decode("ascii") 133 | return self.current_id, sequence 134 | 135 | def _read_header(self, line): 136 | previous_entry = self._current_entry() 137 | 138 | self.current_id = _parse_header_id(line) 139 | 140 | if len(self.current_id) == 0: 141 | logger.warn("Unable to parse ID from header line: %s", line) 142 | 143 | self.current_lines = [] 144 | return previous_entry 145 | 146 | 147 | def parse_fasta_dictionary(fasta_path): 148 | """ 149 | Given a path to a FASTA (or compressed FASTA) file, returns a dictionary 150 | mapping its sequence identifiers to sequences. 151 | 152 | Parameters 153 | ---------- 154 | fasta_path : str 155 | Path to the FASTA file. 156 | 157 | Returns dictionary from string identifiers to string sequences. 158 | """ 159 | parser = FastaParser() 160 | return parser.read_file(fasta_path) 161 | -------------------------------------------------------------------------------- /pyensembl/gene.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | 14 | from memoized_property import memoized_property 15 | 16 | from .locus_with_genome import LocusWithGenome 17 | 18 | 19 | class Gene(LocusWithGenome): 20 | def __init__(self, gene_id, gene_name, contig, start, end, strand, biotype, genome): 21 | LocusWithGenome.__init__( 22 | self, 23 | contig=contig, 24 | start=start, 25 | end=end, 26 | strand=strand, 27 | biotype=biotype, 28 | genome=genome, 29 | ) 30 | self.gene_id = gene_id 31 | self.gene_name = gene_name 32 | 33 | @property 34 | def id(self): 35 | """ 36 | Alias for gene_id necessary for backwards compatibility. 37 | """ 38 | return self.gene_id 39 | 40 | @property 41 | def name(self): 42 | """ 43 | Alias for gene_name necessary for backwards compatibility. 44 | """ 45 | return self.gene_name 46 | 47 | def __str__(self): 48 | return ( 49 | "Gene(gene_id='%s'," 50 | " gene_name='%s'," 51 | " biotype='%s'," 52 | " contig='%s'," 53 | " start=%d," 54 | " end=%d, strand='%s', genome='%s')" 55 | ) % ( 56 | self.gene_id, 57 | self.gene_name, 58 | self.biotype, 59 | self.contig, 60 | self.start, 61 | self.end, 62 | self.strand, 63 | self.genome.reference_name, 64 | ) 65 | 66 | def __eq__(self, other): 67 | return ( 68 | other.__class__ is Gene 69 | and self.id == other.id 70 | and self.genome == other.genome 71 | ) 72 | 73 | def __hash__(self): 74 | return hash(self.id) 75 | 76 | def to_dict(self): 77 | state_dict = LocusWithGenome.to_dict(self) 78 | state_dict["gene_id"] = self.gene_id 79 | state_dict["gene_name"] = self.gene_name 80 | return state_dict 81 | 82 | @memoized_property 83 | def transcripts(self): 84 | """ 85 | Property which dynamically construct transcript objects for all 86 | transcript IDs associated with this gene. 87 | """ 88 | transcript_id_results = self.db.query( 89 | select_column_names=["transcript_id"], 90 | filter_column="gene_id", 91 | filter_value=self.id, 92 | feature="transcript", 93 | distinct=False, 94 | required=False, 95 | ) 96 | 97 | # We're doing a SQL query for each transcript ID to fetch 98 | # its particular information, might be more efficient if we 99 | # just get all the columns here, but how do we keep that modular? 100 | return [ 101 | self.genome.transcript_by_id(result[0]) for result in transcript_id_results 102 | ] 103 | 104 | @memoized_property 105 | def exons(self): 106 | exon_set = set([]) 107 | for transcript in self.transcripts: 108 | for exon in transcript.exons: 109 | exon_set.add(exon) 110 | return list(sorted(exon_set)) 111 | -------------------------------------------------------------------------------- /pyensembl/locus.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from serializable import Serializable 14 | 15 | from .normalization import normalize_chromosome, normalize_strand 16 | 17 | 18 | class Locus(Serializable): 19 | """ 20 | Base class for any entity which can be localized at a range of positions 21 | on a particular strand of a chromosome/contig. 22 | """ 23 | 24 | def __init__(self, contig, start, end, strand): 25 | """ 26 | contig : str 27 | Chromosome or other sequence name in the reference assembly 28 | 29 | start : int 30 | Start position of locus on the contig 31 | 32 | end : int 33 | Inclusive end position on the contig 34 | 35 | strand : str 36 | Should we read the locus forwards ('+') or backwards ('-')? 37 | """ 38 | 39 | self.contig = normalize_chromosome(contig) 40 | self.strand = normalize_strand(strand) 41 | 42 | start = int(start) 43 | end = int(end) 44 | 45 | if start == 0: 46 | raise ValueError("Expected start > 0 (using base 1 coordinates)") 47 | elif end == 0: 48 | raise ValueError("Expected end > 0 (using base 1 coordinates)") 49 | 50 | if end < start: 51 | raise ValueError( 52 | "Expected start <= end, got start = %d, end = %d" % (start, end) 53 | ) 54 | self.start = start 55 | self.end = end 56 | 57 | def __str__(self): 58 | return "Locus(contig='%s', start=%d, end=%d, strand='%s')" % ( 59 | self.contig, 60 | self.start, 61 | self.end, 62 | self.strand, 63 | ) 64 | 65 | def __len__(self): 66 | return self.end - self.start + 1 67 | 68 | def __eq__(self, other): 69 | if not isinstance(other, Locus): 70 | raise TypeError( 71 | "Cannot compare %s and %s" 72 | % (self.__class__.__name__, other.__class.__name__) 73 | ) 74 | return ( 75 | self.contig == other.contig 76 | and self.start == other.start 77 | and self.end == other.end 78 | and self.strand == other.strand 79 | ) 80 | 81 | def to_tuple(self): 82 | return (self.contig, self.start, self.end, self.strand) 83 | 84 | def __lt__(self, other): 85 | if not isinstance(other, Locus): 86 | raise TypeError( 87 | "Cannot compare %s and %s" 88 | % (self.__class__.__name__, other.__class.__name__) 89 | ) 90 | return self.to_tuple() < other.to_tuple() 91 | 92 | def __le__(self, other): 93 | return (self == other) or (self < other) 94 | 95 | def __gt__(self, other): 96 | if not isinstance(other, Locus): 97 | raise TypeError( 98 | "Cannot compare %s and %s" 99 | % (self.__class__.__name__, other.__class.__name__) 100 | ) 101 | return self.to_tuple() > other.to_tuple() 102 | 103 | def __ge__(self, other): 104 | return (self == other) or (self > other) 105 | 106 | def to_dict(self): 107 | return { 108 | "contig": self.contig, 109 | "start": self.start, 110 | "end": self.end, 111 | "strand": self.strand, 112 | } 113 | 114 | @property 115 | def length(self): 116 | return self.end - self.start + 1 117 | 118 | def offset(self, position): 119 | """Offset of given position from stranded start of this locus. 120 | 121 | For example, if a Locus goes from 10..20 and is on the negative strand, 122 | then the offset of position 13 is 7, whereas if the Locus is on the 123 | positive strand, then the offset is 3. 124 | """ 125 | if position > self.end or position < self.start: 126 | raise ValueError( 127 | "Position %d outside valid range %d..%d of %s" 128 | % (position, self.start, self.end, self) 129 | ) 130 | elif self.on_forward_strand: 131 | return position - self.start 132 | else: 133 | return self.end - position 134 | 135 | def offset_range(self, start, end): 136 | """ 137 | Database start/end entries are always ordered such that 138 | start < end. This makes computing a relative position (e.g. of a stop 139 | codon relative to its transcript) complicated since the "end" 140 | position of a backwards locus is actually earlir on the strand. 141 | This function correctly selects a start vs. end value depending 142 | on this locuses's strand and determines that position's offset from 143 | the earliest position in this locus. 144 | """ 145 | if start > end: 146 | raise ValueError( 147 | "Locus should always have start <= end, got start=%d, end=%d" 148 | % (start, end) 149 | ) 150 | 151 | if start < self.start or end > self.end: 152 | raise ValueError("Range (%d, %d) falls outside %s" % (start, end, self)) 153 | 154 | if self.on_forward_strand: 155 | return (start - self.start, end - self.start) 156 | 157 | else: 158 | return (self.end - end, self.end - start) 159 | 160 | def on_contig(self, contig): 161 | return normalize_chromosome(contig) == self.contig 162 | 163 | def on_strand(self, strand): 164 | return normalize_strand(strand) == self.strand 165 | 166 | @property 167 | def on_forward_strand(self): 168 | return self.on_strand("+") 169 | 170 | @property 171 | def on_positive_strand(self): 172 | return self.on_forward_strand 173 | 174 | @property 175 | def on_backward_strand(self): 176 | return self.on_strand("-") 177 | 178 | @property 179 | def on_negative_strand(self): 180 | return self.on_backward_strand 181 | 182 | def can_overlap(self, contig, strand=None): 183 | """ 184 | Is this locus on the same contig and (optionally) on the same strand? 185 | """ 186 | return self.on_contig(contig) and (strand is None or self.on_strand(strand)) 187 | 188 | def distance_to_interval(self, start, end): 189 | """ 190 | Find the distance between intervals [start1, end1] and [start2, end2]. 191 | If the intervals overlap then the distance is 0. 192 | """ 193 | if self.start > end: 194 | # interval is before this exon 195 | return self.start - end 196 | elif self.end < start: 197 | # exon is before the interval 198 | return start - self.end 199 | else: 200 | return 0 201 | 202 | def distance_to_locus(self, other): 203 | if not self.can_overlap(other.contig, other.strand): 204 | # if two loci are on different contigs or strands, 205 | # can't compute a distance between them 206 | return float("inf") 207 | return self.distance_to_interval(other.start, other.end) 208 | 209 | def overlaps(self, contig, start, end, strand=None): 210 | """ 211 | Does this locus overlap with a given range of positions? 212 | 213 | Since locus position ranges are inclusive, we should make sure 214 | that e.g. chr1:10-10 overlaps with chr1:10-10 215 | """ 216 | return ( 217 | self.can_overlap(contig, strand) 218 | and self.distance_to_interval(start, end) == 0 219 | ) 220 | 221 | def overlaps_locus(self, other_locus): 222 | return self.overlaps( 223 | other_locus.contig, other_locus.start, other_locus.end, other_locus.strand 224 | ) 225 | 226 | def contains(self, contig, start, end, strand=None): 227 | return ( 228 | self.can_overlap(contig, strand) and start >= self.start and end <= self.end 229 | ) 230 | 231 | def contains_locus(self, other_locus): 232 | return self.contains( 233 | other_locus.contig, other_locus.start, other_locus.end, other_locus.strand 234 | ) 235 | -------------------------------------------------------------------------------- /pyensembl/locus_with_genome.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | 14 | from .locus import Locus 15 | 16 | 17 | class LocusWithGenome(Locus): 18 | """ 19 | Common base class for Gene and Transcript to avoid copying 20 | their shared logic. 21 | """ 22 | 23 | def __init__(self, contig, start, end, strand, biotype, genome): 24 | Locus.__init__(self, contig, start, end, strand) 25 | self.genome = genome 26 | self.db = self.genome.db 27 | self.biotype = biotype 28 | 29 | def to_dict(self): 30 | return dict( 31 | contig=self.contig, 32 | start=self.start, 33 | end=self.end, 34 | strand=self.strand, 35 | biotype=self.biotype, 36 | genome=self.genome, 37 | ) 38 | 39 | @property 40 | def is_protein_coding(self): 41 | """ 42 | We're not counting immunoglobulin-like genes from the T-cell receptor or 43 | or antibodies since they occur in fragments that must be recombined. 44 | It might be worth consider counting non-sense mediated decay and 45 | non-stop decay since variants in these could potentially make a 46 | functional protein. To read more about the biotypes used in Ensembl: 47 | http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html 48 | http://www.gencodegenes.org/gencode_biotypes.html 49 | 50 | For now let's stick with the simple category of 'protein_coding', which 51 | means that there is an open reading frame in this gene/transcript 52 | whose successful transcription has been observed. 53 | """ 54 | return self.biotype == "protein_coding" 55 | -------------------------------------------------------------------------------- /pyensembl/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root,pyensembl,datacache 3 | 4 | [formatters] 5 | keys=simpleFormatter 6 | 7 | [handlers] 8 | keys=consoleHandler,consoleHandlerCritical 9 | 10 | [logger_root] 11 | level=INFO 12 | handlers=consoleHandlerCritical 13 | 14 | [handler_consoleHandler] 15 | class=StreamHandler 16 | level=INFO 17 | formatter=simpleFormatter 18 | args=(sys.stdout,) 19 | 20 | [handler_consoleHandlerCritical] # only for root logger: essentially silent 21 | class=StreamHandler 22 | level=CRITICAL 23 | formatter=simpleFormatter 24 | args=(sys.stdout,) 25 | 26 | [formatter_simpleFormatter] 27 | format=%(asctime)s - %(name)s - %(levelname)s - %(message)s 28 | datefmt= 29 | 30 | # pyensembl 31 | 32 | [logger_pyensembl] 33 | level=DEBUG 34 | qualname=pyensembl 35 | handlers=consoleHandler 36 | 37 | # datacache 38 | 39 | [logger_datacache] 40 | level=DEBUG 41 | qualname=datacache 42 | handlers=consoleHandler 43 | -------------------------------------------------------------------------------- /pyensembl/normalization.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from sys import intern 14 | from typechecks import is_string, is_integer 15 | 16 | # Manually memoizing here, since our simple common.memoize function has 17 | # noticable overhead in this instance. 18 | NORMALIZE_CHROMOSOME_CACHE = {} 19 | 20 | 21 | def normalize_chromosome(c): 22 | try: 23 | return NORMALIZE_CHROMOSOME_CACHE[c] 24 | except KeyError: 25 | pass 26 | 27 | if not (is_string(c) or is_integer(c)): 28 | raise TypeError("Chromosome cannot be '%s' : %s" % (c, type(c))) 29 | 30 | result = str(c) 31 | 32 | if result == "0": 33 | raise ValueError("Chromosome name cannot be 0") 34 | elif result == "": 35 | raise ValueError("Chromosome name cannot be empty") 36 | 37 | if result.startswith("chr") and "_" not in result: 38 | # excluding "_" for names like "chrUn_gl000212" 39 | # capitalize "chrx" -> "chrX" 40 | result = "chr" + result[3:].upper() 41 | elif result.isalpha(): 42 | # capitalize e.g. "x" -> "X" 43 | result = result.upper() 44 | 45 | # interning strings since the chromosome names probably get constructed 46 | # or parsed millions of times, can save memory in tight situations 47 | # (such as parsing GTF files) 48 | result = intern(result) 49 | 50 | NORMALIZE_CHROMOSOME_CACHE[c] = result 51 | 52 | return result 53 | 54 | 55 | def normalize_strand(strand): 56 | if strand == "+" or strand == 1 or strand == "+1" or strand == "1": 57 | return "+" 58 | elif strand == "-" or strand == -1 or strand == "-1": 59 | return "-" 60 | raise ValueError("Invalid strand: %s" % (strand,)) 61 | -------------------------------------------------------------------------------- /pyensembl/reference_name.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from .ensembl_release import EnsemblRelease 14 | from .species import Species, find_species_by_name 15 | 16 | 17 | def normalize_reference_name(name): 18 | """ 19 | Search the dictionary of species-specific references to find a reference 20 | name that matches aside from capitalization. 21 | 22 | If no matching reference is found, raise an exception. 23 | """ 24 | lower_name = name.strip().lower() 25 | for reference in Species._reference_names_to_species.keys(): 26 | if reference.lower() == lower_name: 27 | return reference 28 | raise ValueError("Reference genome '%s' not found" % name) 29 | 30 | 31 | def find_species_by_reference(reference_name): 32 | return Species._reference_names_to_species[normalize_reference_name(reference_name)] 33 | 34 | 35 | def which_reference(species_name, ensembl_release): 36 | return find_species_by_name(species_name).which_reference(ensembl_release) 37 | 38 | 39 | def max_ensembl_release(reference_name): 40 | species = find_species_by_reference(reference_name) 41 | (_, max_release) = species.reference_assemblies[reference_name] 42 | return max_release 43 | 44 | 45 | def genome_for_reference_name(reference_name, allow_older_downloaded_release=True): 46 | """ 47 | Given a genome reference name, such as "GRCh38", returns the 48 | corresponding Ensembl Release object. 49 | 50 | If `allow_older_downloaded_release` is True, and some older releases have 51 | been downloaded, then return the most recent locally available release. 52 | 53 | Otherwise, return the newest release of Ensembl (even if its data hasn't 54 | already been downloaded). 55 | """ 56 | reference_name = normalize_reference_name(reference_name) 57 | species = find_species_by_reference(reference_name) 58 | (min_ensembl_release, max_ensembl_release) = species.reference_assemblies[ 59 | reference_name 60 | ] 61 | if allow_older_downloaded_release: 62 | # go through candidate releases in descending order 63 | for release in reversed(range(min_ensembl_release, max_ensembl_release + 1)): 64 | # check if release has been locally downloaded 65 | candidate = EnsemblRelease.cached(release=release, species=species) 66 | if candidate.required_local_files_exist(): 67 | return candidate 68 | # see if any of the releases between [max, min] are already locally 69 | # available 70 | return EnsemblRelease.cached(release=max_ensembl_release, species=species) 71 | 72 | 73 | ensembl_grch36 = genome_for_reference_name("ncbi36") 74 | ensembl_grch37 = genome_for_reference_name("grch37") 75 | ensembl_grch38 = genome_for_reference_name("grch38") 76 | -------------------------------------------------------------------------------- /pyensembl/search.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | """ 14 | Helper functions for searching over collections of PyEnsembl objects 15 | """ 16 | 17 | 18 | def find_nearest_locus(start, end, loci): 19 | """ 20 | Finds nearest locus (object with method `distance_to_interval`) to the 21 | interval defined by the given `start` and `end` positions. 22 | Returns the distance to that locus, along with the locus object itself. 23 | """ 24 | best_distance = float("inf") 25 | best_locus = None 26 | for locus in loci: 27 | distance = locus.distance_to_interval(start, end) 28 | if best_distance > distance: 29 | best_distance = distance 30 | best_locus = locus 31 | return best_distance, best_locus 32 | -------------------------------------------------------------------------------- /pyensembl/sequence_data.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from os import remove 14 | from os.path import exists, abspath, split, join 15 | import logging 16 | from collections import Counter 17 | import pickle 18 | from .common import load_pickle, dump_pickle 19 | from .fasta import parse_fasta_dictionary 20 | 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | class SequenceData(object): 26 | """ 27 | Container for reference nucleotide and amino acid sequenes. 28 | """ 29 | 30 | def __init__(self, fasta_paths, cache_directory_path=None): 31 | if type(fasta_paths) is str: 32 | fasta_paths = [fasta_paths] 33 | 34 | self.fasta_paths = [abspath(path) for path in fasta_paths] 35 | self.fasta_directory_paths = [split(path)[0] for path in self.fasta_paths] 36 | self.fasta_filenames = [split(path)[1] for path in self.fasta_paths] 37 | if cache_directory_path: 38 | self.cache_directory_paths = [cache_directory_path] * len(self.fasta_paths) 39 | else: 40 | self.cache_directory_paths = self.fasta_directory_paths 41 | for path in self.fasta_paths: 42 | if not exists(path): 43 | raise ValueError("Couldn't find FASTA file %s" % (path,)) 44 | self.fasta_dictionary_filenames = [ 45 | filename + ".pickle" for filename in self.fasta_filenames 46 | ] 47 | self.fasta_dictionary_pickle_paths = [ 48 | join(cache_path, filename) 49 | for cache_path, filename in zip( 50 | self.cache_directory_paths, self.fasta_dictionary_filenames 51 | ) 52 | ] 53 | self._init_lazy_fields() 54 | 55 | def _init_lazy_fields(self): 56 | self._fasta_dictionary = None 57 | self._fasta_keys = None 58 | 59 | def clear_cache(self): 60 | self._init_lazy_fields() 61 | for path in self.fasta_dictionary_pickle_paths: 62 | if exists(path): 63 | remove(path) 64 | 65 | def __str__(self): 66 | return "SequenceData(fasta_paths=%s)" % (self.fasta_paths,) 67 | 68 | def __repr__(self): 69 | return str(self) 70 | 71 | def __contains__(self, sequence_id): 72 | if self._fasta_keys is None: 73 | self._fasta_keys = set(self.fasta_dictionary.keys()) 74 | return sequence_id in self._fasta_keys 75 | 76 | def __eq__(self, other): 77 | # test to see if self.fasta_paths and other.fasta_paths contain 78 | # the same list of paths, regardless of order 79 | return (other.__class__ is SequenceData) and Counter( 80 | self.fasta_paths 81 | ) == Counter(other.fasta_paths) 82 | 83 | def __hash__(self): 84 | return hash(self.fasta_paths) 85 | 86 | def _add_to_fasta_dictionary(self, fasta_dictionary_tmp): 87 | for identifier, sequence in fasta_dictionary_tmp.items(): 88 | if identifier in self._fasta_dictionary: 89 | logger.warn( 90 | "Sequence identifier %s is duplicated in your FASTA files!" 91 | % identifier 92 | ) 93 | continue 94 | self._fasta_dictionary[identifier] = sequence 95 | 96 | def _load_or_create_fasta_dictionary_pickle(self): 97 | self._fasta_dictionary = dict() 98 | for fasta_path, pickle_path in zip( 99 | self.fasta_paths, self.fasta_dictionary_pickle_paths 100 | ): 101 | if exists(pickle_path): 102 | # try loading the cached file 103 | # but we'll fall back on recreating it if loading fails 104 | try: 105 | fasta_dictionary_tmp = load_pickle(pickle_path) 106 | self._add_to_fasta_dictionary(fasta_dictionary_tmp) 107 | logger.info("Loaded sequence dictionary from %s", pickle_path) 108 | continue 109 | except (pickle.UnpicklingError, AttributeError): 110 | # catch either an UnpicklingError or an AttributeError 111 | # resulting from pickled objects refering to classes 112 | # that no longer exists 113 | logger.warn( 114 | "Failed to load %s, attempting to read FASTA directly", 115 | pickle_path, 116 | ) 117 | logger.info("Parsing sequences from FASTA file at %s", fasta_path) 118 | 119 | fasta_dictionary_tmp = parse_fasta_dictionary(fasta_path) 120 | self._add_to_fasta_dictionary(fasta_dictionary_tmp) 121 | logger.info("Saving sequence dictionary to %s", pickle_path) 122 | dump_pickle(fasta_dictionary_tmp, pickle_path) 123 | 124 | def index(self, overwrite=False): 125 | if overwrite: 126 | self.clear_cache() 127 | self._load_or_create_fasta_dictionary_pickle() 128 | 129 | @property 130 | def fasta_dictionary(self): 131 | if not self._fasta_dictionary: 132 | self._load_or_create_fasta_dictionary_pickle() 133 | return self._fasta_dictionary 134 | 135 | def get(self, sequence_id): 136 | """Get sequence associated with given ID or return None if missing""" 137 | return self.fasta_dictionary.get(sequence_id) 138 | -------------------------------------------------------------------------------- /pyensembl/shell.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | """ 14 | Manipulate pyensembl's local cache. 15 | 16 | %(prog)s {install, delete, delete-sequence-cache} [--release XXX --species human...] 17 | 18 | To install particular Ensembl human release(s): 19 | %(prog)s install --release 75 77 20 | 21 | To install particular Ensembl mouse release(s): 22 | %(prog)s install --release 75 77 --species mouse 23 | 24 | To delete all downloaded and cached data for a particular Ensembl release: 25 | %(prog)s delete-all-files --release 75 --species human 26 | 27 | To delete only cached data related to transcript and protein sequences: 28 | %(prog)s delete-index-files --release 75 29 | 30 | To list all installed genomes: 31 | %(prog)s list 32 | 33 | To install a genome from source files: 34 | %(prog)s install \ 35 | --reference-name "GRCh38" \ 36 | --gtf URL_OR_PATH \ 37 | --transcript-fasta URL_OR_PATH \ 38 | --protein-fasta URL_OR_PATH 39 | """ 40 | 41 | import argparse 42 | import logging.config 43 | import pkg_resources 44 | import os 45 | 46 | from .ensembl_release import EnsemblRelease 47 | from .ensembl_versions import MAX_ENSEMBL_RELEASE 48 | from .genome import Genome 49 | from .species import Species 50 | from .version import __version__ 51 | 52 | logging.config.fileConfig(pkg_resources.resource_filename(__name__, "logging.conf")) 53 | logger = logging.getLogger(__name__) 54 | 55 | 56 | parser = argparse.ArgumentParser(usage=__doc__) 57 | 58 | parser.add_argument( 59 | "--version", 60 | action="version", 61 | version='%(prog)s {version}'.format(version=__version__) 62 | ) 63 | 64 | parser.add_argument( 65 | "--overwrite", 66 | default=False, 67 | action="store_true", 68 | help="Force download and indexing even if files already exist locally", 69 | ) 70 | 71 | 72 | root_group = parser.add_mutually_exclusive_group() 73 | 74 | release_group = root_group.add_argument_group() 75 | release_group.add_argument( 76 | "--release", 77 | type=int, 78 | nargs="+", 79 | default=[], 80 | help="Ensembl release version(s) (default=%d)" % MAX_ENSEMBL_RELEASE, 81 | ) 82 | 83 | release_group.add_argument( 84 | "--species", 85 | default=[], 86 | nargs="+", 87 | help="Which species to download Ensembl data for (default=human)", 88 | ) 89 | 90 | release_group.add_argument( 91 | "--custom-mirror", 92 | default=None, 93 | help="URL and directory to use instead of the default Ensembl FTP server", 94 | ) 95 | 96 | path_group = root_group.add_argument_group() 97 | 98 | path_group.add_argument( 99 | "--reference-name", 100 | type=str, 101 | default=None, 102 | help="Name of the reference, e.g. GRCh38", 103 | ) 104 | 105 | path_group.add_argument( 106 | "--annotation-name", default=None, help="Name of annotation source (e.g. refseq)" 107 | ) 108 | 109 | path_group.add_argument( 110 | "--annotation-version", default=None, help="Version of annotation database" 111 | ) 112 | 113 | path_group.add_argument( 114 | "--gtf", 115 | type=str, 116 | default=None, 117 | help="URL or local path to a GTF file containing annotations.", 118 | ) 119 | 120 | path_group.add_argument( 121 | "--transcript-fasta", 122 | type=str, 123 | action="append", 124 | default=[], 125 | help="URL or local path to a FASTA files containing the transcript " 126 | "data. This option can be specified multiple times for multiple " 127 | "FASTA files.", 128 | ) 129 | 130 | path_group.add_argument( 131 | "--protein-fasta", 132 | type=str, 133 | default=[], 134 | action="append", 135 | help="URL or local path to a FASTA file containing protein data.", 136 | ) 137 | 138 | path_group.add_argument( 139 | "--shared-prefix", 140 | default="", 141 | help="Add this prefix to URLs or paths specified by --gtf, --transcript-fasta, --protein-fasta", 142 | ) 143 | 144 | parser.add_argument( 145 | "action", 146 | type=lambda arg: arg.lower().strip(), 147 | choices=( 148 | "install", 149 | "delete-all-files", 150 | "delete-index-files", 151 | "list", 152 | ), 153 | help=( 154 | '"install" will download and index any data that is not ' 155 | 'currently downloaded or indexed. "delete-all-files" will delete all data ' 156 | 'associated with a genome annotation. "delete-index-files" deletes ' 157 | "all files other than the original GTF and FASTA files for a genome. " 158 | '"list" will show you all installed Ensembl genomes.' 159 | ), 160 | ) 161 | 162 | 163 | def collect_all_installed_ensembl_releases(): 164 | genomes = [] 165 | for species, release in Species.all_species_release_pairs(): 166 | genome = EnsemblRelease(release, species=species) 167 | if genome.required_local_files_exist(): 168 | genomes.append(genome) 169 | return sorted(genomes, key=lambda g: (g.species.latin_name, g.release)) 170 | 171 | 172 | def all_combinations_of_ensembl_genomes(args): 173 | """ 174 | Use all combinations of species and release versions specified by the 175 | commandline arguments to return a list of EnsemblRelease or Genome objects. 176 | The results will typically be of type EnsemblRelease unless the 177 | --custom-mirror argument was given. 178 | """ 179 | species_list = args.species if args.species else ["human"] 180 | release_list = args.release if args.release else [MAX_ENSEMBL_RELEASE] 181 | genomes = [] 182 | for species in species_list: 183 | # Otherwise, use Ensembl release information 184 | for version in release_list: 185 | ensembl_release = EnsemblRelease(version, species=species) 186 | 187 | if not args.custom_mirror: 188 | genomes.append(ensembl_release) 189 | else: 190 | # if we're using a custom mirror then we expect the provided 191 | # URL to be a directory with all the same filenames as 192 | # would be provided by Ensembl 193 | gtf_url = os.path.join( 194 | args.custom_mirror, os.path.basename(ensembl_release.gtf_url) 195 | ) 196 | transcript_fasta_urls = [ 197 | os.path.join( 198 | args.custom_mirror, os.path.basename(transcript_fasta_url) 199 | ) 200 | for transcript_fasta_url in ensembl_release.transcript_fasta_urls 201 | ] 202 | protein_fasta_urls = [ 203 | os.path.join( 204 | args.custom_mirror, os.path.basename(protein_fasta_url) 205 | ) 206 | for protein_fasta_url in ensembl_release.protein_fasta_urls 207 | ] 208 | reference_name = ensembl_release.reference_name 209 | genome = Genome( 210 | reference_name=reference_name, 211 | annotation_name="ensembl", 212 | annotation_version=version, 213 | gtf_path_or_url=gtf_url, 214 | transcript_fasta_paths_or_urls=transcript_fasta_urls, 215 | protein_fasta_paths_or_urls=protein_fasta_urls, 216 | ) 217 | genomes.append(genome) 218 | return genomes 219 | 220 | 221 | def collect_selected_genomes(args): 222 | # If specific genome source URLs are provided, use those 223 | if args.gtf or args.transcript_fasta or args.protein_fasta: 224 | if args.release: 225 | raise ValueError( 226 | "An Ensembl release cannot be specified if " 227 | "specific paths are also given" 228 | ) 229 | if not args.reference_name: 230 | raise ValueError("Must specify a reference name") 231 | if not args.annotation_name: 232 | raise ValueError("Must specify the name of the annotation source") 233 | 234 | return [ 235 | Genome( 236 | reference_name=args.reference_name, 237 | annotation_name=args.annotation_name, 238 | annotation_version=args.annotation_version, 239 | gtf_path_or_url=os.path.join(args.shared_prefix, args.gtf), 240 | transcript_fasta_paths_or_urls=[ 241 | os.path.join(args.shared_prefix, transcript_fasta) 242 | for transcript_fasta in args.transcript_fasta 243 | ], 244 | protein_fasta_paths_or_urls=[ 245 | os.path.join(args.shared_prefix, protein_fasta) 246 | for protein_fasta in args.protein_fasta 247 | ], 248 | ) 249 | ] 250 | else: 251 | return all_combinations_of_ensembl_genomes(args) 252 | 253 | 254 | def run(): 255 | args = parser.parse_args() 256 | if args.action == "list": 257 | # TODO: how do we also identify which non-Ensembl genomes are 258 | # installed? 259 | genomes = collect_all_installed_ensembl_releases() 260 | for genome in genomes: 261 | # print every directory in which downloaded files are located 262 | # in most case this will be only one directory 263 | filepaths = genome.required_local_files() 264 | directories = {os.path.split(path)[0] for path in filepaths} 265 | print("-- %s: %s" % (genome, ", ".join(directories))) 266 | else: 267 | genomes = collect_selected_genomes(args) 268 | 269 | if len(genomes) == 0: 270 | logger.error("ERROR: No genomes selected!") 271 | parser.print_help() 272 | 273 | for genome in genomes: 274 | logger.info("Running '%s' for %s", args.action, genome) 275 | if args.action == "delete-all-files": 276 | genome.download_cache.delete_cache_directory() 277 | elif args.action == "delete-index-files": 278 | genome.delete_index_files() 279 | elif args.action == "install": 280 | genome.download(overwrite=args.overwrite) 281 | genome.index(overwrite=args.overwrite) 282 | else: 283 | raise ValueError("Invalid action: %s" % args.action) 284 | -------------------------------------------------------------------------------- /pyensembl/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "2.3.13" 2 | 3 | def print_version(): 4 | print(f"v{__version__}") 5 | 6 | if __name__ == "__main__": 7 | print_version() 8 | -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | [TYPECHECK] 2 | # Without ignoring this, we get errors like: 3 | # E:249,20: Module 'numpy' has no 'nan' member (no-member) 4 | ignored-modules = numpy 5 | 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | typechecks>=0.0.2,<1.0.0 2 | datacache>=1.4.0,<2.0.0 3 | memoized-property>=1.0.2 4 | tinytimer>=0.0.0,<1.0.0 5 | gtfparse>=2.5.0,<3.0.0 6 | serializable>=0.2.1,<1.0.0 7 | pylint>=2.17.2,<3.0.0 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from __future__ import print_function 14 | import os 15 | import re 16 | 17 | # TODO: replace setup.py with pyproject.toml 18 | from setuptools import setup 19 | 20 | package_name = "pyensembl" 21 | current_directory = os.path.dirname(__file__) 22 | readme_filename = "README.md" 23 | readme_path = os.path.join(current_directory, readme_filename) 24 | github_url = "https://github.com/openvax/%s" % package_name 25 | 26 | try: 27 | with open(readme_path, "r") as f: 28 | readme_markdown = f.read() 29 | except IOError as e: 30 | print(e) 31 | print("Failed to open %s" % readme_path) 32 | readme_markdown = "" 33 | 34 | 35 | with open("%s/version.py" % package_name, "r") as f: 36 | version = re.search( 37 | r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', f.read(), re.MULTILINE 38 | ).group(1) 39 | 40 | if not version: 41 | raise RuntimeError("Cannot find version information") 42 | 43 | if __name__ == "__main__": 44 | with open("requirements.txt") as f: 45 | requirements = [l.strip() for l in f] 46 | 47 | setup( 48 | name=package_name, 49 | version=version, 50 | description="Python interface to Ensembl reference genome metadata", 51 | author="Alex Rubinsteyn", 52 | author_email="alex.rubinsteyn@unc.edu", 53 | url=github_url, 54 | license="http://www.apache.org/licenses/LICENSE-2.0.html", 55 | entry_points={ 56 | "console_scripts": ["pyensembl = %s.shell:run" % package_name], 57 | }, 58 | classifiers=[ 59 | "Development Status :: 4 - Beta", 60 | "Environment :: Console", 61 | "Operating System :: OS Independent", 62 | "Intended Audience :: Science/Research", 63 | "License :: OSI Approved :: Apache Software License", 64 | "Programming Language :: Python", 65 | "Topic :: Scientific/Engineering :: Bio-Informatics", 66 | ], 67 | install_requires=requirements, 68 | long_description=readme_markdown, 69 | long_description_content_type="text/markdown", 70 | packages=[package_name], 71 | package_data={ 72 | package_name: ["logging.conf", "../requirements.txt"], 73 | }, 74 | ) 75 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | pytest --cov=pyensembl/ --cov-report=term-missing tests 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvax/pyensembl/d292b1749875904b380a209f4ff44b7d75dafdc3/tests/__init__.py -------------------------------------------------------------------------------- /tests/common.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | from pyensembl import genome_for_reference_name, cached_release 4 | 5 | import pytest 6 | 7 | 8 | grch37 = genome_for_reference_name("GRCh37") 9 | grch38 = genome_for_reference_name("GRCh38") 10 | 11 | major_releases = [grch37, grch38] 12 | 13 | contigs = [str(c) for c in range(1, 23)] + ["X", "Y", "M"] 14 | 15 | 16 | def run_multiple_genomes(*versions): 17 | if len(versions) == 1 and callable(versions[0]): 18 | return pytest.mark.parametrize("genome", major_releases)(versions[0]) 19 | if not versions: 20 | genomes = major_releases 21 | else: 22 | genomes = [cached_release(v) for v in versions] 23 | return lambda fn: pytest.mark.parametrize("genome", genomes)(fn) 24 | 25 | 26 | # TemporaryDirectory only got added to Python in version 3.2 27 | try: 28 | # pylint: disable=no-name-in-module 29 | from tempfile import TemporaryDirectory 30 | 31 | except ImportError: 32 | # only added in Python 3.2 33 | from tempfile import mkdtemp 34 | from shutil import rmtree 35 | 36 | class TemporaryDirectory(object): 37 | def __init__(self): 38 | self.name = mkdtemp() 39 | 40 | def __enter__(self, *args, **kwargs): 41 | return self.name 42 | 43 | def __exit__(self, type, value, traceback): 44 | rmtree(self.name) 45 | # don't suppress exceptions 46 | return False 47 | 48 | 49 | def ok_(b): 50 | assert b 51 | 52 | 53 | def eq_(x, y, msg=None): 54 | if msg is None: 55 | assert x == y 56 | else: 57 | assert x == y, msg 58 | 59 | 60 | def neq_(x, y, msg=None): 61 | if msg is None: 62 | assert x != y 63 | else: 64 | assert x != y, msg 65 | 66 | 67 | def gt_(x, y, msg=None): 68 | if msg is None: 69 | assert x > y 70 | else: 71 | assert x > y, msg 72 | 73 | 74 | def gte_(x, y, msg=None): 75 | if msg is None: 76 | assert x >= y 77 | else: 78 | assert x >= y, msg 79 | 80 | 81 | def lt_(x, y, msg=None): 82 | if msg is None: 83 | assert x < y 84 | else: 85 | assert x < y, msg 86 | 87 | 88 | def lte_(x, y, msg=None): 89 | if msg is None: 90 | assert x <= y 91 | else: 92 | assert x <= y, msg 93 | -------------------------------------------------------------------------------- /tests/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pyensembl import Locus, Genome 3 | 4 | 5 | def data_path(name): 6 | """ 7 | Return the absolute path to a file in the test/data directory. 8 | The name specified should be relative to test/data. 9 | """ 10 | return os.path.join(os.path.dirname(__file__), "data", name) 11 | 12 | 13 | # mapping of ensembl releases to transcript IDs for FOXP3-001 14 | FOXP3_001_transcript_id = "ENST00000376207" 15 | 16 | TP53_gene_id = "ENSG00000141510" 17 | 18 | # beta-catenin interacting protein from the negative strand of chromosome 1 19 | # URL: http://useast.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA? 20 | # db=core;g=ENSG00000178585;r=1:9848276-9910336;t=ENST00000377256 21 | CTNNBIP1_004_transcript_id = "ENST00000377256" 22 | 23 | # coding sequence for beta-catenin interacting protein (CTNNBIP1-004) 24 | CTNNBIP1_004_CDS = "".join([ 25 | "ATG", 26 | "AACCGCGAGGGAGCTCCCGGGAAGAGTCCGGAG", 27 | "GAGATGTACATTCAGCAGAAGGTCCGAGTGCTGCTCATGCTGCGGAAGATGGGATCAAAC", 28 | "CTGACAGCCAGCGAGGAGGAGTTCCTGCGCACCTATGCAGGGGTGGTCAACAGCCAGCTC", 29 | "AGCCAGCTGCCTCCGCACTCCATCGACCAGG", 30 | "GTGCAGAGGACGTGGTGATGGCGTTTTCCAGGTCGGAGACGGAAGACCGGAGGCAG", 31 | "TAG" 32 | ]) 33 | 34 | # 5' UTR for beta-catenin interacting protein (CTNNBIP1-004) 35 | CTNNBIP1_004_UTR5 = "".join([ 36 | "TGTGGGTGCAGGTTTCCTGGGCTTGCCAGACACACAGGGCGGCACCTTCCTACTTCTGCC", 37 | "CAGCCACAGCCCTCCCCTCACAGTTGAGCACCTGTTTGCCTGAAGTTAATTTCCAGAAGC", 38 | "AGGAGTCCCCAGAGCCAGGCAGGGGG"]) 39 | 40 | # 3' UTR for beta-catenin interacting protein (CTNNBIP1-004) 41 | CTNNBIP1_004_UTR3 = \ 42 | "CTGCAAAGCCCTTGGAACACCCTGGATGCTGTTGAGGGCCAAGAGATCTGTGTGGCTCC" 43 | 44 | CTNNBIP1_004_locus = Locus("1", 9850659, 9878176, "-") 45 | 46 | # properties of CTNNBIP1-004's exons copied from 47 | # http://useast.ensembl.org/Homo_sapiens/Transcript/Exons?g=ENSG00000178585; 48 | # r=1:9850659-9878176;redirect=no;t=ENST00000377256 49 | CTTNNIP1_004_exon_ids = [ 50 | 'ENSE00001473268', 51 | 'ENSE00001643659', 52 | 'ENSE00001600669', 53 | 'ENSE00001267940', 54 | 'ENSE00001473265', 55 | ] 56 | 57 | CTTNNIP1_004_exon_lengths = [ 58 | 37, 59 | 85, 60 | 120, 61 | 91, 62 | 118 63 | ] 64 | 65 | 66 | # 67 | # Information for EGFR from Ensembl website 68 | # Date: March 25th, 2015 69 | # Ensembl Release: 79 70 | # 71 | EGFR_001_name = "EGFR-001" 72 | EGFR_001_transcript_id = "ENST00000275493" 73 | EGFR_001_ccds_id = "CCDS5514" 74 | EGFR_001_protein_id = "ENSP00000275493" 75 | EGFR_001_protein_sequence = "".join([ 76 | "MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYV", 77 | "QRNYDLSFLKTIQEVAGYVLIALNTVERIPLENLQIIRGNMYYENSYALAVLSNYDANKTGLKELPMRNL", 78 | "QEILHGAVRFSNNPALCNVESIQWRDIVSSDFLSNMSMDFQNHLGSCQKCDPSCPNGSCWGAGEENCQKL", 79 | "TKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGPRESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVN", 80 | "PEGKYSFGATCVKKCPRNYVVTDHGSCVRACGADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLS", 81 | "INATNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELDILKTVKEITGFLLIQAWPENRTDLHAF", 82 | "ENLEIIRGRTKQHGQFSLAVVSLNITSLGLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTKI", 83 | "ISNRGENSCKATGQVCHALCSPEGCWGPEPRDCVSCRNVSRGRECVDKCNLLEGEPREFVENSECIQCHP", 84 | "ECLPQAMNITCTGRGPDNCIQCAHYIDGPHCVKTCPAGVMGENNTLVWKYADAGHVCHLCHPNCTYGCTG", 85 | "PGLEGCPTNGPKIPSIATGMVGALLLLLVVALGIGLFMRRRHIVRKRTLRRLLQERELVEPLTPSGEAPN", 86 | "QALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVD", 87 | "NPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAA", 88 | "RNVLVKTPQHVKITDFGLAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTF", 89 | "GSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLV", 90 | "IQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQGFFSSPSTSRTPLLSSLSATSNNSTVACI", 91 | "DRNGLQSCPIKEDSFLQRYSSDPTGALTEDSIDDTFLPVPEYINQSVPKRPAGSVQNPVYHNQPLNPAPS" 92 | "RDPHYQDPHSTAVGNPEYLNTVQPTCVNSTFDSPAHWAQKGSHQISLDNPDYQQDFFPKEAKPNGIFKGS" 93 | "TAENAEYLRVAPQSSEFIGA" 94 | ]) 95 | 96 | 97 | # GTF cropped from ftp://ftp.ensembl.org/pub/release-81/gtf/mus_musculus/ 98 | # Mus_musculus.GRCm38.81.gtf.gz via: 99 | # grep "ENSMUSG00000017167" Mus_musculus.GRCm38.81.gtf 100 | 101 | # Transcript FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/ 102 | # fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz via: 103 | # grep "ENSMUSG00000017167" Mus_musculus.GRCm38.cdna.all.fa -A 50 104 | 105 | # ncRNA FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/ 106 | # fasta/mus_musculus/cdna/Mus_musculus.GRCm38.ncrna.fa.gz via: 107 | # grep "ENSMUSG00000088969" Mus_musculus.GRCm38.ncrna.fa -A 2 108 | 109 | # Protein FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/fasta/ 110 | # mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz via: 111 | # grep "ENSMUSG00000017167" Mus_musculus.GRCm38.pep.all.fa -A 50 112 | 113 | # Tested against: 114 | # http://useast.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000017167 115 | 116 | MOUSE_ENSMUSG00000017167_PATH = data_path( 117 | "mouse.ensembl.81.partial.ENSMUSG00000017167.gtf") 118 | MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH = data_path( 119 | "mouse.ensembl.81.partial.ENSMUSG00000017167.fa") 120 | MOUSE_ENSMUSG00000088969_NCRNA_FASTA_PATH = data_path( 121 | "mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa") 122 | MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH = data_path( 123 | "mouse.ensembl.81.partial.ENSMUSG00000017167.pep") 124 | 125 | 126 | custom_mouse_genome_grcm38_subset = Genome( 127 | reference_name="GRCm38", 128 | annotation_name="_test_mouse_ensembl81_subset", 129 | gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, 130 | transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH], 131 | protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH]) 132 | 133 | 134 | def setup_init_custom_mouse_genome(): 135 | """ 136 | If a unit test needs to start from a cleared cache, add this to the test 137 | setup. 138 | """ 139 | custom_mouse_genome_grcm38_subset.clear_cache() 140 | custom_mouse_genome_grcm38_subset.index() 141 | -------------------------------------------------------------------------------- /tests/data/gencode.ucsc.small.gtf: -------------------------------------------------------------------------------- 1 | # expected format is [attributes] [comments] 2 | chr1 hg38_knownGene exon 17369 17436 0.000000 - . gene_id "uc031tla.1"; transcript_id "uc031tla.1"; 3 | chr1 hg38_knownGene exon 29554 30039 0.000000 + . gene_id "uc057aty.1"; transcript_id "uc057aty.1"; 4 | chr1 hg38_knownGene exon 30564 30667 0.000000 + . gene_id "uc057aty.1"; transcript_id "uc057aty.1"; 5 | chr1 hg38_knownGene exon 30976 31097 0.000000 + . gene_id "uc057aty.1"; transcript_id "uc057aty.1"; 6 | chr1 hg38_knownGene exon 30267 30667 0.000000 + . gene_id "uc057atz.1"; transcript_id "uc057atz.1"; 7 | chr1 hg38_knownGene exon 30976 31109 0.000000 + . gene_id "uc057atz.1"; transcript_id "uc057atz.1"; 8 | chr1 hg38_knownGene exon 30366 30503 0.000000 + . gene_id "uc031tlb.1"; transcript_id "uc031tlb.1"; 9 | chr1 hg38_knownGene exon 34554 35174 0.000000 - . gene_id "uc001aak.4"; transcript_id "uc001aak.4"; 10 | chr1 hg38_knownGene exon 35277 35481 0.000000 - . gene_id "uc001aak.4"; transcript_id "uc001aak.4"; 11 | chr1 hg38_knownGene exon 35721 36081 0.000000 - . gene_id "uc001aak.4"; transcript_id "uc001aak.4"; 12 | chr1 hg38_knownGene exon 35245 35481 0.000000 - . gene_id "uc057aua.1"; transcript_id "uc057aua.1"; 13 | chr1 hg38_knownGene exon 35721 36073 0.000000 - . gene_id "uc057aua.1"; transcript_id "uc057aua.1"; 14 | chr1 hg38_knownGene start_codon 69091 69093 0.000000 + . gene_id "uc001aal.1"; transcript_id "uc001aal.1"; 15 | chr1 hg38_knownGene CDS 69091 70005 0.000000 + 0 gene_id "uc001aal.1"; transcript_id "uc001aal.1"; 16 | chr1 hg38_knownGene stop_codon 70006 70008 0.000000 + . gene_id "uc001aal.1"; transcript_id "uc001aal.1"; 17 | -------------------------------------------------------------------------------- /tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.fa: -------------------------------------------------------------------------------- 1 | >ENSMUST00000138942 havana:known chromosome:GRCm38:11:101170523:101178316:1 gene:ENSMUSG00000017167 gene_biotype:protein_coding transcript_biotype:processed_transcript 2 | CAGCGCGAAGCCCACAGGCGCATCCCTAGTAGGGCTACTTGCCCCTGGAGCTCCCGGGGC 3 | TCTGGCCCTCAGACAAGAATCTCCCCCACATTTGCAGTTGGCCAAGAGGACTGCGTTTGG 4 | CCCAAGTATGGAGCAGGCTCAGGCGTGACGGCCGGTTGTAGTGAGAAAGATTGAACTCGG 5 | TTCTAAATCCCTGTAGACTTACCCTCCCGCCGCCCGCTGGACTCGGGGTCCTTAGCTCAA 6 | AGGTCTCGTCGTCCTCGTCTTCGTCCCCGTCCCCGCTAAGCTCGCCGTCCCCGTAGTCCC 7 | GGTGCAGAAGAGTGAAGCCTCGACGGCAGCAGAGAAGCCACCACAATCCCCCGGGGAGAG 8 | GCATCCGGGCGAGCAGCCTGGGAATGGGGGCGCAGGCAGTGTTGCCTGAGCAGCAGGGAA 9 | TCTGAGAAACTGGAGACCTTCTTCGGGAATGTCAATGACTCGGCAGTGGTCCGCCATGAC 10 | CTTCACTACCACTTTACGGCTCGCTACATCCGCATCGTGCCACTGGCCTGGAACCCACGC 11 | GGCAAGATTGGCCTGAGGCTGGGCATCTATGGTTGTCCCTACACATCCAGCATCCTGTAT 12 | TTTGACGGCGACGATGCCATCTCATACCGCTTCCAGCGAGGCGCCAGCCAAAGTCTTTGG 13 | GACG 14 | >ENSMUST00000103109 ensembl_havana_transcript:known chromosome:GRCm38:11:101176041:101190724:1 gene:ENSMUSG00000017167 gene_biotype:protein_coding transcript_biotype:protein_coding 15 | GAGAGAAGGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA 16 | GAGAGAGAGAGATTGGGGGTAGGAGAGAGGGAAGGGTGGATAAGGACGGAAAAAAGCTTT 17 | GGTCAGCCGTGAACCCAGGAGAAAAGCTGGGGGCCTGAGCCAGAACGGGAGCCCTAGCGG 18 | CGCAACAAGGCTGACACCCAGCGTTGGTCAGCTCCGCATGATGAGTCTCCGGCTCTTCAG 19 | CATCCTGCTCGCCACGGTGGTCTCTGGAGCTTGGGGCTGGGGCTACTACGGTTGCAATGA 20 | GGAGCTGGTGGGGCCTCTGTATGCACGGTCTCTGGGCGCTTCTTCCTACTATGGACTCTT 21 | TACCACAGCCCGCTTTGCCCGGCTACATGGCATCAGTGGATGGTCGCCCCGGATTGGGGA 22 | CCCGAATCCCTGGCTGCAGATCGACTTAATGAAGAAGCATCGGATCCGGGCTGTGGCCAC 23 | ACAGGGAGCTTTTAATTCTTGGGATTGGGTCACACGTTACATGCTGCTCTACGGGGACCG 24 | TGTGGACAGCTGGACACCATTCTACCAAAAAGGGCACAATGCGACCTTCTTCGGGAATGT 25 | CAATGACTCGGCAGTGGTCCGCCATGACCTTCACTACCACTTTACGGCTCGCTACATCCG 26 | CATCGTGCCACTGGCCTGGAACCCACGCGGCAAGATTGGCCTGAGGCTGGGCATCTATGG 27 | TTGTCCCTACACATCCAGCATCCTGTATTTTGACGGCGACGATGCCATCTCATACCGCTT 28 | CCAGCGAGGCGCCAGCCAAAGTCTTTGGGACGTGTTCGCTTTTAGTTTCAAGACAGAGGA 29 | GAAGGATGGGCTGTTGCTGCACACCGAGGGCTCCCAGGGGGATTATGTGACGCTTGAACT 30 | GCAAGGGGCGCACCTGCTGCTGCACATGAGCCTGGGCAGCAGTCCCATCCAGCCAAGACC 31 | TGGTCACACCACGGTGAGCCTTGGTGGCGTTCTTAACGACCTAAGCTGGCACTATGTGCG 32 | GGTGGATCGATATGGCCGAGATGCAAATTTCACCCTGGATGGTTACGCCCATCACTTTGT 33 | GCTCAACGGCGACTTTGAAAGGCTGAATCTTGAAAATGAGATATTCATCGGGGGTCTAGT 34 | GGGCGCAGCCCGTAAGAACCTGGCCTACCGCCATAACTTCCGTGGCTGCATAGAAAACGT 35 | GATCTACAACCGGATCAACATTGCAGAAATGGCAGTGATGCGCCATTCGCGGATCACCTT 36 | TGAGGGTAATGTGGCTTTCCGTTGCTTGGATCCCGTTCCACACCCCATCAACTTCGGAGG 37 | CCCTCACAACTTCGTCCAAGTGCCTGGCTTTCCACGCCGAGGACGCTTAGCCGTCTCTTT 38 | TCGTTTCCGCACCTGGGACCTCACAGGGCTGCTCCTTTTCTCCCACTTGGGGGACGGGCT 39 | GGGTCATGTGGAGCTGATGCTTAGCGAAGGGCAAGTGAATGTATCCATCGCGCAGACTGG 40 | CCGCAAGAAGCTTCAGTTTGCTGCTGGGTACCGCCTGAATGATGGCTTCTGGCACGAGGT 41 | GAACTTTGTGGCACAGGAAAACCATGCAGTCATCAGTATTGATGATGTGGAAGGGGCAGA 42 | GGTCAGGGTTTCATACCCACTGCTGATCCGCACAGGGACTTCATACTTCTTTGGTGGTTG 43 | TCCCAAACCAGCCAGTCGATGGGGCTGCCACTCCAACCAGACAGCATTCCATGGCTGCAT 44 | GGAGCTGCTCAAGGTGGACGGTCAACTGGTCAACCTCACTCTGGTAGAGTTTCGGAAGCT 45 | CGGTTATTTTGCTGAGGTCCTCTTTGACACATGTGGCATCACAGACAGATGCAGCCCTAA 46 | CATGTGTGAGCATGACGGACGATGCTACCAGTCTTGGGATGACTTCATCTGCTACTGCGA 47 | ACTTACCGGCTACAAGGGAGTTACCTGCCACGAACCATTGTACAAGGAGTCCTGTGAGGC 48 | CTATCGGCTCAGTGGGAAATATTCTGGAAACTACACCATTGATCCTGATGGCAGTGGACC 49 | CCTGAAGCCGTTTGTGGTGTATTGTGACATCCGAGAGAACCGAGCGTGGACAGTTGTGCG 50 | GCATGACAGGCTGTGGACCACTCGAGTGACTGGTTCCAGCATGGACCGGCCCTTTCTGGG 51 | GGCCATCCAATACTGGAATGCCTCCTGGGAGGAAGTCAGCGCTCTGGCCAATGCTTCCCA 52 | ACACTGTGAGCAGTGGATCGAGTTTTCCTGCTACAATTCCCGGCTGCTCAACACTGCAGG 53 | AGGCTACCCCTACAGCTTTTGGATTGGCCGCAATGAGGAACAGCATTTCTACTGGGGAGG 54 | CTCCCAGCCTGGGATCCAGCGCTGTGCCTGTGGGCTGGACCAGAGCTGTGTGGACCCTGC 55 | ACTGCACTGCAATTGTGATGCCGACCAGCCACAGTGGAGAACAGACAAGGGGCTCCTGAC 56 | CTTTGTAGACCATCTGCCTGTCACTCAGGTAGTGGTAGGTGATACAAACCGCTCAAATTC 57 | TGAAGCTCAGTTCTTCCTGAGGCCTCTGCGCTGCTATGGTGACCGCAATTCCTGGAACAC 58 | CATCTCCTTCCACACTGGAGCTGCACTGCGTTTCCCTCCGATCCGAGCCAACCACAGCCT 59 | CGATGTCTCATTCTACTTCAGGACCTCGGCTCCCTCGGGTGTCTTCCTAGAGAACATGGG 60 | GGGTCCTTTCTGCCGGTGGCGCCGACCTTACGTGAGAGTGGAGCTCAACACATCCCGGGA 61 | TGTGGTCTTTGCCTTTGATATTGGCAATGGGGATGAGAACCTGACAGTGCACTCGGATGA 62 | CTTTGAGTTTAACGATGATGAGTGGCATTTGGTCCGAGCTGAAATCAACGTGAAGCAGGC 63 | CCGGCTGCGAGTGGATCACCGGCCCTGGGTGCTAAGGCCCATGCCCCTGCAGACCTACAT 64 | CTGGCTGGTGTATGACCAACCCCTCTATGTGGGATCTGCAGAGCTTAAGAGGCGCCCTTT 65 | -------------------------------------------------------------------------------- /tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.pep: -------------------------------------------------------------------------------- 1 | >ENSMUSP00000099398 pep:known chromosome:GRCm38:11:101176041:101190724:1 gene:ENSMUSG00000017167 transcript:ENSMUST00000103109 gene_biotype:protein_coding transcript_biotype:protein_coding 2 | MMSLRLFSILLATVVSGAWGWGYYGCNEELVGPLYARSLGASSYYGLFTTARFARLHGIS 3 | GWSPRIGDPNPWLQIDLMKKHRIRAVATQGAFNSWDWVTRYMLLYGDRVDSWTPFYQKGH 4 | NATFFGNVNDSAVVRHDLHYHFTARYIRIVPLAWNPRGKIGLRLGIYGCPYTSSILYFDG 5 | DDAISYRFQRGASQSLWDVFAFSFKTEEKDGLLLHTEGSQGDYVTLELQGAHLLLHMSLG 6 | SSPIQPRPGHTTVSLGGVLNDLSWHYVRVDRYGRDANFTLDGYAHHFVLNGDFERLNLEN 7 | EIFIGGLVGAARKNLAYRHNFRGCIENVIYNRINIAEMAVMRHSRITFEGNVAFRCLDPV 8 | PHPINFGGPHNFVQVPGFPRRGRLAVSFRFRTWDLTGLLLFSHLGDGLGHVELMLSEGQV 9 | NVSIAQTGRKKLQFAAGYRLNDGFWHEVNFVAQENHAVISIDDVEGAEVRVSYPLLIRTG 10 | TSYFFGGCPKPASRWGCHSNQTAFHGCMELLKVDGQLVNLTLVEFRKLGYFAEVLFDTCG 11 | ITDRCSPNMCEHDGRCYQSWDDFICYCELTGYKGVTCHEPLYKESCEAYRLSGKYSGNYT 12 | IDPDGSGPLKPFVVYCDIRENRAWTVVRHDRLWTTRVTGSSMDRPFLGAIQYWNASWEEV 13 | SALANASQHCEQWIEFSCYNSRLLNTAGGYPYSFWIGRNEEQHFYWGGSQPGIQRCACGL 14 | DQSCVDPALHCNCDADQPQWRTDKGLLTFVDHLPVTQVVVGDTNRSNSEAQFFLRPLRCY 15 | GDRNSWNTISFHTGAALRFPPIRANHSLDVSFYFRTSAPSGVFLENMGGPFCRWRRPYVR 16 | VELNTSRDVVFAFDIGNGDENLTVHSDDFEFNDDEWHLVRAEINVKQARLRVDHRPWVLR 17 | PMPLQTYIWLVYDQPLYVGSAELKRRPFVGCLRAMRLNGVTLNLEGRANASEGTFPNCTG 18 | HCTHPRFPCFHGGRCVERYSYYTCDCDLTAFDGPYCNHDIGGFFETGTWMRYNLQSALRS 19 | AAREFSHMLSRPVPGYEPGYVPGYDTPGYVPGYHGPGYRLPEYPRPGRPVPGYRGPVYNV 20 | TGEEVSFSFSTNSAPAVLLYVSSFVRDYMAVLIKEDGTLQLRYQLGTSPYVYQLTTRPVT 21 | DGQPHSVNITRVYRNLFIQVDYFPLTEQKFSLLVDSQLDSPKALYLGRVMETGVIDPEIQ 22 | RYNTPGFSGCLSGVRFNNVAPLKTHFRTPRPMTAELAEAMRVQGELSESNCGAMPRLVSE 23 | VPPELDPWYLPPDFPYYHDDGWIAILLGFLVAFLLLGLVGMLVLFYLQNHRYKGSYHTNE 24 | PKATHDSHPGGKAPLPPSGPAQAPAPTPAPTQLPTPAPAPAPAPASGPGPRDQNLPQILE 25 | ESRSE 26 | >ENSMUSP00000006660 pep:known chromosome:GRCm38:13:27658956:27668036:-1 gene:ENSMUSG00000046899 transcript:ENSMUST00000006660 gene_biotype:protein_coding transcript_biotype:protein_coding 27 | MSFSFSQPCPSGALLLVVVSSLLLWENVASVPLSSNETDGYPLSINGLFHNAMRLTWNIK 28 | NLNMELRKTYTVNQVSEKLYENYMLDFIEDMEYLVKALTCCHNYSIKTPENLDEAQQIPF 29 | NEFPKLILSRMWAWNETSKVLLTTLRSIPGMHDDVISLAKNIETKLAELFEYTQSILNSI 30 | YGTTTTGNVEYTVFSGLEDLKSSDEEFSLFDLCKFSYCLRVDIHMVELYLKLLECVVYVS 31 | SDVCLSKNIRDAS 32 | >ENSMUSP00000046761 pep:known chromosome:GRCm38:12:66469568:67222549:-1 gene:ENSMUSG00000034912 transcript:ENSMUST00000037181 gene_biotype:protein_coding transcript_biotype:protein_coding 33 | MDLVYGLVWLLTVLLEGISGQGVYAPPTVRIVHSGLACNIEEERYSERVYTIREGETLEL 34 | TCLVTGHPRPQIRWTKTAGSASDRFQDSSVFNETLRITNIQRHQGGRYYCKAENGLGSPA 35 | IKSIRVDVYYLDDPVVTVHQSIGEAKEQFYYERTVFLRCVANSNPPVRYSWRRGQEVLLQ 36 | GSDKGVEIYEPFFTQGETKILKLKNLRPQDYANYSCIASVRNVCNIPDKMVSFRLSNKTA 37 | SPSIKLLVDDPIVVNPGEAITLVCVTTGGEPTPSLTWVRSFGTLPEKIVLNGGTLTIPAI 38 | TSDDAGTYSCIANNNVGNPAKKSTNIIVRALKKGRFWITPDPYHKDDNIQIGREVKISCQ 39 | VEAVPSEELTFSWFKNGRPLRSSERMVITQTDPDVSPGTTNLDIIDLKFTDFGTYTCVAS 40 | LKGGGISDISIDVNISSSTVPPNLTVPQEKSPLVTREGDTIELQCQVTGKPKPIILWSRA 41 | DKEVAMPDGTMQMESYDGTLRIVNVSREMSGMYRCQTSQYNGFNVKPREALVQLIVQYPP 42 | AVEPAFLEIRQGQDRSVTMSCRVLRAYPIRVLTYEWRLGNKLLRTGQFDSQEYTEYPLKS 43 | LSNENYGVYNCSIINEAGAGRCSFLVTGKAYAPEFYYDTYNPVWQNRHRVYSYSLQWTQM 44 | NPDAVDRIVAYRLGIRQAGQQRWWEQEIKINGNIQKGELITYNLTELIKPEAYEVRLTPL 45 | TKFGEGDSTIRVIKYTAPVNPHLREFHCGFEDGNICLFTQDDTDNFDWTKQSTATRNTKY 46 | TPNTGPSADRSGSKEGFYMYIETSRPRLEGEKARLLSPVFSIAPKNPYGPTNSAYCFSFF 47 | YHMYGQHIGVLNVYLRLKGQTTIENPLWSSSGNKGQRWNEAHVNIYPITSFQLIFEGIRG 48 | PGIEGDIAIDDVSIAEGECAKQDLPTKNSVDGAVGILVHIWLFPVIILISILSPRR 49 | >ENSMUSP00000137608 pep:known chromosome:GRCm38:12:66471182:67221221:-1 gene:ENSMUSG00000034912 transcript:ENSMUST00000178814 gene_biotype:protein_coding transcript_biotype:protein_coding 50 | MDLVYGLVWLLTVLLEGISGQGVYAPPTVRIVHSGLACNIEEERYSERVYTIREGETLEL 51 | TCLVTGHPRPQIRWTKTAGSASDRFQDSSVFNETLRITNIQRHQGGRYYCKAENGLGSPA 52 | -------------------------------------------------------------------------------- /tests/data/mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa: -------------------------------------------------------------------------------- 1 | >ENSMUST00000158344 ncrna:known chromosome:GRCm38:8:107009502:107009611:-1 gene:ENSMUSG00000088969 gene_biotype:snRNA transcript_biotype:snRNA 2 | GCACCTATTTTGACAGCACAAATACTAAAATTGGAACAAATCAGGGAAGATTAGCATGCT 3 | CTCATGCAAGGATGACACGGAAATTCATGGAACAGCGGATTCATATTTTA 4 | -------------------------------------------------------------------------------- /tests/data/refseq.ucsc.small.gtf: -------------------------------------------------------------------------------- 1 | chr1 hg38_refGene exon 67092176 67093604 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077"; 2 | chr1 hg38_refGene exon 67096252 67096321 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077"; 3 | chr1 hg38_refGene exon 67103238 67103382 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077"; 4 | chr1 hg38_refGene exon 67111577 67111644 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077"; 5 | chr1 hg38_refGene exon 67113614 67113756 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077"; 6 | chr1 hg38_refGene exon 67115352 67115464 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077"; 7 | chr1 hg38_refGene exon 67125752 67125909 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077"; 8 | chr1 hg38_refGene exon 67127166 67127257 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077"; 9 | chr1 hg38_refGene exon 67131142 67131227 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077"; 10 | chr1 hg38_refGene exon 67134930 67134971 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077"; 11 | chr1 hg38_refGene stop_codon 67093580 67093582 0.000000 - . gene_id "NM_001276352"; transcript_id "NM_001276352"; 12 | chr1 hg38_refGene CDS 67093583 67093604 0.000000 - 1 gene_id "NM_001276352"; transcript_id "NM_001276352"; 13 | chr1 hg38_refGene exon 67092176 67093604 0.000000 - . gene_id "NM_001276352"; transcript_id "NM_001276352"; 14 | chr1 hg38_refGene CDS 67096252 67096321 0.000000 - 2 gene_id "NM_001276352"; transcript_id "NM_001276352"; 15 | chr1 hg38_refGene exon 67096252 67096321 0.000000 - . gene_id "NM_001276352"; transcript_id "NM_001276352"; 16 | chr1 hg38_refGene CDS 67103238 67103382 0.000000 - 0 gene_id "NM_001276352"; transcript_id "NM_001276352"; 17 | chr1 hg38_refGene exon 67103238 67103382 0.000000 - . gene_id "NM_001276352"; transcript_id "NM_001276352"; 18 | chr1 hg38_refGene CDS 67111577 67111644 0.000000 - 2 gene_id "NM_001276352"; transcript_id "NM_001276352"; 19 | chr1 hg38_refGene exon 67111577 67111644 0.000000 - . gene_id "NM_001276352"; transcript_id "NM_001276352"; 20 | chr1 hg38_refGene CDS 67115352 67115464 0.000000 - 1 gene_id "NM_001276352"; transcript_id "NM_001276352"; 21 | chr1 hg38_refGene exon 67115352 67115464 0.000000 - . gene_id "NM_001276352"; transcript_id "NM_001276352"; 22 | chr1 hg38_refGene CDS 67125752 67125909 0.000000 - 0 gene_id "NM_001276352"; transcript_id "NM_001276352"; 23 | chr1 hg38_refGene exon 67125752 67125909 0.000000 - . gene_id "NM_001276352"; transcript_id "NM_001276352"; 24 | chr1 hg38_refGene CDS 67127166 67127240 0.000000 - 0 gene_id "NM_001276352"; transcript_id "NM_001276352"; 25 | -------------------------------------------------------------------------------- /tests/test_contigs.py: -------------------------------------------------------------------------------- 1 | from pyensembl import genome_for_reference_name 2 | 3 | grch38 = genome_for_reference_name("GRCh38") 4 | 5 | def test_contig_names(): 6 | contig_names = set(grch38.contigs()) 7 | for chrom in list(range(1, 23)) + ["X", "Y", "MT"]: 8 | assert str(chrom) in contig_names, (chrom, contig_names) 9 | -------------------------------------------------------------------------------- /tests/test_download_cache.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | from pytest import raises as assert_raises 5 | from pyensembl.download_cache import DownloadCache, MissingLocalFile, MissingRemoteFile 6 | 7 | 8 | from .data import data_path 9 | from .common import ok_ 10 | 11 | download_cache = DownloadCache( 12 | reference_name="__test_reference", 13 | annotation_name="__test_annotation", 14 | copy_local_files_to_cache=False, 15 | ) 16 | 17 | 18 | def test_download_cache_missing_local_file(): 19 | # clear the cache 20 | download_cache.delete_cache_directory() 21 | with assert_raises(MissingLocalFile): 22 | download_cache.download_or_copy_if_necessary( 23 | path_or_url="test_file_doesn_not_exist.file" 24 | ) 25 | 26 | 27 | def test_download_cache_missing_remote_file(): 28 | # clear the cache 29 | download_cache.delete_cache_directory() 30 | with assert_raises(MissingRemoteFile): 31 | download_cache.download_or_copy_if_necessary( 32 | path_or_url="ftp://NOTAURL.NOTAURL.NOTAURL" 33 | ) 34 | 35 | 36 | def test_download_cache_custom_location(): 37 | test_file = "refseq.ucsc.small.gtf" 38 | tmp_dir = tempfile.gettempdir() 39 | 40 | print("DIR: %s" % tmp_dir) 41 | assert tmp_dir is not None 42 | 43 | os.environ["PYENSEMBL_CACHE_DIR"] = tmp_dir 44 | 45 | # We need another instance of DownloadCache 46 | # that copies files over to cache folder 47 | download_cache = DownloadCache( 48 | reference_name="test_reference", 49 | annotation_name="test_annotation", 50 | copy_local_files_to_cache=True, 51 | ) 52 | 53 | # clean up 54 | download_cache.delete_cache_directory() 55 | download_cache.download_or_copy_if_necessary( 56 | download_if_missing=True, path_or_url=data_path(test_file) 57 | ) 58 | 59 | full_path = os.path.join( 60 | tmp_dir, "pyensembl", "test_reference", "test_annotation", test_file 61 | ) 62 | print("FULL PATH: %s" % full_path) 63 | assert len(full_path) > 0 64 | 65 | ok_(os.path.exists(full_path)) 66 | del os.environ["PYENSEMBL_CACHE_DIR"] 67 | -------------------------------------------------------------------------------- /tests/test_ensembl_gtf.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from os.path import exists 3 | 4 | 5 | from .common import run_multiple_genomes 6 | 7 | 8 | @run_multiple_genomes() 9 | def gtf_path_endswith_gtf_gz(ensembl): 10 | path = ensembl.gtf.gtf_path 11 | assert exists(path) 12 | assert path.endswith(".gtf.gz") 13 | -------------------------------------------------------------------------------- /tests/test_ensembl_object_properties.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for simple properties of an EnsemblRelease object which don't 3 | require database lookups. 4 | """ 5 | 6 | from __future__ import absolute_import 7 | 8 | from .common import eq_ 9 | from pyensembl import EnsemblRelease, MAX_ENSEMBL_RELEASE 10 | 11 | 12 | def test_human_reference_name(): 13 | eq_(EnsemblRelease(release=54).reference_name, "NCBI36") 14 | eq_(EnsemblRelease(release=74).reference_name, "GRCh37") 15 | eq_(EnsemblRelease(release=75).reference_name, "GRCh37") 16 | for release in range(76, MAX_ENSEMBL_RELEASE): 17 | eq_(EnsemblRelease(release=release).reference_name, "GRCh38") 18 | -------------------------------------------------------------------------------- /tests/test_exon_id.py: -------------------------------------------------------------------------------- 1 | """ 2 | Exon IDs of the TP53 gene and one of its transcripts (TP53-026) were copied 3 | from the Ensembl website, make sure same IDs are found by pyensembl. 4 | """ 5 | from __future__ import absolute_import 6 | 7 | from pyensembl import cached_release 8 | 9 | ensembl = cached_release(77) 10 | 11 | # all exons associated with TP53 gene in Ensembl release 77 12 | TP53_EXON_IDS_RELEASE_77 = [ 13 | 'ENSE00002337729', 'ENSE00002419584', 14 | 'ENSE00003625790', 'ENSE00003518480', 15 | 'ENSE00003723991', 'ENSE00003712342', 16 | 'ENSE00001657961', 'ENSE00003725258', 17 | 'ENSE00003740946', 'ENSE00002204316', 18 | 'ENSE00002064269', 'ENSE00003750554', 19 | 'ENSE00003634848', 'ENSE00003492844', 20 | 'ENSE00003735852', 'ENSE00003545950', 21 | 'ENSE00003605891', 'ENSE00002051192', 22 | 'ENSE00002084733', 'ENSE00003726882', 23 | 'ENSE00001146308', 'ENSE00002667911', 24 | 'ENSE00003752869', 'ENSE00003739898', 25 | 'ENSE00003753508', 'ENSE00002034209', 26 | 'ENSE00002030826', 'ENSE00001596491', 27 | 'ENSE00002037735', 'ENSE00003736616', 28 | 'ENSE00002672443', 'ENSE00002226620', 29 | 'ENSE00003715195', 'ENSE00003750794', 30 | 'ENSE00003745267', 'ENSE00003746220', 31 | 'ENSE00003656695', 'ENSE00003669712', 32 | 'ENSE00002051873', 'ENSE00002048269', 33 | 'ENSE00002670535', 'ENSE00002677565', 34 | 'ENSE00003532881', 'ENSE00003520683', 35 | 'ENSE00002076714', 'ENSE00002062958', 36 | 'ENSE00002073243', 'ENSE00003670707', 37 | 'ENSE00002065802', 'ENSE00002362269' 38 | ] 39 | 40 | def test_exon_ids_of_gene_id(): 41 | """ 42 | test_exon_ids_of_gene_id: Ensure that gene_id ENSG00000141510 (name=TP53), 43 | has all the same exon IDs found on the Ensembl website. 44 | """ 45 | exon_ids = ensembl.exon_ids_of_gene_id('ENSG00000141510') 46 | assert len(exon_ids) == len(TP53_EXON_IDS_RELEASE_77), \ 47 | "Wrong number of exons, expected %d but got %d (n_distinct=%d)" % ( 48 | len(TP53_EXON_IDS_RELEASE_77), 49 | len(exon_ids), 50 | len(set(exon_ids))) 51 | assert all(exon_id in TP53_EXON_IDS_RELEASE_77 for exon_id in exon_ids) 52 | 53 | def test_exon_ids_of_gene_name(): 54 | """ 55 | test_exon_ids_of_gene_name: Ensure that TP53 has the same exon IDs found 56 | on the Ensembl website. 57 | """ 58 | exon_ids = ensembl.exon_ids_of_gene_name("TP53") 59 | assert len(exon_ids) == len(TP53_EXON_IDS_RELEASE_77), \ 60 | "Wrong number of exons, expected %d but got %d (n_distinct=%d)" % ( 61 | len(TP53_EXON_IDS_RELEASE_77), 62 | len(exon_ids), 63 | len(set(exon_ids))) 64 | assert all(exon_id in TP53_EXON_IDS_RELEASE_77 for exon_id in exon_ids) 65 | 66 | # Exon IDs of transcript TP53-026 67 | TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 = [ 68 | 'ENSE00002064269', 69 | 'ENSE00003723991', 70 | 'ENSE00003712342', 71 | 'ENSE00003725258', 72 | 'ENSE00003740946', 73 | 'ENSE00003750554', 74 | 'ENSE00003634848', 75 | 'ENSE00003492844' 76 | ] 77 | 78 | def test_exon_ids_of_transcript_name(): 79 | """ 80 | test_exon_ids_of_transcript_name : Look up exon IDs of transcript TP53-026 81 | by name and ensure that the exon IDs match what we find on Ensembl's website 82 | for release 77 83 | """ 84 | exon_ids = ensembl.exon_ids_of_transcript_name("TP53-026") 85 | assert len(exon_ids) == len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), \ 86 | "Expected %d exons, got %d" % ( 87 | len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), 88 | len(exon_ids)) 89 | assert all( 90 | exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 91 | for exon_id in exon_ids) 92 | 93 | def exon_ids_of_transcript_id(): 94 | """ 95 | exon_ids_of_transcript_id : Look up exon IDs of transcript 96 | ENST00000610623 (name: TP53-026) by its ID and make sure they match 97 | what we find on the Ensembl website. 98 | """ 99 | exon_ids = ensembl.exon_ids_of_transcript_id("ENST00000610623") 100 | assert len(exon_ids) == len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), \ 101 | "Expected %d exons, got %d" % ( 102 | len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), 103 | len(exon_ids)) 104 | assert all( 105 | exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 106 | for exon_id in exon_ids) 107 | -------------------------------------------------------------------------------- /tests/test_exon_object.py: -------------------------------------------------------------------------------- 1 | """ 2 | Check that pyensembl returns correct exon objects for exon IDs 3 | and loci. Make sure the information on the exon object matches 4 | the expected gene ID and location. 5 | """ 6 | from __future__ import absolute_import 7 | 8 | from pyensembl import cached_release 9 | 10 | ensembl = cached_release(77) 11 | 12 | def test_exon_object_by_id(): 13 | """ 14 | test_exon_object_by_id : check properties of exon 4 of CTNNB1 when looked 15 | up by ID in Ensembl 77. 16 | """ 17 | exon = ensembl.exon_by_id("ENSE00003464041") 18 | assert exon.gene_name == "CTNNB1", \ 19 | "Unexpected gene name: %s" % exon.gene_name 20 | assert exon.contig == "3", exon.contig 21 | assert exon.strand == "+" 22 | assert exon.on_forward_strand 23 | assert exon.on_positive_strand 24 | assert exon.start == 41224526, "Unexpected exon start: %s" % exon.start 25 | assert exon.end == 41224753, "Unexpected exon end: %s" % exon.end 26 | assert exon.length == len(exon) == 228 27 | 28 | def test_exon_object_by_id_on_negative_strand(): 29 | """ 30 | test_exon_object_by_id_on_negative_strand : check properties of exon 1 31 | from CXCR3 when looked up by ID in Ensembl 77. 32 | """ 33 | exon = ensembl.exon_by_id("ENSE00001817013") 34 | assert exon.gene_name == "CXCR3", \ 35 | "Unexpected gene name: %s" % exon.gene_name 36 | assert exon.contig == "X", exon.contig 37 | assert exon.strand == "-" 38 | assert exon.on_backward_strand 39 | assert exon.on_negative_strand 40 | assert exon.start == 71618438, "Unexpected exon start: %s" % exon.start 41 | assert exon.end == 71618517, "Unexpected exon end: %s" % exon.end 42 | assert exon.length == len(exon) == 80 43 | 44 | 45 | def test_exon_object_at_locus(): 46 | """ 47 | test_exon_object_at_locus : check properties of exon 4 of CTNNB1 when looked 48 | up by its location on the forward strand of chr3 49 | """ 50 | exons = ensembl.exons_at_locus(3, 41224526, strand="+") 51 | for exon in exons: 52 | assert exon.gene_name == "CTNNB1", exon.transcript_name 53 | assert exon.contig == "3", exon.contig 54 | assert exon.strand == "+" 55 | assert exon.on_forward_strand 56 | assert exon.on_positive_strand 57 | assert exon.start <= 41224526, "Unexpected exon start: %s" % exon.start 58 | assert exon.end >= 41224526, "Unexpected exon end: %s" % exon.end 59 | 60 | def test_exon_object_at_locus_on_negative_strand(): 61 | """ 62 | test_exon_object_at_locus : check properties of exon 1 of CXCR3 when looked 63 | up by its location on the negative strand of chrX 64 | """ 65 | exons = ensembl.exons_at_locus("chrX", 71618517, strand="-") 66 | for exon in exons: 67 | assert exon.gene_name == "CXCR3", exon.transcript_name 68 | assert exon.contig == "X", exon.contig 69 | assert exon.strand == "-" 70 | assert exon.on_backward_strand 71 | assert exon.on_negative_strand 72 | assert exon.start <= 71618517, "Unexpected exon start: %s" % exon.start 73 | assert exon.end >= 71618517, "Unexpected exon end: %s" % exon.end 74 | 75 | def test_exon_basic_properties_str(): 76 | exon = ensembl.exon_by_id("ENSE00001817013") 77 | assert isinstance(str(exon), str) 78 | assert isinstance(repr(exon), str) 79 | # for now we're assuming that __repr__ and __str__ do the same thing, 80 | # if we later change that assumption we should do so explicitly and 81 | # change this test 82 | assert str(exon) == repr(exon), "%s != %s" % (str(exon), repr(exon)) 83 | 84 | def test_exon_basic_properties_hash(): 85 | exon = ensembl.exon_by_id("ENSE00001817013") 86 | assert isinstance(hash(exon), int), \ 87 | "Hash function returns %s instead of int" % ( 88 | type(hash(exon),)) 89 | assert hash(exon) == hash(exon), "Hash function is non-deterministic!" 90 | other_exon = ensembl.exon_by_id("ENSE00003464041") 91 | assert exon != other_exon 92 | assert hash(exon) != hash(other_exon) 93 | -------------------------------------------------------------------------------- /tests/test_gene_ids.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test all methods which return collections of gene IDs that aren't converting 3 | from some other type of name or ID. 4 | 5 | TODO: Implement tests for EnsemblRelease.gene_ids 6 | """ 7 | from __future__ import absolute_import 8 | 9 | from pytest import raises 10 | from pyensembl import ensembl_grch38, cached_release 11 | 12 | from .common import run_multiple_genomes, eq_ 13 | 14 | ensembl77 = cached_release(77, "human") 15 | 16 | 17 | def test_gene_ids_grch38_hla_a(): 18 | # chr6:29,945,884 is a position for HLA-A 19 | # Gene ID = ENSG00000206503 20 | # based on: 21 | # http://useast.ensembl.org/Homo_sapiens/Gene/ 22 | # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884 23 | ids = ensembl_grch38.gene_ids_at_locus(6, 29945884) 24 | expected = "ENSG00000206503" 25 | assert ids == ["ENSG00000206503"], "Expected HLA-A, gene ID = %s, got: %s" % ( 26 | expected, 27 | ids, 28 | ) 29 | 30 | 31 | def test_gene_ids_of_gene_name_hla_grch38(): 32 | hla_a_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-A") 33 | assert "ENSG00000206503" in hla_a_gene_ids, hla_a_gene_ids 34 | 35 | hla_b_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-B") 36 | assert "ENSG00000234745" in hla_b_gene_ids, hla_b_gene_ids 37 | 38 | hla_c_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-C") 39 | assert "ENSG00000204525" in hla_c_gene_ids, hla_c_gene_ids 40 | 41 | 42 | def test_gene_id_of_protein_id_release77(): 43 | gene_id = ensembl77.gene_id_of_protein_id("ENSP00000485677") 44 | eq_("ENSG00000279634", gene_id) 45 | 46 | 47 | def test_gene_id_of_invalid_name(): 48 | with raises(Exception): 49 | ensembl_grch38.gene_ids_of_gene_name("A wonderous pony sees through your soul") 50 | 51 | 52 | @run_multiple_genomes() 53 | def test_gene_ids_on_contig(genome): 54 | gene_ids_chr17 = genome.gene_ids(contig=17) 55 | # gene ID of TP53 56 | tp53 = "ENSG00000141510" 57 | assert ( 58 | tp53 in gene_ids_chr17 59 | ), "Missing %s from %s on chr17, example IDs: %s (total = %d)" % ( 60 | tp53, 61 | genome, 62 | gene_ids_chr17[:5], 63 | len(gene_ids_chr17), 64 | ) 65 | 66 | # gene ID of SMAD4 67 | gene_ids_chr18 = genome.gene_ids(contig=18) 68 | smad4 = "ENSG00000141646" 69 | assert ( 70 | smad4 in gene_ids_chr18 71 | ), "Missing %s from %s on chr18, example result: %s (total = %d)" % ( 72 | smad4, 73 | genome, 74 | gene_ids_chr18[:5], 75 | len(gene_ids_chr18), 76 | ) 77 | -------------------------------------------------------------------------------- /tests/test_gene_names.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test all methods which return collections of gene names that aren't converting 3 | from some other type of name or ID. 4 | """ 5 | from __future__ import absolute_import, print_function 6 | from pyensembl import genome_for_reference_name 7 | 8 | from .common import run_multiple_genomes 9 | 10 | grch38 = genome_for_reference_name("GRCh38") 11 | 12 | KNOWN_GENE_NAMES = [ 13 | "TP53", 14 | "ERBB2", 15 | "SMAD4", 16 | "CTAG1A", 17 | "HLA-A", 18 | ] 19 | 20 | 21 | @run_multiple_genomes() 22 | def test_all_gene_names(genome): 23 | """ 24 | test_all_gene_names : Make sure some known gene names such as 25 | SMAD4, TP53, ERBB2, &c 26 | """ 27 | gene_names = genome.gene_names() 28 | print(type(gene_names)) 29 | for gene_name in KNOWN_GENE_NAMES: 30 | assert gene_name in gene_names, "Missing gene name %s from %s" % ( 31 | gene_name, 32 | genome, 33 | ) 34 | 35 | 36 | def test_gene_names_at_locus_grch38_hla_a(): 37 | # chr6:29,945,884 is a position for HLA-A 38 | # based on: 39 | # http://useast.ensembl.org/Homo_sapiens/Gene/ 40 | # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884 41 | names = grch38.gene_names_at_locus(6, 29945884) 42 | assert names == ["HLA-A"], "Expected gene name HLA-A, got: %s" % (names,) 43 | 44 | 45 | @run_multiple_genomes() 46 | def test_gene_names_on_contig(genome): 47 | gene_names_chr17 = genome.gene_names(17) 48 | assert ( 49 | "TP53" in gene_names_chr17 50 | ), "No TP53 in gene names on chr17 of %s, gene names: %s ... (%d)" % ( 51 | genome, 52 | list(gene_names_chr17[:4]), 53 | len(gene_names_chr17), 54 | ) 55 | 56 | gene_names_chr18 = genome.gene_names(18) 57 | assert ( 58 | "SMAD4" in gene_names_chr18 59 | ), "No SMAD4 in gene names on chr18 of %s, gene names: %s ... (%d)" % ( 60 | genome, 61 | list(gene_names_chr18[:4]), 62 | len(gene_names_chr18), 63 | ) 64 | 65 | 66 | def test_gene_name_of_HLA_gene_id(): 67 | gene_ids = grch38.gene_ids_of_gene_name("HLA-A") 68 | gene_names = [grch38.gene_name_of_gene_id(gene_id) for gene_id in gene_ids] 69 | unique_gene_names = list(set(gene_names)) 70 | assert len(unique_gene_names) == 1, (len(unique_gene_names), unique_gene_names) 71 | gene_name = unique_gene_names[0] 72 | assert gene_name == "HLA-A", gene_name 73 | -------------------------------------------------------------------------------- /tests/test_gene_objects.py: -------------------------------------------------------------------------------- 1 | from .common import eq_ 2 | from .common import run_multiple_genomes 3 | from .data import TP53_gene_id 4 | 5 | 6 | @run_multiple_genomes() 7 | def test_TP53_gene_object_by_id(genome): 8 | # when we look up TP53 by its gene ID, we should get the 9 | # correct gene back 10 | gene = genome.gene_by_id(TP53_gene_id) 11 | assert gene.name == "TP53", "Incorrect gene name %s for gene ID %s in %s" % ( 12 | gene.name, 13 | gene.id, 14 | genome, 15 | ) 16 | assert gene.contig == "17", "Incorrect gene contig %s for gene ID %s in %s" % ( 17 | gene.contig, 18 | gene.id, 19 | genome, 20 | ) 21 | 22 | 23 | @run_multiple_genomes() 24 | def test_TP53_gene_object_by_name(genome): 25 | genes = genome.genes_by_name("TP53") 26 | # we should only have one TP53 gene (there aren't any copies) 27 | assert len(genes) == 1, "Expected only one gene with name TP53, got %s" % (genes,) 28 | # make sure it has the correct gene ID 29 | assert genes[0].id == TP53_gene_id, "Expected gene to have ID %s, got %s" % ( 30 | TP53_gene_id, 31 | genes[0].id, 32 | ) 33 | 34 | 35 | @run_multiple_genomes() 36 | def test_equal_genes(genome): 37 | gene1 = genome.genes_by_name("TP53")[0] 38 | # get an identical gene 39 | gene2 = genome.gene_by_id(gene1.id) 40 | 41 | assert hash(gene1) == hash(gene2) 42 | assert gene1 == gene2 43 | 44 | 45 | @run_multiple_genomes() 46 | def test_not_equal_genes(genome): 47 | gene1 = genome.genes_by_name("MUC1")[0] 48 | gene2 = genome.genes_by_name("BRCA1")[0] 49 | assert hash(gene1) != hash(gene2) 50 | assert gene1 != gene2 51 | 52 | 53 | @run_multiple_genomes() 54 | def test_BRCA1_protein_coding_biotype(genome): 55 | gene = genome.genes_by_name("BRCA1")[0] 56 | assert gene.is_protein_coding 57 | eq_(gene.biotype, "protein_coding") 58 | -------------------------------------------------------------------------------- /tests/test_id_length.py: -------------------------------------------------------------------------------- 1 | from .common import major_releases 2 | 3 | 4 | def check_id_length(method_name): 5 | for release in major_releases: 6 | method = getattr(release, method_name) 7 | # only load chromosome Y to speed up tests 8 | idents = method(contig="Y") 9 | assert len(idents) > 0, "No values returned by %s" % method_name 10 | assert all(len(ident) == 15 for ident in idents), "Invalid IDs for %s: %s" % ( 11 | method_name, 12 | [ident for ident in idents if len(ident) != 15], 13 | ) 14 | 15 | 16 | def test_gene_id_length(): 17 | check_id_length("gene_ids") 18 | 19 | 20 | def test_transcript_id_length(): 21 | check_id_length("transcript_ids") 22 | 23 | 24 | def test_protein_id_length(): 25 | check_id_length("protein_ids") 26 | -------------------------------------------------------------------------------- /tests/test_locus.py: -------------------------------------------------------------------------------- 1 | from pyensembl.locus import Locus 2 | from pyensembl.normalization import normalize_chromosome 3 | 4 | from pytest import raises as assert_raises 5 | 6 | 7 | def test_normalize_chromosome(): 8 | assert normalize_chromosome("X") == "X" 9 | assert normalize_chromosome("chrX") == "chrX" 10 | 11 | assert normalize_chromosome("x") == "X" 12 | assert normalize_chromosome("chrx") == "chrX" 13 | 14 | assert normalize_chromosome(1) == "1" 15 | assert normalize_chromosome("1") == "1" 16 | assert normalize_chromosome("chr1") == "chr1" 17 | 18 | assert normalize_chromosome("chrM") == "chrM" 19 | assert normalize_chromosome("chrMT") == "chrMT" 20 | assert normalize_chromosome("M") == "M" 21 | assert normalize_chromosome("MT") == "MT" 22 | assert normalize_chromosome("m") == "M" 23 | assert normalize_chromosome("chrm") == "chrM" 24 | assert normalize_chromosome("mt") == "MT" 25 | assert normalize_chromosome("chrmt") == "chrMT" 26 | 27 | with assert_raises(TypeError): 28 | normalize_chromosome({"a": "b"}) 29 | 30 | with assert_raises(TypeError): 31 | normalize_chromosome([]) 32 | 33 | with assert_raises(TypeError): 34 | normalize_chromosome(None) 35 | 36 | with assert_raises(ValueError): 37 | normalize_chromosome("") 38 | 39 | with assert_raises(ValueError): 40 | normalize_chromosome(0) 41 | 42 | 43 | def test_locus_overlaps(): 44 | locus = Locus("1", 10, 20, "+") 45 | assert locus.overlaps("1", 10, 20, "+") 46 | assert locus.overlaps("1", 10, 20) 47 | assert locus.overlaps("1", 5, 30) 48 | assert locus.overlaps("1", 15, 16) 49 | assert locus.overlaps("1", 15, 30) 50 | assert locus.overlaps("1", 5, 15) 51 | assert locus.overlaps("1", 10, 10) 52 | assert locus.overlaps("1", 20, 20) 53 | # before start 54 | assert not locus.overlaps(1, 9, 9) 55 | # after end 56 | assert not locus.overlaps(21, 30, 30) 57 | # wrong contig 58 | assert not locus.overlaps("2", 10, 20) 59 | # wrong strand 60 | assert not locus.overlaps("1", 10, 20, "-") 61 | 62 | 63 | def test_locus_contains(): 64 | locus = Locus("1", 10, 20, "+") 65 | assert locus.contains("1", 10, 20, "+") 66 | assert locus.contains("1", 10, 20) 67 | assert locus.contains("1", 15, 16) 68 | assert locus.contains("1", 10, 10) 69 | assert locus.contains("1", 20, 20) 70 | 71 | # before start and after end 72 | assert not locus.contains("1", 5, 30) 73 | 74 | # before start 75 | assert not locus.contains("1", 1, 9) 76 | assert not locus.contains("1", 5, 15) 77 | 78 | # after end 79 | assert not locus.contains("1", 21, 30) 80 | assert not locus.contains("1", 15, 30) 81 | 82 | # wrong contig 83 | assert not locus.contains("2", 10, 20) 84 | 85 | # wrong strand 86 | assert not locus.contains("1", 10, 20, "-") 87 | 88 | 89 | def test_position_offset(): 90 | forward_locus = Locus("1", 10, 20, "+") 91 | assert forward_locus.offset(10) == 0 92 | assert forward_locus.offset(15) == 5 93 | assert forward_locus.offset(19) == 9 94 | assert forward_locus.offset(20) == 10 95 | 96 | negative_locus = Locus("1", 10, 20, "-") 97 | assert negative_locus.offset(10) == 10 98 | assert negative_locus.offset(15) == 5 99 | assert negative_locus.offset(19) == 1 100 | assert negative_locus.offset(20) == 0 101 | 102 | # don't allow negative offsets 103 | with assert_raises(ValueError): 104 | forward_locus.offset(9) 105 | 106 | # don't allow negative offsets 107 | with assert_raises(ValueError): 108 | negative_locus.offset(9) 109 | 110 | # don't allow offset past the end of the locus 111 | with assert_raises(ValueError): 112 | forward_locus.offset(21) 113 | 114 | # don't allow offset past the end of the locus 115 | with assert_raises(ValueError): 116 | negative_locus.offset(21) 117 | 118 | 119 | def test_range_offset(): 120 | forward_locus = Locus("1", 10, 20, "+") 121 | assert forward_locus.offset_range(10, 20) == (0, 10) 122 | assert forward_locus.offset_range(11, 14) == (1, 4) 123 | assert forward_locus.offset_range(20, 20) == (10, 10) 124 | 125 | negative_locus = Locus("1", 10, 20, "-") 126 | assert negative_locus.offset_range(10, 20) == (0, 10) 127 | assert negative_locus.offset_range(11, 14) == (6, 9) 128 | assert negative_locus.offset_range(20, 20) == (0, 0) 129 | 130 | # start shouldn't be larger than end 131 | with assert_raises(ValueError): 132 | forward_locus.offset_range(21, 20) 133 | 134 | # start shouldn't be larger than end 135 | with assert_raises(ValueError): 136 | negative_locus.offset_range(21, 20) 137 | 138 | # don't allow negative offsets 139 | with assert_raises(ValueError): 140 | forward_locus.offset_range(9, 10) 141 | 142 | # don't allow negative offsets 143 | with assert_raises(ValueError): 144 | forward_locus.offset_range(9, 10) 145 | 146 | # don't allow negative offsets 147 | with assert_raises(ValueError): 148 | negative_locus.offset_range(9, 10) 149 | 150 | 151 | def test_locus_distance(): 152 | locus_chr1_10_20_pos = Locus("1", 10, 20, "+") 153 | locus_chr1_21_25_pos = Locus("1", 21, 25, "+") 154 | locus_chr2_21_25_pos = Locus("2", 21, 25, "+") 155 | locus_chr1_21_25_neg = Locus("1", 21, 25, "-") 156 | assert locus_chr1_10_20_pos.distance_to_locus(locus_chr1_21_25_pos) == 1 157 | assert locus_chr1_21_25_pos.distance_to_locus(locus_chr1_10_20_pos) == 1 158 | inf = float("inf") 159 | assert locus_chr1_10_20_pos.distance_to_locus(locus_chr2_21_25_pos) == inf 160 | assert locus_chr1_10_20_pos.distance_to_locus(locus_chr1_21_25_neg) == inf 161 | -------------------------------------------------------------------------------- /tests/test_missing_genome_sources.py: -------------------------------------------------------------------------------- 1 | from pyensembl import Genome 2 | from pytest import raises 3 | from .common import eq_ 4 | from .data import data_path 5 | 6 | MOUSE_ENSMUSG00000017167_PATH = data_path( 7 | "mouse.ensembl.81.partial.ENSMUSG00000017167.gtf" 8 | ) 9 | MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH = data_path( 10 | "mouse.ensembl.81.partial.ENSMUSG00000017167.fa" 11 | ) 12 | # MOUSE_ENSMUSG00000088969_NCRNA_FASTA_PATH = data_path( 13 | # "mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa") 14 | MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH = data_path( 15 | "mouse.ensembl.81.partial.ENSMUSG00000017167.pep" 16 | ) 17 | 18 | 19 | def no_gtf_(e): 20 | print("Testing for 'GTF' in %s : %s" % (type(e), e)) 21 | assert "GTF" in str(e) 22 | 23 | 24 | def no_transcript_(e): 25 | print("Testing for 'transcript' in %s : %s" % (type(e), e)) 26 | assert "transcript" in str(e) 27 | 28 | 29 | def no_protein_(e): 30 | print("Testing for 'protein' in %s : %s" % (type(e), e)) 31 | assert "protein" in str(e) 32 | 33 | 34 | def test_transcript_fasta_only(): 35 | genome = Genome( 36 | reference_name="GRCm38", 37 | annotation_name="_test_mouse_ensembl81_subset", 38 | transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH], 39 | ) 40 | genome.index() 41 | 42 | eq_(2, len(genome.transcript_sequences.fasta_dictionary)) 43 | 44 | with raises(ValueError) as e: 45 | genome.genes() 46 | no_gtf_(e) 47 | 48 | with raises(ValueError) as e: 49 | genome.gene_ids() 50 | no_gtf_(e) 51 | 52 | with raises(ValueError) as e: 53 | genome.gene_ids_of_gene_name("test") 54 | no_gtf_(e) 55 | 56 | with raises(ValueError) as e: 57 | genome.transcript_names() 58 | no_gtf_(e) 59 | 60 | with raises(ValueError) as e: 61 | genome.protein_sequence("test") 62 | no_protein_(e) 63 | 64 | 65 | def test_protein_fasta_only(): 66 | genome_only_proteins = Genome( 67 | reference_name="GRCm38", 68 | annotation_name="_test_mouse_ensembl81_subset", 69 | protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH], 70 | ) 71 | genome_only_proteins.index() 72 | 73 | eq_(4, len(genome_only_proteins.protein_sequences.fasta_dictionary)) 74 | 75 | with raises(ValueError) as e: 76 | genome_only_proteins.genes() 77 | no_gtf_(e) 78 | 79 | with raises(ValueError) as e: 80 | genome_only_proteins.transcript_sequence("DOES_NOT_EXIST") 81 | no_transcript_(e) 82 | 83 | 84 | def test_gtf_only(): 85 | genome_only_gtf = Genome( 86 | reference_name="GRCm38", 87 | annotation_name="_test_mouse_ensembl81_subset", 88 | gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, 89 | ) 90 | genome_only_gtf.index() 91 | 92 | eq_(1, len(genome_only_gtf.genes())) 93 | 94 | with raises(ValueError) as e: 95 | genome_only_gtf.transcript_sequence("DOES_NOT_EXIST") 96 | 97 | no_transcript_(e) 98 | 99 | with raises(ValueError) as cm: 100 | genome_only_gtf.protein_sequence("genome_only_gtf") 101 | 102 | no_protein_(cm) 103 | 104 | 105 | def test_gtf_transcript_only(): 106 | genome_gtf_with_cdna = Genome( 107 | reference_name="GRCm38", 108 | annotation_name="_test_mouse_ensembl81_subset", 109 | gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, 110 | transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH], 111 | ) 112 | genome_gtf_with_cdna.index() 113 | 114 | eq_(1, len(genome_gtf_with_cdna.genes())) 115 | 116 | transcript = genome_gtf_with_cdna.transcripts()[0] 117 | assert transcript.sequence is not None 118 | 119 | with raises(ValueError) as e: 120 | transcript.protein_sequence 121 | no_protein_(e) 122 | 123 | 124 | def test_gtf_protein_only(): 125 | genome_gtf_with_proteins = Genome( 126 | reference_name="GRCm38", 127 | annotation_name="_test_mouse_ensembl81_subset", 128 | gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, 129 | protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH], 130 | ) 131 | genome_gtf_with_proteins.index() 132 | 133 | eq_(1, len(genome_gtf_with_proteins.genes())) 134 | 135 | transcript = genome_gtf_with_proteins.transcripts()[0] 136 | assert transcript.protein_sequence is not None 137 | 138 | with raises(ValueError) as e: 139 | transcript.sequence 140 | no_transcript_(e) 141 | -------------------------------------------------------------------------------- /tests/test_mouse.py: -------------------------------------------------------------------------------- 1 | from .common import eq_ 2 | from .data import custom_mouse_genome_grcm38_subset, setup_init_custom_mouse_genome 3 | 4 | 5 | def test_mouse_ENSMUSG00000017167(): 6 | """ 7 | GTF cropped from ftp://ftp.ensembl.org/pub/release-81/gtf/mus_musculus/ 8 | Mus_musculus.GRCm38.81.gtf.gz via: 9 | grep "ENSMUSG00000017167" Mus_musculus.GRCm38.81.gtf 10 | 11 | Transcript FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/ 12 | fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz via: 13 | grep "ENSMUSG00000017167" Mus_musculus.GRCm38.cdna.all.fa -A 50 14 | 15 | ncRNA FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/ 16 | fasta/mus_musculus/cdna/Mus_musculus.GRCm38.ncrna.fa.gz via: 17 | grep "ENSMUSG00000088969" Mus_musculus.GRCm38.ncrna.fa -A 2 18 | 19 | Protein FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/fasta/ 20 | mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz via: 21 | grep "ENSMUSG00000017167" Mus_musculus.GRCm38.pep.all.fa -A 50 22 | 23 | Tested against: 24 | http://useast.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000017167 25 | """ 26 | setup_init_custom_mouse_genome() 27 | genes_cntnap1 = custom_mouse_genome_grcm38_subset.genes_by_name("Cntnap1") 28 | eq_(len(genes_cntnap1), 1) 29 | gene_cntnap1 = genes_cntnap1[0] 30 | transcripts_cntnap1 = gene_cntnap1.transcripts 31 | eq_(len(transcripts_cntnap1), 2) 32 | transcripts_coding_cntnap1 = [ 33 | transcript 34 | for transcript in transcripts_cntnap1 35 | if transcript.biotype == "protein_coding" 36 | ] 37 | eq_(len(transcripts_coding_cntnap1), 1) 38 | transcript_cntnap1 = transcripts_coding_cntnap1[0] 39 | eq_( 40 | transcript_cntnap1.sequence[:120], 41 | ( 42 | "GAGAGAAGGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" 43 | "GAGAGAGAGAGATTGGGGGTAGGAGAGAGGGAAGGGTGGATAAGGACGGAAAAAAGCTTT" 44 | ), 45 | ) 46 | eq_( 47 | transcript_cntnap1.protein_sequence[:120], 48 | ( 49 | "MMSLRLFSILLATVVSGAWGWGYYGCNEELVGPLYARSLGASSYYGLFTTARFARLHGIS" 50 | "GWSPRIGDPNPWLQIDLMKKHRIRAVATQGAFNSWDWVTRYMLLYGDRVDSWTPFYQKGH" 51 | ), 52 | ) 53 | -------------------------------------------------------------------------------- /tests/test_release_versions.py: -------------------------------------------------------------------------------- 1 | from pyensembl import EnsemblRelease, MAX_ENSEMBL_RELEASE 2 | 3 | from pytest import raises 4 | 5 | 6 | def test_version_too_old_1(): 7 | with raises(Exception): 8 | EnsemblRelease(1) 9 | 10 | 11 | def test_version_too_old_47(): 12 | with raises(Exception): 13 | EnsemblRelease(47) 14 | 15 | 16 | def test_version_is_not_numeric(): 17 | with raises(Exception): 18 | EnsemblRelease("wuzzle") 19 | 20 | 21 | def test_version_is_none(): 22 | with raises(Exception): 23 | EnsemblRelease(None) 24 | 25 | 26 | def test_max_ensembl_release(): 27 | assert isinstance( 28 | MAX_ENSEMBL_RELEASE, int 29 | ), "Unexpected type for MAX_ENSEMBL_RELEASE: %s" % (type(MAX_ENSEMBL_RELEASE),) 30 | assert 83 <= MAX_ENSEMBL_RELEASE < 1000, ( 31 | "Unexpected value for MAX_ENSEMBL_RELEASE: %d" % MAX_ENSEMBL_RELEASE 32 | ) 33 | 34 | 35 | def test_int_version(): 36 | for version in range(54, MAX_ENSEMBL_RELEASE): 37 | EnsemblRelease(version) 38 | 39 | 40 | def test_str_version(): 41 | for version in range(54, MAX_ENSEMBL_RELEASE): 42 | EnsemblRelease(str(version)) 43 | -------------------------------------------------------------------------------- /tests/test_search.py: -------------------------------------------------------------------------------- 1 | from pyensembl import find_nearest_locus 2 | from .common import eq_ 3 | from .common import run_multiple_genomes 4 | 5 | 6 | @run_multiple_genomes() 7 | def test_find_nearest_BRAF_exon(genome): 8 | braf = genome.genes_by_name("BRAF")[0] 9 | braf_transcripts = braf.transcripts 10 | exons = braf_transcripts[0].exons 11 | for exon in exons: 12 | # immediately before exon 13 | result_before = find_nearest_locus( 14 | start=exon.start - 2, end=exon.start - 1, loci=exons 15 | ) 16 | eq_(result_before, (1, exon)) 17 | 18 | # overlapping with exon 19 | result_overlap = find_nearest_locus( 20 | start=exon.start - 2, end=exon.start + 1, loci=exons 21 | ) 22 | eq_(result_overlap, (0, exon)) 23 | 24 | # immediately after exon 25 | result_after = find_nearest_locus( 26 | start=exon.end + 1, end=exon.end + 2, loci=exons 27 | ) 28 | eq_(result_after, (1, exon)) 29 | 30 | 31 | @run_multiple_genomes() 32 | def test_find_nearest_BRAF_transcript(genome): 33 | braf_transcript = genome.genes_by_name("BRAF")[0].transcripts[0] 34 | egfr_transcript = genome.genes_by_name("EGFR")[0].transcripts[0] 35 | transcripts = [braf_transcript, egfr_transcript] 36 | for transcript in transcripts: 37 | # immediately before transcript 38 | result_before = find_nearest_locus( 39 | start=transcript.start - 2, end=transcript.start - 1, loci=transcripts 40 | ) 41 | eq_(result_before, (1, transcript)) 42 | 43 | # overlapping with transcript 44 | result_overlap = find_nearest_locus( 45 | start=transcript.start - 2, end=transcript.start + 1, loci=transcripts 46 | ) 47 | eq_(result_overlap, (0, transcript)) 48 | 49 | # immediately after transcript 50 | # may overlap with other transcripts 51 | result_after = find_nearest_locus( 52 | start=transcript.end + 1, end=transcript.end + 2, loci=transcripts 53 | ) 54 | eq_(result_after, (1, transcript)) 55 | -------------------------------------------------------------------------------- /tests/test_sequence_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test SequenceData object to make sure it's correctly parsing FASTA files 3 | and that we're able to clear and regenrate its cached representation of 4 | a FASTA dictionary 5 | """ 6 | from os.path import exists 7 | 8 | from pyensembl import SequenceData 9 | 10 | from .common import TemporaryDirectory 11 | from .data import data_path 12 | 13 | 14 | FASTA_PATH = data_path("mouse.ensembl.81.partial.ENSMUSG00000017167.fa") 15 | 16 | 17 | def test_sequence_type(): 18 | with TemporaryDirectory() as tmpdir: 19 | seqs_dna = SequenceData([FASTA_PATH], cache_directory_path=tmpdir) 20 | seq = seqs_dna.get("ENSMUST00000138942") 21 | assert seq is not None, "Failed to find sequence for ENSMUST00000138942" 22 | assert isinstance(seq, str), "Wrong sequence type, expected %s but got %s" % ( 23 | str, 24 | type(seq), 25 | ) 26 | 27 | 28 | def test_missing_sequence(): 29 | with TemporaryDirectory() as tmpdir: 30 | seqs = SequenceData([FASTA_PATH], cache_directory_path=tmpdir) 31 | seq = seqs.get("NotInFasta") 32 | assert seq is None, "Should get None back for missing sequence" 33 | 34 | 35 | def test_clear_cache(): 36 | with TemporaryDirectory() as tmpdir: 37 | seqs = SequenceData([FASTA_PATH], cache_directory_path=tmpdir) 38 | assert not seqs._fasta_dictionary, "Expected _fasta_dictionary to load lazily" 39 | 40 | seqs._load_or_create_fasta_dictionary_pickle() 41 | assert len(seqs._fasta_dictionary) > 0, "FASTA dictionary didn't get created" 42 | 43 | seqs.clear_cache() 44 | assert ( 45 | not seqs._fasta_dictionary 46 | ), "Expected FASTA dictionary to be empty after clear_cache()" 47 | for pickle_path in seqs.fasta_dictionary_pickle_paths: 48 | assert not exists( 49 | pickle_path 50 | ), "Cached pickle file should have been deleted" 51 | 52 | seqs._load_or_create_fasta_dictionary_pickle() 53 | for pickle_path in seqs.fasta_dictionary_pickle_paths: 54 | assert exists(pickle_path), "Cached pickle file should have been created" 55 | -------------------------------------------------------------------------------- /tests/test_serialization.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | import pickle 14 | 15 | from pyensembl import Genome, Transcript, Gene, Exon 16 | from pyensembl.species import Species, human 17 | 18 | from .common import run_multiple_genomes, eq_ 19 | from .data import ( 20 | TP53_gene_id, 21 | custom_mouse_genome_grcm38_subset, 22 | setup_init_custom_mouse_genome, 23 | ) 24 | 25 | 26 | @run_multiple_genomes 27 | def test_pickle_ensembl_gene(genome): 28 | gene = genome.gene_by_id(TP53_gene_id) 29 | gene_new = pickle.loads(pickle.dumps(gene)) 30 | assert gene == gene_new 31 | 32 | 33 | @run_multiple_genomes() 34 | def test_pickle_ensembl_transcript(genome): 35 | gene = genome.gene_by_id(TP53_gene_id) 36 | transcript = gene.transcripts[0] 37 | transcript_reconstructed = pickle.loads(pickle.dumps(transcript)) 38 | eq_(transcript, transcript_reconstructed) 39 | 40 | 41 | @run_multiple_genomes() 42 | def test_pickle_ensembl_exon(genome): 43 | gene = genome.gene_by_id(TP53_gene_id) 44 | transcript = gene.transcripts[0] 45 | exon = transcript.exons[0] 46 | exon_reconstructed = pickle.loads(pickle.dumps(exon)) 47 | eq_(exon, exon_reconstructed) 48 | 49 | 50 | @run_multiple_genomes() 51 | def test_json_ensembl_gene(genome): 52 | gene = genome.gene_by_id(TP53_gene_id) 53 | gene_reconstructed = Gene.from_json(gene.to_json()) 54 | eq_(gene, gene_reconstructed) 55 | 56 | 57 | @run_multiple_genomes() 58 | def test_json_ensembl_transcript(genome): 59 | gene = genome.gene_by_id(TP53_gene_id) 60 | transcript = gene.transcripts[0] 61 | transcript_reconstructed = Transcript.from_json(transcript.to_json()) 62 | eq_(transcript, transcript_reconstructed) 63 | 64 | 65 | @run_multiple_genomes() 66 | def test_json_ensembl_exon(genome): 67 | gene = genome.gene_by_id(TP53_gene_id) 68 | transcript = gene.transcripts[0] 69 | exon = transcript.exons[0] 70 | exon_reconstructed = Exon.from_json(exon.to_json()) 71 | eq_(exon, exon_reconstructed) 72 | 73 | 74 | @run_multiple_genomes() 75 | def test_pickle_ensembl_genome(genome): 76 | genome_pickled = pickle.dumps(genome) 77 | genome_reconstructed = pickle.loads(genome_pickled) 78 | eq_(genome, genome_reconstructed) 79 | 80 | # This Genome happens to be an EnsemblRelease; test that too. 81 | eq_(genome.release, genome_reconstructed.release) 82 | eq_(genome.species, genome_reconstructed.species) 83 | 84 | 85 | @run_multiple_genomes() 86 | def test_ensembl_genome_to_dict(genome): 87 | genome_dict = genome.to_dict() 88 | genome_reconstructed = genome.__class__.from_dict(genome_dict) 89 | eq_(genome, genome_reconstructed) 90 | 91 | 92 | @run_multiple_genomes() 93 | def test_ensembl_genome_to_json(genome): 94 | genome_json = genome.to_json() 95 | genome_class = genome.__class__ 96 | genome_reconstructed = genome_class.from_json(genome_json) 97 | eq_(genome, genome_reconstructed) 98 | 99 | 100 | def test_custom_genome_to_json(): 101 | setup_init_custom_mouse_genome() 102 | json = custom_mouse_genome_grcm38_subset.to_json() 103 | reconstructed = Genome.from_json(json) 104 | eq_(custom_mouse_genome_grcm38_subset, reconstructed) 105 | 106 | 107 | def test_custom_genome_to_dict(): 108 | setup_init_custom_mouse_genome() 109 | reconstructed = Genome.from_dict(custom_mouse_genome_grcm38_subset.to_dict()) 110 | eq_(custom_mouse_genome_grcm38_subset, reconstructed) 111 | 112 | 113 | def test_species_to_dict(): 114 | eq_(human, Species.from_dict(human.to_dict())) 115 | 116 | 117 | def test_species_to_json(): 118 | eq_(human, Species.from_json(human.to_json())) 119 | 120 | 121 | def test_species_to_pickle(): 122 | eq_(human, pickle.loads(pickle.dumps(human))) 123 | 124 | 125 | @run_multiple_genomes() 126 | def test_unique_memory_address_of_unpickled_genomes(genome): 127 | unpickled = pickle.loads(pickle.dumps(genome)) 128 | assert ( 129 | genome is unpickled 130 | ), "Expected same object for %s but got two different instances" % (unpickled,) 131 | -------------------------------------------------------------------------------- /tests/test_shell.py: -------------------------------------------------------------------------------- 1 | from pyensembl.shell import parser, all_combinations_of_ensembl_genomes 2 | from .common import eq_ 3 | 4 | 5 | def test_genome_selection_grch38(): 6 | args = parser.parse_args(["install", "--release", "100", "--species", "human"]) 7 | genomes = all_combinations_of_ensembl_genomes(args) 8 | assert len(genomes) == 1 9 | genome = genomes[0] 10 | eq_(genome.species.latin_name, "homo_sapiens") 11 | eq_(genome.release, 100) 12 | -------------------------------------------------------------------------------- /tests/test_string_representation.py: -------------------------------------------------------------------------------- 1 | from pyensembl import Locus, Gene, ensembl_grch37, Transcript, Exon 2 | from .common import eq_ 3 | 4 | 5 | def test_Locus_string_representation(): 6 | locus = Locus("X", 1000, 1010, "+") 7 | string_repr = str(locus) 8 | expected = "Locus(contig='X', start=1000, end=1010, strand='+')" 9 | eq_(string_repr, expected) 10 | 11 | 12 | def test_Gene_string_representation(): 13 | gene = Gene( 14 | gene_id="ENSG0001", 15 | gene_name="CAPITALISM", 16 | biotype="protein_coding", 17 | contig="Y", 18 | start=1, 19 | end=5, 20 | strand="+", 21 | genome=ensembl_grch37, 22 | ) 23 | string_repr = str(gene) 24 | expected = ( 25 | "Gene(gene_id='ENSG0001'," 26 | " gene_name='CAPITALISM'," 27 | " biotype='protein_coding'," 28 | " contig='Y'," 29 | " start=1, end=5, strand='+', genome='GRCh37')" 30 | ) 31 | eq_(string_repr, expected) 32 | 33 | 34 | def test_Transcript_string_representation(): 35 | transcript = Transcript( 36 | transcript_id="ENST0001", 37 | transcript_name="CAPITALISM-001", 38 | gene_id="ENSG0001", 39 | biotype="protein_coding", 40 | contig="Y", 41 | start=1, 42 | end=5, 43 | strand="+", 44 | genome=ensembl_grch37, 45 | ) 46 | 47 | expected = ( 48 | "Transcript(transcript_id='ENST0001'," 49 | " transcript_name='CAPITALISM-001'," 50 | " gene_id='ENSG0001'," 51 | " biotype='protein_coding'," 52 | " contig='Y'," 53 | " start=1," 54 | " end=5, strand='+', genome='GRCh37')" 55 | ) 56 | string_repr = str(transcript) 57 | eq_(string_repr, expected) 58 | 59 | 60 | def test_Exon_string_representation(): 61 | exon = Exon( 62 | exon_id="ENSE0001", 63 | gene_id="ENSG0001", 64 | gene_name="CAPITALISM", 65 | contig="Y", 66 | start=1, 67 | end=5, 68 | strand="+", 69 | ) 70 | 71 | expected = ( 72 | "Exon(exon_id='ENSE0001'," 73 | " gene_id='ENSG0001'," 74 | " gene_name='CAPITALISM'," 75 | " contig='Y'," 76 | " start=1," 77 | " end=5, strand='+')" 78 | ) 79 | string_repr = str(exon) 80 | eq_(string_repr, expected) 81 | -------------------------------------------------------------------------------- /tests/test_timings.py: -------------------------------------------------------------------------------- 1 | from pyensembl import genome_for_reference_name 2 | 3 | from tinytimer import benchmark 4 | 5 | ensembl = genome_for_reference_name("GRCh38") 6 | contigs = [str(i + 1) for i in range(22)] + ["X", "Y"] 7 | 8 | 9 | def make_repeat_lookup_fn(lookup_fn, n_positions): 10 | """ 11 | Make a thunk which calls the lookup_fn at a number of loci 12 | for each human chromosome (excluding MT). 13 | """ 14 | 15 | def repeat_lookup_fn(): 16 | for contig in contigs: 17 | for position in [10**6 + i * 10**6 for i in range(n_positions)]: 18 | lookup_fn(contig, position) 19 | 20 | return repeat_lookup_fn 21 | 22 | 23 | def run_benchmark(lookup_fn, n_positions_per_contig=20, time_limit=60.0): 24 | """ 25 | Take a lookup functions (such as EnsemblRelease.genes_at_locus) and 26 | time how long it takes across multiple loci. 27 | """ 28 | repeat_lookup_fn = make_repeat_lookup_fn(lookup_fn, n_positions_per_contig) 29 | n_loci = n_positions_per_contig * len(contigs) 30 | name = lookup_fn.__name__ 31 | average_time = benchmark(repeat_lookup_fn, name="%s for %d loci" % (name, n_loci)) 32 | print("-- %s : %0.4fs" % (name, average_time)) 33 | assert average_time < time_limit, "%s took too long for %s loci: %0.4fs" % ( 34 | name, 35 | n_loci, 36 | average_time, 37 | ) 38 | return average_time 39 | 40 | 41 | def test_timing_genes_at_locus(): 42 | run_benchmark(ensembl.genes_at_locus) 43 | 44 | 45 | def test_timing_transcripts_at_locus(): 46 | run_benchmark(ensembl.transcripts_at_locus) 47 | 48 | 49 | def test_timing_exons_at_locus(): 50 | run_benchmark(ensembl.exons_at_locus) 51 | 52 | 53 | def test_timing_transcript_sequences_at_locus(): 54 | def transcript_sequences_at_locus(contig, position): 55 | sequences = [] 56 | for transcript in ensembl.transcripts_at_locus(contig, position): 57 | sequences.append(transcript.sequence) 58 | return sequences 59 | 60 | run_benchmark(transcript_sequences_at_locus) 61 | 62 | 63 | def test_timing_transcript_coding_sequences_at_locus(): 64 | def transcript_coding_sequences_at_locus(contig, position): 65 | sequences = [] 66 | for transcript in ensembl.transcripts_at_locus(contig, position): 67 | if transcript.sequence and transcript.complete: 68 | sequences.append(transcript.coding_sequence) 69 | return sequences 70 | 71 | run_benchmark(transcript_coding_sequences_at_locus) 72 | 73 | 74 | def run_all_benchmarks(): 75 | import types 76 | 77 | # run all local test functions to see their timings printed 78 | global_variables = globals() 79 | for variable_name in global_variables: 80 | if "test_" in variable_name: 81 | f = global_variables[variable_name] 82 | if isinstance(f, types.FunctionType): 83 | f() 84 | 85 | 86 | if __name__ == "__main__": 87 | run_all_benchmarks() 88 | -------------------------------------------------------------------------------- /tests/test_transcript_ids.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for methods which return collections of transcript IDs that aren't 3 | converting from some type of name or ID. 4 | """ 5 | from __future__ import absolute_import 6 | 7 | from pyensembl import genome_for_reference_name 8 | from .common import eq_ 9 | from .common import run_multiple_genomes 10 | 11 | grch38 = genome_for_reference_name("GRCh38") 12 | 13 | # subset of transcript IDs for HLA-A 14 | HLA_A_TRANSCRIPT_IDS = [ 15 | "ENST00000396634", 16 | "ENST00000376809", 17 | "ENST00000376806", 18 | "ENST00000376802", 19 | "ENST00000496081", 20 | "ENST00000495183", 21 | "ENST00000461903", 22 | "ENST00000479320", 23 | ] 24 | 25 | 26 | def test_transcript_ids_ensembl_grch38_hla_a(): 27 | # chr6:29,945,884 is a position for HLA-A 28 | # based on: 29 | # http://useast.ensembl.org/Homo_sapiens/Gene/ 30 | # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884 31 | transcript_ids = grch38.transcript_ids_at_locus(6, 29941260, 29945884) 32 | for transcript_id in HLA_A_TRANSCRIPT_IDS: 33 | assert transcript_id in transcript_ids, ( 34 | "Transcript %s of HLA-A not found overlapping locus" % transcript_id 35 | ) 36 | 37 | 38 | KNOWN_TRANSCRIPT_IDS = HLA_A_TRANSCRIPT_IDS + [ 39 | "ENST00000398417", # transcript ID of SMAD4-001 40 | "ENST00000334701", # transcript ID of HSP90AA1-001 41 | "ENST00000599837", # transcript ID of CTAG1A-002 42 | ] 43 | 44 | 45 | # TODO: add release 54 after transcript IDs for older GTFs are filled in 46 | # See https://github.com/hammerlab/pyensembl/issues/20 47 | @run_multiple_genomes(75, grch38.release) 48 | def test_all_transcript_ids(genome): 49 | transcript_ids = set(genome.transcript_ids()) 50 | for transcript_id in KNOWN_TRANSCRIPT_IDS: 51 | assert transcript_id in transcript_ids, "Missing transcript ID %s from %s" % ( 52 | transcript_id, 53 | genome, 54 | ) 55 | 56 | 57 | def test_transcript_id_of_protein_id_CCR2(): 58 | # Looked up on Oct 9 2021: 59 | # CCR2-203 ENST00000445132.3 maps to ENSP00000399285.2 60 | # Ensembl release 104, GRCh38.p13 61 | transcript_id = grch38.transcript_id_of_protein_id("ENSP00000399285") 62 | eq_("ENST00000445132", transcript_id) 63 | -------------------------------------------------------------------------------- /tests/test_transcript_objects.py: -------------------------------------------------------------------------------- 1 | from pyensembl import Locus, cached_release 2 | 3 | from .common import eq_, neq_, gt_, run_multiple_genomes 4 | from .data import ( 5 | FOXP3_001_transcript_id, 6 | CTNNBIP1_004_transcript_id, 7 | CTNNBIP1_004_UTR5, 8 | CTNNBIP1_004_UTR3, 9 | CTNNBIP1_004_CDS, 10 | CTNNBIP1_004_locus, 11 | CTTNNIP1_004_exon_lengths, 12 | CTTNNIP1_004_exon_ids, 13 | EGFR_001_protein_sequence, 14 | TP53_gene_id, 15 | ) 16 | 17 | ensembl77 = cached_release(77) 18 | 19 | 20 | def test_transcript_start_codon(): 21 | """ 22 | test_transcript_start_codon : Check that fields Transcript 23 | (for transcript named CTNNBIP1-004) matches known values. 24 | """ 25 | CTNNBIP1_004_transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) 26 | 27 | assert Locus.__eq__( 28 | CTNNBIP1_004_transcript, CTNNBIP1_004_locus 29 | ), "Expected locus %s but got %s" % ( 30 | CTNNBIP1_004_locus, 31 | Locus.__str__(CTNNBIP1_004_transcript), 32 | ) 33 | 34 | start_offsets = CTNNBIP1_004_transcript.start_codon_spliced_offsets 35 | assert len(start_offsets) == 3, "Wrong length for start codon: %d (%s)" % ( 36 | len(start_offsets), 37 | start_offsets, 38 | ) 39 | 40 | assert all( 41 | isinstance(i, int) for i in start_offsets 42 | ), "Wrong type %s for beginning start codon offset" % ( 43 | [type(i) for i in start_offsets], 44 | ) 45 | 46 | expected_start_codon_offset = len(CTNNBIP1_004_UTR5) 47 | start_codon_offset = min(start_offsets) 48 | assert ( 49 | start_codon_offset == expected_start_codon_offset 50 | ), "Incorrect start codon offset, expected %d but got %d" % ( 51 | expected_start_codon_offset, 52 | start_codon_offset, 53 | ) 54 | 55 | 56 | def test_transcript_exons(): 57 | """ 58 | test_transcript_exons : Ensure that properties of CTTNBIP1-004's 59 | Exon objects match known values. 60 | """ 61 | transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) 62 | exons = transcript.exons 63 | assert isinstance(exons, list), "Expected list of Exon objects, got %s : %s" % ( 64 | exons, 65 | type(exons), 66 | ) 67 | 68 | # CTTNBIP1-004 has 5 exons 69 | assert len(exons) == len( 70 | CTTNNIP1_004_exon_lengths 71 | ), "Expected %d exons but got %d" % (len(CTTNNIP1_004_exon_lengths), len(exons)) 72 | 73 | for i, exon in enumerate(exons): 74 | expected_id = CTTNNIP1_004_exon_ids[i] 75 | assert ( 76 | exon.id == expected_id 77 | ), "Expected exon #%d of %s to have ID %s but got %s" % ( 78 | i + 1, 79 | transcript, 80 | expected_id, 81 | exon.id, 82 | ) 83 | 84 | expected_length = CTTNNIP1_004_exon_lengths[i] 85 | assert ( 86 | len(exon) == expected_length 87 | ), "Expected exon #%d of %s (%s) to have length %d but got %d" % ( 88 | i + 1, 89 | transcript, 90 | exon, 91 | expected_length, 92 | len(exon), 93 | ) 94 | 95 | 96 | # not testing NCBI/Release 54 since I just discovered that ensembl54 97 | # feature='transcript' entries don't have a gene ID. 98 | # TODO: Add gene_id patching to gtf_parsing, add ensembl54 to the list 99 | # below 100 | @run_multiple_genomes(75, 77) 101 | def test_sequence_parts(genome): 102 | # Ensure that the UTRs and coding sequence can be 103 | # combined to make the full transcript. 104 | transcript = genome.transcript_by_id(FOXP3_001_transcript_id) 105 | 106 | # The combined lengths of the upstream untranslated region, 107 | # coding sequence, and downstream untranslated region 108 | full_sequence = transcript.sequence 109 | gt_(len(full_sequence), 0) 110 | 111 | utr5 = transcript.five_prime_utr_sequence 112 | gt_(len(utr5), 0) 113 | 114 | cds = transcript.coding_sequence 115 | gt_(len(cds), 0) 116 | 117 | utr3 = transcript.three_prime_utr_sequence 118 | gt_(len(utr3), 0) 119 | 120 | # need to use `seq` property of Sequence objects to get underlying 121 | # strings which can be concatenated and compared 122 | combined_string = utr5 + cds + utr3 123 | 124 | combined_sequence_length = len(combined_string) 125 | # make sure length property of transcript matches the sequence length 126 | eq_( 127 | combined_sequence_length, 128 | len(transcript), 129 | "Length 5' UTR(%dnt) + CDS(%dnt) + 3' UTR(%d) = %d, expected %d" 130 | % (len(utr5), len(cds), len(utr3), combined_sequence_length, len(transcript)), 131 | ) 132 | eq_( 133 | combined_string, 134 | full_sequence, 135 | "Expected FOXP3-001 sequence:\n%s\n\n5' UTR + CDS + 3' UTR:\n%s" 136 | % (full_sequence, combined_string), 137 | ) 138 | 139 | 140 | def test_transcript_utr5_sequence_CTNNIP1_004(): 141 | transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) 142 | utr5 = transcript.five_prime_utr_sequence 143 | expected_utr5_length = len(CTNNBIP1_004_UTR5) 144 | eq_( 145 | len(utr5), 146 | expected_utr5_length, 147 | "Expected 5' UTR length %d, got %d" % (expected_utr5_length, len(utr5)), 148 | ) 149 | eq_(utr5, CTNNBIP1_004_UTR5) 150 | 151 | 152 | def test_transcript_utr3_sequence_CTNNIP1_004(): 153 | transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) 154 | utr3 = transcript.three_prime_utr_sequence 155 | expected_utr3_length = len(CTNNBIP1_004_UTR3) 156 | eq_( 157 | len(utr3), 158 | expected_utr3_length, 159 | "Expected 3' UTR length %d, got %d" % (expected_utr3_length, len(utr3)), 160 | ) 161 | eq_(utr3, CTNNBIP1_004_UTR3) 162 | 163 | 164 | def test_transcript_cds_CTNNIP1_004(): 165 | transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id) 166 | cds = transcript.coding_sequence 167 | expected_cds_length = len(CTNNBIP1_004_CDS) 168 | eq_( 169 | len(cds), 170 | expected_cds_length, 171 | "Expected CDS length %d, got %d" % (expected_cds_length, len(cds)), 172 | ) 173 | eq_(cds, CTNNBIP1_004_CDS) 174 | 175 | 176 | @run_multiple_genomes() 177 | def test_equal_transcripts(genome): 178 | t1 = genome.genes_by_name("TP53")[0].transcripts[0] 179 | # get an identical gene 180 | t2 = genome.transcript_by_id(t1.id) 181 | eq_(t1, t2) 182 | eq_(hash(t1), hash(t2)) 183 | 184 | 185 | @run_multiple_genomes() 186 | def test_not_equal_transcripts(genome): 187 | t1 = genome.genes_by_name("MUC1")[0].transcripts[0] 188 | t2 = genome.genes_by_name("BRCA1")[0].transcripts[0] 189 | neq_(t1, t2) 190 | 191 | 192 | def test_protein_id(): 193 | transcript = ensembl77.transcripts_by_name("EGFR-001")[0] 194 | eq_(transcript.protein_id, "ENSP00000275493") 195 | 196 | 197 | def test_protein_protein_sequence(): 198 | transcript = ensembl77.transcripts_by_name("EGFR-001")[0] 199 | eq_(transcript.protein_sequence, EGFR_001_protein_sequence) 200 | 201 | 202 | def test_transcript_gene_should_match_parent_gene(): 203 | gene = ensembl77.gene_by_id(TP53_gene_id) 204 | for transcript in gene.transcripts: 205 | eq_(transcript.gene, gene) 206 | 207 | 208 | @run_multiple_genomes() 209 | def test_BRCA1_201_has_protein_coding_biotype(genome): 210 | transcript = genome.transcripts_by_name("BRCA1-201")[0] 211 | assert ( 212 | transcript.is_protein_coding 213 | ), "Expected BRCA1-201 transcript %s to have a protein coding biotype in %s" % ( 214 | transcript, 215 | genome, 216 | ) 217 | eq_(transcript.biotype, "protein_coding") 218 | -------------------------------------------------------------------------------- /tests/test_transcript_sequences.py: -------------------------------------------------------------------------------- 1 | """Make sure we're getting correct transcritp sequence from Ensembl and that 2 | it's a sequence type which correctly implements `complement` 3 | and `reverse_complement` 4 | """ 5 | 6 | from __future__ import absolute_import 7 | from .common import eq_ 8 | from pyensembl import genome_for_reference_name 9 | 10 | grch38 = genome_for_reference_name("GRCh38") 11 | 12 | 13 | def test_transcript_sequence_ensembl_grch38(): 14 | # extremely short TRD gene 15 | seq = grch38.transcript_sequence("ENST00000448914") 16 | expected = "ACTGGGGGATACG" 17 | eq_(seq, expected) 18 | # now try via a Transcript object 19 | eq_(grch38.transcript_by_id("ENST00000448914").sequence, expected) 20 | -------------------------------------------------------------------------------- /tests/test_transcript_support_level.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for methods which return collections of transcript IDs that aren't 3 | converting from some type of name or ID. 4 | """ 5 | from __future__ import absolute_import 6 | 7 | from .common import eq_ 8 | 9 | from pyensembl import cached_release 10 | 11 | 12 | def test_transcript_support_level(): 13 | """The Transcript Support Level (TSL) is a method to highlight the well-supported and poorly-supported transcript 14 | models for users, based on the type and quality of the alignments used to annotate the transcript. 15 | In the Ensembl database, it can be assigned to a value 1 through 5, or reported as NA, or missing, or missing 16 | completely in older releases. We translate it to an integer value, otherwise to None. 17 | """ 18 | ensembl93 = cached_release(93) 19 | transcript = ensembl93.transcripts_by_name("DDX11L1-202")[0] 20 | eq_(transcript.support_level, 1) 21 | 22 | # For this transcript, the transcript_support_level value is missing in the database record: 23 | transcript = ensembl93.transcripts_by_name("OR4G11P-202")[0] 24 | eq_(transcript.support_level, None) 25 | 26 | # Some features are reported as "NA" in Ensembl: those are features like pseudogenes, single exon transcripts, 27 | # HLA, T-cell receptor and Ig transcripts that are not analysed in terms of TSL and therefore not given any 28 | # of the TSL categories. We translate NA to None as well. 29 | transcript = ensembl93.transcripts_by_name("MIR1302-2-201")[0] 30 | eq_(transcript.support_level, None) 31 | 32 | # Transcript_support_level column was missing completely in GRCh37 and older releases of GRCh38: 33 | ensembl77 = cached_release(77) 34 | transcript = ensembl77.transcripts_by_name("DDX11L1-002")[0] 35 | eq_(transcript.support_level, None) 36 | -------------------------------------------------------------------------------- /tests/test_ucsc_gtf.py: -------------------------------------------------------------------------------- 1 | from pyensembl import Genome, Database 2 | 3 | from .common import TemporaryDirectory, eq_ 4 | from .data import data_path 5 | 6 | UCSC_GENCODE_PATH = data_path("gencode.ucsc.small.gtf") 7 | UCSC_REFSEQ_PATH = data_path("refseq.ucsc.small.gtf") 8 | 9 | 10 | def test_ucsc_gencode_gtf(): 11 | with TemporaryDirectory() as tmpdir: 12 | db = Database(UCSC_GENCODE_PATH, cache_directory_path=tmpdir) 13 | df = db._load_gtf_as_dataframe() 14 | exons = df[df["feature"] == "exon"] 15 | # expect 12 exons from the dataframe 16 | assert len(exons) == 12, "Expected 12 exons, got %d: %s" % (len(exons), exons) 17 | 18 | 19 | def test_ucsc_gencode_genome(): 20 | """ 21 | Testing with a small GENCODE GTF file downloaded from 22 | http://genome.ucsc.edu/cgi-bin/hgTables 23 | """ 24 | with TemporaryDirectory() as tmpdir: 25 | genome = Genome( 26 | reference_name="GRCh38", 27 | annotation_name="ucsc_test", 28 | gtf_path_or_url=UCSC_GENCODE_PATH, 29 | cache_directory_path=tmpdir, 30 | ) 31 | genome.index() 32 | genes = genome.genes() 33 | for gene in genes: 34 | assert gene.id, "Gene with missing ID in %s" % (genome,) 35 | assert len(genes) == 7, "Expected 7 genes, got %d: %s" % (len(genes), genes) 36 | transcripts = genome.transcripts() 37 | for transcript in transcripts: 38 | assert transcript.id, "Transcript with missing ID in %s" % (genome,) 39 | assert len(transcripts) == 7, "Expected 7 transcripts, got %d: %s" % ( 40 | len(transcripts), 41 | transcripts, 42 | ) 43 | 44 | gene_uc001aak4 = genome.gene_by_id("uc001aak.4") 45 | eq_(gene_uc001aak4.id, "uc001aak.4") 46 | eq_(gene_uc001aak4.name, None) 47 | eq_(gene_uc001aak4.biotype, None) 48 | 49 | gene_1_17369 = genome.genes_at_locus("chr1", 17369) 50 | eq_(gene_1_17369[0].id, "uc031tla.1") 51 | 52 | transcript_1_30564 = genome.transcripts_at_locus("chr1", 30564) 53 | eq_(transcript_1_30564[0].id, "uc057aty.1") 54 | 55 | 56 | def test_ucsc_refseq_gtf(): 57 | """ 58 | Test GTF object with a small RefSeq GTF file downloaded from 59 | http://genome.ucsc.edu/cgi-bin/hgTables 60 | """ 61 | with TemporaryDirectory() as tmpdir: 62 | db = Database(UCSC_REFSEQ_PATH, cache_directory_path=tmpdir) 63 | df = db._load_gtf_as_dataframe() 64 | exons = df[df["feature"] == "exon"] 65 | # expect 16 exons from the GTF 66 | assert len(exons) == 16, "Expected 16 exons, got %d: %s" % (len(exons), exons) 67 | 68 | 69 | def test_ucsc_refseq_genome(): 70 | """ 71 | Test Genome object with a small RefSeq GTF file downloaded from 72 | http://genome.ucsc.edu/cgi-bin/hgTables 73 | """ 74 | with TemporaryDirectory() as tmpdir: 75 | genome = Genome( 76 | reference_name="GRCh38", 77 | annotation_name="ucsc_test", 78 | gtf_path_or_url=UCSC_REFSEQ_PATH, 79 | cache_directory_path=tmpdir, 80 | ) 81 | genome.index() 82 | genes = genome.genes() 83 | for gene in genes: 84 | assert gene.id, "Gene with missing ID in %s" % ( 85 | genome.db._load_gtf_as_dataframe(), 86 | ) 87 | assert len(genes) == 2, "Expected 2 genes, got %d: %s" % (len(genes), genes) 88 | transcripts = genome.transcripts() 89 | for transcript in transcripts: 90 | assert transcript.id, "Transcript with missing ID in %s" % ( 91 | genome.db._load_gtf_as_dataframe(), 92 | ) 93 | assert len(transcripts) == 2, "Expected 2 transcripts, got %d: %s" % ( 94 | len(transcripts), 95 | transcripts, 96 | ) 97 | genes_at_locus = genome.genes_at_locus("chr1", 67092176) 98 | assert ( 99 | len(genes_at_locus) == 2 100 | ), "Expected 2 genes at locus chr1:67092176, got %d: %s" % ( 101 | len(genes_at_locus), 102 | genes_at_locus, 103 | ) 104 | ids = set([gene.id for gene in genes_at_locus]) 105 | eq_(set(["NM_001276352", "NR_075077"]), ids) 106 | --------------------------------------------------------------------------------