├── .github
└── workflows
│ └── tests.yml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── RELEASING.md
├── deploy.sh
├── develop.sh
├── docs
├── Makefile
├── conf.py
├── index.rst
├── make.bat
├── modules.rst
└── pyensembl.rst
├── lint-and-test.sh
├── lint.sh
├── pyensembl
├── __init__.py
├── common.py
├── database.py
├── download_cache.py
├── ensembl_release.py
├── ensembl_url_templates.py
├── ensembl_versions.py
├── exon.py
├── fasta.py
├── gene.py
├── genome.py
├── locus.py
├── locus_with_genome.py
├── logging.conf
├── normalization.py
├── reference_name.py
├── search.py
├── sequence_data.py
├── shell.py
├── species.py
├── transcript.py
└── version.py
├── pylintrc
├── requirements.txt
├── setup.py
├── test.sh
└── tests
├── __init__.py
├── common.py
├── data.py
├── data
├── gencode.ucsc.small.gtf
├── mouse.ensembl.81.partial.ENSMUSG00000017167.fa
├── mouse.ensembl.81.partial.ENSMUSG00000017167.gtf
├── mouse.ensembl.81.partial.ENSMUSG00000017167.pep
├── mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa
└── refseq.ucsc.small.gtf
├── test_contigs.py
├── test_download_cache.py
├── test_ensembl_gtf.py
├── test_ensembl_object_properties.py
├── test_exon_id.py
├── test_exon_object.py
├── test_gene_ids.py
├── test_gene_names.py
├── test_gene_objects.py
├── test_id_length.py
├── test_locus.py
├── test_missing_genome_sources.py
├── test_mouse.py
├── test_release_versions.py
├── test_search.py
├── test_sequence_data.py
├── test_serialization.py
├── test_shell.py
├── test_string_representation.py
├── test_timings.py
├── test_transcript_ids.py
├── test_transcript_objects.py
├── test_transcript_sequences.py
├── test_transcript_support_level.py
└── test_ucsc_gtf.py
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3 |
4 | # TODO:
5 | # - cache this directory $HOME/.cache/pyensembl/
6 | # - update coveralls
7 | # - get a badge for tests passing
8 | # - download binary dependencies from conda
9 | name: Tests
10 | on: [push, pull_request]
11 |
12 | jobs:
13 | build:
14 | runs-on: ubuntu-latest
15 | strategy:
16 | fail-fast: true
17 | matrix:
18 | python-version: ["3.9", "3.10", "3.11"]
19 |
20 | steps:
21 | - name: Checkout repository
22 | uses: actions/checkout@v3
23 | - name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v3
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | cache: "pip"
28 | - name: Install dependencies
29 | run: |
30 | python -m pip install --upgrade pip
31 | python -m pip install flake8 pytest pytest-cov coveralls
32 | pip install -r requirements.txt
33 | pip install .
34 | - name: Lint with flake8
35 | run: |
36 | # stop the build if there are Python syntax errors or undefined names
37 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
38 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
39 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
40 | - name: Run default linting script
41 | run: |
42 | ./lint.sh
43 | - name: Install Ensembl data
44 | run: |
45 | echo "Before installing Ensembl releases" && df -h
46 | pyensembl install --release 75 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh37.75/
47 | pyensembl install --release 77 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.77/
48 | pyensembl install --release 93 --species human --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCh38.93/
49 | pyensembl install --release 93 --species mouse --custom-mirror https://github.com/openvax/ensembl-data/releases/download/GRCm38.93/
50 | - name: Run unit tests
51 | run: |
52 | ./test.sh
53 | - name: Publish coverage to Coveralls
54 | uses: coverallsapp/github-action@v2.2.3
55 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | bin/
12 | build/
13 | develop-eggs/
14 | dist/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 |
25 | # Installer logs
26 | pip-log.txt
27 | pip-delete-this-directory.txt
28 |
29 | # Unit test / coverage reports
30 | htmlcov/
31 | .tox/
32 | .coverage
33 | .cache
34 | nosetests.xml
35 | coverage.xml
36 |
37 | # Translations
38 | *.mo
39 |
40 | # Mr Developer
41 | .mr.developer.cfg
42 | .project
43 | .pydevproject
44 |
45 | # Rope
46 | .ropeproject
47 |
48 | # Django stuff:
49 | *.log
50 | *.pot
51 |
52 | # Sphinx documentation
53 | docs/_build/
54 |
55 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to PyEnsembl
2 |
3 | [PyEnsembl](http://www.github.com/hammerlab/pyensembl) is open source software and
4 | we welcome your contributions. This document should help you get started
5 | contributing to PyEnsembl.
6 |
7 | ## Filing Issues
8 |
9 | If you find any bugs or problems while using PyEnsembl or have any feature requests, please feel free to file an issue against the project. When doing so, please follow the guidelines below:
10 |
11 | To report any bugs, issues, or feature requests, please [open an issue](https://github.com/hammerlab/pyensembl/issues)
12 | Please check the [current open issues](https://github.com/hammerlab/pyensembl/issues) to see if the request already exists
13 | If you are filing a bug report, please describe the version of PyEnsembl and Python you are using. If your problem involves a particular gene, transcript, or genomic locus, please include that information (e.g. "Missing transcript sequence for BRCA1-002 for Ensembl release 74").
14 |
15 | ## Coding Guidelines
16 |
17 | - PyEnsembl is written in Python and adheres to the [PEP8](https://www.python.org/dev/peps/pep-0008/)
18 | style guidelines.
19 | - Contributions should come in the form of GitHub pull requests.
20 | - New features should start with a GitHub issue explaining their scope and rationale.
21 | - If the work is based on an existing issue, please reference the issue in the PR.
22 | - All new code should be accompanied by comprehensive unit tests.
23 | - If the PR fixes or implements an issue, please state "Closes #XYZ" or "Fixes #XYZ", where XYZ is the issue number.
24 | - Please ensure that your code works under Python >= 3.7.
25 |
26 | ## Licensing
27 |
28 | PyEnsembl is licensed under the Apache 2.0 license. Your code is assumed to be as well.
29 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://github.com/openvax/pyensembl/actions/workflows/tests.yml)
2 | [](https://coveralls.io/github/openvax/pyensembl?branch=main)
3 |
4 |
5 |
6 |
7 | # PyEnsembl
8 |
9 | PyEnsembl is a Python interface to [Ensembl](http://www.ensembl.org) reference genome metadata such as exons and transcripts. PyEnsembl downloads [GTF](https://en.wikipedia.org/wiki/Gene_transfer_format) and [FASTA](https://en.wikipedia.org/wiki/FASTA_format) files from the [Ensembl FTP server](ftp://ftp.ensembl.org) and loads them into a local database. PyEnsembl can also work with custom reference data specified using user-supplied GTF and FASTA files.
10 |
11 | # Example Usage
12 |
13 | ```python
14 | from pyensembl import EnsemblRelease
15 |
16 | # release 77 uses human reference genome GRCh38
17 | data = EnsemblRelease(77)
18 |
19 | # will return ['HLA-A']
20 | gene_names = data.gene_names_at_locus(contig=6, position=29945884)
21 |
22 | # get all exons associated with HLA-A
23 | exon_ids = data.exon_ids_of_gene_name('HLA-A')
24 | ```
25 |
26 | # Installation
27 |
28 | You can install PyEnsembl using [pip](https://pip.pypa.io/en/latest/quickstart.html):
29 |
30 | ```sh
31 | pip install pyensembl
32 | ```
33 |
34 | This should also install any required packages such as [datacache](https://github.com/openvax/datacache).
35 |
36 | Before using PyEnsembl, run the following command to download and install
37 | Ensembl data:
38 |
39 | ```
40 | pyensembl install --release --species
41 | ```
42 |
43 | For example, `pyensembl install --release 75 76 --species human` will download and install all
44 | human reference data from Ensembl releases 75 and 76.
45 |
46 | Alternatively, you can create the `EnsemblRelease` object from inside a Python
47 | process and call `ensembl_object.download()` followed by `ensembl_object.index()`.
48 |
49 | ## Cache Location
50 |
51 | By default, PyEnsembl uses the platform-specific `Cache` folder
52 | and caches the files into the `pyensembl` sub-directory.
53 | You can override this default by setting the environment key `PYENSEMBL_CACHE_DIR`
54 | as your preferred location for caching:
55 |
56 | ```sh
57 | export PYENSEMBL_CACHE_DIR=/custom/cache/dir
58 | ```
59 |
60 | or
61 |
62 | ```python
63 | import os
64 |
65 | os.environ['PYENSEMBL_CACHE_DIR'] = '/custom/cache/dir'
66 | # ... PyEnsembl API usage
67 | ```
68 |
69 | # Usage tips
70 |
71 | ## List installed genomes
72 |
73 | To see the genomes for which PyEnsembl has already downloaded and indexed metadata you can run:
74 |
75 | ```sh
76 | pyensembl list
77 | ```
78 |
79 | Or equivalently do this in Python:
80 |
81 | ```python
82 | from pyensembl.shell import collect_all_installed_ensembl_releases
83 | collect_all_installed_ensembl_releases()
84 | ```
85 |
86 | ## Load genome in Python
87 |
88 | Here's an example Python snippet that loads fly genome data from Ensembl release v100:
89 |
90 | ```python
91 | from pyensembl import EnsemblRelease
92 | data = EnsemblRelease(release=100, species='drosophila_melanogaster')
93 | ```
94 |
95 | ## Data structures
96 |
97 | ### Gene
98 |
99 | ```python
100 | gene = genome.gene_by_id(gene_id='FBgn0011747')
101 | ```
102 |
103 | ### Transcript
104 |
105 | ```python
106 | transcript = gene.transcripts[0]
107 | ```
108 |
109 | ### Protein information
110 |
111 | ```python
112 | transcript.protein_id
113 | transcript.protein_sequence
114 | ```
115 |
116 | # Non-Ensembl Data
117 |
118 | PyEnsembl also allows arbitrary genomes via the specification
119 | of local file paths or remote URLs to both Ensembl and non-Ensembl GTF
120 | and FASTA files. (Warning: GTF formats can vary, and handling of
121 | non-Ensembl data is still very much in development.)
122 |
123 | For example:
124 |
125 | ```python
126 | from pyensembl import Genome
127 | data = Genome(
128 | reference_name='GRCh38',
129 | annotation_name='my_genome_features',
130 | # annotation_version=None,
131 | gtf_path_or_url='/My/local/gtf/path_to_my_genome_features.gtf', # Path or URL of GTF file
132 | # transcript_fasta_paths_or_urls=None, # List of paths or URLs of FASTA files containing transcript sequences
133 | # protein_fasta_paths_or_urls=None, # List of paths or URLs of FASTA files containing protein sequences
134 | # cache_directory_path=None, # Where to place downloaded and cached files for this genome
135 | )
136 | # parse GTF and construct database of genomic features
137 | data.index()
138 | gene_names = data.gene_names_at_locus(contig=6, position=29945884)
139 | ```
140 |
141 | # API
142 |
143 | The `EnsemblRelease` object has methods to let you access all possible
144 | combinations of the annotation features _gene_name_, _gene_id_,
145 | _transcript_name_, _transcript_id_, _exon_id_ as well as the location of
146 | these genomic elements (contig, start position, end position, strand).
147 |
148 | ## Genes
149 |
150 |
151 | genes(contig=None, strand=None)
152 | Returns a list of Gene objects, optionally restricted to a particular contig
153 | or strand.
154 |
155 | genes_at_locus(contig, position, end=None, strand=None)
156 | Returns a list of Gene objects overlapping a particular position on a contig,
157 | optionally extend into a range with the end parameter and restrict to
158 | forward or backward strand by passing strand='+' or strand='-'.
159 |
160 | gene_by_id(gene_id)
161 | Return a Gene object for given Ensembl gene ID (e.g. "ENSG00000068793").
162 |
163 | gene_names(contig=None, strand=None)
164 | Returns all gene names in the annotation database, optionally restricted
165 | to a particular contig or strand.
166 |
167 | genes_by_name(gene_name)
168 | Get all the unqiue genes with the given name (there might be multiple
169 | due to copies in the genome), return a list containing a Gene object for each
170 | distinct ID.
171 |
172 | gene_by_protein_id(protein_id)
173 | Find Gene associated with the given Ensembl protein ID (e.g. "ENSP00000350283")
174 |
175 | gene_names_at_locus(contig, position, end=None, strand=None)
176 |
177 | Names of genes overlapping with the given locus, optionally restricted by strand.
178 | (returns a list to account for overlapping genes)
179 |
180 | gene_name_of_gene_id(gene_id)
181 |
182 | Returns name of gene with given genen ID.
183 |
184 | gene_name_of_transcript_id(transcript_id)
185 | Returns name of gene associated with given transcript ID.
186 |
187 | gene_name_of_transcript_name(transcript_name)
188 |
189 | Returns name of gene associated with given transcript name.
190 |
191 | gene_name_of_exon_id(exon_id)
192 | Returns name of gene associated with given exon ID.
193 |
194 | gene_ids(contig=None, strand=None)
195 |
196 | Return all gene IDs in the annotation database, optionally restricted by
197 | chromosome name or strand.
198 |
199 | gene_ids_of_gene_name(gene_name)
200 |
201 | Returns all Ensembl gene IDs with the given name.
202 |
203 |
204 |
205 | ## Transcripts
206 |
207 |
208 | transcripts(contig=None, strand=None)
209 | Returns a list of Transcript objects for all transcript entries in the
210 | Ensembl database, optionally restricted to a particular contig or strand.
211 |
212 | transcript_by_id(transcript_id)
213 | Construct a Transcript object for given Ensembl transcript ID (e.g. "ENST00000369985")
214 |
215 | transcripts_by_name(transcript_name)
216 | Returns a list of Transcript objects for every transcript matching the given name.
217 |
218 | transcript_names(contig=None, strand=None)
219 | Returns all transcript names in the annotation database.
220 |
221 | transcript_ids(contig=None, strand=None)
222 | Returns all transcript IDs in the annotation database.
223 |
224 | transcript_ids_of_gene_id(gene_id)
225 | Return IDs of all transcripts associated with given gene ID.
226 |
227 | transcript_ids_of_gene_name(gene_name)
228 | Return IDs of all transcripts associated with given gene name.
229 |
230 | transcript_ids_of_transcript_name(transcript_name)
231 | Find all Ensembl transcript IDs with the given name.
232 |
233 | transcript_ids_of_exon_id(exon_id)
234 | Return IDs of all transcripts associatd with given exon ID.
235 |
236 |
237 | ## Exons
238 |
239 |
240 | exon_ids(contig=None, strand=None)
241 | Returns a list of exons IDs in the annotation database, optionally restricted
242 | by the given chromosome and strand.
243 |
244 | exon_by_id(exon_id)
245 | Construct an Exon object for given Ensembl exon ID (e.g. "ENSE00001209410")
246 |
247 | exon_ids_of_gene_id(gene_id)
248 | Returns a list of exon IDs associated with a given gene ID.
249 |
250 | exon_ids_of_gene_name(gene_name)
251 | Returns a list of exon IDs associated with a given gene name.
252 |
253 | exon_ids_of_transcript_id(transcript_id)
254 | Returns a list of exon IDs associated with a given transcript ID.
255 |
256 | exon_ids_of_transcript_name(transcript_name)
257 | Returns a list of exon IDs associated with a given transcript name.
258 |
259 |
--------------------------------------------------------------------------------
/RELEASING.md:
--------------------------------------------------------------------------------
1 | # Releasing Pyensembl
2 |
3 | This document explains what do once your [Pull Request](https://www.atlassian.com/git/tutorials/making-a-pull-request/) has been reviewed and all final changes applied. Now you're ready merge your branch into master and release it to the world:
4 |
5 | 1. Bump the [version](http://semver.org/) in `version.py`, as part of the PR you want to release.
6 | 2. Merge your branch into master.
7 | 3. Run `deploy.sh`
8 |
--------------------------------------------------------------------------------
/deploy.sh:
--------------------------------------------------------------------------------
1 | ./lint.sh && \
2 | ./test.sh && \
3 | python3 -m pip install --upgrade build && \
4 | python3 -m pip install --upgrade twine && \
5 | rm -rf dist && \
6 | python3 -m build && \
7 | git --version && \
8 | python3 -m twine upload dist/* && \
9 | git tag "$(python3 pyensembl/version.py)" && \
10 | git push --tags
11 |
--------------------------------------------------------------------------------
/develop.sh:
--------------------------------------------------------------------------------
1 | set -e
2 |
3 | pip install -e .
4 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | PAPER =
8 | BUILDDIR = _build
9 |
10 | # User-friendly check for sphinx-build
11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
13 | endif
14 |
15 | # Internal variables.
16 | PAPEROPT_a4 = -D latex_paper_size=a4
17 | PAPEROPT_letter = -D latex_paper_size=letter
18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
19 | # the i18n builder cannot share the environment and doctrees with the others
20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
21 |
22 | .PHONY: help
23 | help:
24 | @echo "Please use \`make ' where is one of"
25 | @echo " html to make standalone HTML files"
26 | @echo " dirhtml to make HTML files named index.html in directories"
27 | @echo " singlehtml to make a single large HTML file"
28 | @echo " pickle to make pickle files"
29 | @echo " json to make JSON files"
30 | @echo " htmlhelp to make HTML files and a HTML help project"
31 | @echo " qthelp to make HTML files and a qthelp project"
32 | @echo " applehelp to make an Apple Help Book"
33 | @echo " devhelp to make HTML files and a Devhelp project"
34 | @echo " epub to make an epub"
35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
36 | @echo " latexpdf to make LaTeX files and run them through pdflatex"
37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
38 | @echo " text to make text files"
39 | @echo " man to make manual pages"
40 | @echo " texinfo to make Texinfo files"
41 | @echo " info to make Texinfo files and run them through makeinfo"
42 | @echo " gettext to make PO message catalogs"
43 | @echo " changes to make an overview of all changed/added/deprecated items"
44 | @echo " xml to make Docutils-native XML files"
45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes"
46 | @echo " linkcheck to check all external links for integrity"
47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)"
48 | @echo " coverage to run coverage check of the documentation (if enabled)"
49 |
50 | .PHONY: clean
51 | clean:
52 | rm -rf $(BUILDDIR)/*
53 |
54 | .PHONY: html
55 | html:
56 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
57 | @echo
58 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
59 |
60 | .PHONY: dirhtml
61 | dirhtml:
62 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
63 | @echo
64 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
65 |
66 | .PHONY: singlehtml
67 | singlehtml:
68 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
69 | @echo
70 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
71 |
72 | .PHONY: pickle
73 | pickle:
74 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
75 | @echo
76 | @echo "Build finished; now you can process the pickle files."
77 |
78 | .PHONY: json
79 | json:
80 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
81 | @echo
82 | @echo "Build finished; now you can process the JSON files."
83 |
84 | .PHONY: htmlhelp
85 | htmlhelp:
86 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
87 | @echo
88 | @echo "Build finished; now you can run HTML Help Workshop with the" \
89 | ".hhp project file in $(BUILDDIR)/htmlhelp."
90 |
91 | .PHONY: qthelp
92 | qthelp:
93 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
94 | @echo
95 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \
96 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
97 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pyensembl.qhcp"
98 | @echo "To view the help file:"
99 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pyensembl.qhc"
100 |
101 | .PHONY: applehelp
102 | applehelp:
103 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
104 | @echo
105 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
106 | @echo "N.B. You won't be able to view it unless you put it in" \
107 | "~/Library/Documentation/Help or install it in your application" \
108 | "bundle."
109 |
110 | .PHONY: devhelp
111 | devhelp:
112 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
113 | @echo
114 | @echo "Build finished."
115 | @echo "To view the help file:"
116 | @echo "# mkdir -p $$HOME/.local/share/devhelp/pyensembl"
117 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pyensembl"
118 | @echo "# devhelp"
119 |
120 | .PHONY: epub
121 | epub:
122 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
123 | @echo
124 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
125 |
126 | .PHONY: latex
127 | latex:
128 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
129 | @echo
130 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
131 | @echo "Run \`make' in that directory to run these through (pdf)latex" \
132 | "(use \`make latexpdf' here to do that automatically)."
133 |
134 | .PHONY: latexpdf
135 | latexpdf:
136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
137 | @echo "Running LaTeX files through pdflatex..."
138 | $(MAKE) -C $(BUILDDIR)/latex all-pdf
139 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
140 |
141 | .PHONY: latexpdfja
142 | latexpdfja:
143 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
144 | @echo "Running LaTeX files through platex and dvipdfmx..."
145 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
146 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
147 |
148 | .PHONY: text
149 | text:
150 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
151 | @echo
152 | @echo "Build finished. The text files are in $(BUILDDIR)/text."
153 |
154 | .PHONY: man
155 | man:
156 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
157 | @echo
158 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
159 |
160 | .PHONY: texinfo
161 | texinfo:
162 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
163 | @echo
164 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
165 | @echo "Run \`make' in that directory to run these through makeinfo" \
166 | "(use \`make info' here to do that automatically)."
167 |
168 | .PHONY: info
169 | info:
170 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
171 | @echo "Running Texinfo files through makeinfo..."
172 | make -C $(BUILDDIR)/texinfo info
173 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
174 |
175 | .PHONY: gettext
176 | gettext:
177 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
178 | @echo
179 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
180 |
181 | .PHONY: changes
182 | changes:
183 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
184 | @echo
185 | @echo "The overview file is in $(BUILDDIR)/changes."
186 |
187 | .PHONY: linkcheck
188 | linkcheck:
189 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
190 | @echo
191 | @echo "Link check complete; look for any errors in the above output " \
192 | "or in $(BUILDDIR)/linkcheck/output.txt."
193 |
194 | .PHONY: doctest
195 | doctest:
196 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
197 | @echo "Testing of doctests in the sources finished, look at the " \
198 | "results in $(BUILDDIR)/doctest/output.txt."
199 |
200 | .PHONY: coverage
201 | coverage:
202 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
203 | @echo "Testing of coverage in the sources finished, look at the " \
204 | "results in $(BUILDDIR)/coverage/python.txt."
205 |
206 | .PHONY: xml
207 | xml:
208 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
209 | @echo
210 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
211 |
212 | .PHONY: pseudoxml
213 | pseudoxml:
214 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
215 | @echo
216 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
217 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # pyensembl documentation build configuration file, created by
4 | # sphinx-quickstart on Sat Mar 26 22:47:25 2016.
5 | #
6 | # This file is execfile()d with the current directory set to its
7 | # containing dir.
8 | #
9 | # Note that not all possible configuration values are present in this
10 | # autogenerated file.
11 | #
12 | # All configuration values have a default; values that are commented out
13 | # serve to show the default.
14 |
15 | import sys
16 | import os
17 |
18 | # If extensions (or modules to document with autodoc) are in another directory,
19 | # add these directories to sys.path here. If the directory is relative to the
20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
21 | sys.path.insert(0, os.path.abspath('..'))
22 |
23 | # -- General configuration ------------------------------------------------
24 |
25 | # If your documentation needs a minimal Sphinx version, state it here.
26 | #needs_sphinx = '1.0'
27 |
28 | # Add any Sphinx extension module names here, as strings. They can be
29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
30 | # ones.
31 | extensions = [
32 | 'sphinx.ext.autodoc',
33 | ]
34 |
35 | # Add any paths that contain templates here, relative to this directory.
36 | templates_path = ['_templates']
37 |
38 | # The suffix(es) of source filenames.
39 | # You can specify multiple suffix as a list of string:
40 | # source_suffix = ['.rst', '.md']
41 | source_suffix = '.rst'
42 |
43 | # The encoding of source files.
44 | #source_encoding = 'utf-8-sig'
45 |
46 | # The master toctree document.
47 | master_doc = 'index'
48 |
49 | # General information about the project.
50 | project = u'pyensembl'
51 | copyright = u'2016, Hammer Lab'
52 | author = u'Hammer Lab'
53 |
54 | # The version info for the project you're documenting, acts as replacement for
55 | # |version| and |release|, also used in various other places throughout the
56 | # built documents.
57 | #
58 | # The short X.Y version.
59 | version = u'0.8.10'
60 | # The full version, including alpha/beta/rc tags.
61 | release = u'0.8.10'
62 |
63 | # The language for content autogenerated by Sphinx. Refer to documentation
64 | # for a list of supported languages.
65 | #
66 | # This is also used if you do content translation via gettext catalogs.
67 | # Usually you set "language" from the command line for these cases.
68 | language = None
69 |
70 | # There are two options for replacing |today|: either, you set today to some
71 | # non-false value, then it is used:
72 | #today = ''
73 | # Else, today_fmt is used as the format for a strftime call.
74 | #today_fmt = '%B %d, %Y'
75 |
76 | # List of patterns, relative to source directory, that match files and
77 | # directories to ignore when looking for source files.
78 | exclude_patterns = ['_build']
79 |
80 | # The reST default role (used for this markup: `text`) to use for all
81 | # documents.
82 | #default_role = None
83 |
84 | # If true, '()' will be appended to :func: etc. cross-reference text.
85 | #add_function_parentheses = True
86 |
87 | # If true, the current module name will be prepended to all description
88 | # unit titles (such as .. function::).
89 | #add_module_names = True
90 |
91 | # If true, sectionauthor and moduleauthor directives will be shown in the
92 | # output. They are ignored by default.
93 | #show_authors = False
94 |
95 | # The name of the Pygments (syntax highlighting) style to use.
96 | pygments_style = 'sphinx'
97 |
98 | # A list of ignored prefixes for module index sorting.
99 | #modindex_common_prefix = []
100 |
101 | # If true, keep warnings as "system message" paragraphs in the built documents.
102 | #keep_warnings = False
103 |
104 | # If true, `todo` and `todoList` produce output, else they produce nothing.
105 | todo_include_todos = False
106 |
107 |
108 | # -- Options for HTML output ----------------------------------------------
109 |
110 | # The theme to use for HTML and HTML Help pages. See the documentation for
111 | # a list of builtin themes.
112 | html_theme = 'alabaster'
113 |
114 | # Theme options are theme-specific and customize the look and feel of a theme
115 | # further. For a list of options available for each theme, see the
116 | # documentation.
117 | #html_theme_options = {}
118 |
119 | # Add any paths that contain custom themes here, relative to this directory.
120 | #html_theme_path = []
121 |
122 | # The name for this set of Sphinx documents. If None, it defaults to
123 | # " v documentation".
124 | #html_title = None
125 |
126 | # A shorter title for the navigation bar. Default is the same as html_title.
127 | #html_short_title = None
128 |
129 | # The name of an image file (relative to this directory) to place at the top
130 | # of the sidebar.
131 | #html_logo = None
132 |
133 | # The name of an image file (relative to this directory) to use as a favicon of
134 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
135 | # pixels large.
136 | #html_favicon = None
137 |
138 | # Add any paths that contain custom static files (such as style sheets) here,
139 | # relative to this directory. They are copied after the builtin static files,
140 | # so a file named "default.css" will overwrite the builtin "default.css".
141 | html_static_path = ['_static']
142 |
143 | # Add any extra paths that contain custom files (such as robots.txt or
144 | # .htaccess) here, relative to this directory. These files are copied
145 | # directly to the root of the documentation.
146 | #html_extra_path = []
147 |
148 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
149 | # using the given strftime format.
150 | #html_last_updated_fmt = '%b %d, %Y'
151 |
152 | # If true, SmartyPants will be used to convert quotes and dashes to
153 | # typographically correct entities.
154 | #html_use_smartypants = True
155 |
156 | # Custom sidebar templates, maps document names to template names.
157 | #html_sidebars = {}
158 |
159 | # Additional templates that should be rendered to pages, maps page names to
160 | # template names.
161 | #html_additional_pages = {}
162 |
163 | # If false, no module index is generated.
164 | #html_domain_indices = True
165 |
166 | # If false, no index is generated.
167 | #html_use_index = True
168 |
169 | # If true, the index is split into individual pages for each letter.
170 | #html_split_index = False
171 |
172 | # If true, links to the reST sources are added to the pages.
173 | #html_show_sourcelink = True
174 |
175 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
176 | #html_show_sphinx = True
177 |
178 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
179 | #html_show_copyright = True
180 |
181 | # If true, an OpenSearch description file will be output, and all pages will
182 | # contain a tag referring to it. The value of this option must be the
183 | # base URL from which the finished HTML is served.
184 | #html_use_opensearch = ''
185 |
186 | # This is the file name suffix for HTML files (e.g. ".xhtml").
187 | #html_file_suffix = None
188 |
189 | # Language to be used for generating the HTML full-text search index.
190 | # Sphinx supports the following languages:
191 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
192 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
193 | #html_search_language = 'en'
194 |
195 | # A dictionary with options for the search language support, empty by default.
196 | # Now only 'ja' uses this config value
197 | #html_search_options = {'type': 'default'}
198 |
199 | # The name of a javascript file (relative to the configuration directory) that
200 | # implements a search results scorer. If empty, the default will be used.
201 | #html_search_scorer = 'scorer.js'
202 |
203 | # Output file base name for HTML help builder.
204 | htmlhelp_basename = 'pyensembldoc'
205 |
206 | # -- Options for LaTeX output ---------------------------------------------
207 |
208 | latex_elements = {
209 | # The paper size ('letterpaper' or 'a4paper').
210 | #'papersize': 'letterpaper',
211 |
212 | # The font size ('10pt', '11pt' or '12pt').
213 | #'pointsize': '10pt',
214 |
215 | # Additional stuff for the LaTeX preamble.
216 | #'preamble': '',
217 |
218 | # Latex figure (float) alignment
219 | #'figure_align': 'htbp',
220 | }
221 |
222 | # Grouping the document tree into LaTeX files. List of tuples
223 | # (source start file, target name, title,
224 | # author, documentclass [howto, manual, or own class]).
225 | latex_documents = [
226 | (master_doc, 'pyensembl.tex', u'pyensembl Documentation',
227 | u'Hammer Lab', 'manual'),
228 | ]
229 |
230 | # The name of an image file (relative to this directory) to place at the top of
231 | # the title page.
232 | #latex_logo = None
233 |
234 | # For "manual" documents, if this is true, then toplevel headings are parts,
235 | # not chapters.
236 | #latex_use_parts = False
237 |
238 | # If true, show page references after internal links.
239 | #latex_show_pagerefs = False
240 |
241 | # If true, show URL addresses after external links.
242 | #latex_show_urls = False
243 |
244 | # Documents to append as an appendix to all manuals.
245 | #latex_appendices = []
246 |
247 | # If false, no module index is generated.
248 | #latex_domain_indices = True
249 |
250 |
251 | # -- Options for manual page output ---------------------------------------
252 |
253 | # One entry per manual page. List of tuples
254 | # (source start file, name, description, authors, manual section).
255 | man_pages = [
256 | (master_doc, 'pyensembl', u'pyensembl Documentation',
257 | [author], 1)
258 | ]
259 |
260 | # If true, show URL addresses after external links.
261 | #man_show_urls = False
262 |
263 |
264 | # -- Options for Texinfo output -------------------------------------------
265 |
266 | # Grouping the document tree into Texinfo files. List of tuples
267 | # (source start file, target name, title, author,
268 | # dir menu entry, description, category)
269 | texinfo_documents = [
270 | (master_doc, 'pyensembl', u'pyensembl Documentation',
271 | author, 'pyensembl', 'One line description of project.',
272 | 'Miscellaneous'),
273 | ]
274 |
275 | # Documents to append as an appendix to all manuals.
276 | #texinfo_appendices = []
277 |
278 | # If false, no module index is generated.
279 | #texinfo_domain_indices = True
280 |
281 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
282 | #texinfo_show_urls = 'footnote'
283 |
284 | # If true, do not generate a @detailmenu in the "Top" node's menu.
285 | #texinfo_no_detailmenu = False
286 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. pyensembl documentation master file, created by
2 | sphinx-quickstart on Sat Mar 26 22:47:25 2016.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to pyensembl's documentation!
7 | =====================================
8 |
9 | Contents:
10 |
11 | .. toctree::
12 | :maxdepth: 2
13 |
14 | modules
15 |
16 | Indices and tables
17 | ==================
18 |
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 |
23 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | REM Command file for Sphinx documentation
4 |
5 | if "%SPHINXBUILD%" == "" (
6 | set SPHINXBUILD=sphinx-build
7 | )
8 | set BUILDDIR=_build
9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
11 | if NOT "%PAPER%" == "" (
12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
14 | )
15 |
16 | if "%1" == "" goto help
17 |
18 | if "%1" == "help" (
19 | :help
20 | echo.Please use `make ^` where ^ is one of
21 | echo. html to make standalone HTML files
22 | echo. dirhtml to make HTML files named index.html in directories
23 | echo. singlehtml to make a single large HTML file
24 | echo. pickle to make pickle files
25 | echo. json to make JSON files
26 | echo. htmlhelp to make HTML files and a HTML help project
27 | echo. qthelp to make HTML files and a qthelp project
28 | echo. devhelp to make HTML files and a Devhelp project
29 | echo. epub to make an epub
30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
31 | echo. text to make text files
32 | echo. man to make manual pages
33 | echo. texinfo to make Texinfo files
34 | echo. gettext to make PO message catalogs
35 | echo. changes to make an overview over all changed/added/deprecated items
36 | echo. xml to make Docutils-native XML files
37 | echo. pseudoxml to make pseudoxml-XML files for display purposes
38 | echo. linkcheck to check all external links for integrity
39 | echo. doctest to run all doctests embedded in the documentation if enabled
40 | echo. coverage to run coverage check of the documentation if enabled
41 | goto end
42 | )
43 |
44 | if "%1" == "clean" (
45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
46 | del /q /s %BUILDDIR%\*
47 | goto end
48 | )
49 |
50 |
51 | REM Check if sphinx-build is available and fallback to Python version if any
52 | %SPHINXBUILD% 1>NUL 2>NUL
53 | if errorlevel 9009 goto sphinx_python
54 | goto sphinx_ok
55 |
56 | :sphinx_python
57 |
58 | set SPHINXBUILD=python -m sphinx.__init__
59 | %SPHINXBUILD% 2> nul
60 | if errorlevel 9009 (
61 | echo.
62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
63 | echo.installed, then set the SPHINXBUILD environment variable to point
64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
65 | echo.may add the Sphinx directory to PATH.
66 | echo.
67 | echo.If you don't have Sphinx installed, grab it from
68 | echo.http://sphinx-doc.org/
69 | exit /b 1
70 | )
71 |
72 | :sphinx_ok
73 |
74 |
75 | if "%1" == "html" (
76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
77 | if errorlevel 1 exit /b 1
78 | echo.
79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html.
80 | goto end
81 | )
82 |
83 | if "%1" == "dirhtml" (
84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
85 | if errorlevel 1 exit /b 1
86 | echo.
87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
88 | goto end
89 | )
90 |
91 | if "%1" == "singlehtml" (
92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
93 | if errorlevel 1 exit /b 1
94 | echo.
95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
96 | goto end
97 | )
98 |
99 | if "%1" == "pickle" (
100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
101 | if errorlevel 1 exit /b 1
102 | echo.
103 | echo.Build finished; now you can process the pickle files.
104 | goto end
105 | )
106 |
107 | if "%1" == "json" (
108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
109 | if errorlevel 1 exit /b 1
110 | echo.
111 | echo.Build finished; now you can process the JSON files.
112 | goto end
113 | )
114 |
115 | if "%1" == "htmlhelp" (
116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
117 | if errorlevel 1 exit /b 1
118 | echo.
119 | echo.Build finished; now you can run HTML Help Workshop with the ^
120 | .hhp project file in %BUILDDIR%/htmlhelp.
121 | goto end
122 | )
123 |
124 | if "%1" == "qthelp" (
125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
126 | if errorlevel 1 exit /b 1
127 | echo.
128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^
129 | .qhcp project file in %BUILDDIR%/qthelp, like this:
130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pyensembl.qhcp
131 | echo.To view the help file:
132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pyensembl.ghc
133 | goto end
134 | )
135 |
136 | if "%1" == "devhelp" (
137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
138 | if errorlevel 1 exit /b 1
139 | echo.
140 | echo.Build finished.
141 | goto end
142 | )
143 |
144 | if "%1" == "epub" (
145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
146 | if errorlevel 1 exit /b 1
147 | echo.
148 | echo.Build finished. The epub file is in %BUILDDIR%/epub.
149 | goto end
150 | )
151 |
152 | if "%1" == "latex" (
153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
154 | if errorlevel 1 exit /b 1
155 | echo.
156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
157 | goto end
158 | )
159 |
160 | if "%1" == "latexpdf" (
161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
162 | cd %BUILDDIR%/latex
163 | make all-pdf
164 | cd %~dp0
165 | echo.
166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex.
167 | goto end
168 | )
169 |
170 | if "%1" == "latexpdfja" (
171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | cd %BUILDDIR%/latex
173 | make all-pdf-ja
174 | cd %~dp0
175 | echo.
176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | goto end
178 | )
179 |
180 | if "%1" == "text" (
181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
182 | if errorlevel 1 exit /b 1
183 | echo.
184 | echo.Build finished. The text files are in %BUILDDIR%/text.
185 | goto end
186 | )
187 |
188 | if "%1" == "man" (
189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
190 | if errorlevel 1 exit /b 1
191 | echo.
192 | echo.Build finished. The manual pages are in %BUILDDIR%/man.
193 | goto end
194 | )
195 |
196 | if "%1" == "texinfo" (
197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
198 | if errorlevel 1 exit /b 1
199 | echo.
200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
201 | goto end
202 | )
203 |
204 | if "%1" == "gettext" (
205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
206 | if errorlevel 1 exit /b 1
207 | echo.
208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
209 | goto end
210 | )
211 |
212 | if "%1" == "changes" (
213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
214 | if errorlevel 1 exit /b 1
215 | echo.
216 | echo.The overview file is in %BUILDDIR%/changes.
217 | goto end
218 | )
219 |
220 | if "%1" == "linkcheck" (
221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
222 | if errorlevel 1 exit /b 1
223 | echo.
224 | echo.Link check complete; look for any errors in the above output ^
225 | or in %BUILDDIR%/linkcheck/output.txt.
226 | goto end
227 | )
228 |
229 | if "%1" == "doctest" (
230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
231 | if errorlevel 1 exit /b 1
232 | echo.
233 | echo.Testing of doctests in the sources finished, look at the ^
234 | results in %BUILDDIR%/doctest/output.txt.
235 | goto end
236 | )
237 |
238 | if "%1" == "coverage" (
239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
240 | if errorlevel 1 exit /b 1
241 | echo.
242 | echo.Testing of coverage in the sources finished, look at the ^
243 | results in %BUILDDIR%/coverage/python.txt.
244 | goto end
245 | )
246 |
247 | if "%1" == "xml" (
248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
249 | if errorlevel 1 exit /b 1
250 | echo.
251 | echo.Build finished. The XML files are in %BUILDDIR%/xml.
252 | goto end
253 | )
254 |
255 | if "%1" == "pseudoxml" (
256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
257 | if errorlevel 1 exit /b 1
258 | echo.
259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
260 | goto end
261 | )
262 |
263 | :end
264 |
--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | pyensembl
2 | =========
3 |
4 | .. toctree::
5 | :maxdepth: 4
6 |
7 | pyensembl
8 |
--------------------------------------------------------------------------------
/docs/pyensembl.rst:
--------------------------------------------------------------------------------
1 | pyensembl package
2 | =================
3 |
4 | Submodules
5 | ----------
6 |
7 | pyensembl.biotypes module
8 | -------------------------
9 |
10 | .. automodule:: pyensembl.biotypes
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | pyensembl.common module
16 | -----------------------
17 |
18 | .. automodule:: pyensembl.common
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 | pyensembl.database module
24 | -------------------------
25 |
26 | .. automodule:: pyensembl.database
27 | :members:
28 | :undoc-members:
29 | :show-inheritance:
30 |
31 | pyensembl.download_cache module
32 | -------------------------------
33 |
34 | .. automodule:: pyensembl.download_cache
35 | :members:
36 | :undoc-members:
37 | :show-inheritance:
38 |
39 | pyensembl.ensembl_release module
40 | --------------------------------
41 |
42 | .. automodule:: pyensembl.ensembl_release
43 | :members:
44 | :undoc-members:
45 | :show-inheritance:
46 |
47 | pyensembl.ensembl_versions module
48 | -----------------------------------------
49 |
50 | .. automodule:: pyensembl.ensembl_versions
51 | :members:
52 | :undoc-members:
53 | :show-inheritance:
54 |
55 | pyensembl.ensembl_url_templates module
56 | --------------------------------------
57 |
58 | .. automodule:: pyensembl.ensembl_url_templates
59 | :members:
60 | :undoc-members:
61 | :show-inheritance:
62 |
63 | pyensembl.exon module
64 | ---------------------
65 |
66 | .. automodule:: pyensembl.exon
67 | :members:
68 | :undoc-members:
69 | :show-inheritance:
70 |
71 | pyensembl.gene module
72 | ---------------------
73 |
74 | .. automodule:: pyensembl.gene
75 | :members:
76 | :undoc-members:
77 | :show-inheritance:
78 |
79 | pyensembl.genome module
80 | -----------------------
81 |
82 | .. automodule:: pyensembl.genome
83 | :members:
84 | :undoc-members:
85 | :show-inheritance:
86 |
87 | pyensembl.gtf module
88 | --------------------
89 |
90 | .. automodule:: pyensembl.gtf
91 | :members:
92 | :undoc-members:
93 | :show-inheritance:
94 |
95 | pyensembl.locus module
96 | ----------------------
97 |
98 | .. automodule:: pyensembl.locus
99 | :members:
100 | :undoc-members:
101 | :show-inheritance:
102 |
103 | pyensembl.memory_cache module
104 | -----------------------------
105 |
106 | .. automodule:: pyensembl.memory_cache
107 | :members:
108 | :undoc-members:
109 | :show-inheritance:
110 |
111 | pyensembl.search module
112 | -----------------------
113 |
114 | .. automodule:: pyensembl.search
115 | :members:
116 | :undoc-members:
117 | :show-inheritance:
118 |
119 | pyensembl.sequence_data module
120 | ------------------------------
121 |
122 | .. automodule:: pyensembl.sequence_data
123 | :members:
124 | :undoc-members:
125 | :show-inheritance:
126 |
127 | pyensembl.shell module
128 | ----------------------
129 |
130 | .. automodule:: pyensembl.shell
131 | :members:
132 | :undoc-members:
133 | :show-inheritance:
134 |
135 | pyensembl.species module
136 | ------------------------
137 |
138 | .. automodule:: pyensembl.species
139 | :members:
140 | :undoc-members:
141 | :show-inheritance:
142 |
143 | pyensembl.transcript module
144 | ---------------------------
145 |
146 | .. automodule:: pyensembl.transcript
147 | :members:
148 | :undoc-members:
149 | :show-inheritance:
150 |
151 |
152 | Module contents
153 | ---------------
154 |
155 | .. automodule:: pyensembl
156 | :members:
157 | :undoc-members:
158 | :show-inheritance:
159 |
--------------------------------------------------------------------------------
/lint-and-test.sh:
--------------------------------------------------------------------------------
1 | ./lint.sh && ./test.sh
2 |
--------------------------------------------------------------------------------
/lint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -o errexit
3 |
4 |
5 | # disabling several categories of errors due to false positives in pylint,
6 | # see these issues:
7 | # - https://bitbucket.org/logilab/pylint/issues/701/false-positives-with-not-an-iterable-and
8 | # - https://bitbucket.org/logilab/pylint/issues/58
9 |
10 | find pyensembl -name '*.py' \
11 | | xargs pylint \
12 | --errors-only \
13 | --disable=unsubscriptable-object,not-an-iterable,no-member
14 |
15 | echo 'Passes pylint check'
16 |
--------------------------------------------------------------------------------
/pyensembl/__init__.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from .database import Database
14 | from .download_cache import DownloadCache
15 | from .ensembl_release import EnsemblRelease, cached_release
16 | from .ensembl_versions import MAX_ENSEMBL_RELEASE
17 | from .exon import Exon
18 | from .genome import Genome
19 | from .gene import Gene
20 | from .locus import Locus
21 | from .reference_name import (
22 | ensembl_grch36,
23 | ensembl_grch37,
24 | ensembl_grch38,
25 | normalize_reference_name,
26 | find_species_by_reference,
27 | which_reference,
28 | genome_for_reference_name,
29 | )
30 |
31 | from .search import find_nearest_locus
32 | from .sequence_data import SequenceData
33 | from .species import find_species_by_name, check_species_object, normalize_species_name
34 | from .transcript import Transcript
35 | from .version import __version__
36 |
37 | __all__ = [
38 | "__version__",
39 | "DownloadCache",
40 | "Database",
41 | "EnsemblRelease",
42 | "cached_release",
43 | "MAX_ENSEMBL_RELEASE",
44 | "Gene",
45 | "Transcript",
46 | "Exon",
47 | "SequenceData",
48 | "find_nearest_locus",
49 | "find_species_by_name",
50 | "find_species_by_reference",
51 | "genome_for_reference_name",
52 | "which_reference",
53 | "check_species_object",
54 | "normalize_reference_name",
55 | "normalize_species_name",
56 | "Genome",
57 | "Locus",
58 | "Exon",
59 | "ensembl_grch36",
60 | "ensembl_grch37",
61 | "ensembl_grch38",
62 | ]
63 |
--------------------------------------------------------------------------------
/pyensembl/common.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | import pickle
14 |
15 | from functools import wraps
16 |
17 |
18 | def dump_pickle(obj, filepath):
19 | with open(filepath, "wb") as f:
20 | # use lower protocol for compatibility between Python 2 and Python 3
21 | pickle.dump(obj, file=f, protocol=2)
22 |
23 |
24 | def load_pickle(filepath):
25 | with open(filepath, "rb") as f:
26 | obj = pickle.load(f)
27 | return obj
28 |
29 |
30 | def _memoize_cache_key(args, kwargs):
31 | """Turn args tuple and kwargs dictionary into a hashable key.
32 |
33 | Expects that all arguments to a memoized function are either hashable
34 | or can be uniquely identified from type(arg) and repr(arg).
35 | """
36 | cache_key_list = []
37 |
38 | # hack to get around the unhashability of lists,
39 | # add a special case to convert them to tuples
40 | for arg in args:
41 | if type(arg) is list:
42 | cache_key_list.append(tuple(arg))
43 | else:
44 | cache_key_list.append(arg)
45 | for k, v in sorted(kwargs.items()):
46 | if type(v) is list:
47 | cache_key_list.append((k, tuple(v)))
48 | else:
49 | cache_key_list.append((k, v))
50 | return tuple(cache_key_list)
51 |
52 |
53 | def memoize(fn):
54 | """Simple reset-able memoization decorator for functions and methods,
55 | assumes that all arguments to the function can be hashed and
56 | compared.
57 | """
58 | cache = {}
59 |
60 | @wraps(fn)
61 | def wrapped_fn(*args, **kwargs):
62 | cache_key = _memoize_cache_key(args, kwargs)
63 | try:
64 | return cache[cache_key]
65 | except KeyError:
66 | value = fn(*args, **kwargs)
67 | cache[cache_key] = value
68 | return value
69 |
70 | def clear_cache():
71 | cache.clear()
72 |
73 | # Needed to ensure that EnsemblRelease.clear_cache
74 | # is able to clear memoized values from each of its methods
75 | wrapped_fn.clear_cache = clear_cache
76 | # expose the cache so we can check if an item has already been computed
77 | wrapped_fn.cache = cache
78 | # if we want to check whether an item is in the cache, first need
79 | # to construct the same cache key as used by wrapped_fn
80 | wrapped_fn.make_cache_key = _memoize_cache_key
81 | return wrapped_fn
82 |
--------------------------------------------------------------------------------
/pyensembl/ensembl_release.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | """
14 | Contains the EnsemblRelease class, which extends the Genome class
15 | to be specific to a particular release of Ensembl.
16 | """
17 | from weakref import WeakValueDictionary
18 |
19 | from .genome import Genome
20 | from .ensembl_versions import check_release_number, MAX_ENSEMBL_RELEASE
21 | from .species import check_species_object, human
22 |
23 | from .ensembl_url_templates import ENSEMBL_FTP_SERVER, make_gtf_url, make_fasta_url
24 |
25 |
26 | class EnsemblRelease(Genome):
27 | """
28 | Bundles together the genomic annotation and sequence data associated with
29 | a particular release of the Ensembl database.
30 | """
31 |
32 | @classmethod
33 | def normalize_init_values(cls, release, species, server):
34 | """
35 | Normalizes the arguments which uniquely specify an EnsemblRelease
36 | genome.
37 | """
38 | release = check_release_number(release)
39 | species = check_species_object(species)
40 | return (release, species, server)
41 |
42 | # Using a WeakValueDictionary instead of an ordinary dict to prevent a
43 | # memory leak in cases where we test many different releases in sequence.
44 | # When all the references to a particular EnsemblRelease die then that
45 | # genome should also be removed from this cache.
46 | _genome_cache = WeakValueDictionary()
47 |
48 | @classmethod
49 | def cached(
50 | cls, release=MAX_ENSEMBL_RELEASE, species=human, server=ENSEMBL_FTP_SERVER
51 | ):
52 | """
53 | Construct EnsemblRelease if it's never been made before, otherwise
54 | return an old instance.
55 | """
56 | init_args_tuple = cls.normalize_init_values(release, species, server)
57 | if init_args_tuple in cls._genome_cache:
58 | genome = cls._genome_cache[init_args_tuple]
59 | else:
60 | genome = cls._genome_cache[init_args_tuple] = cls(*init_args_tuple)
61 | return genome
62 |
63 | def __init__(
64 | self, release=MAX_ENSEMBL_RELEASE, species=human, server=ENSEMBL_FTP_SERVER
65 | ):
66 | self.release, self.species, self.server = self.normalize_init_values(
67 | release=release, species=species, server=server
68 | )
69 |
70 | self.gtf_url = make_gtf_url(
71 | ensembl_release=self.release, species=self.species, server=self.server
72 | )
73 |
74 | self.transcript_fasta_urls = [
75 | make_fasta_url(
76 | ensembl_release=self.release,
77 | species=self.species.latin_name,
78 | sequence_type="cdna",
79 | server=server,
80 | is_plant = self.species.is_plant,
81 | ),
82 | make_fasta_url(
83 | ensembl_release=self.release,
84 | species=self.species.latin_name,
85 | sequence_type="ncrna",
86 | server=server,
87 | is_plant = self.species.is_plant,
88 | ),
89 | ]
90 |
91 | self.protein_fasta_urls = [
92 | make_fasta_url(
93 | ensembl_release=self.release,
94 | species=self.species.latin_name,
95 | sequence_type="pep",
96 | server=self.server,
97 | is_plant = self.species.is_plant,
98 | )
99 | ]
100 |
101 | self.reference_name = self.species.which_reference(self.release)
102 |
103 | Genome.__init__(
104 | self,
105 | reference_name=self.reference_name,
106 | annotation_name="ensembl",
107 | annotation_version=self.release,
108 | gtf_path_or_url=self.gtf_url,
109 | transcript_fasta_paths_or_urls=self.transcript_fasta_urls,
110 | protein_fasta_paths_or_urls=self.protein_fasta_urls,
111 | )
112 |
113 | def install_string(self):
114 | return "pyensembl install --release %d --species %s" % (
115 | self.release,
116 | self.species.latin_name,
117 | )
118 |
119 | def __str__(self):
120 | return "EnsemblRelease(release=%d, species='%s')" % (
121 | self.release,
122 | self.species.latin_name,
123 | )
124 |
125 | def __eq__(self, other):
126 | return (
127 | other.__class__ is EnsemblRelease
128 | and self.release == other.release
129 | and self.species == other.species
130 | )
131 |
132 | def __hash__(self):
133 | return hash((self.release, self.species))
134 |
135 | def to_dict(self):
136 | return {"release": self.release, "species": self.species, "server": self.server}
137 |
138 | @classmethod
139 | def from_dict(cls, state_dict):
140 | """
141 | Deserialize EnsemblRelease without creating duplicate instances.
142 | """
143 | return cls.cached(**state_dict)
144 |
145 |
146 | def cached_release(release, species="human"):
147 | """
148 | Create an EnsemblRelease instance only if it's hasn't already been made,
149 | otherwise returns the old instance.
150 | Keeping this function for backwards compatibility but this functionality
151 | has been moving into the cached method of EnsemblRelease.
152 | """
153 | return EnsemblRelease.cached(release=release, species=species)
154 |
--------------------------------------------------------------------------------
/pyensembl/ensembl_url_templates.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | """
14 | Templates for URLs and paths to specific relase, species, and file type
15 | on the Ensembl ftp server.
16 |
17 | For example, the human chromosomal DNA sequences for release 78 are in:
18 |
19 | https://ftp.ensembl.org/pub/release-78/fasta/homo_sapiens/dna/
20 |
21 | """
22 |
23 | from .species import Species, find_species_by_name
24 | from .ensembl_versions import check_release_number
25 |
26 | ENSEMBL_FTP_SERVER = "https://ftp.ensembl.org"
27 | ENSEMBL_PLANTS_FTP_SERVER = "https://ftp.ensemblgenomes.ebi.ac.uk/"
28 |
29 | # Example directories
30 | # FASTA files: /pub/release-78/fasta/homo_sapiens/
31 | # GTF annotation files: /pub/release-78/gtf/homo_sapiens/
32 | FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/fasta/%(species)s/%(type)s/"
33 | PLANTS_FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/plants/fasta/%(species)s/%(type)s/"
34 | GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/gtf/%(species)s/"
35 | PLANTS_GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/plants/gtf/%(species)s/"
36 |
37 | #List plants
38 | #Lest do a vector with all the plants species that we added to make the custom url
39 | lPlants = ("arabidopsis_thaliana","arabidopsis")
40 |
41 | def normalize_release_properties(ensembl_release, species):
42 | """
43 | Make sure a given release is valid, normalize it to be an integer,
44 | normalize the species name, and get its associated reference.
45 | """
46 | ensembl_release = check_release_number(ensembl_release)
47 | if not isinstance(species, Species):
48 | species = find_species_by_name(species)
49 | reference_name = species.which_reference(ensembl_release)
50 | return ensembl_release, species.latin_name, reference_name
51 |
52 |
53 | # GTF annotation file example: Homo_sapiens.GTCh38.gtf.gz
54 | GTF_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(release)d.gtf.gz"
55 |
56 |
57 | def make_gtf_filename(ensembl_release, species):
58 | """
59 | Return GTF filename expect on Ensembl FTP server for a specific
60 | species/release combination
61 | """
62 | ensembl_release, species, reference_name = normalize_release_properties(
63 | ensembl_release, species
64 | )
65 | return GTF_FILENAME_TEMPLATE % {
66 | "Species": species.capitalize(),
67 | "reference": reference_name,
68 | "release": ensembl_release,
69 | }
70 |
71 |
72 | def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER, gtf_subdir=GTF_SUBDIR_TEMPLATE):
73 | """
74 | Returns a URL and a filename, which can be joined together.
75 | """
76 | if species.is_plant:
77 | server = ENSEMBL_PLANTS_FTP_SERVER
78 | gtf_subdir = PLANTS_GTF_SUBDIR_TEMPLATE
79 | #else:
80 | #print(f"[+] {species.latin_name} it is not a plant", flush=True)
81 |
82 | ensembl_release, species, _ = normalize_release_properties(ensembl_release, species)
83 | subdir = gtf_subdir % {"release": ensembl_release, "species": species}
84 | filename = make_gtf_filename(ensembl_release=ensembl_release, species=species)
85 | return server + subdir + filename
86 |
87 |
88 | # cDNA & protein FASTA file for releases before (and including) Ensembl 75
89 | # example: Homo_sapiens.NCBI36.54.cdna.all.fa.gz
90 | OLD_FASTA_FILENAME_TEMPLATE = (
91 | "%(Species)s.%(reference)s.%(release)d.%(sequence_type)s.all.fa.gz"
92 | )
93 |
94 | # ncRNA FASTA file for releases before (and including) Ensembl 75
95 | # example: Homo_sapiens.NCBI36.54.ncrna.fa.gz
96 |
97 | OLD_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.%(release)d.ncrna.fa.gz"
98 |
99 | # cDNA & protein FASTA file for releases after Ensembl 75
100 | # example: Homo_sapiens.GRCh37.cdna.all.fa.gz
101 | NEW_FASTA_FILENAME_TEMPLATE = "%(Species)s.%(reference)s.%(sequence_type)s.all.fa.gz"
102 |
103 | # ncRNA FASTA file for releases after Ensembl 75
104 | # example: Homo_sapiens.GRCh37.ncrna.fa.gz
105 | NEW_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.ncrna.fa.gz"
106 |
107 |
108 | def make_fasta_filename(ensembl_release, species, sequence_type, is_plant):
109 | ensembl_release, species, reference_name = normalize_release_properties(
110 | ensembl_release, species
111 | )
112 | if ensembl_release <= 75 and not is_plant:
113 | if sequence_type == "ncrna":
114 | return OLD_FASTA_FILENAME_TEMPLATE_NCRNA % {
115 | "Species": species.capitalize(),
116 | "reference": reference_name,
117 | "release": ensembl_release,
118 | }
119 | else:
120 | return OLD_FASTA_FILENAME_TEMPLATE % {
121 | "Species": species.capitalize(),
122 | "reference": reference_name,
123 | "release": ensembl_release,
124 | "sequence_type": sequence_type,
125 | }
126 | else:
127 | if sequence_type == "ncrna":
128 | return NEW_FASTA_FILENAME_TEMPLATE_NCRNA % {
129 | "Species": species.capitalize(),
130 | "reference": reference_name,
131 | }
132 | else:
133 | return NEW_FASTA_FILENAME_TEMPLATE % {
134 | "Species": species.capitalize(),
135 | "reference": reference_name,
136 | "sequence_type": sequence_type,
137 | }
138 |
139 |
140 | def make_fasta_url(ensembl_release, species, sequence_type, is_plant, server=ENSEMBL_FTP_SERVER, fasta_subdir=FASTA_SUBDIR_TEMPLATE):
141 | """Construct URL to FASTA file with cDNA transcript or protein sequences
142 |
143 | Parameter examples:
144 | ensembl_release = 75
145 | species = "Homo_sapiens"
146 | sequence_type = "cdna" (other option: "pep")
147 | """
148 | ensembl_release, species, reference_name = normalize_release_properties(
149 | ensembl_release, species
150 | )
151 |
152 | if is_plant:
153 | server = ENSEMBL_PLANTS_FTP_SERVER
154 | fasta_subdir = PLANTS_FASTA_SUBDIR_TEMPLATE
155 |
156 | subdir = fasta_subdir % {
157 | "release": ensembl_release,
158 | "species": species,
159 | "type": sequence_type,
160 | }
161 | filename = make_fasta_filename(
162 | ensembl_release=ensembl_release, species=species, sequence_type=sequence_type, is_plant = is_plant
163 | )
164 | return server + subdir + filename
165 |
--------------------------------------------------------------------------------
/pyensembl/ensembl_versions.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | MIN_ENSEMBL_RELEASE = 40
14 | MAX_ENSEMBL_RELEASE = 111
15 | MAX_PLANTS_ENSEMBL_RELEASE = 58
16 |
17 | def check_release_number(release):
18 | """
19 | Check to make sure a release is in the valid range of
20 | Ensembl releases.
21 | """
22 | try:
23 | release = int(release)
24 | except:
25 | raise ValueError("Invalid Ensembl release: %s" % release)
26 |
27 | if release < MIN_ENSEMBL_RELEASE:
28 | raise ValueError(
29 | "Invalid Ensembl releases %d, must be greater than %d"
30 | % (release, MIN_ENSEMBL_RELEASE)
31 | )
32 | return release
33 |
--------------------------------------------------------------------------------
/pyensembl/exon.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 |
14 | from .locus import Locus
15 |
16 |
17 | class Exon(Locus):
18 | def __init__(self, exon_id, contig, start, end, strand, gene_name, gene_id):
19 | Locus.__init__(self, contig, start, end, strand)
20 | self.exon_id = exon_id
21 | self.gene_name = gene_name
22 | self.gene_id = gene_id
23 |
24 | @property
25 | def id(self):
26 | """
27 | Alias for exon_id necessary for backward compatibility.
28 | """
29 | return self.exon_id
30 |
31 | def __str__(self):
32 | return (
33 | "Exon(exon_id='%s',"
34 | " gene_id='%s',"
35 | " gene_name='%s',"
36 | " contig='%s',"
37 | " start=%d,"
38 | " end=%s,"
39 | " strand='%s')"
40 | ) % (
41 | self.exon_id,
42 | self.gene_id,
43 | self.gene_name,
44 | self.contig,
45 | self.start,
46 | self.end,
47 | self.strand,
48 | )
49 |
50 | def __eq__(self, other):
51 | if not isinstance(other, Exon):
52 | raise TypeError(
53 | "Cannot compare %s and %s"
54 | % (self.__class__.__name__, other.__class.__name__)
55 | )
56 | return (
57 | self.contig == other.contig
58 | and self.start == other.start
59 | and self.end == other.end
60 | and self.strand == other.strand
61 | and self.id == other.id
62 | )
63 |
64 | def __hash__(self):
65 | return hash(self.id)
66 |
67 | def to_dict(self):
68 | state_dict = Locus.to_dict(self)
69 | state_dict["exon_id"] = self.id
70 | state_dict["gene_name"] = self.gene_name
71 | state_dict["gene_id"] = self.gene_id
72 | return state_dict
73 |
--------------------------------------------------------------------------------
/pyensembl/fasta.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2015-2016. Mount Sinai School of Medicine
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | The worse sin in bioinformatics is to write your own FASTA parser.
17 | Unfortunately, small errors creep in to different FASTA files on the
18 | Ensembl FTP server that no proper FASTA parser lets you skip over.
19 | """
20 |
21 |
22 | from gzip import GzipFile
23 | import logging
24 |
25 |
26 | logger = logging.getLogger(__name__)
27 |
28 |
29 | def _parse_header_id(line):
30 | """
31 | Pull the transcript or protein identifier from the header line
32 | which starts with '>'
33 | """
34 | if type(line) is not bytes:
35 | raise TypeError(
36 | "Expected header line to be of type %s but got %s" % (bytes, type(line))
37 | )
38 |
39 | if len(line) <= 1:
40 | raise ValueError("No identifier on FASTA line")
41 |
42 | # split line at first space to get the unique identifier for
43 | # this sequence
44 | space_index = line.find(b" ")
45 | if space_index >= 0:
46 | identifier = line[1:space_index]
47 | else:
48 | identifier = line[1:]
49 |
50 | # annoyingly Ensembl83 reformatted the transcript IDs of its
51 | # cDNA FASTA to include sequence version numbers
52 | # .e.g.
53 | # "ENST00000448914.1" instead of "ENST00000448914"
54 | # So now we have to parse out the identifier
55 |
56 | # only split name of ENSEMBL naming. In other database, such as TAIR,
57 | # the '.1' notation is the isoform not the version.
58 | if identifier.startswith(b"ENS"):
59 | dot_index = identifier.find(b".")
60 | if dot_index >= 0:
61 | identifier = identifier[:dot_index]
62 |
63 | return identifier.decode("ascii")
64 |
65 |
66 | class FastaParser(object):
67 | """
68 | FastaParser object consumes lines of a FASTA file incrementally
69 | while building up a dictionary mapping sequence identifiers to sequences.
70 | """
71 |
72 | def __init__(self):
73 | self.current_id = None
74 | self.current_lines = []
75 |
76 | def read_file(self, fasta_path):
77 | """
78 | Read the contents of a FASTA file into a dictionary
79 | """
80 | fasta_dictionary = {}
81 | for identifier, sequence in self.iterate_over_file(fasta_path):
82 | fasta_dictionary[identifier] = sequence
83 | return fasta_dictionary
84 |
85 | def iterate_over_file(self, fasta_path):
86 | """
87 | Generator that yields identifiers paired with sequences.
88 | """
89 | with self._open(fasta_path) as f:
90 | for line in f:
91 | line = line.rstrip()
92 |
93 | if len(line) == 0:
94 | continue
95 |
96 | # have to slice into a bytes object or else I get a single integer
97 | first_char = line[0:1]
98 |
99 | if first_char == b">":
100 | id_and_seq = self._read_header(line)
101 | if id_and_seq is not None:
102 | yield id_and_seq
103 |
104 | elif first_char == b";":
105 | # semicolon are comment characters
106 | continue
107 | else:
108 | self.current_lines.append(line)
109 | # the last sequence is still in the lines buffer after we're done with
110 | # the file so make sure to yield it
111 | id_and_seq = self._current_entry()
112 | if id_and_seq is not None:
113 | yield id_and_seq
114 |
115 | def _open(self, fasta_path):
116 | """
117 | Open either a text file or compressed gzip file as a stream of bytes.
118 | """
119 | if fasta_path.endswith("gz") or fasta_path.endswith("gzip"):
120 | return GzipFile(fasta_path, "rb")
121 | else:
122 | return open(fasta_path, "rb")
123 |
124 | def _current_entry(self):
125 | # when we hit a new entry, if this isn't the first
126 | # entry of the file then put the last one in the dictionary
127 | if self.current_id:
128 | if len(self.current_lines) == 0:
129 | logger.warn("No sequence data for '%s'", self.current_id)
130 | else:
131 | sequence = b"".join(self.current_lines)
132 | sequence = sequence.decode("ascii")
133 | return self.current_id, sequence
134 |
135 | def _read_header(self, line):
136 | previous_entry = self._current_entry()
137 |
138 | self.current_id = _parse_header_id(line)
139 |
140 | if len(self.current_id) == 0:
141 | logger.warn("Unable to parse ID from header line: %s", line)
142 |
143 | self.current_lines = []
144 | return previous_entry
145 |
146 |
147 | def parse_fasta_dictionary(fasta_path):
148 | """
149 | Given a path to a FASTA (or compressed FASTA) file, returns a dictionary
150 | mapping its sequence identifiers to sequences.
151 |
152 | Parameters
153 | ----------
154 | fasta_path : str
155 | Path to the FASTA file.
156 |
157 | Returns dictionary from string identifiers to string sequences.
158 | """
159 | parser = FastaParser()
160 | return parser.read_file(fasta_path)
161 |
--------------------------------------------------------------------------------
/pyensembl/gene.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 |
14 | from memoized_property import memoized_property
15 |
16 | from .locus_with_genome import LocusWithGenome
17 |
18 |
19 | class Gene(LocusWithGenome):
20 | def __init__(self, gene_id, gene_name, contig, start, end, strand, biotype, genome):
21 | LocusWithGenome.__init__(
22 | self,
23 | contig=contig,
24 | start=start,
25 | end=end,
26 | strand=strand,
27 | biotype=biotype,
28 | genome=genome,
29 | )
30 | self.gene_id = gene_id
31 | self.gene_name = gene_name
32 |
33 | @property
34 | def id(self):
35 | """
36 | Alias for gene_id necessary for backwards compatibility.
37 | """
38 | return self.gene_id
39 |
40 | @property
41 | def name(self):
42 | """
43 | Alias for gene_name necessary for backwards compatibility.
44 | """
45 | return self.gene_name
46 |
47 | def __str__(self):
48 | return (
49 | "Gene(gene_id='%s',"
50 | " gene_name='%s',"
51 | " biotype='%s',"
52 | " contig='%s',"
53 | " start=%d,"
54 | " end=%d, strand='%s', genome='%s')"
55 | ) % (
56 | self.gene_id,
57 | self.gene_name,
58 | self.biotype,
59 | self.contig,
60 | self.start,
61 | self.end,
62 | self.strand,
63 | self.genome.reference_name,
64 | )
65 |
66 | def __eq__(self, other):
67 | return (
68 | other.__class__ is Gene
69 | and self.id == other.id
70 | and self.genome == other.genome
71 | )
72 |
73 | def __hash__(self):
74 | return hash(self.id)
75 |
76 | def to_dict(self):
77 | state_dict = LocusWithGenome.to_dict(self)
78 | state_dict["gene_id"] = self.gene_id
79 | state_dict["gene_name"] = self.gene_name
80 | return state_dict
81 |
82 | @memoized_property
83 | def transcripts(self):
84 | """
85 | Property which dynamically construct transcript objects for all
86 | transcript IDs associated with this gene.
87 | """
88 | transcript_id_results = self.db.query(
89 | select_column_names=["transcript_id"],
90 | filter_column="gene_id",
91 | filter_value=self.id,
92 | feature="transcript",
93 | distinct=False,
94 | required=False,
95 | )
96 |
97 | # We're doing a SQL query for each transcript ID to fetch
98 | # its particular information, might be more efficient if we
99 | # just get all the columns here, but how do we keep that modular?
100 | return [
101 | self.genome.transcript_by_id(result[0]) for result in transcript_id_results
102 | ]
103 |
104 | @memoized_property
105 | def exons(self):
106 | exon_set = set([])
107 | for transcript in self.transcripts:
108 | for exon in transcript.exons:
109 | exon_set.add(exon)
110 | return list(sorted(exon_set))
111 |
--------------------------------------------------------------------------------
/pyensembl/locus.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from serializable import Serializable
14 |
15 | from .normalization import normalize_chromosome, normalize_strand
16 |
17 |
18 | class Locus(Serializable):
19 | """
20 | Base class for any entity which can be localized at a range of positions
21 | on a particular strand of a chromosome/contig.
22 | """
23 |
24 | def __init__(self, contig, start, end, strand):
25 | """
26 | contig : str
27 | Chromosome or other sequence name in the reference assembly
28 |
29 | start : int
30 | Start position of locus on the contig
31 |
32 | end : int
33 | Inclusive end position on the contig
34 |
35 | strand : str
36 | Should we read the locus forwards ('+') or backwards ('-')?
37 | """
38 |
39 | self.contig = normalize_chromosome(contig)
40 | self.strand = normalize_strand(strand)
41 |
42 | start = int(start)
43 | end = int(end)
44 |
45 | if start == 0:
46 | raise ValueError("Expected start > 0 (using base 1 coordinates)")
47 | elif end == 0:
48 | raise ValueError("Expected end > 0 (using base 1 coordinates)")
49 |
50 | if end < start:
51 | raise ValueError(
52 | "Expected start <= end, got start = %d, end = %d" % (start, end)
53 | )
54 | self.start = start
55 | self.end = end
56 |
57 | def __str__(self):
58 | return "Locus(contig='%s', start=%d, end=%d, strand='%s')" % (
59 | self.contig,
60 | self.start,
61 | self.end,
62 | self.strand,
63 | )
64 |
65 | def __len__(self):
66 | return self.end - self.start + 1
67 |
68 | def __eq__(self, other):
69 | if not isinstance(other, Locus):
70 | raise TypeError(
71 | "Cannot compare %s and %s"
72 | % (self.__class__.__name__, other.__class.__name__)
73 | )
74 | return (
75 | self.contig == other.contig
76 | and self.start == other.start
77 | and self.end == other.end
78 | and self.strand == other.strand
79 | )
80 |
81 | def to_tuple(self):
82 | return (self.contig, self.start, self.end, self.strand)
83 |
84 | def __lt__(self, other):
85 | if not isinstance(other, Locus):
86 | raise TypeError(
87 | "Cannot compare %s and %s"
88 | % (self.__class__.__name__, other.__class.__name__)
89 | )
90 | return self.to_tuple() < other.to_tuple()
91 |
92 | def __le__(self, other):
93 | return (self == other) or (self < other)
94 |
95 | def __gt__(self, other):
96 | if not isinstance(other, Locus):
97 | raise TypeError(
98 | "Cannot compare %s and %s"
99 | % (self.__class__.__name__, other.__class.__name__)
100 | )
101 | return self.to_tuple() > other.to_tuple()
102 |
103 | def __ge__(self, other):
104 | return (self == other) or (self > other)
105 |
106 | def to_dict(self):
107 | return {
108 | "contig": self.contig,
109 | "start": self.start,
110 | "end": self.end,
111 | "strand": self.strand,
112 | }
113 |
114 | @property
115 | def length(self):
116 | return self.end - self.start + 1
117 |
118 | def offset(self, position):
119 | """Offset of given position from stranded start of this locus.
120 |
121 | For example, if a Locus goes from 10..20 and is on the negative strand,
122 | then the offset of position 13 is 7, whereas if the Locus is on the
123 | positive strand, then the offset is 3.
124 | """
125 | if position > self.end or position < self.start:
126 | raise ValueError(
127 | "Position %d outside valid range %d..%d of %s"
128 | % (position, self.start, self.end, self)
129 | )
130 | elif self.on_forward_strand:
131 | return position - self.start
132 | else:
133 | return self.end - position
134 |
135 | def offset_range(self, start, end):
136 | """
137 | Database start/end entries are always ordered such that
138 | start < end. This makes computing a relative position (e.g. of a stop
139 | codon relative to its transcript) complicated since the "end"
140 | position of a backwards locus is actually earlir on the strand.
141 | This function correctly selects a start vs. end value depending
142 | on this locuses's strand and determines that position's offset from
143 | the earliest position in this locus.
144 | """
145 | if start > end:
146 | raise ValueError(
147 | "Locus should always have start <= end, got start=%d, end=%d"
148 | % (start, end)
149 | )
150 |
151 | if start < self.start or end > self.end:
152 | raise ValueError("Range (%d, %d) falls outside %s" % (start, end, self))
153 |
154 | if self.on_forward_strand:
155 | return (start - self.start, end - self.start)
156 |
157 | else:
158 | return (self.end - end, self.end - start)
159 |
160 | def on_contig(self, contig):
161 | return normalize_chromosome(contig) == self.contig
162 |
163 | def on_strand(self, strand):
164 | return normalize_strand(strand) == self.strand
165 |
166 | @property
167 | def on_forward_strand(self):
168 | return self.on_strand("+")
169 |
170 | @property
171 | def on_positive_strand(self):
172 | return self.on_forward_strand
173 |
174 | @property
175 | def on_backward_strand(self):
176 | return self.on_strand("-")
177 |
178 | @property
179 | def on_negative_strand(self):
180 | return self.on_backward_strand
181 |
182 | def can_overlap(self, contig, strand=None):
183 | """
184 | Is this locus on the same contig and (optionally) on the same strand?
185 | """
186 | return self.on_contig(contig) and (strand is None or self.on_strand(strand))
187 |
188 | def distance_to_interval(self, start, end):
189 | """
190 | Find the distance between intervals [start1, end1] and [start2, end2].
191 | If the intervals overlap then the distance is 0.
192 | """
193 | if self.start > end:
194 | # interval is before this exon
195 | return self.start - end
196 | elif self.end < start:
197 | # exon is before the interval
198 | return start - self.end
199 | else:
200 | return 0
201 |
202 | def distance_to_locus(self, other):
203 | if not self.can_overlap(other.contig, other.strand):
204 | # if two loci are on different contigs or strands,
205 | # can't compute a distance between them
206 | return float("inf")
207 | return self.distance_to_interval(other.start, other.end)
208 |
209 | def overlaps(self, contig, start, end, strand=None):
210 | """
211 | Does this locus overlap with a given range of positions?
212 |
213 | Since locus position ranges are inclusive, we should make sure
214 | that e.g. chr1:10-10 overlaps with chr1:10-10
215 | """
216 | return (
217 | self.can_overlap(contig, strand)
218 | and self.distance_to_interval(start, end) == 0
219 | )
220 |
221 | def overlaps_locus(self, other_locus):
222 | return self.overlaps(
223 | other_locus.contig, other_locus.start, other_locus.end, other_locus.strand
224 | )
225 |
226 | def contains(self, contig, start, end, strand=None):
227 | return (
228 | self.can_overlap(contig, strand) and start >= self.start and end <= self.end
229 | )
230 |
231 | def contains_locus(self, other_locus):
232 | return self.contains(
233 | other_locus.contig, other_locus.start, other_locus.end, other_locus.strand
234 | )
235 |
--------------------------------------------------------------------------------
/pyensembl/locus_with_genome.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 |
14 | from .locus import Locus
15 |
16 |
17 | class LocusWithGenome(Locus):
18 | """
19 | Common base class for Gene and Transcript to avoid copying
20 | their shared logic.
21 | """
22 |
23 | def __init__(self, contig, start, end, strand, biotype, genome):
24 | Locus.__init__(self, contig, start, end, strand)
25 | self.genome = genome
26 | self.db = self.genome.db
27 | self.biotype = biotype
28 |
29 | def to_dict(self):
30 | return dict(
31 | contig=self.contig,
32 | start=self.start,
33 | end=self.end,
34 | strand=self.strand,
35 | biotype=self.biotype,
36 | genome=self.genome,
37 | )
38 |
39 | @property
40 | def is_protein_coding(self):
41 | """
42 | We're not counting immunoglobulin-like genes from the T-cell receptor or
43 | or antibodies since they occur in fragments that must be recombined.
44 | It might be worth consider counting non-sense mediated decay and
45 | non-stop decay since variants in these could potentially make a
46 | functional protein. To read more about the biotypes used in Ensembl:
47 | http://vega.sanger.ac.uk/info/about/gene_and_transcript_types.html
48 | http://www.gencodegenes.org/gencode_biotypes.html
49 |
50 | For now let's stick with the simple category of 'protein_coding', which
51 | means that there is an open reading frame in this gene/transcript
52 | whose successful transcription has been observed.
53 | """
54 | return self.biotype == "protein_coding"
55 |
--------------------------------------------------------------------------------
/pyensembl/logging.conf:
--------------------------------------------------------------------------------
1 | [loggers]
2 | keys=root,pyensembl,datacache
3 |
4 | [formatters]
5 | keys=simpleFormatter
6 |
7 | [handlers]
8 | keys=consoleHandler,consoleHandlerCritical
9 |
10 | [logger_root]
11 | level=INFO
12 | handlers=consoleHandlerCritical
13 |
14 | [handler_consoleHandler]
15 | class=StreamHandler
16 | level=INFO
17 | formatter=simpleFormatter
18 | args=(sys.stdout,)
19 |
20 | [handler_consoleHandlerCritical] # only for root logger: essentially silent
21 | class=StreamHandler
22 | level=CRITICAL
23 | formatter=simpleFormatter
24 | args=(sys.stdout,)
25 |
26 | [formatter_simpleFormatter]
27 | format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
28 | datefmt=
29 |
30 | # pyensembl
31 |
32 | [logger_pyensembl]
33 | level=DEBUG
34 | qualname=pyensembl
35 | handlers=consoleHandler
36 |
37 | # datacache
38 |
39 | [logger_datacache]
40 | level=DEBUG
41 | qualname=datacache
42 | handlers=consoleHandler
43 |
--------------------------------------------------------------------------------
/pyensembl/normalization.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from sys import intern
14 | from typechecks import is_string, is_integer
15 |
16 | # Manually memoizing here, since our simple common.memoize function has
17 | # noticable overhead in this instance.
18 | NORMALIZE_CHROMOSOME_CACHE = {}
19 |
20 |
21 | def normalize_chromosome(c):
22 | try:
23 | return NORMALIZE_CHROMOSOME_CACHE[c]
24 | except KeyError:
25 | pass
26 |
27 | if not (is_string(c) or is_integer(c)):
28 | raise TypeError("Chromosome cannot be '%s' : %s" % (c, type(c)))
29 |
30 | result = str(c)
31 |
32 | if result == "0":
33 | raise ValueError("Chromosome name cannot be 0")
34 | elif result == "":
35 | raise ValueError("Chromosome name cannot be empty")
36 |
37 | if result.startswith("chr") and "_" not in result:
38 | # excluding "_" for names like "chrUn_gl000212"
39 | # capitalize "chrx" -> "chrX"
40 | result = "chr" + result[3:].upper()
41 | elif result.isalpha():
42 | # capitalize e.g. "x" -> "X"
43 | result = result.upper()
44 |
45 | # interning strings since the chromosome names probably get constructed
46 | # or parsed millions of times, can save memory in tight situations
47 | # (such as parsing GTF files)
48 | result = intern(result)
49 |
50 | NORMALIZE_CHROMOSOME_CACHE[c] = result
51 |
52 | return result
53 |
54 |
55 | def normalize_strand(strand):
56 | if strand == "+" or strand == 1 or strand == "+1" or strand == "1":
57 | return "+"
58 | elif strand == "-" or strand == -1 or strand == "-1":
59 | return "-"
60 | raise ValueError("Invalid strand: %s" % (strand,))
61 |
--------------------------------------------------------------------------------
/pyensembl/reference_name.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from .ensembl_release import EnsemblRelease
14 | from .species import Species, find_species_by_name
15 |
16 |
17 | def normalize_reference_name(name):
18 | """
19 | Search the dictionary of species-specific references to find a reference
20 | name that matches aside from capitalization.
21 |
22 | If no matching reference is found, raise an exception.
23 | """
24 | lower_name = name.strip().lower()
25 | for reference in Species._reference_names_to_species.keys():
26 | if reference.lower() == lower_name:
27 | return reference
28 | raise ValueError("Reference genome '%s' not found" % name)
29 |
30 |
31 | def find_species_by_reference(reference_name):
32 | return Species._reference_names_to_species[normalize_reference_name(reference_name)]
33 |
34 |
35 | def which_reference(species_name, ensembl_release):
36 | return find_species_by_name(species_name).which_reference(ensembl_release)
37 |
38 |
39 | def max_ensembl_release(reference_name):
40 | species = find_species_by_reference(reference_name)
41 | (_, max_release) = species.reference_assemblies[reference_name]
42 | return max_release
43 |
44 |
45 | def genome_for_reference_name(reference_name, allow_older_downloaded_release=True):
46 | """
47 | Given a genome reference name, such as "GRCh38", returns the
48 | corresponding Ensembl Release object.
49 |
50 | If `allow_older_downloaded_release` is True, and some older releases have
51 | been downloaded, then return the most recent locally available release.
52 |
53 | Otherwise, return the newest release of Ensembl (even if its data hasn't
54 | already been downloaded).
55 | """
56 | reference_name = normalize_reference_name(reference_name)
57 | species = find_species_by_reference(reference_name)
58 | (min_ensembl_release, max_ensembl_release) = species.reference_assemblies[
59 | reference_name
60 | ]
61 | if allow_older_downloaded_release:
62 | # go through candidate releases in descending order
63 | for release in reversed(range(min_ensembl_release, max_ensembl_release + 1)):
64 | # check if release has been locally downloaded
65 | candidate = EnsemblRelease.cached(release=release, species=species)
66 | if candidate.required_local_files_exist():
67 | return candidate
68 | # see if any of the releases between [max, min] are already locally
69 | # available
70 | return EnsemblRelease.cached(release=max_ensembl_release, species=species)
71 |
72 |
73 | ensembl_grch36 = genome_for_reference_name("ncbi36")
74 | ensembl_grch37 = genome_for_reference_name("grch37")
75 | ensembl_grch38 = genome_for_reference_name("grch38")
76 |
--------------------------------------------------------------------------------
/pyensembl/search.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | """
14 | Helper functions for searching over collections of PyEnsembl objects
15 | """
16 |
17 |
18 | def find_nearest_locus(start, end, loci):
19 | """
20 | Finds nearest locus (object with method `distance_to_interval`) to the
21 | interval defined by the given `start` and `end` positions.
22 | Returns the distance to that locus, along with the locus object itself.
23 | """
24 | best_distance = float("inf")
25 | best_locus = None
26 | for locus in loci:
27 | distance = locus.distance_to_interval(start, end)
28 | if best_distance > distance:
29 | best_distance = distance
30 | best_locus = locus
31 | return best_distance, best_locus
32 |
--------------------------------------------------------------------------------
/pyensembl/sequence_data.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from os import remove
14 | from os.path import exists, abspath, split, join
15 | import logging
16 | from collections import Counter
17 | import pickle
18 | from .common import load_pickle, dump_pickle
19 | from .fasta import parse_fasta_dictionary
20 |
21 |
22 | logger = logging.getLogger(__name__)
23 |
24 |
25 | class SequenceData(object):
26 | """
27 | Container for reference nucleotide and amino acid sequenes.
28 | """
29 |
30 | def __init__(self, fasta_paths, cache_directory_path=None):
31 | if type(fasta_paths) is str:
32 | fasta_paths = [fasta_paths]
33 |
34 | self.fasta_paths = [abspath(path) for path in fasta_paths]
35 | self.fasta_directory_paths = [split(path)[0] for path in self.fasta_paths]
36 | self.fasta_filenames = [split(path)[1] for path in self.fasta_paths]
37 | if cache_directory_path:
38 | self.cache_directory_paths = [cache_directory_path] * len(self.fasta_paths)
39 | else:
40 | self.cache_directory_paths = self.fasta_directory_paths
41 | for path in self.fasta_paths:
42 | if not exists(path):
43 | raise ValueError("Couldn't find FASTA file %s" % (path,))
44 | self.fasta_dictionary_filenames = [
45 | filename + ".pickle" for filename in self.fasta_filenames
46 | ]
47 | self.fasta_dictionary_pickle_paths = [
48 | join(cache_path, filename)
49 | for cache_path, filename in zip(
50 | self.cache_directory_paths, self.fasta_dictionary_filenames
51 | )
52 | ]
53 | self._init_lazy_fields()
54 |
55 | def _init_lazy_fields(self):
56 | self._fasta_dictionary = None
57 | self._fasta_keys = None
58 |
59 | def clear_cache(self):
60 | self._init_lazy_fields()
61 | for path in self.fasta_dictionary_pickle_paths:
62 | if exists(path):
63 | remove(path)
64 |
65 | def __str__(self):
66 | return "SequenceData(fasta_paths=%s)" % (self.fasta_paths,)
67 |
68 | def __repr__(self):
69 | return str(self)
70 |
71 | def __contains__(self, sequence_id):
72 | if self._fasta_keys is None:
73 | self._fasta_keys = set(self.fasta_dictionary.keys())
74 | return sequence_id in self._fasta_keys
75 |
76 | def __eq__(self, other):
77 | # test to see if self.fasta_paths and other.fasta_paths contain
78 | # the same list of paths, regardless of order
79 | return (other.__class__ is SequenceData) and Counter(
80 | self.fasta_paths
81 | ) == Counter(other.fasta_paths)
82 |
83 | def __hash__(self):
84 | return hash(self.fasta_paths)
85 |
86 | def _add_to_fasta_dictionary(self, fasta_dictionary_tmp):
87 | for identifier, sequence in fasta_dictionary_tmp.items():
88 | if identifier in self._fasta_dictionary:
89 | logger.warn(
90 | "Sequence identifier %s is duplicated in your FASTA files!"
91 | % identifier
92 | )
93 | continue
94 | self._fasta_dictionary[identifier] = sequence
95 |
96 | def _load_or_create_fasta_dictionary_pickle(self):
97 | self._fasta_dictionary = dict()
98 | for fasta_path, pickle_path in zip(
99 | self.fasta_paths, self.fasta_dictionary_pickle_paths
100 | ):
101 | if exists(pickle_path):
102 | # try loading the cached file
103 | # but we'll fall back on recreating it if loading fails
104 | try:
105 | fasta_dictionary_tmp = load_pickle(pickle_path)
106 | self._add_to_fasta_dictionary(fasta_dictionary_tmp)
107 | logger.info("Loaded sequence dictionary from %s", pickle_path)
108 | continue
109 | except (pickle.UnpicklingError, AttributeError):
110 | # catch either an UnpicklingError or an AttributeError
111 | # resulting from pickled objects refering to classes
112 | # that no longer exists
113 | logger.warn(
114 | "Failed to load %s, attempting to read FASTA directly",
115 | pickle_path,
116 | )
117 | logger.info("Parsing sequences from FASTA file at %s", fasta_path)
118 |
119 | fasta_dictionary_tmp = parse_fasta_dictionary(fasta_path)
120 | self._add_to_fasta_dictionary(fasta_dictionary_tmp)
121 | logger.info("Saving sequence dictionary to %s", pickle_path)
122 | dump_pickle(fasta_dictionary_tmp, pickle_path)
123 |
124 | def index(self, overwrite=False):
125 | if overwrite:
126 | self.clear_cache()
127 | self._load_or_create_fasta_dictionary_pickle()
128 |
129 | @property
130 | def fasta_dictionary(self):
131 | if not self._fasta_dictionary:
132 | self._load_or_create_fasta_dictionary_pickle()
133 | return self._fasta_dictionary
134 |
135 | def get(self, sequence_id):
136 | """Get sequence associated with given ID or return None if missing"""
137 | return self.fasta_dictionary.get(sequence_id)
138 |
--------------------------------------------------------------------------------
/pyensembl/shell.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | """
14 | Manipulate pyensembl's local cache.
15 |
16 | %(prog)s {install, delete, delete-sequence-cache} [--release XXX --species human...]
17 |
18 | To install particular Ensembl human release(s):
19 | %(prog)s install --release 75 77
20 |
21 | To install particular Ensembl mouse release(s):
22 | %(prog)s install --release 75 77 --species mouse
23 |
24 | To delete all downloaded and cached data for a particular Ensembl release:
25 | %(prog)s delete-all-files --release 75 --species human
26 |
27 | To delete only cached data related to transcript and protein sequences:
28 | %(prog)s delete-index-files --release 75
29 |
30 | To list all installed genomes:
31 | %(prog)s list
32 |
33 | To install a genome from source files:
34 | %(prog)s install \
35 | --reference-name "GRCh38" \
36 | --gtf URL_OR_PATH \
37 | --transcript-fasta URL_OR_PATH \
38 | --protein-fasta URL_OR_PATH
39 | """
40 |
41 | import argparse
42 | import logging.config
43 | import pkg_resources
44 | import os
45 |
46 | from .ensembl_release import EnsemblRelease
47 | from .ensembl_versions import MAX_ENSEMBL_RELEASE
48 | from .genome import Genome
49 | from .species import Species
50 | from .version import __version__
51 |
52 | logging.config.fileConfig(pkg_resources.resource_filename(__name__, "logging.conf"))
53 | logger = logging.getLogger(__name__)
54 |
55 |
56 | parser = argparse.ArgumentParser(usage=__doc__)
57 |
58 | parser.add_argument(
59 | "--version",
60 | action="version",
61 | version='%(prog)s {version}'.format(version=__version__)
62 | )
63 |
64 | parser.add_argument(
65 | "--overwrite",
66 | default=False,
67 | action="store_true",
68 | help="Force download and indexing even if files already exist locally",
69 | )
70 |
71 |
72 | root_group = parser.add_mutually_exclusive_group()
73 |
74 | release_group = root_group.add_argument_group()
75 | release_group.add_argument(
76 | "--release",
77 | type=int,
78 | nargs="+",
79 | default=[],
80 | help="Ensembl release version(s) (default=%d)" % MAX_ENSEMBL_RELEASE,
81 | )
82 |
83 | release_group.add_argument(
84 | "--species",
85 | default=[],
86 | nargs="+",
87 | help="Which species to download Ensembl data for (default=human)",
88 | )
89 |
90 | release_group.add_argument(
91 | "--custom-mirror",
92 | default=None,
93 | help="URL and directory to use instead of the default Ensembl FTP server",
94 | )
95 |
96 | path_group = root_group.add_argument_group()
97 |
98 | path_group.add_argument(
99 | "--reference-name",
100 | type=str,
101 | default=None,
102 | help="Name of the reference, e.g. GRCh38",
103 | )
104 |
105 | path_group.add_argument(
106 | "--annotation-name", default=None, help="Name of annotation source (e.g. refseq)"
107 | )
108 |
109 | path_group.add_argument(
110 | "--annotation-version", default=None, help="Version of annotation database"
111 | )
112 |
113 | path_group.add_argument(
114 | "--gtf",
115 | type=str,
116 | default=None,
117 | help="URL or local path to a GTF file containing annotations.",
118 | )
119 |
120 | path_group.add_argument(
121 | "--transcript-fasta",
122 | type=str,
123 | action="append",
124 | default=[],
125 | help="URL or local path to a FASTA files containing the transcript "
126 | "data. This option can be specified multiple times for multiple "
127 | "FASTA files.",
128 | )
129 |
130 | path_group.add_argument(
131 | "--protein-fasta",
132 | type=str,
133 | default=[],
134 | action="append",
135 | help="URL or local path to a FASTA file containing protein data.",
136 | )
137 |
138 | path_group.add_argument(
139 | "--shared-prefix",
140 | default="",
141 | help="Add this prefix to URLs or paths specified by --gtf, --transcript-fasta, --protein-fasta",
142 | )
143 |
144 | parser.add_argument(
145 | "action",
146 | type=lambda arg: arg.lower().strip(),
147 | choices=(
148 | "install",
149 | "delete-all-files",
150 | "delete-index-files",
151 | "list",
152 | ),
153 | help=(
154 | '"install" will download and index any data that is not '
155 | 'currently downloaded or indexed. "delete-all-files" will delete all data '
156 | 'associated with a genome annotation. "delete-index-files" deletes '
157 | "all files other than the original GTF and FASTA files for a genome. "
158 | '"list" will show you all installed Ensembl genomes.'
159 | ),
160 | )
161 |
162 |
163 | def collect_all_installed_ensembl_releases():
164 | genomes = []
165 | for species, release in Species.all_species_release_pairs():
166 | genome = EnsemblRelease(release, species=species)
167 | if genome.required_local_files_exist():
168 | genomes.append(genome)
169 | return sorted(genomes, key=lambda g: (g.species.latin_name, g.release))
170 |
171 |
172 | def all_combinations_of_ensembl_genomes(args):
173 | """
174 | Use all combinations of species and release versions specified by the
175 | commandline arguments to return a list of EnsemblRelease or Genome objects.
176 | The results will typically be of type EnsemblRelease unless the
177 | --custom-mirror argument was given.
178 | """
179 | species_list = args.species if args.species else ["human"]
180 | release_list = args.release if args.release else [MAX_ENSEMBL_RELEASE]
181 | genomes = []
182 | for species in species_list:
183 | # Otherwise, use Ensembl release information
184 | for version in release_list:
185 | ensembl_release = EnsemblRelease(version, species=species)
186 |
187 | if not args.custom_mirror:
188 | genomes.append(ensembl_release)
189 | else:
190 | # if we're using a custom mirror then we expect the provided
191 | # URL to be a directory with all the same filenames as
192 | # would be provided by Ensembl
193 | gtf_url = os.path.join(
194 | args.custom_mirror, os.path.basename(ensembl_release.gtf_url)
195 | )
196 | transcript_fasta_urls = [
197 | os.path.join(
198 | args.custom_mirror, os.path.basename(transcript_fasta_url)
199 | )
200 | for transcript_fasta_url in ensembl_release.transcript_fasta_urls
201 | ]
202 | protein_fasta_urls = [
203 | os.path.join(
204 | args.custom_mirror, os.path.basename(protein_fasta_url)
205 | )
206 | for protein_fasta_url in ensembl_release.protein_fasta_urls
207 | ]
208 | reference_name = ensembl_release.reference_name
209 | genome = Genome(
210 | reference_name=reference_name,
211 | annotation_name="ensembl",
212 | annotation_version=version,
213 | gtf_path_or_url=gtf_url,
214 | transcript_fasta_paths_or_urls=transcript_fasta_urls,
215 | protein_fasta_paths_or_urls=protein_fasta_urls,
216 | )
217 | genomes.append(genome)
218 | return genomes
219 |
220 |
221 | def collect_selected_genomes(args):
222 | # If specific genome source URLs are provided, use those
223 | if args.gtf or args.transcript_fasta or args.protein_fasta:
224 | if args.release:
225 | raise ValueError(
226 | "An Ensembl release cannot be specified if "
227 | "specific paths are also given"
228 | )
229 | if not args.reference_name:
230 | raise ValueError("Must specify a reference name")
231 | if not args.annotation_name:
232 | raise ValueError("Must specify the name of the annotation source")
233 |
234 | return [
235 | Genome(
236 | reference_name=args.reference_name,
237 | annotation_name=args.annotation_name,
238 | annotation_version=args.annotation_version,
239 | gtf_path_or_url=os.path.join(args.shared_prefix, args.gtf),
240 | transcript_fasta_paths_or_urls=[
241 | os.path.join(args.shared_prefix, transcript_fasta)
242 | for transcript_fasta in args.transcript_fasta
243 | ],
244 | protein_fasta_paths_or_urls=[
245 | os.path.join(args.shared_prefix, protein_fasta)
246 | for protein_fasta in args.protein_fasta
247 | ],
248 | )
249 | ]
250 | else:
251 | return all_combinations_of_ensembl_genomes(args)
252 |
253 |
254 | def run():
255 | args = parser.parse_args()
256 | if args.action == "list":
257 | # TODO: how do we also identify which non-Ensembl genomes are
258 | # installed?
259 | genomes = collect_all_installed_ensembl_releases()
260 | for genome in genomes:
261 | # print every directory in which downloaded files are located
262 | # in most case this will be only one directory
263 | filepaths = genome.required_local_files()
264 | directories = {os.path.split(path)[0] for path in filepaths}
265 | print("-- %s: %s" % (genome, ", ".join(directories)))
266 | else:
267 | genomes = collect_selected_genomes(args)
268 |
269 | if len(genomes) == 0:
270 | logger.error("ERROR: No genomes selected!")
271 | parser.print_help()
272 |
273 | for genome in genomes:
274 | logger.info("Running '%s' for %s", args.action, genome)
275 | if args.action == "delete-all-files":
276 | genome.download_cache.delete_cache_directory()
277 | elif args.action == "delete-index-files":
278 | genome.delete_index_files()
279 | elif args.action == "install":
280 | genome.download(overwrite=args.overwrite)
281 | genome.index(overwrite=args.overwrite)
282 | else:
283 | raise ValueError("Invalid action: %s" % args.action)
284 |
--------------------------------------------------------------------------------
/pyensembl/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "2.3.13"
2 |
3 | def print_version():
4 | print(f"v{__version__}")
5 |
6 | if __name__ == "__main__":
7 | print_version()
8 |
--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
1 | [TYPECHECK]
2 | # Without ignoring this, we get errors like:
3 | # E:249,20: Module 'numpy' has no 'nan' member (no-member)
4 | ignored-modules = numpy
5 |
6 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | typechecks>=0.0.2,<1.0.0
2 | datacache>=1.4.0,<2.0.0
3 | memoized-property>=1.0.2
4 | tinytimer>=0.0.0,<1.0.0
5 | gtfparse>=2.5.0,<3.0.0
6 | serializable>=0.2.1,<1.0.0
7 | pylint>=2.17.2,<3.0.0
8 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from __future__ import print_function
14 | import os
15 | import re
16 |
17 | # TODO: replace setup.py with pyproject.toml
18 | from setuptools import setup
19 |
20 | package_name = "pyensembl"
21 | current_directory = os.path.dirname(__file__)
22 | readme_filename = "README.md"
23 | readme_path = os.path.join(current_directory, readme_filename)
24 | github_url = "https://github.com/openvax/%s" % package_name
25 |
26 | try:
27 | with open(readme_path, "r") as f:
28 | readme_markdown = f.read()
29 | except IOError as e:
30 | print(e)
31 | print("Failed to open %s" % readme_path)
32 | readme_markdown = ""
33 |
34 |
35 | with open("%s/version.py" % package_name, "r") as f:
36 | version = re.search(
37 | r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', f.read(), re.MULTILINE
38 | ).group(1)
39 |
40 | if not version:
41 | raise RuntimeError("Cannot find version information")
42 |
43 | if __name__ == "__main__":
44 | with open("requirements.txt") as f:
45 | requirements = [l.strip() for l in f]
46 |
47 | setup(
48 | name=package_name,
49 | version=version,
50 | description="Python interface to Ensembl reference genome metadata",
51 | author="Alex Rubinsteyn",
52 | author_email="alex.rubinsteyn@unc.edu",
53 | url=github_url,
54 | license="http://www.apache.org/licenses/LICENSE-2.0.html",
55 | entry_points={
56 | "console_scripts": ["pyensembl = %s.shell:run" % package_name],
57 | },
58 | classifiers=[
59 | "Development Status :: 4 - Beta",
60 | "Environment :: Console",
61 | "Operating System :: OS Independent",
62 | "Intended Audience :: Science/Research",
63 | "License :: OSI Approved :: Apache Software License",
64 | "Programming Language :: Python",
65 | "Topic :: Scientific/Engineering :: Bio-Informatics",
66 | ],
67 | install_requires=requirements,
68 | long_description=readme_markdown,
69 | long_description_content_type="text/markdown",
70 | packages=[package_name],
71 | package_data={
72 | package_name: ["logging.conf", "../requirements.txt"],
73 | },
74 | )
75 |
--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | pytest --cov=pyensembl/ --cov-report=term-missing tests
2 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openvax/pyensembl/d292b1749875904b380a209f4ff44b7d75dafdc3/tests/__init__.py
--------------------------------------------------------------------------------
/tests/common.py:
--------------------------------------------------------------------------------
1 | import functools
2 |
3 | from pyensembl import genome_for_reference_name, cached_release
4 |
5 | import pytest
6 |
7 |
8 | grch37 = genome_for_reference_name("GRCh37")
9 | grch38 = genome_for_reference_name("GRCh38")
10 |
11 | major_releases = [grch37, grch38]
12 |
13 | contigs = [str(c) for c in range(1, 23)] + ["X", "Y", "M"]
14 |
15 |
16 | def run_multiple_genomes(*versions):
17 | if len(versions) == 1 and callable(versions[0]):
18 | return pytest.mark.parametrize("genome", major_releases)(versions[0])
19 | if not versions:
20 | genomes = major_releases
21 | else:
22 | genomes = [cached_release(v) for v in versions]
23 | return lambda fn: pytest.mark.parametrize("genome", genomes)(fn)
24 |
25 |
26 | # TemporaryDirectory only got added to Python in version 3.2
27 | try:
28 | # pylint: disable=no-name-in-module
29 | from tempfile import TemporaryDirectory
30 |
31 | except ImportError:
32 | # only added in Python 3.2
33 | from tempfile import mkdtemp
34 | from shutil import rmtree
35 |
36 | class TemporaryDirectory(object):
37 | def __init__(self):
38 | self.name = mkdtemp()
39 |
40 | def __enter__(self, *args, **kwargs):
41 | return self.name
42 |
43 | def __exit__(self, type, value, traceback):
44 | rmtree(self.name)
45 | # don't suppress exceptions
46 | return False
47 |
48 |
49 | def ok_(b):
50 | assert b
51 |
52 |
53 | def eq_(x, y, msg=None):
54 | if msg is None:
55 | assert x == y
56 | else:
57 | assert x == y, msg
58 |
59 |
60 | def neq_(x, y, msg=None):
61 | if msg is None:
62 | assert x != y
63 | else:
64 | assert x != y, msg
65 |
66 |
67 | def gt_(x, y, msg=None):
68 | if msg is None:
69 | assert x > y
70 | else:
71 | assert x > y, msg
72 |
73 |
74 | def gte_(x, y, msg=None):
75 | if msg is None:
76 | assert x >= y
77 | else:
78 | assert x >= y, msg
79 |
80 |
81 | def lt_(x, y, msg=None):
82 | if msg is None:
83 | assert x < y
84 | else:
85 | assert x < y, msg
86 |
87 |
88 | def lte_(x, y, msg=None):
89 | if msg is None:
90 | assert x <= y
91 | else:
92 | assert x <= y, msg
93 |
--------------------------------------------------------------------------------
/tests/data.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pyensembl import Locus, Genome
3 |
4 |
5 | def data_path(name):
6 | """
7 | Return the absolute path to a file in the test/data directory.
8 | The name specified should be relative to test/data.
9 | """
10 | return os.path.join(os.path.dirname(__file__), "data", name)
11 |
12 |
13 | # mapping of ensembl releases to transcript IDs for FOXP3-001
14 | FOXP3_001_transcript_id = "ENST00000376207"
15 |
16 | TP53_gene_id = "ENSG00000141510"
17 |
18 | # beta-catenin interacting protein from the negative strand of chromosome 1
19 | # URL: http://useast.ensembl.org/Homo_sapiens/Transcript/Sequence_cDNA?
20 | # db=core;g=ENSG00000178585;r=1:9848276-9910336;t=ENST00000377256
21 | CTNNBIP1_004_transcript_id = "ENST00000377256"
22 |
23 | # coding sequence for beta-catenin interacting protein (CTNNBIP1-004)
24 | CTNNBIP1_004_CDS = "".join([
25 | "ATG",
26 | "AACCGCGAGGGAGCTCCCGGGAAGAGTCCGGAG",
27 | "GAGATGTACATTCAGCAGAAGGTCCGAGTGCTGCTCATGCTGCGGAAGATGGGATCAAAC",
28 | "CTGACAGCCAGCGAGGAGGAGTTCCTGCGCACCTATGCAGGGGTGGTCAACAGCCAGCTC",
29 | "AGCCAGCTGCCTCCGCACTCCATCGACCAGG",
30 | "GTGCAGAGGACGTGGTGATGGCGTTTTCCAGGTCGGAGACGGAAGACCGGAGGCAG",
31 | "TAG"
32 | ])
33 |
34 | # 5' UTR for beta-catenin interacting protein (CTNNBIP1-004)
35 | CTNNBIP1_004_UTR5 = "".join([
36 | "TGTGGGTGCAGGTTTCCTGGGCTTGCCAGACACACAGGGCGGCACCTTCCTACTTCTGCC",
37 | "CAGCCACAGCCCTCCCCTCACAGTTGAGCACCTGTTTGCCTGAAGTTAATTTCCAGAAGC",
38 | "AGGAGTCCCCAGAGCCAGGCAGGGGG"])
39 |
40 | # 3' UTR for beta-catenin interacting protein (CTNNBIP1-004)
41 | CTNNBIP1_004_UTR3 = \
42 | "CTGCAAAGCCCTTGGAACACCCTGGATGCTGTTGAGGGCCAAGAGATCTGTGTGGCTCC"
43 |
44 | CTNNBIP1_004_locus = Locus("1", 9850659, 9878176, "-")
45 |
46 | # properties of CTNNBIP1-004's exons copied from
47 | # http://useast.ensembl.org/Homo_sapiens/Transcript/Exons?g=ENSG00000178585;
48 | # r=1:9850659-9878176;redirect=no;t=ENST00000377256
49 | CTTNNIP1_004_exon_ids = [
50 | 'ENSE00001473268',
51 | 'ENSE00001643659',
52 | 'ENSE00001600669',
53 | 'ENSE00001267940',
54 | 'ENSE00001473265',
55 | ]
56 |
57 | CTTNNIP1_004_exon_lengths = [
58 | 37,
59 | 85,
60 | 120,
61 | 91,
62 | 118
63 | ]
64 |
65 |
66 | #
67 | # Information for EGFR from Ensembl website
68 | # Date: March 25th, 2015
69 | # Ensembl Release: 79
70 | #
71 | EGFR_001_name = "EGFR-001"
72 | EGFR_001_transcript_id = "ENST00000275493"
73 | EGFR_001_ccds_id = "CCDS5514"
74 | EGFR_001_protein_id = "ENSP00000275493"
75 | EGFR_001_protein_sequence = "".join([
76 | "MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVLGNLEITYV",
77 | "QRNYDLSFLKTIQEVAGYVLIALNTVERIPLENLQIIRGNMYYENSYALAVLSNYDANKTGLKELPMRNL",
78 | "QEILHGAVRFSNNPALCNVESIQWRDIVSSDFLSNMSMDFQNHLGSCQKCDPSCPNGSCWGAGEENCQKL",
79 | "TKIICAQQCSGRCRGKSPSDCCHNQCAAGCTGPRESDCLVCRKFRDEATCKDTCPPLMLYNPTTYQMDVN",
80 | "PEGKYSFGATCVKKCPRNYVVTDHGSCVRACGADSYEMEEDGVRKCKKCEGPCRKVCNGIGIGEFKDSLS",
81 | "INATNIKHFKNCTSISGDLHILPVAFRGDSFTHTPPLDPQELDILKTVKEITGFLLIQAWPENRTDLHAF",
82 | "ENLEIIRGRTKQHGQFSLAVVSLNITSLGLRSLKEISDGDVIISGNKNLCYANTINWKKLFGTSGQKTKI",
83 | "ISNRGENSCKATGQVCHALCSPEGCWGPEPRDCVSCRNVSRGRECVDKCNLLEGEPREFVENSECIQCHP",
84 | "ECLPQAMNITCTGRGPDNCIQCAHYIDGPHCVKTCPAGVMGENNTLVWKYADAGHVCHLCHPNCTYGCTG",
85 | "PGLEGCPTNGPKIPSIATGMVGALLLLLVVALGIGLFMRRRHIVRKRTLRRLLQERELVEPLTPSGEAPN",
86 | "QALLRILKETEFKKIKVLGSGAFGTVYKGLWIPEGEKVKIPVAIKELREATSPKANKEILDEAYVMASVD",
87 | "NPHVCRLLGICLTSTVQLITQLMPFGCLLDYVREHKDNIGSQYLLNWCVQIAKGMNYLEDRRLVHRDLAA",
88 | "RNVLVKTPQHVKITDFGLAKLLGAEEKEYHAEGGKVPIKWMALESILHRIYTHQSDVWSYGVTVWELMTF",
89 | "GSKPYDGIPASEISSILEKGERLPQPPICTIDVYMIMVKCWMIDADSRPKFRELIIEFSKMARDPQRYLV",
90 | "IQGDERMHLPSPTDSNFYRALMDEEDMDDVVDADEYLIPQQGFFSSPSTSRTPLLSSLSATSNNSTVACI",
91 | "DRNGLQSCPIKEDSFLQRYSSDPTGALTEDSIDDTFLPVPEYINQSVPKRPAGSVQNPVYHNQPLNPAPS"
92 | "RDPHYQDPHSTAVGNPEYLNTVQPTCVNSTFDSPAHWAQKGSHQISLDNPDYQQDFFPKEAKPNGIFKGS"
93 | "TAENAEYLRVAPQSSEFIGA"
94 | ])
95 |
96 |
97 | # GTF cropped from ftp://ftp.ensembl.org/pub/release-81/gtf/mus_musculus/
98 | # Mus_musculus.GRCm38.81.gtf.gz via:
99 | # grep "ENSMUSG00000017167" Mus_musculus.GRCm38.81.gtf
100 |
101 | # Transcript FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/
102 | # fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz via:
103 | # grep "ENSMUSG00000017167" Mus_musculus.GRCm38.cdna.all.fa -A 50
104 |
105 | # ncRNA FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/
106 | # fasta/mus_musculus/cdna/Mus_musculus.GRCm38.ncrna.fa.gz via:
107 | # grep "ENSMUSG00000088969" Mus_musculus.GRCm38.ncrna.fa -A 2
108 |
109 | # Protein FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/fasta/
110 | # mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz via:
111 | # grep "ENSMUSG00000017167" Mus_musculus.GRCm38.pep.all.fa -A 50
112 |
113 | # Tested against:
114 | # http://useast.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000017167
115 |
116 | MOUSE_ENSMUSG00000017167_PATH = data_path(
117 | "mouse.ensembl.81.partial.ENSMUSG00000017167.gtf")
118 | MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH = data_path(
119 | "mouse.ensembl.81.partial.ENSMUSG00000017167.fa")
120 | MOUSE_ENSMUSG00000088969_NCRNA_FASTA_PATH = data_path(
121 | "mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa")
122 | MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH = data_path(
123 | "mouse.ensembl.81.partial.ENSMUSG00000017167.pep")
124 |
125 |
126 | custom_mouse_genome_grcm38_subset = Genome(
127 | reference_name="GRCm38",
128 | annotation_name="_test_mouse_ensembl81_subset",
129 | gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
130 | transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH],
131 | protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH])
132 |
133 |
134 | def setup_init_custom_mouse_genome():
135 | """
136 | If a unit test needs to start from a cleared cache, add this to the test
137 | setup.
138 | """
139 | custom_mouse_genome_grcm38_subset.clear_cache()
140 | custom_mouse_genome_grcm38_subset.index()
141 |
--------------------------------------------------------------------------------
/tests/data/gencode.ucsc.small.gtf:
--------------------------------------------------------------------------------
1 | # expected format is [attributes] [comments]
2 | chr1 hg38_knownGene exon 17369 17436 0.000000 - . gene_id "uc031tla.1"; transcript_id "uc031tla.1";
3 | chr1 hg38_knownGene exon 29554 30039 0.000000 + . gene_id "uc057aty.1"; transcript_id "uc057aty.1";
4 | chr1 hg38_knownGene exon 30564 30667 0.000000 + . gene_id "uc057aty.1"; transcript_id "uc057aty.1";
5 | chr1 hg38_knownGene exon 30976 31097 0.000000 + . gene_id "uc057aty.1"; transcript_id "uc057aty.1";
6 | chr1 hg38_knownGene exon 30267 30667 0.000000 + . gene_id "uc057atz.1"; transcript_id "uc057atz.1";
7 | chr1 hg38_knownGene exon 30976 31109 0.000000 + . gene_id "uc057atz.1"; transcript_id "uc057atz.1";
8 | chr1 hg38_knownGene exon 30366 30503 0.000000 + . gene_id "uc031tlb.1"; transcript_id "uc031tlb.1";
9 | chr1 hg38_knownGene exon 34554 35174 0.000000 - . gene_id "uc001aak.4"; transcript_id "uc001aak.4";
10 | chr1 hg38_knownGene exon 35277 35481 0.000000 - . gene_id "uc001aak.4"; transcript_id "uc001aak.4";
11 | chr1 hg38_knownGene exon 35721 36081 0.000000 - . gene_id "uc001aak.4"; transcript_id "uc001aak.4";
12 | chr1 hg38_knownGene exon 35245 35481 0.000000 - . gene_id "uc057aua.1"; transcript_id "uc057aua.1";
13 | chr1 hg38_knownGene exon 35721 36073 0.000000 - . gene_id "uc057aua.1"; transcript_id "uc057aua.1";
14 | chr1 hg38_knownGene start_codon 69091 69093 0.000000 + . gene_id "uc001aal.1"; transcript_id "uc001aal.1";
15 | chr1 hg38_knownGene CDS 69091 70005 0.000000 + 0 gene_id "uc001aal.1"; transcript_id "uc001aal.1";
16 | chr1 hg38_knownGene stop_codon 70006 70008 0.000000 + . gene_id "uc001aal.1"; transcript_id "uc001aal.1";
17 |
--------------------------------------------------------------------------------
/tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.fa:
--------------------------------------------------------------------------------
1 | >ENSMUST00000138942 havana:known chromosome:GRCm38:11:101170523:101178316:1 gene:ENSMUSG00000017167 gene_biotype:protein_coding transcript_biotype:processed_transcript
2 | CAGCGCGAAGCCCACAGGCGCATCCCTAGTAGGGCTACTTGCCCCTGGAGCTCCCGGGGC
3 | TCTGGCCCTCAGACAAGAATCTCCCCCACATTTGCAGTTGGCCAAGAGGACTGCGTTTGG
4 | CCCAAGTATGGAGCAGGCTCAGGCGTGACGGCCGGTTGTAGTGAGAAAGATTGAACTCGG
5 | TTCTAAATCCCTGTAGACTTACCCTCCCGCCGCCCGCTGGACTCGGGGTCCTTAGCTCAA
6 | AGGTCTCGTCGTCCTCGTCTTCGTCCCCGTCCCCGCTAAGCTCGCCGTCCCCGTAGTCCC
7 | GGTGCAGAAGAGTGAAGCCTCGACGGCAGCAGAGAAGCCACCACAATCCCCCGGGGAGAG
8 | GCATCCGGGCGAGCAGCCTGGGAATGGGGGCGCAGGCAGTGTTGCCTGAGCAGCAGGGAA
9 | TCTGAGAAACTGGAGACCTTCTTCGGGAATGTCAATGACTCGGCAGTGGTCCGCCATGAC
10 | CTTCACTACCACTTTACGGCTCGCTACATCCGCATCGTGCCACTGGCCTGGAACCCACGC
11 | GGCAAGATTGGCCTGAGGCTGGGCATCTATGGTTGTCCCTACACATCCAGCATCCTGTAT
12 | TTTGACGGCGACGATGCCATCTCATACCGCTTCCAGCGAGGCGCCAGCCAAAGTCTTTGG
13 | GACG
14 | >ENSMUST00000103109 ensembl_havana_transcript:known chromosome:GRCm38:11:101176041:101190724:1 gene:ENSMUSG00000017167 gene_biotype:protein_coding transcript_biotype:protein_coding
15 | GAGAGAAGGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA
16 | GAGAGAGAGAGATTGGGGGTAGGAGAGAGGGAAGGGTGGATAAGGACGGAAAAAAGCTTT
17 | GGTCAGCCGTGAACCCAGGAGAAAAGCTGGGGGCCTGAGCCAGAACGGGAGCCCTAGCGG
18 | CGCAACAAGGCTGACACCCAGCGTTGGTCAGCTCCGCATGATGAGTCTCCGGCTCTTCAG
19 | CATCCTGCTCGCCACGGTGGTCTCTGGAGCTTGGGGCTGGGGCTACTACGGTTGCAATGA
20 | GGAGCTGGTGGGGCCTCTGTATGCACGGTCTCTGGGCGCTTCTTCCTACTATGGACTCTT
21 | TACCACAGCCCGCTTTGCCCGGCTACATGGCATCAGTGGATGGTCGCCCCGGATTGGGGA
22 | CCCGAATCCCTGGCTGCAGATCGACTTAATGAAGAAGCATCGGATCCGGGCTGTGGCCAC
23 | ACAGGGAGCTTTTAATTCTTGGGATTGGGTCACACGTTACATGCTGCTCTACGGGGACCG
24 | TGTGGACAGCTGGACACCATTCTACCAAAAAGGGCACAATGCGACCTTCTTCGGGAATGT
25 | CAATGACTCGGCAGTGGTCCGCCATGACCTTCACTACCACTTTACGGCTCGCTACATCCG
26 | CATCGTGCCACTGGCCTGGAACCCACGCGGCAAGATTGGCCTGAGGCTGGGCATCTATGG
27 | TTGTCCCTACACATCCAGCATCCTGTATTTTGACGGCGACGATGCCATCTCATACCGCTT
28 | CCAGCGAGGCGCCAGCCAAAGTCTTTGGGACGTGTTCGCTTTTAGTTTCAAGACAGAGGA
29 | GAAGGATGGGCTGTTGCTGCACACCGAGGGCTCCCAGGGGGATTATGTGACGCTTGAACT
30 | GCAAGGGGCGCACCTGCTGCTGCACATGAGCCTGGGCAGCAGTCCCATCCAGCCAAGACC
31 | TGGTCACACCACGGTGAGCCTTGGTGGCGTTCTTAACGACCTAAGCTGGCACTATGTGCG
32 | GGTGGATCGATATGGCCGAGATGCAAATTTCACCCTGGATGGTTACGCCCATCACTTTGT
33 | GCTCAACGGCGACTTTGAAAGGCTGAATCTTGAAAATGAGATATTCATCGGGGGTCTAGT
34 | GGGCGCAGCCCGTAAGAACCTGGCCTACCGCCATAACTTCCGTGGCTGCATAGAAAACGT
35 | GATCTACAACCGGATCAACATTGCAGAAATGGCAGTGATGCGCCATTCGCGGATCACCTT
36 | TGAGGGTAATGTGGCTTTCCGTTGCTTGGATCCCGTTCCACACCCCATCAACTTCGGAGG
37 | CCCTCACAACTTCGTCCAAGTGCCTGGCTTTCCACGCCGAGGACGCTTAGCCGTCTCTTT
38 | TCGTTTCCGCACCTGGGACCTCACAGGGCTGCTCCTTTTCTCCCACTTGGGGGACGGGCT
39 | GGGTCATGTGGAGCTGATGCTTAGCGAAGGGCAAGTGAATGTATCCATCGCGCAGACTGG
40 | CCGCAAGAAGCTTCAGTTTGCTGCTGGGTACCGCCTGAATGATGGCTTCTGGCACGAGGT
41 | GAACTTTGTGGCACAGGAAAACCATGCAGTCATCAGTATTGATGATGTGGAAGGGGCAGA
42 | GGTCAGGGTTTCATACCCACTGCTGATCCGCACAGGGACTTCATACTTCTTTGGTGGTTG
43 | TCCCAAACCAGCCAGTCGATGGGGCTGCCACTCCAACCAGACAGCATTCCATGGCTGCAT
44 | GGAGCTGCTCAAGGTGGACGGTCAACTGGTCAACCTCACTCTGGTAGAGTTTCGGAAGCT
45 | CGGTTATTTTGCTGAGGTCCTCTTTGACACATGTGGCATCACAGACAGATGCAGCCCTAA
46 | CATGTGTGAGCATGACGGACGATGCTACCAGTCTTGGGATGACTTCATCTGCTACTGCGA
47 | ACTTACCGGCTACAAGGGAGTTACCTGCCACGAACCATTGTACAAGGAGTCCTGTGAGGC
48 | CTATCGGCTCAGTGGGAAATATTCTGGAAACTACACCATTGATCCTGATGGCAGTGGACC
49 | CCTGAAGCCGTTTGTGGTGTATTGTGACATCCGAGAGAACCGAGCGTGGACAGTTGTGCG
50 | GCATGACAGGCTGTGGACCACTCGAGTGACTGGTTCCAGCATGGACCGGCCCTTTCTGGG
51 | GGCCATCCAATACTGGAATGCCTCCTGGGAGGAAGTCAGCGCTCTGGCCAATGCTTCCCA
52 | ACACTGTGAGCAGTGGATCGAGTTTTCCTGCTACAATTCCCGGCTGCTCAACACTGCAGG
53 | AGGCTACCCCTACAGCTTTTGGATTGGCCGCAATGAGGAACAGCATTTCTACTGGGGAGG
54 | CTCCCAGCCTGGGATCCAGCGCTGTGCCTGTGGGCTGGACCAGAGCTGTGTGGACCCTGC
55 | ACTGCACTGCAATTGTGATGCCGACCAGCCACAGTGGAGAACAGACAAGGGGCTCCTGAC
56 | CTTTGTAGACCATCTGCCTGTCACTCAGGTAGTGGTAGGTGATACAAACCGCTCAAATTC
57 | TGAAGCTCAGTTCTTCCTGAGGCCTCTGCGCTGCTATGGTGACCGCAATTCCTGGAACAC
58 | CATCTCCTTCCACACTGGAGCTGCACTGCGTTTCCCTCCGATCCGAGCCAACCACAGCCT
59 | CGATGTCTCATTCTACTTCAGGACCTCGGCTCCCTCGGGTGTCTTCCTAGAGAACATGGG
60 | GGGTCCTTTCTGCCGGTGGCGCCGACCTTACGTGAGAGTGGAGCTCAACACATCCCGGGA
61 | TGTGGTCTTTGCCTTTGATATTGGCAATGGGGATGAGAACCTGACAGTGCACTCGGATGA
62 | CTTTGAGTTTAACGATGATGAGTGGCATTTGGTCCGAGCTGAAATCAACGTGAAGCAGGC
63 | CCGGCTGCGAGTGGATCACCGGCCCTGGGTGCTAAGGCCCATGCCCCTGCAGACCTACAT
64 | CTGGCTGGTGTATGACCAACCCCTCTATGTGGGATCTGCAGAGCTTAAGAGGCGCCCTTT
65 |
--------------------------------------------------------------------------------
/tests/data/mouse.ensembl.81.partial.ENSMUSG00000017167.pep:
--------------------------------------------------------------------------------
1 | >ENSMUSP00000099398 pep:known chromosome:GRCm38:11:101176041:101190724:1 gene:ENSMUSG00000017167 transcript:ENSMUST00000103109 gene_biotype:protein_coding transcript_biotype:protein_coding
2 | MMSLRLFSILLATVVSGAWGWGYYGCNEELVGPLYARSLGASSYYGLFTTARFARLHGIS
3 | GWSPRIGDPNPWLQIDLMKKHRIRAVATQGAFNSWDWVTRYMLLYGDRVDSWTPFYQKGH
4 | NATFFGNVNDSAVVRHDLHYHFTARYIRIVPLAWNPRGKIGLRLGIYGCPYTSSILYFDG
5 | DDAISYRFQRGASQSLWDVFAFSFKTEEKDGLLLHTEGSQGDYVTLELQGAHLLLHMSLG
6 | SSPIQPRPGHTTVSLGGVLNDLSWHYVRVDRYGRDANFTLDGYAHHFVLNGDFERLNLEN
7 | EIFIGGLVGAARKNLAYRHNFRGCIENVIYNRINIAEMAVMRHSRITFEGNVAFRCLDPV
8 | PHPINFGGPHNFVQVPGFPRRGRLAVSFRFRTWDLTGLLLFSHLGDGLGHVELMLSEGQV
9 | NVSIAQTGRKKLQFAAGYRLNDGFWHEVNFVAQENHAVISIDDVEGAEVRVSYPLLIRTG
10 | TSYFFGGCPKPASRWGCHSNQTAFHGCMELLKVDGQLVNLTLVEFRKLGYFAEVLFDTCG
11 | ITDRCSPNMCEHDGRCYQSWDDFICYCELTGYKGVTCHEPLYKESCEAYRLSGKYSGNYT
12 | IDPDGSGPLKPFVVYCDIRENRAWTVVRHDRLWTTRVTGSSMDRPFLGAIQYWNASWEEV
13 | SALANASQHCEQWIEFSCYNSRLLNTAGGYPYSFWIGRNEEQHFYWGGSQPGIQRCACGL
14 | DQSCVDPALHCNCDADQPQWRTDKGLLTFVDHLPVTQVVVGDTNRSNSEAQFFLRPLRCY
15 | GDRNSWNTISFHTGAALRFPPIRANHSLDVSFYFRTSAPSGVFLENMGGPFCRWRRPYVR
16 | VELNTSRDVVFAFDIGNGDENLTVHSDDFEFNDDEWHLVRAEINVKQARLRVDHRPWVLR
17 | PMPLQTYIWLVYDQPLYVGSAELKRRPFVGCLRAMRLNGVTLNLEGRANASEGTFPNCTG
18 | HCTHPRFPCFHGGRCVERYSYYTCDCDLTAFDGPYCNHDIGGFFETGTWMRYNLQSALRS
19 | AAREFSHMLSRPVPGYEPGYVPGYDTPGYVPGYHGPGYRLPEYPRPGRPVPGYRGPVYNV
20 | TGEEVSFSFSTNSAPAVLLYVSSFVRDYMAVLIKEDGTLQLRYQLGTSPYVYQLTTRPVT
21 | DGQPHSVNITRVYRNLFIQVDYFPLTEQKFSLLVDSQLDSPKALYLGRVMETGVIDPEIQ
22 | RYNTPGFSGCLSGVRFNNVAPLKTHFRTPRPMTAELAEAMRVQGELSESNCGAMPRLVSE
23 | VPPELDPWYLPPDFPYYHDDGWIAILLGFLVAFLLLGLVGMLVLFYLQNHRYKGSYHTNE
24 | PKATHDSHPGGKAPLPPSGPAQAPAPTPAPTQLPTPAPAPAPAPASGPGPRDQNLPQILE
25 | ESRSE
26 | >ENSMUSP00000006660 pep:known chromosome:GRCm38:13:27658956:27668036:-1 gene:ENSMUSG00000046899 transcript:ENSMUST00000006660 gene_biotype:protein_coding transcript_biotype:protein_coding
27 | MSFSFSQPCPSGALLLVVVSSLLLWENVASVPLSSNETDGYPLSINGLFHNAMRLTWNIK
28 | NLNMELRKTYTVNQVSEKLYENYMLDFIEDMEYLVKALTCCHNYSIKTPENLDEAQQIPF
29 | NEFPKLILSRMWAWNETSKVLLTTLRSIPGMHDDVISLAKNIETKLAELFEYTQSILNSI
30 | YGTTTTGNVEYTVFSGLEDLKSSDEEFSLFDLCKFSYCLRVDIHMVELYLKLLECVVYVS
31 | SDVCLSKNIRDAS
32 | >ENSMUSP00000046761 pep:known chromosome:GRCm38:12:66469568:67222549:-1 gene:ENSMUSG00000034912 transcript:ENSMUST00000037181 gene_biotype:protein_coding transcript_biotype:protein_coding
33 | MDLVYGLVWLLTVLLEGISGQGVYAPPTVRIVHSGLACNIEEERYSERVYTIREGETLEL
34 | TCLVTGHPRPQIRWTKTAGSASDRFQDSSVFNETLRITNIQRHQGGRYYCKAENGLGSPA
35 | IKSIRVDVYYLDDPVVTVHQSIGEAKEQFYYERTVFLRCVANSNPPVRYSWRRGQEVLLQ
36 | GSDKGVEIYEPFFTQGETKILKLKNLRPQDYANYSCIASVRNVCNIPDKMVSFRLSNKTA
37 | SPSIKLLVDDPIVVNPGEAITLVCVTTGGEPTPSLTWVRSFGTLPEKIVLNGGTLTIPAI
38 | TSDDAGTYSCIANNNVGNPAKKSTNIIVRALKKGRFWITPDPYHKDDNIQIGREVKISCQ
39 | VEAVPSEELTFSWFKNGRPLRSSERMVITQTDPDVSPGTTNLDIIDLKFTDFGTYTCVAS
40 | LKGGGISDISIDVNISSSTVPPNLTVPQEKSPLVTREGDTIELQCQVTGKPKPIILWSRA
41 | DKEVAMPDGTMQMESYDGTLRIVNVSREMSGMYRCQTSQYNGFNVKPREALVQLIVQYPP
42 | AVEPAFLEIRQGQDRSVTMSCRVLRAYPIRVLTYEWRLGNKLLRTGQFDSQEYTEYPLKS
43 | LSNENYGVYNCSIINEAGAGRCSFLVTGKAYAPEFYYDTYNPVWQNRHRVYSYSLQWTQM
44 | NPDAVDRIVAYRLGIRQAGQQRWWEQEIKINGNIQKGELITYNLTELIKPEAYEVRLTPL
45 | TKFGEGDSTIRVIKYTAPVNPHLREFHCGFEDGNICLFTQDDTDNFDWTKQSTATRNTKY
46 | TPNTGPSADRSGSKEGFYMYIETSRPRLEGEKARLLSPVFSIAPKNPYGPTNSAYCFSFF
47 | YHMYGQHIGVLNVYLRLKGQTTIENPLWSSSGNKGQRWNEAHVNIYPITSFQLIFEGIRG
48 | PGIEGDIAIDDVSIAEGECAKQDLPTKNSVDGAVGILVHIWLFPVIILISILSPRR
49 | >ENSMUSP00000137608 pep:known chromosome:GRCm38:12:66471182:67221221:-1 gene:ENSMUSG00000034912 transcript:ENSMUST00000178814 gene_biotype:protein_coding transcript_biotype:protein_coding
50 | MDLVYGLVWLLTVLLEGISGQGVYAPPTVRIVHSGLACNIEEERYSERVYTIREGETLEL
51 | TCLVTGHPRPQIRWTKTAGSASDRFQDSSVFNETLRITNIQRHQGGRYYCKAENGLGSPA
52 |
--------------------------------------------------------------------------------
/tests/data/mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa:
--------------------------------------------------------------------------------
1 | >ENSMUST00000158344 ncrna:known chromosome:GRCm38:8:107009502:107009611:-1 gene:ENSMUSG00000088969 gene_biotype:snRNA transcript_biotype:snRNA
2 | GCACCTATTTTGACAGCACAAATACTAAAATTGGAACAAATCAGGGAAGATTAGCATGCT
3 | CTCATGCAAGGATGACACGGAAATTCATGGAACAGCGGATTCATATTTTA
4 |
--------------------------------------------------------------------------------
/tests/data/refseq.ucsc.small.gtf:
--------------------------------------------------------------------------------
1 | chr1 hg38_refGene exon 67092176 67093604 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077";
2 | chr1 hg38_refGene exon 67096252 67096321 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077";
3 | chr1 hg38_refGene exon 67103238 67103382 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077";
4 | chr1 hg38_refGene exon 67111577 67111644 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077";
5 | chr1 hg38_refGene exon 67113614 67113756 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077";
6 | chr1 hg38_refGene exon 67115352 67115464 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077";
7 | chr1 hg38_refGene exon 67125752 67125909 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077";
8 | chr1 hg38_refGene exon 67127166 67127257 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077";
9 | chr1 hg38_refGene exon 67131142 67131227 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077";
10 | chr1 hg38_refGene exon 67134930 67134971 0.000000 - . gene_id "NR_075077"; transcript_id "NR_075077";
11 | chr1 hg38_refGene stop_codon 67093580 67093582 0.000000 - . gene_id "NM_001276352"; transcript_id "NM_001276352";
12 | chr1 hg38_refGene CDS 67093583 67093604 0.000000 - 1 gene_id "NM_001276352"; transcript_id "NM_001276352";
13 | chr1 hg38_refGene exon 67092176 67093604 0.000000 - . gene_id "NM_001276352"; transcript_id "NM_001276352";
14 | chr1 hg38_refGene CDS 67096252 67096321 0.000000 - 2 gene_id "NM_001276352"; transcript_id "NM_001276352";
15 | chr1 hg38_refGene exon 67096252 67096321 0.000000 - . gene_id "NM_001276352"; transcript_id "NM_001276352";
16 | chr1 hg38_refGene CDS 67103238 67103382 0.000000 - 0 gene_id "NM_001276352"; transcript_id "NM_001276352";
17 | chr1 hg38_refGene exon 67103238 67103382 0.000000 - . gene_id "NM_001276352"; transcript_id "NM_001276352";
18 | chr1 hg38_refGene CDS 67111577 67111644 0.000000 - 2 gene_id "NM_001276352"; transcript_id "NM_001276352";
19 | chr1 hg38_refGene exon 67111577 67111644 0.000000 - . gene_id "NM_001276352"; transcript_id "NM_001276352";
20 | chr1 hg38_refGene CDS 67115352 67115464 0.000000 - 1 gene_id "NM_001276352"; transcript_id "NM_001276352";
21 | chr1 hg38_refGene exon 67115352 67115464 0.000000 - . gene_id "NM_001276352"; transcript_id "NM_001276352";
22 | chr1 hg38_refGene CDS 67125752 67125909 0.000000 - 0 gene_id "NM_001276352"; transcript_id "NM_001276352";
23 | chr1 hg38_refGene exon 67125752 67125909 0.000000 - . gene_id "NM_001276352"; transcript_id "NM_001276352";
24 | chr1 hg38_refGene CDS 67127166 67127240 0.000000 - 0 gene_id "NM_001276352"; transcript_id "NM_001276352";
25 |
--------------------------------------------------------------------------------
/tests/test_contigs.py:
--------------------------------------------------------------------------------
1 | from pyensembl import genome_for_reference_name
2 |
3 | grch38 = genome_for_reference_name("GRCh38")
4 |
5 | def test_contig_names():
6 | contig_names = set(grch38.contigs())
7 | for chrom in list(range(1, 23)) + ["X", "Y", "MT"]:
8 | assert str(chrom) in contig_names, (chrom, contig_names)
9 |
--------------------------------------------------------------------------------
/tests/test_download_cache.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 |
4 | from pytest import raises as assert_raises
5 | from pyensembl.download_cache import DownloadCache, MissingLocalFile, MissingRemoteFile
6 |
7 |
8 | from .data import data_path
9 | from .common import ok_
10 |
11 | download_cache = DownloadCache(
12 | reference_name="__test_reference",
13 | annotation_name="__test_annotation",
14 | copy_local_files_to_cache=False,
15 | )
16 |
17 |
18 | def test_download_cache_missing_local_file():
19 | # clear the cache
20 | download_cache.delete_cache_directory()
21 | with assert_raises(MissingLocalFile):
22 | download_cache.download_or_copy_if_necessary(
23 | path_or_url="test_file_doesn_not_exist.file"
24 | )
25 |
26 |
27 | def test_download_cache_missing_remote_file():
28 | # clear the cache
29 | download_cache.delete_cache_directory()
30 | with assert_raises(MissingRemoteFile):
31 | download_cache.download_or_copy_if_necessary(
32 | path_or_url="ftp://NOTAURL.NOTAURL.NOTAURL"
33 | )
34 |
35 |
36 | def test_download_cache_custom_location():
37 | test_file = "refseq.ucsc.small.gtf"
38 | tmp_dir = tempfile.gettempdir()
39 |
40 | print("DIR: %s" % tmp_dir)
41 | assert tmp_dir is not None
42 |
43 | os.environ["PYENSEMBL_CACHE_DIR"] = tmp_dir
44 |
45 | # We need another instance of DownloadCache
46 | # that copies files over to cache folder
47 | download_cache = DownloadCache(
48 | reference_name="test_reference",
49 | annotation_name="test_annotation",
50 | copy_local_files_to_cache=True,
51 | )
52 |
53 | # clean up
54 | download_cache.delete_cache_directory()
55 | download_cache.download_or_copy_if_necessary(
56 | download_if_missing=True, path_or_url=data_path(test_file)
57 | )
58 |
59 | full_path = os.path.join(
60 | tmp_dir, "pyensembl", "test_reference", "test_annotation", test_file
61 | )
62 | print("FULL PATH: %s" % full_path)
63 | assert len(full_path) > 0
64 |
65 | ok_(os.path.exists(full_path))
66 | del os.environ["PYENSEMBL_CACHE_DIR"]
67 |
--------------------------------------------------------------------------------
/tests/test_ensembl_gtf.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from os.path import exists
3 |
4 |
5 | from .common import run_multiple_genomes
6 |
7 |
8 | @run_multiple_genomes()
9 | def gtf_path_endswith_gtf_gz(ensembl):
10 | path = ensembl.gtf.gtf_path
11 | assert exists(path)
12 | assert path.endswith(".gtf.gz")
13 |
--------------------------------------------------------------------------------
/tests/test_ensembl_object_properties.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for simple properties of an EnsemblRelease object which don't
3 | require database lookups.
4 | """
5 |
6 | from __future__ import absolute_import
7 |
8 | from .common import eq_
9 | from pyensembl import EnsemblRelease, MAX_ENSEMBL_RELEASE
10 |
11 |
12 | def test_human_reference_name():
13 | eq_(EnsemblRelease(release=54).reference_name, "NCBI36")
14 | eq_(EnsemblRelease(release=74).reference_name, "GRCh37")
15 | eq_(EnsemblRelease(release=75).reference_name, "GRCh37")
16 | for release in range(76, MAX_ENSEMBL_RELEASE):
17 | eq_(EnsemblRelease(release=release).reference_name, "GRCh38")
18 |
--------------------------------------------------------------------------------
/tests/test_exon_id.py:
--------------------------------------------------------------------------------
1 | """
2 | Exon IDs of the TP53 gene and one of its transcripts (TP53-026) were copied
3 | from the Ensembl website, make sure same IDs are found by pyensembl.
4 | """
5 | from __future__ import absolute_import
6 |
7 | from pyensembl import cached_release
8 |
9 | ensembl = cached_release(77)
10 |
11 | # all exons associated with TP53 gene in Ensembl release 77
12 | TP53_EXON_IDS_RELEASE_77 = [
13 | 'ENSE00002337729', 'ENSE00002419584',
14 | 'ENSE00003625790', 'ENSE00003518480',
15 | 'ENSE00003723991', 'ENSE00003712342',
16 | 'ENSE00001657961', 'ENSE00003725258',
17 | 'ENSE00003740946', 'ENSE00002204316',
18 | 'ENSE00002064269', 'ENSE00003750554',
19 | 'ENSE00003634848', 'ENSE00003492844',
20 | 'ENSE00003735852', 'ENSE00003545950',
21 | 'ENSE00003605891', 'ENSE00002051192',
22 | 'ENSE00002084733', 'ENSE00003726882',
23 | 'ENSE00001146308', 'ENSE00002667911',
24 | 'ENSE00003752869', 'ENSE00003739898',
25 | 'ENSE00003753508', 'ENSE00002034209',
26 | 'ENSE00002030826', 'ENSE00001596491',
27 | 'ENSE00002037735', 'ENSE00003736616',
28 | 'ENSE00002672443', 'ENSE00002226620',
29 | 'ENSE00003715195', 'ENSE00003750794',
30 | 'ENSE00003745267', 'ENSE00003746220',
31 | 'ENSE00003656695', 'ENSE00003669712',
32 | 'ENSE00002051873', 'ENSE00002048269',
33 | 'ENSE00002670535', 'ENSE00002677565',
34 | 'ENSE00003532881', 'ENSE00003520683',
35 | 'ENSE00002076714', 'ENSE00002062958',
36 | 'ENSE00002073243', 'ENSE00003670707',
37 | 'ENSE00002065802', 'ENSE00002362269'
38 | ]
39 |
40 | def test_exon_ids_of_gene_id():
41 | """
42 | test_exon_ids_of_gene_id: Ensure that gene_id ENSG00000141510 (name=TP53),
43 | has all the same exon IDs found on the Ensembl website.
44 | """
45 | exon_ids = ensembl.exon_ids_of_gene_id('ENSG00000141510')
46 | assert len(exon_ids) == len(TP53_EXON_IDS_RELEASE_77), \
47 | "Wrong number of exons, expected %d but got %d (n_distinct=%d)" % (
48 | len(TP53_EXON_IDS_RELEASE_77),
49 | len(exon_ids),
50 | len(set(exon_ids)))
51 | assert all(exon_id in TP53_EXON_IDS_RELEASE_77 for exon_id in exon_ids)
52 |
53 | def test_exon_ids_of_gene_name():
54 | """
55 | test_exon_ids_of_gene_name: Ensure that TP53 has the same exon IDs found
56 | on the Ensembl website.
57 | """
58 | exon_ids = ensembl.exon_ids_of_gene_name("TP53")
59 | assert len(exon_ids) == len(TP53_EXON_IDS_RELEASE_77), \
60 | "Wrong number of exons, expected %d but got %d (n_distinct=%d)" % (
61 | len(TP53_EXON_IDS_RELEASE_77),
62 | len(exon_ids),
63 | len(set(exon_ids)))
64 | assert all(exon_id in TP53_EXON_IDS_RELEASE_77 for exon_id in exon_ids)
65 |
66 | # Exon IDs of transcript TP53-026
67 | TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77 = [
68 | 'ENSE00002064269',
69 | 'ENSE00003723991',
70 | 'ENSE00003712342',
71 | 'ENSE00003725258',
72 | 'ENSE00003740946',
73 | 'ENSE00003750554',
74 | 'ENSE00003634848',
75 | 'ENSE00003492844'
76 | ]
77 |
78 | def test_exon_ids_of_transcript_name():
79 | """
80 | test_exon_ids_of_transcript_name : Look up exon IDs of transcript TP53-026
81 | by name and ensure that the exon IDs match what we find on Ensembl's website
82 | for release 77
83 | """
84 | exon_ids = ensembl.exon_ids_of_transcript_name("TP53-026")
85 | assert len(exon_ids) == len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), \
86 | "Expected %d exons, got %d" % (
87 | len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77),
88 | len(exon_ids))
89 | assert all(
90 | exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77
91 | for exon_id in exon_ids)
92 |
93 | def exon_ids_of_transcript_id():
94 | """
95 | exon_ids_of_transcript_id : Look up exon IDs of transcript
96 | ENST00000610623 (name: TP53-026) by its ID and make sure they match
97 | what we find on the Ensembl website.
98 | """
99 | exon_ids = ensembl.exon_ids_of_transcript_id("ENST00000610623")
100 | assert len(exon_ids) == len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77), \
101 | "Expected %d exons, got %d" % (
102 | len(TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77),
103 | len(exon_ids))
104 | assert all(
105 | exon_id in TP53_TRANSCRIPT_26_EXON_IDS_RELEASE_77
106 | for exon_id in exon_ids)
107 |
--------------------------------------------------------------------------------
/tests/test_exon_object.py:
--------------------------------------------------------------------------------
1 | """
2 | Check that pyensembl returns correct exon objects for exon IDs
3 | and loci. Make sure the information on the exon object matches
4 | the expected gene ID and location.
5 | """
6 | from __future__ import absolute_import
7 |
8 | from pyensembl import cached_release
9 |
10 | ensembl = cached_release(77)
11 |
12 | def test_exon_object_by_id():
13 | """
14 | test_exon_object_by_id : check properties of exon 4 of CTNNB1 when looked
15 | up by ID in Ensembl 77.
16 | """
17 | exon = ensembl.exon_by_id("ENSE00003464041")
18 | assert exon.gene_name == "CTNNB1", \
19 | "Unexpected gene name: %s" % exon.gene_name
20 | assert exon.contig == "3", exon.contig
21 | assert exon.strand == "+"
22 | assert exon.on_forward_strand
23 | assert exon.on_positive_strand
24 | assert exon.start == 41224526, "Unexpected exon start: %s" % exon.start
25 | assert exon.end == 41224753, "Unexpected exon end: %s" % exon.end
26 | assert exon.length == len(exon) == 228
27 |
28 | def test_exon_object_by_id_on_negative_strand():
29 | """
30 | test_exon_object_by_id_on_negative_strand : check properties of exon 1
31 | from CXCR3 when looked up by ID in Ensembl 77.
32 | """
33 | exon = ensembl.exon_by_id("ENSE00001817013")
34 | assert exon.gene_name == "CXCR3", \
35 | "Unexpected gene name: %s" % exon.gene_name
36 | assert exon.contig == "X", exon.contig
37 | assert exon.strand == "-"
38 | assert exon.on_backward_strand
39 | assert exon.on_negative_strand
40 | assert exon.start == 71618438, "Unexpected exon start: %s" % exon.start
41 | assert exon.end == 71618517, "Unexpected exon end: %s" % exon.end
42 | assert exon.length == len(exon) == 80
43 |
44 |
45 | def test_exon_object_at_locus():
46 | """
47 | test_exon_object_at_locus : check properties of exon 4 of CTNNB1 when looked
48 | up by its location on the forward strand of chr3
49 | """
50 | exons = ensembl.exons_at_locus(3, 41224526, strand="+")
51 | for exon in exons:
52 | assert exon.gene_name == "CTNNB1", exon.transcript_name
53 | assert exon.contig == "3", exon.contig
54 | assert exon.strand == "+"
55 | assert exon.on_forward_strand
56 | assert exon.on_positive_strand
57 | assert exon.start <= 41224526, "Unexpected exon start: %s" % exon.start
58 | assert exon.end >= 41224526, "Unexpected exon end: %s" % exon.end
59 |
60 | def test_exon_object_at_locus_on_negative_strand():
61 | """
62 | test_exon_object_at_locus : check properties of exon 1 of CXCR3 when looked
63 | up by its location on the negative strand of chrX
64 | """
65 | exons = ensembl.exons_at_locus("chrX", 71618517, strand="-")
66 | for exon in exons:
67 | assert exon.gene_name == "CXCR3", exon.transcript_name
68 | assert exon.contig == "X", exon.contig
69 | assert exon.strand == "-"
70 | assert exon.on_backward_strand
71 | assert exon.on_negative_strand
72 | assert exon.start <= 71618517, "Unexpected exon start: %s" % exon.start
73 | assert exon.end >= 71618517, "Unexpected exon end: %s" % exon.end
74 |
75 | def test_exon_basic_properties_str():
76 | exon = ensembl.exon_by_id("ENSE00001817013")
77 | assert isinstance(str(exon), str)
78 | assert isinstance(repr(exon), str)
79 | # for now we're assuming that __repr__ and __str__ do the same thing,
80 | # if we later change that assumption we should do so explicitly and
81 | # change this test
82 | assert str(exon) == repr(exon), "%s != %s" % (str(exon), repr(exon))
83 |
84 | def test_exon_basic_properties_hash():
85 | exon = ensembl.exon_by_id("ENSE00001817013")
86 | assert isinstance(hash(exon), int), \
87 | "Hash function returns %s instead of int" % (
88 | type(hash(exon),))
89 | assert hash(exon) == hash(exon), "Hash function is non-deterministic!"
90 | other_exon = ensembl.exon_by_id("ENSE00003464041")
91 | assert exon != other_exon
92 | assert hash(exon) != hash(other_exon)
93 |
--------------------------------------------------------------------------------
/tests/test_gene_ids.py:
--------------------------------------------------------------------------------
1 | """
2 | Test all methods which return collections of gene IDs that aren't converting
3 | from some other type of name or ID.
4 |
5 | TODO: Implement tests for EnsemblRelease.gene_ids
6 | """
7 | from __future__ import absolute_import
8 |
9 | from pytest import raises
10 | from pyensembl import ensembl_grch38, cached_release
11 |
12 | from .common import run_multiple_genomes, eq_
13 |
14 | ensembl77 = cached_release(77, "human")
15 |
16 |
17 | def test_gene_ids_grch38_hla_a():
18 | # chr6:29,945,884 is a position for HLA-A
19 | # Gene ID = ENSG00000206503
20 | # based on:
21 | # http://useast.ensembl.org/Homo_sapiens/Gene/
22 | # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884
23 | ids = ensembl_grch38.gene_ids_at_locus(6, 29945884)
24 | expected = "ENSG00000206503"
25 | assert ids == ["ENSG00000206503"], "Expected HLA-A, gene ID = %s, got: %s" % (
26 | expected,
27 | ids,
28 | )
29 |
30 |
31 | def test_gene_ids_of_gene_name_hla_grch38():
32 | hla_a_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-A")
33 | assert "ENSG00000206503" in hla_a_gene_ids, hla_a_gene_ids
34 |
35 | hla_b_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-B")
36 | assert "ENSG00000234745" in hla_b_gene_ids, hla_b_gene_ids
37 |
38 | hla_c_gene_ids = ensembl_grch38.gene_ids_of_gene_name("HLA-C")
39 | assert "ENSG00000204525" in hla_c_gene_ids, hla_c_gene_ids
40 |
41 |
42 | def test_gene_id_of_protein_id_release77():
43 | gene_id = ensembl77.gene_id_of_protein_id("ENSP00000485677")
44 | eq_("ENSG00000279634", gene_id)
45 |
46 |
47 | def test_gene_id_of_invalid_name():
48 | with raises(Exception):
49 | ensembl_grch38.gene_ids_of_gene_name("A wonderous pony sees through your soul")
50 |
51 |
52 | @run_multiple_genomes()
53 | def test_gene_ids_on_contig(genome):
54 | gene_ids_chr17 = genome.gene_ids(contig=17)
55 | # gene ID of TP53
56 | tp53 = "ENSG00000141510"
57 | assert (
58 | tp53 in gene_ids_chr17
59 | ), "Missing %s from %s on chr17, example IDs: %s (total = %d)" % (
60 | tp53,
61 | genome,
62 | gene_ids_chr17[:5],
63 | len(gene_ids_chr17),
64 | )
65 |
66 | # gene ID of SMAD4
67 | gene_ids_chr18 = genome.gene_ids(contig=18)
68 | smad4 = "ENSG00000141646"
69 | assert (
70 | smad4 in gene_ids_chr18
71 | ), "Missing %s from %s on chr18, example result: %s (total = %d)" % (
72 | smad4,
73 | genome,
74 | gene_ids_chr18[:5],
75 | len(gene_ids_chr18),
76 | )
77 |
--------------------------------------------------------------------------------
/tests/test_gene_names.py:
--------------------------------------------------------------------------------
1 | """
2 | Test all methods which return collections of gene names that aren't converting
3 | from some other type of name or ID.
4 | """
5 | from __future__ import absolute_import, print_function
6 | from pyensembl import genome_for_reference_name
7 |
8 | from .common import run_multiple_genomes
9 |
10 | grch38 = genome_for_reference_name("GRCh38")
11 |
12 | KNOWN_GENE_NAMES = [
13 | "TP53",
14 | "ERBB2",
15 | "SMAD4",
16 | "CTAG1A",
17 | "HLA-A",
18 | ]
19 |
20 |
21 | @run_multiple_genomes()
22 | def test_all_gene_names(genome):
23 | """
24 | test_all_gene_names : Make sure some known gene names such as
25 | SMAD4, TP53, ERBB2, &c
26 | """
27 | gene_names = genome.gene_names()
28 | print(type(gene_names))
29 | for gene_name in KNOWN_GENE_NAMES:
30 | assert gene_name in gene_names, "Missing gene name %s from %s" % (
31 | gene_name,
32 | genome,
33 | )
34 |
35 |
36 | def test_gene_names_at_locus_grch38_hla_a():
37 | # chr6:29,945,884 is a position for HLA-A
38 | # based on:
39 | # http://useast.ensembl.org/Homo_sapiens/Gene/
40 | # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884
41 | names = grch38.gene_names_at_locus(6, 29945884)
42 | assert names == ["HLA-A"], "Expected gene name HLA-A, got: %s" % (names,)
43 |
44 |
45 | @run_multiple_genomes()
46 | def test_gene_names_on_contig(genome):
47 | gene_names_chr17 = genome.gene_names(17)
48 | assert (
49 | "TP53" in gene_names_chr17
50 | ), "No TP53 in gene names on chr17 of %s, gene names: %s ... (%d)" % (
51 | genome,
52 | list(gene_names_chr17[:4]),
53 | len(gene_names_chr17),
54 | )
55 |
56 | gene_names_chr18 = genome.gene_names(18)
57 | assert (
58 | "SMAD4" in gene_names_chr18
59 | ), "No SMAD4 in gene names on chr18 of %s, gene names: %s ... (%d)" % (
60 | genome,
61 | list(gene_names_chr18[:4]),
62 | len(gene_names_chr18),
63 | )
64 |
65 |
66 | def test_gene_name_of_HLA_gene_id():
67 | gene_ids = grch38.gene_ids_of_gene_name("HLA-A")
68 | gene_names = [grch38.gene_name_of_gene_id(gene_id) for gene_id in gene_ids]
69 | unique_gene_names = list(set(gene_names))
70 | assert len(unique_gene_names) == 1, (len(unique_gene_names), unique_gene_names)
71 | gene_name = unique_gene_names[0]
72 | assert gene_name == "HLA-A", gene_name
73 |
--------------------------------------------------------------------------------
/tests/test_gene_objects.py:
--------------------------------------------------------------------------------
1 | from .common import eq_
2 | from .common import run_multiple_genomes
3 | from .data import TP53_gene_id
4 |
5 |
6 | @run_multiple_genomes()
7 | def test_TP53_gene_object_by_id(genome):
8 | # when we look up TP53 by its gene ID, we should get the
9 | # correct gene back
10 | gene = genome.gene_by_id(TP53_gene_id)
11 | assert gene.name == "TP53", "Incorrect gene name %s for gene ID %s in %s" % (
12 | gene.name,
13 | gene.id,
14 | genome,
15 | )
16 | assert gene.contig == "17", "Incorrect gene contig %s for gene ID %s in %s" % (
17 | gene.contig,
18 | gene.id,
19 | genome,
20 | )
21 |
22 |
23 | @run_multiple_genomes()
24 | def test_TP53_gene_object_by_name(genome):
25 | genes = genome.genes_by_name("TP53")
26 | # we should only have one TP53 gene (there aren't any copies)
27 | assert len(genes) == 1, "Expected only one gene with name TP53, got %s" % (genes,)
28 | # make sure it has the correct gene ID
29 | assert genes[0].id == TP53_gene_id, "Expected gene to have ID %s, got %s" % (
30 | TP53_gene_id,
31 | genes[0].id,
32 | )
33 |
34 |
35 | @run_multiple_genomes()
36 | def test_equal_genes(genome):
37 | gene1 = genome.genes_by_name("TP53")[0]
38 | # get an identical gene
39 | gene2 = genome.gene_by_id(gene1.id)
40 |
41 | assert hash(gene1) == hash(gene2)
42 | assert gene1 == gene2
43 |
44 |
45 | @run_multiple_genomes()
46 | def test_not_equal_genes(genome):
47 | gene1 = genome.genes_by_name("MUC1")[0]
48 | gene2 = genome.genes_by_name("BRCA1")[0]
49 | assert hash(gene1) != hash(gene2)
50 | assert gene1 != gene2
51 |
52 |
53 | @run_multiple_genomes()
54 | def test_BRCA1_protein_coding_biotype(genome):
55 | gene = genome.genes_by_name("BRCA1")[0]
56 | assert gene.is_protein_coding
57 | eq_(gene.biotype, "protein_coding")
58 |
--------------------------------------------------------------------------------
/tests/test_id_length.py:
--------------------------------------------------------------------------------
1 | from .common import major_releases
2 |
3 |
4 | def check_id_length(method_name):
5 | for release in major_releases:
6 | method = getattr(release, method_name)
7 | # only load chromosome Y to speed up tests
8 | idents = method(contig="Y")
9 | assert len(idents) > 0, "No values returned by %s" % method_name
10 | assert all(len(ident) == 15 for ident in idents), "Invalid IDs for %s: %s" % (
11 | method_name,
12 | [ident for ident in idents if len(ident) != 15],
13 | )
14 |
15 |
16 | def test_gene_id_length():
17 | check_id_length("gene_ids")
18 |
19 |
20 | def test_transcript_id_length():
21 | check_id_length("transcript_ids")
22 |
23 |
24 | def test_protein_id_length():
25 | check_id_length("protein_ids")
26 |
--------------------------------------------------------------------------------
/tests/test_locus.py:
--------------------------------------------------------------------------------
1 | from pyensembl.locus import Locus
2 | from pyensembl.normalization import normalize_chromosome
3 |
4 | from pytest import raises as assert_raises
5 |
6 |
7 | def test_normalize_chromosome():
8 | assert normalize_chromosome("X") == "X"
9 | assert normalize_chromosome("chrX") == "chrX"
10 |
11 | assert normalize_chromosome("x") == "X"
12 | assert normalize_chromosome("chrx") == "chrX"
13 |
14 | assert normalize_chromosome(1) == "1"
15 | assert normalize_chromosome("1") == "1"
16 | assert normalize_chromosome("chr1") == "chr1"
17 |
18 | assert normalize_chromosome("chrM") == "chrM"
19 | assert normalize_chromosome("chrMT") == "chrMT"
20 | assert normalize_chromosome("M") == "M"
21 | assert normalize_chromosome("MT") == "MT"
22 | assert normalize_chromosome("m") == "M"
23 | assert normalize_chromosome("chrm") == "chrM"
24 | assert normalize_chromosome("mt") == "MT"
25 | assert normalize_chromosome("chrmt") == "chrMT"
26 |
27 | with assert_raises(TypeError):
28 | normalize_chromosome({"a": "b"})
29 |
30 | with assert_raises(TypeError):
31 | normalize_chromosome([])
32 |
33 | with assert_raises(TypeError):
34 | normalize_chromosome(None)
35 |
36 | with assert_raises(ValueError):
37 | normalize_chromosome("")
38 |
39 | with assert_raises(ValueError):
40 | normalize_chromosome(0)
41 |
42 |
43 | def test_locus_overlaps():
44 | locus = Locus("1", 10, 20, "+")
45 | assert locus.overlaps("1", 10, 20, "+")
46 | assert locus.overlaps("1", 10, 20)
47 | assert locus.overlaps("1", 5, 30)
48 | assert locus.overlaps("1", 15, 16)
49 | assert locus.overlaps("1", 15, 30)
50 | assert locus.overlaps("1", 5, 15)
51 | assert locus.overlaps("1", 10, 10)
52 | assert locus.overlaps("1", 20, 20)
53 | # before start
54 | assert not locus.overlaps(1, 9, 9)
55 | # after end
56 | assert not locus.overlaps(21, 30, 30)
57 | # wrong contig
58 | assert not locus.overlaps("2", 10, 20)
59 | # wrong strand
60 | assert not locus.overlaps("1", 10, 20, "-")
61 |
62 |
63 | def test_locus_contains():
64 | locus = Locus("1", 10, 20, "+")
65 | assert locus.contains("1", 10, 20, "+")
66 | assert locus.contains("1", 10, 20)
67 | assert locus.contains("1", 15, 16)
68 | assert locus.contains("1", 10, 10)
69 | assert locus.contains("1", 20, 20)
70 |
71 | # before start and after end
72 | assert not locus.contains("1", 5, 30)
73 |
74 | # before start
75 | assert not locus.contains("1", 1, 9)
76 | assert not locus.contains("1", 5, 15)
77 |
78 | # after end
79 | assert not locus.contains("1", 21, 30)
80 | assert not locus.contains("1", 15, 30)
81 |
82 | # wrong contig
83 | assert not locus.contains("2", 10, 20)
84 |
85 | # wrong strand
86 | assert not locus.contains("1", 10, 20, "-")
87 |
88 |
89 | def test_position_offset():
90 | forward_locus = Locus("1", 10, 20, "+")
91 | assert forward_locus.offset(10) == 0
92 | assert forward_locus.offset(15) == 5
93 | assert forward_locus.offset(19) == 9
94 | assert forward_locus.offset(20) == 10
95 |
96 | negative_locus = Locus("1", 10, 20, "-")
97 | assert negative_locus.offset(10) == 10
98 | assert negative_locus.offset(15) == 5
99 | assert negative_locus.offset(19) == 1
100 | assert negative_locus.offset(20) == 0
101 |
102 | # don't allow negative offsets
103 | with assert_raises(ValueError):
104 | forward_locus.offset(9)
105 |
106 | # don't allow negative offsets
107 | with assert_raises(ValueError):
108 | negative_locus.offset(9)
109 |
110 | # don't allow offset past the end of the locus
111 | with assert_raises(ValueError):
112 | forward_locus.offset(21)
113 |
114 | # don't allow offset past the end of the locus
115 | with assert_raises(ValueError):
116 | negative_locus.offset(21)
117 |
118 |
119 | def test_range_offset():
120 | forward_locus = Locus("1", 10, 20, "+")
121 | assert forward_locus.offset_range(10, 20) == (0, 10)
122 | assert forward_locus.offset_range(11, 14) == (1, 4)
123 | assert forward_locus.offset_range(20, 20) == (10, 10)
124 |
125 | negative_locus = Locus("1", 10, 20, "-")
126 | assert negative_locus.offset_range(10, 20) == (0, 10)
127 | assert negative_locus.offset_range(11, 14) == (6, 9)
128 | assert negative_locus.offset_range(20, 20) == (0, 0)
129 |
130 | # start shouldn't be larger than end
131 | with assert_raises(ValueError):
132 | forward_locus.offset_range(21, 20)
133 |
134 | # start shouldn't be larger than end
135 | with assert_raises(ValueError):
136 | negative_locus.offset_range(21, 20)
137 |
138 | # don't allow negative offsets
139 | with assert_raises(ValueError):
140 | forward_locus.offset_range(9, 10)
141 |
142 | # don't allow negative offsets
143 | with assert_raises(ValueError):
144 | forward_locus.offset_range(9, 10)
145 |
146 | # don't allow negative offsets
147 | with assert_raises(ValueError):
148 | negative_locus.offset_range(9, 10)
149 |
150 |
151 | def test_locus_distance():
152 | locus_chr1_10_20_pos = Locus("1", 10, 20, "+")
153 | locus_chr1_21_25_pos = Locus("1", 21, 25, "+")
154 | locus_chr2_21_25_pos = Locus("2", 21, 25, "+")
155 | locus_chr1_21_25_neg = Locus("1", 21, 25, "-")
156 | assert locus_chr1_10_20_pos.distance_to_locus(locus_chr1_21_25_pos) == 1
157 | assert locus_chr1_21_25_pos.distance_to_locus(locus_chr1_10_20_pos) == 1
158 | inf = float("inf")
159 | assert locus_chr1_10_20_pos.distance_to_locus(locus_chr2_21_25_pos) == inf
160 | assert locus_chr1_10_20_pos.distance_to_locus(locus_chr1_21_25_neg) == inf
161 |
--------------------------------------------------------------------------------
/tests/test_missing_genome_sources.py:
--------------------------------------------------------------------------------
1 | from pyensembl import Genome
2 | from pytest import raises
3 | from .common import eq_
4 | from .data import data_path
5 |
6 | MOUSE_ENSMUSG00000017167_PATH = data_path(
7 | "mouse.ensembl.81.partial.ENSMUSG00000017167.gtf"
8 | )
9 | MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH = data_path(
10 | "mouse.ensembl.81.partial.ENSMUSG00000017167.fa"
11 | )
12 | # MOUSE_ENSMUSG00000088969_NCRNA_FASTA_PATH = data_path(
13 | # "mouse.ensembl.81.partial.ncrna.ENSMUSG00000017167.fa")
14 | MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH = data_path(
15 | "mouse.ensembl.81.partial.ENSMUSG00000017167.pep"
16 | )
17 |
18 |
19 | def no_gtf_(e):
20 | print("Testing for 'GTF' in %s : %s" % (type(e), e))
21 | assert "GTF" in str(e)
22 |
23 |
24 | def no_transcript_(e):
25 | print("Testing for 'transcript' in %s : %s" % (type(e), e))
26 | assert "transcript" in str(e)
27 |
28 |
29 | def no_protein_(e):
30 | print("Testing for 'protein' in %s : %s" % (type(e), e))
31 | assert "protein" in str(e)
32 |
33 |
34 | def test_transcript_fasta_only():
35 | genome = Genome(
36 | reference_name="GRCm38",
37 | annotation_name="_test_mouse_ensembl81_subset",
38 | transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH],
39 | )
40 | genome.index()
41 |
42 | eq_(2, len(genome.transcript_sequences.fasta_dictionary))
43 |
44 | with raises(ValueError) as e:
45 | genome.genes()
46 | no_gtf_(e)
47 |
48 | with raises(ValueError) as e:
49 | genome.gene_ids()
50 | no_gtf_(e)
51 |
52 | with raises(ValueError) as e:
53 | genome.gene_ids_of_gene_name("test")
54 | no_gtf_(e)
55 |
56 | with raises(ValueError) as e:
57 | genome.transcript_names()
58 | no_gtf_(e)
59 |
60 | with raises(ValueError) as e:
61 | genome.protein_sequence("test")
62 | no_protein_(e)
63 |
64 |
65 | def test_protein_fasta_only():
66 | genome_only_proteins = Genome(
67 | reference_name="GRCm38",
68 | annotation_name="_test_mouse_ensembl81_subset",
69 | protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH],
70 | )
71 | genome_only_proteins.index()
72 |
73 | eq_(4, len(genome_only_proteins.protein_sequences.fasta_dictionary))
74 |
75 | with raises(ValueError) as e:
76 | genome_only_proteins.genes()
77 | no_gtf_(e)
78 |
79 | with raises(ValueError) as e:
80 | genome_only_proteins.transcript_sequence("DOES_NOT_EXIST")
81 | no_transcript_(e)
82 |
83 |
84 | def test_gtf_only():
85 | genome_only_gtf = Genome(
86 | reference_name="GRCm38",
87 | annotation_name="_test_mouse_ensembl81_subset",
88 | gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
89 | )
90 | genome_only_gtf.index()
91 |
92 | eq_(1, len(genome_only_gtf.genes()))
93 |
94 | with raises(ValueError) as e:
95 | genome_only_gtf.transcript_sequence("DOES_NOT_EXIST")
96 |
97 | no_transcript_(e)
98 |
99 | with raises(ValueError) as cm:
100 | genome_only_gtf.protein_sequence("genome_only_gtf")
101 |
102 | no_protein_(cm)
103 |
104 |
105 | def test_gtf_transcript_only():
106 | genome_gtf_with_cdna = Genome(
107 | reference_name="GRCm38",
108 | annotation_name="_test_mouse_ensembl81_subset",
109 | gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
110 | transcript_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH],
111 | )
112 | genome_gtf_with_cdna.index()
113 |
114 | eq_(1, len(genome_gtf_with_cdna.genes()))
115 |
116 | transcript = genome_gtf_with_cdna.transcripts()[0]
117 | assert transcript.sequence is not None
118 |
119 | with raises(ValueError) as e:
120 | transcript.protein_sequence
121 | no_protein_(e)
122 |
123 |
124 | def test_gtf_protein_only():
125 | genome_gtf_with_proteins = Genome(
126 | reference_name="GRCm38",
127 | annotation_name="_test_mouse_ensembl81_subset",
128 | gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH,
129 | protein_fasta_paths_or_urls=[MOUSE_ENSMUSG00000017167_PROTEIN_FASTA_PATH],
130 | )
131 | genome_gtf_with_proteins.index()
132 |
133 | eq_(1, len(genome_gtf_with_proteins.genes()))
134 |
135 | transcript = genome_gtf_with_proteins.transcripts()[0]
136 | assert transcript.protein_sequence is not None
137 |
138 | with raises(ValueError) as e:
139 | transcript.sequence
140 | no_transcript_(e)
141 |
--------------------------------------------------------------------------------
/tests/test_mouse.py:
--------------------------------------------------------------------------------
1 | from .common import eq_
2 | from .data import custom_mouse_genome_grcm38_subset, setup_init_custom_mouse_genome
3 |
4 |
5 | def test_mouse_ENSMUSG00000017167():
6 | """
7 | GTF cropped from ftp://ftp.ensembl.org/pub/release-81/gtf/mus_musculus/
8 | Mus_musculus.GRCm38.81.gtf.gz via:
9 | grep "ENSMUSG00000017167" Mus_musculus.GRCm38.81.gtf
10 |
11 | Transcript FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/
12 | fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz via:
13 | grep "ENSMUSG00000017167" Mus_musculus.GRCm38.cdna.all.fa -A 50
14 |
15 | ncRNA FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/
16 | fasta/mus_musculus/cdna/Mus_musculus.GRCm38.ncrna.fa.gz via:
17 | grep "ENSMUSG00000088969" Mus_musculus.GRCm38.ncrna.fa -A 2
18 |
19 | Protein FASTA cropped from ftp://ftp.ensembl.org/pub/release-81/fasta/
20 | mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz via:
21 | grep "ENSMUSG00000017167" Mus_musculus.GRCm38.pep.all.fa -A 50
22 |
23 | Tested against:
24 | http://useast.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000017167
25 | """
26 | setup_init_custom_mouse_genome()
27 | genes_cntnap1 = custom_mouse_genome_grcm38_subset.genes_by_name("Cntnap1")
28 | eq_(len(genes_cntnap1), 1)
29 | gene_cntnap1 = genes_cntnap1[0]
30 | transcripts_cntnap1 = gene_cntnap1.transcripts
31 | eq_(len(transcripts_cntnap1), 2)
32 | transcripts_coding_cntnap1 = [
33 | transcript
34 | for transcript in transcripts_cntnap1
35 | if transcript.biotype == "protein_coding"
36 | ]
37 | eq_(len(transcripts_coding_cntnap1), 1)
38 | transcript_cntnap1 = transcripts_coding_cntnap1[0]
39 | eq_(
40 | transcript_cntnap1.sequence[:120],
41 | (
42 | "GAGAGAAGGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA"
43 | "GAGAGAGAGAGATTGGGGGTAGGAGAGAGGGAAGGGTGGATAAGGACGGAAAAAAGCTTT"
44 | ),
45 | )
46 | eq_(
47 | transcript_cntnap1.protein_sequence[:120],
48 | (
49 | "MMSLRLFSILLATVVSGAWGWGYYGCNEELVGPLYARSLGASSYYGLFTTARFARLHGIS"
50 | "GWSPRIGDPNPWLQIDLMKKHRIRAVATQGAFNSWDWVTRYMLLYGDRVDSWTPFYQKGH"
51 | ),
52 | )
53 |
--------------------------------------------------------------------------------
/tests/test_release_versions.py:
--------------------------------------------------------------------------------
1 | from pyensembl import EnsemblRelease, MAX_ENSEMBL_RELEASE
2 |
3 | from pytest import raises
4 |
5 |
6 | def test_version_too_old_1():
7 | with raises(Exception):
8 | EnsemblRelease(1)
9 |
10 |
11 | def test_version_too_old_47():
12 | with raises(Exception):
13 | EnsemblRelease(47)
14 |
15 |
16 | def test_version_is_not_numeric():
17 | with raises(Exception):
18 | EnsemblRelease("wuzzle")
19 |
20 |
21 | def test_version_is_none():
22 | with raises(Exception):
23 | EnsemblRelease(None)
24 |
25 |
26 | def test_max_ensembl_release():
27 | assert isinstance(
28 | MAX_ENSEMBL_RELEASE, int
29 | ), "Unexpected type for MAX_ENSEMBL_RELEASE: %s" % (type(MAX_ENSEMBL_RELEASE),)
30 | assert 83 <= MAX_ENSEMBL_RELEASE < 1000, (
31 | "Unexpected value for MAX_ENSEMBL_RELEASE: %d" % MAX_ENSEMBL_RELEASE
32 | )
33 |
34 |
35 | def test_int_version():
36 | for version in range(54, MAX_ENSEMBL_RELEASE):
37 | EnsemblRelease(version)
38 |
39 |
40 | def test_str_version():
41 | for version in range(54, MAX_ENSEMBL_RELEASE):
42 | EnsemblRelease(str(version))
43 |
--------------------------------------------------------------------------------
/tests/test_search.py:
--------------------------------------------------------------------------------
1 | from pyensembl import find_nearest_locus
2 | from .common import eq_
3 | from .common import run_multiple_genomes
4 |
5 |
6 | @run_multiple_genomes()
7 | def test_find_nearest_BRAF_exon(genome):
8 | braf = genome.genes_by_name("BRAF")[0]
9 | braf_transcripts = braf.transcripts
10 | exons = braf_transcripts[0].exons
11 | for exon in exons:
12 | # immediately before exon
13 | result_before = find_nearest_locus(
14 | start=exon.start - 2, end=exon.start - 1, loci=exons
15 | )
16 | eq_(result_before, (1, exon))
17 |
18 | # overlapping with exon
19 | result_overlap = find_nearest_locus(
20 | start=exon.start - 2, end=exon.start + 1, loci=exons
21 | )
22 | eq_(result_overlap, (0, exon))
23 |
24 | # immediately after exon
25 | result_after = find_nearest_locus(
26 | start=exon.end + 1, end=exon.end + 2, loci=exons
27 | )
28 | eq_(result_after, (1, exon))
29 |
30 |
31 | @run_multiple_genomes()
32 | def test_find_nearest_BRAF_transcript(genome):
33 | braf_transcript = genome.genes_by_name("BRAF")[0].transcripts[0]
34 | egfr_transcript = genome.genes_by_name("EGFR")[0].transcripts[0]
35 | transcripts = [braf_transcript, egfr_transcript]
36 | for transcript in transcripts:
37 | # immediately before transcript
38 | result_before = find_nearest_locus(
39 | start=transcript.start - 2, end=transcript.start - 1, loci=transcripts
40 | )
41 | eq_(result_before, (1, transcript))
42 |
43 | # overlapping with transcript
44 | result_overlap = find_nearest_locus(
45 | start=transcript.start - 2, end=transcript.start + 1, loci=transcripts
46 | )
47 | eq_(result_overlap, (0, transcript))
48 |
49 | # immediately after transcript
50 | # may overlap with other transcripts
51 | result_after = find_nearest_locus(
52 | start=transcript.end + 1, end=transcript.end + 2, loci=transcripts
53 | )
54 | eq_(result_after, (1, transcript))
55 |
--------------------------------------------------------------------------------
/tests/test_sequence_data.py:
--------------------------------------------------------------------------------
1 | """
2 | Test SequenceData object to make sure it's correctly parsing FASTA files
3 | and that we're able to clear and regenrate its cached representation of
4 | a FASTA dictionary
5 | """
6 | from os.path import exists
7 |
8 | from pyensembl import SequenceData
9 |
10 | from .common import TemporaryDirectory
11 | from .data import data_path
12 |
13 |
14 | FASTA_PATH = data_path("mouse.ensembl.81.partial.ENSMUSG00000017167.fa")
15 |
16 |
17 | def test_sequence_type():
18 | with TemporaryDirectory() as tmpdir:
19 | seqs_dna = SequenceData([FASTA_PATH], cache_directory_path=tmpdir)
20 | seq = seqs_dna.get("ENSMUST00000138942")
21 | assert seq is not None, "Failed to find sequence for ENSMUST00000138942"
22 | assert isinstance(seq, str), "Wrong sequence type, expected %s but got %s" % (
23 | str,
24 | type(seq),
25 | )
26 |
27 |
28 | def test_missing_sequence():
29 | with TemporaryDirectory() as tmpdir:
30 | seqs = SequenceData([FASTA_PATH], cache_directory_path=tmpdir)
31 | seq = seqs.get("NotInFasta")
32 | assert seq is None, "Should get None back for missing sequence"
33 |
34 |
35 | def test_clear_cache():
36 | with TemporaryDirectory() as tmpdir:
37 | seqs = SequenceData([FASTA_PATH], cache_directory_path=tmpdir)
38 | assert not seqs._fasta_dictionary, "Expected _fasta_dictionary to load lazily"
39 |
40 | seqs._load_or_create_fasta_dictionary_pickle()
41 | assert len(seqs._fasta_dictionary) > 0, "FASTA dictionary didn't get created"
42 |
43 | seqs.clear_cache()
44 | assert (
45 | not seqs._fasta_dictionary
46 | ), "Expected FASTA dictionary to be empty after clear_cache()"
47 | for pickle_path in seqs.fasta_dictionary_pickle_paths:
48 | assert not exists(
49 | pickle_path
50 | ), "Cached pickle file should have been deleted"
51 |
52 | seqs._load_or_create_fasta_dictionary_pickle()
53 | for pickle_path in seqs.fasta_dictionary_pickle_paths:
54 | assert exists(pickle_path), "Cached pickle file should have been created"
55 |
--------------------------------------------------------------------------------
/tests/test_serialization.py:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | import pickle
14 |
15 | from pyensembl import Genome, Transcript, Gene, Exon
16 | from pyensembl.species import Species, human
17 |
18 | from .common import run_multiple_genomes, eq_
19 | from .data import (
20 | TP53_gene_id,
21 | custom_mouse_genome_grcm38_subset,
22 | setup_init_custom_mouse_genome,
23 | )
24 |
25 |
26 | @run_multiple_genomes
27 | def test_pickle_ensembl_gene(genome):
28 | gene = genome.gene_by_id(TP53_gene_id)
29 | gene_new = pickle.loads(pickle.dumps(gene))
30 | assert gene == gene_new
31 |
32 |
33 | @run_multiple_genomes()
34 | def test_pickle_ensembl_transcript(genome):
35 | gene = genome.gene_by_id(TP53_gene_id)
36 | transcript = gene.transcripts[0]
37 | transcript_reconstructed = pickle.loads(pickle.dumps(transcript))
38 | eq_(transcript, transcript_reconstructed)
39 |
40 |
41 | @run_multiple_genomes()
42 | def test_pickle_ensembl_exon(genome):
43 | gene = genome.gene_by_id(TP53_gene_id)
44 | transcript = gene.transcripts[0]
45 | exon = transcript.exons[0]
46 | exon_reconstructed = pickle.loads(pickle.dumps(exon))
47 | eq_(exon, exon_reconstructed)
48 |
49 |
50 | @run_multiple_genomes()
51 | def test_json_ensembl_gene(genome):
52 | gene = genome.gene_by_id(TP53_gene_id)
53 | gene_reconstructed = Gene.from_json(gene.to_json())
54 | eq_(gene, gene_reconstructed)
55 |
56 |
57 | @run_multiple_genomes()
58 | def test_json_ensembl_transcript(genome):
59 | gene = genome.gene_by_id(TP53_gene_id)
60 | transcript = gene.transcripts[0]
61 | transcript_reconstructed = Transcript.from_json(transcript.to_json())
62 | eq_(transcript, transcript_reconstructed)
63 |
64 |
65 | @run_multiple_genomes()
66 | def test_json_ensembl_exon(genome):
67 | gene = genome.gene_by_id(TP53_gene_id)
68 | transcript = gene.transcripts[0]
69 | exon = transcript.exons[0]
70 | exon_reconstructed = Exon.from_json(exon.to_json())
71 | eq_(exon, exon_reconstructed)
72 |
73 |
74 | @run_multiple_genomes()
75 | def test_pickle_ensembl_genome(genome):
76 | genome_pickled = pickle.dumps(genome)
77 | genome_reconstructed = pickle.loads(genome_pickled)
78 | eq_(genome, genome_reconstructed)
79 |
80 | # This Genome happens to be an EnsemblRelease; test that too.
81 | eq_(genome.release, genome_reconstructed.release)
82 | eq_(genome.species, genome_reconstructed.species)
83 |
84 |
85 | @run_multiple_genomes()
86 | def test_ensembl_genome_to_dict(genome):
87 | genome_dict = genome.to_dict()
88 | genome_reconstructed = genome.__class__.from_dict(genome_dict)
89 | eq_(genome, genome_reconstructed)
90 |
91 |
92 | @run_multiple_genomes()
93 | def test_ensembl_genome_to_json(genome):
94 | genome_json = genome.to_json()
95 | genome_class = genome.__class__
96 | genome_reconstructed = genome_class.from_json(genome_json)
97 | eq_(genome, genome_reconstructed)
98 |
99 |
100 | def test_custom_genome_to_json():
101 | setup_init_custom_mouse_genome()
102 | json = custom_mouse_genome_grcm38_subset.to_json()
103 | reconstructed = Genome.from_json(json)
104 | eq_(custom_mouse_genome_grcm38_subset, reconstructed)
105 |
106 |
107 | def test_custom_genome_to_dict():
108 | setup_init_custom_mouse_genome()
109 | reconstructed = Genome.from_dict(custom_mouse_genome_grcm38_subset.to_dict())
110 | eq_(custom_mouse_genome_grcm38_subset, reconstructed)
111 |
112 |
113 | def test_species_to_dict():
114 | eq_(human, Species.from_dict(human.to_dict()))
115 |
116 |
117 | def test_species_to_json():
118 | eq_(human, Species.from_json(human.to_json()))
119 |
120 |
121 | def test_species_to_pickle():
122 | eq_(human, pickle.loads(pickle.dumps(human)))
123 |
124 |
125 | @run_multiple_genomes()
126 | def test_unique_memory_address_of_unpickled_genomes(genome):
127 | unpickled = pickle.loads(pickle.dumps(genome))
128 | assert (
129 | genome is unpickled
130 | ), "Expected same object for %s but got two different instances" % (unpickled,)
131 |
--------------------------------------------------------------------------------
/tests/test_shell.py:
--------------------------------------------------------------------------------
1 | from pyensembl.shell import parser, all_combinations_of_ensembl_genomes
2 | from .common import eq_
3 |
4 |
5 | def test_genome_selection_grch38():
6 | args = parser.parse_args(["install", "--release", "100", "--species", "human"])
7 | genomes = all_combinations_of_ensembl_genomes(args)
8 | assert len(genomes) == 1
9 | genome = genomes[0]
10 | eq_(genome.species.latin_name, "homo_sapiens")
11 | eq_(genome.release, 100)
12 |
--------------------------------------------------------------------------------
/tests/test_string_representation.py:
--------------------------------------------------------------------------------
1 | from pyensembl import Locus, Gene, ensembl_grch37, Transcript, Exon
2 | from .common import eq_
3 |
4 |
5 | def test_Locus_string_representation():
6 | locus = Locus("X", 1000, 1010, "+")
7 | string_repr = str(locus)
8 | expected = "Locus(contig='X', start=1000, end=1010, strand='+')"
9 | eq_(string_repr, expected)
10 |
11 |
12 | def test_Gene_string_representation():
13 | gene = Gene(
14 | gene_id="ENSG0001",
15 | gene_name="CAPITALISM",
16 | biotype="protein_coding",
17 | contig="Y",
18 | start=1,
19 | end=5,
20 | strand="+",
21 | genome=ensembl_grch37,
22 | )
23 | string_repr = str(gene)
24 | expected = (
25 | "Gene(gene_id='ENSG0001',"
26 | " gene_name='CAPITALISM',"
27 | " biotype='protein_coding',"
28 | " contig='Y',"
29 | " start=1, end=5, strand='+', genome='GRCh37')"
30 | )
31 | eq_(string_repr, expected)
32 |
33 |
34 | def test_Transcript_string_representation():
35 | transcript = Transcript(
36 | transcript_id="ENST0001",
37 | transcript_name="CAPITALISM-001",
38 | gene_id="ENSG0001",
39 | biotype="protein_coding",
40 | contig="Y",
41 | start=1,
42 | end=5,
43 | strand="+",
44 | genome=ensembl_grch37,
45 | )
46 |
47 | expected = (
48 | "Transcript(transcript_id='ENST0001',"
49 | " transcript_name='CAPITALISM-001',"
50 | " gene_id='ENSG0001',"
51 | " biotype='protein_coding',"
52 | " contig='Y',"
53 | " start=1,"
54 | " end=5, strand='+', genome='GRCh37')"
55 | )
56 | string_repr = str(transcript)
57 | eq_(string_repr, expected)
58 |
59 |
60 | def test_Exon_string_representation():
61 | exon = Exon(
62 | exon_id="ENSE0001",
63 | gene_id="ENSG0001",
64 | gene_name="CAPITALISM",
65 | contig="Y",
66 | start=1,
67 | end=5,
68 | strand="+",
69 | )
70 |
71 | expected = (
72 | "Exon(exon_id='ENSE0001',"
73 | " gene_id='ENSG0001',"
74 | " gene_name='CAPITALISM',"
75 | " contig='Y',"
76 | " start=1,"
77 | " end=5, strand='+')"
78 | )
79 | string_repr = str(exon)
80 | eq_(string_repr, expected)
81 |
--------------------------------------------------------------------------------
/tests/test_timings.py:
--------------------------------------------------------------------------------
1 | from pyensembl import genome_for_reference_name
2 |
3 | from tinytimer import benchmark
4 |
5 | ensembl = genome_for_reference_name("GRCh38")
6 | contigs = [str(i + 1) for i in range(22)] + ["X", "Y"]
7 |
8 |
9 | def make_repeat_lookup_fn(lookup_fn, n_positions):
10 | """
11 | Make a thunk which calls the lookup_fn at a number of loci
12 | for each human chromosome (excluding MT).
13 | """
14 |
15 | def repeat_lookup_fn():
16 | for contig in contigs:
17 | for position in [10**6 + i * 10**6 for i in range(n_positions)]:
18 | lookup_fn(contig, position)
19 |
20 | return repeat_lookup_fn
21 |
22 |
23 | def run_benchmark(lookup_fn, n_positions_per_contig=20, time_limit=60.0):
24 | """
25 | Take a lookup functions (such as EnsemblRelease.genes_at_locus) and
26 | time how long it takes across multiple loci.
27 | """
28 | repeat_lookup_fn = make_repeat_lookup_fn(lookup_fn, n_positions_per_contig)
29 | n_loci = n_positions_per_contig * len(contigs)
30 | name = lookup_fn.__name__
31 | average_time = benchmark(repeat_lookup_fn, name="%s for %d loci" % (name, n_loci))
32 | print("-- %s : %0.4fs" % (name, average_time))
33 | assert average_time < time_limit, "%s took too long for %s loci: %0.4fs" % (
34 | name,
35 | n_loci,
36 | average_time,
37 | )
38 | return average_time
39 |
40 |
41 | def test_timing_genes_at_locus():
42 | run_benchmark(ensembl.genes_at_locus)
43 |
44 |
45 | def test_timing_transcripts_at_locus():
46 | run_benchmark(ensembl.transcripts_at_locus)
47 |
48 |
49 | def test_timing_exons_at_locus():
50 | run_benchmark(ensembl.exons_at_locus)
51 |
52 |
53 | def test_timing_transcript_sequences_at_locus():
54 | def transcript_sequences_at_locus(contig, position):
55 | sequences = []
56 | for transcript in ensembl.transcripts_at_locus(contig, position):
57 | sequences.append(transcript.sequence)
58 | return sequences
59 |
60 | run_benchmark(transcript_sequences_at_locus)
61 |
62 |
63 | def test_timing_transcript_coding_sequences_at_locus():
64 | def transcript_coding_sequences_at_locus(contig, position):
65 | sequences = []
66 | for transcript in ensembl.transcripts_at_locus(contig, position):
67 | if transcript.sequence and transcript.complete:
68 | sequences.append(transcript.coding_sequence)
69 | return sequences
70 |
71 | run_benchmark(transcript_coding_sequences_at_locus)
72 |
73 |
74 | def run_all_benchmarks():
75 | import types
76 |
77 | # run all local test functions to see their timings printed
78 | global_variables = globals()
79 | for variable_name in global_variables:
80 | if "test_" in variable_name:
81 | f = global_variables[variable_name]
82 | if isinstance(f, types.FunctionType):
83 | f()
84 |
85 |
86 | if __name__ == "__main__":
87 | run_all_benchmarks()
88 |
--------------------------------------------------------------------------------
/tests/test_transcript_ids.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for methods which return collections of transcript IDs that aren't
3 | converting from some type of name or ID.
4 | """
5 | from __future__ import absolute_import
6 |
7 | from pyensembl import genome_for_reference_name
8 | from .common import eq_
9 | from .common import run_multiple_genomes
10 |
11 | grch38 = genome_for_reference_name("GRCh38")
12 |
13 | # subset of transcript IDs for HLA-A
14 | HLA_A_TRANSCRIPT_IDS = [
15 | "ENST00000396634",
16 | "ENST00000376809",
17 | "ENST00000376806",
18 | "ENST00000376802",
19 | "ENST00000496081",
20 | "ENST00000495183",
21 | "ENST00000461903",
22 | "ENST00000479320",
23 | ]
24 |
25 |
26 | def test_transcript_ids_ensembl_grch38_hla_a():
27 | # chr6:29,945,884 is a position for HLA-A
28 | # based on:
29 | # http://useast.ensembl.org/Homo_sapiens/Gene/
30 | # Summary?db=core;g=ENSG00000206503;r=6:29941260-29945884
31 | transcript_ids = grch38.transcript_ids_at_locus(6, 29941260, 29945884)
32 | for transcript_id in HLA_A_TRANSCRIPT_IDS:
33 | assert transcript_id in transcript_ids, (
34 | "Transcript %s of HLA-A not found overlapping locus" % transcript_id
35 | )
36 |
37 |
38 | KNOWN_TRANSCRIPT_IDS = HLA_A_TRANSCRIPT_IDS + [
39 | "ENST00000398417", # transcript ID of SMAD4-001
40 | "ENST00000334701", # transcript ID of HSP90AA1-001
41 | "ENST00000599837", # transcript ID of CTAG1A-002
42 | ]
43 |
44 |
45 | # TODO: add release 54 after transcript IDs for older GTFs are filled in
46 | # See https://github.com/hammerlab/pyensembl/issues/20
47 | @run_multiple_genomes(75, grch38.release)
48 | def test_all_transcript_ids(genome):
49 | transcript_ids = set(genome.transcript_ids())
50 | for transcript_id in KNOWN_TRANSCRIPT_IDS:
51 | assert transcript_id in transcript_ids, "Missing transcript ID %s from %s" % (
52 | transcript_id,
53 | genome,
54 | )
55 |
56 |
57 | def test_transcript_id_of_protein_id_CCR2():
58 | # Looked up on Oct 9 2021:
59 | # CCR2-203 ENST00000445132.3 maps to ENSP00000399285.2
60 | # Ensembl release 104, GRCh38.p13
61 | transcript_id = grch38.transcript_id_of_protein_id("ENSP00000399285")
62 | eq_("ENST00000445132", transcript_id)
63 |
--------------------------------------------------------------------------------
/tests/test_transcript_objects.py:
--------------------------------------------------------------------------------
1 | from pyensembl import Locus, cached_release
2 |
3 | from .common import eq_, neq_, gt_, run_multiple_genomes
4 | from .data import (
5 | FOXP3_001_transcript_id,
6 | CTNNBIP1_004_transcript_id,
7 | CTNNBIP1_004_UTR5,
8 | CTNNBIP1_004_UTR3,
9 | CTNNBIP1_004_CDS,
10 | CTNNBIP1_004_locus,
11 | CTTNNIP1_004_exon_lengths,
12 | CTTNNIP1_004_exon_ids,
13 | EGFR_001_protein_sequence,
14 | TP53_gene_id,
15 | )
16 |
17 | ensembl77 = cached_release(77)
18 |
19 |
20 | def test_transcript_start_codon():
21 | """
22 | test_transcript_start_codon : Check that fields Transcript
23 | (for transcript named CTNNBIP1-004) matches known values.
24 | """
25 | CTNNBIP1_004_transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id)
26 |
27 | assert Locus.__eq__(
28 | CTNNBIP1_004_transcript, CTNNBIP1_004_locus
29 | ), "Expected locus %s but got %s" % (
30 | CTNNBIP1_004_locus,
31 | Locus.__str__(CTNNBIP1_004_transcript),
32 | )
33 |
34 | start_offsets = CTNNBIP1_004_transcript.start_codon_spliced_offsets
35 | assert len(start_offsets) == 3, "Wrong length for start codon: %d (%s)" % (
36 | len(start_offsets),
37 | start_offsets,
38 | )
39 |
40 | assert all(
41 | isinstance(i, int) for i in start_offsets
42 | ), "Wrong type %s for beginning start codon offset" % (
43 | [type(i) for i in start_offsets],
44 | )
45 |
46 | expected_start_codon_offset = len(CTNNBIP1_004_UTR5)
47 | start_codon_offset = min(start_offsets)
48 | assert (
49 | start_codon_offset == expected_start_codon_offset
50 | ), "Incorrect start codon offset, expected %d but got %d" % (
51 | expected_start_codon_offset,
52 | start_codon_offset,
53 | )
54 |
55 |
56 | def test_transcript_exons():
57 | """
58 | test_transcript_exons : Ensure that properties of CTTNBIP1-004's
59 | Exon objects match known values.
60 | """
61 | transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id)
62 | exons = transcript.exons
63 | assert isinstance(exons, list), "Expected list of Exon objects, got %s : %s" % (
64 | exons,
65 | type(exons),
66 | )
67 |
68 | # CTTNBIP1-004 has 5 exons
69 | assert len(exons) == len(
70 | CTTNNIP1_004_exon_lengths
71 | ), "Expected %d exons but got %d" % (len(CTTNNIP1_004_exon_lengths), len(exons))
72 |
73 | for i, exon in enumerate(exons):
74 | expected_id = CTTNNIP1_004_exon_ids[i]
75 | assert (
76 | exon.id == expected_id
77 | ), "Expected exon #%d of %s to have ID %s but got %s" % (
78 | i + 1,
79 | transcript,
80 | expected_id,
81 | exon.id,
82 | )
83 |
84 | expected_length = CTTNNIP1_004_exon_lengths[i]
85 | assert (
86 | len(exon) == expected_length
87 | ), "Expected exon #%d of %s (%s) to have length %d but got %d" % (
88 | i + 1,
89 | transcript,
90 | exon,
91 | expected_length,
92 | len(exon),
93 | )
94 |
95 |
96 | # not testing NCBI/Release 54 since I just discovered that ensembl54
97 | # feature='transcript' entries don't have a gene ID.
98 | # TODO: Add gene_id patching to gtf_parsing, add ensembl54 to the list
99 | # below
100 | @run_multiple_genomes(75, 77)
101 | def test_sequence_parts(genome):
102 | # Ensure that the UTRs and coding sequence can be
103 | # combined to make the full transcript.
104 | transcript = genome.transcript_by_id(FOXP3_001_transcript_id)
105 |
106 | # The combined lengths of the upstream untranslated region,
107 | # coding sequence, and downstream untranslated region
108 | full_sequence = transcript.sequence
109 | gt_(len(full_sequence), 0)
110 |
111 | utr5 = transcript.five_prime_utr_sequence
112 | gt_(len(utr5), 0)
113 |
114 | cds = transcript.coding_sequence
115 | gt_(len(cds), 0)
116 |
117 | utr3 = transcript.three_prime_utr_sequence
118 | gt_(len(utr3), 0)
119 |
120 | # need to use `seq` property of Sequence objects to get underlying
121 | # strings which can be concatenated and compared
122 | combined_string = utr5 + cds + utr3
123 |
124 | combined_sequence_length = len(combined_string)
125 | # make sure length property of transcript matches the sequence length
126 | eq_(
127 | combined_sequence_length,
128 | len(transcript),
129 | "Length 5' UTR(%dnt) + CDS(%dnt) + 3' UTR(%d) = %d, expected %d"
130 | % (len(utr5), len(cds), len(utr3), combined_sequence_length, len(transcript)),
131 | )
132 | eq_(
133 | combined_string,
134 | full_sequence,
135 | "Expected FOXP3-001 sequence:\n%s\n\n5' UTR + CDS + 3' UTR:\n%s"
136 | % (full_sequence, combined_string),
137 | )
138 |
139 |
140 | def test_transcript_utr5_sequence_CTNNIP1_004():
141 | transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id)
142 | utr5 = transcript.five_prime_utr_sequence
143 | expected_utr5_length = len(CTNNBIP1_004_UTR5)
144 | eq_(
145 | len(utr5),
146 | expected_utr5_length,
147 | "Expected 5' UTR length %d, got %d" % (expected_utr5_length, len(utr5)),
148 | )
149 | eq_(utr5, CTNNBIP1_004_UTR5)
150 |
151 |
152 | def test_transcript_utr3_sequence_CTNNIP1_004():
153 | transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id)
154 | utr3 = transcript.three_prime_utr_sequence
155 | expected_utr3_length = len(CTNNBIP1_004_UTR3)
156 | eq_(
157 | len(utr3),
158 | expected_utr3_length,
159 | "Expected 3' UTR length %d, got %d" % (expected_utr3_length, len(utr3)),
160 | )
161 | eq_(utr3, CTNNBIP1_004_UTR3)
162 |
163 |
164 | def test_transcript_cds_CTNNIP1_004():
165 | transcript = ensembl77.transcript_by_id(CTNNBIP1_004_transcript_id)
166 | cds = transcript.coding_sequence
167 | expected_cds_length = len(CTNNBIP1_004_CDS)
168 | eq_(
169 | len(cds),
170 | expected_cds_length,
171 | "Expected CDS length %d, got %d" % (expected_cds_length, len(cds)),
172 | )
173 | eq_(cds, CTNNBIP1_004_CDS)
174 |
175 |
176 | @run_multiple_genomes()
177 | def test_equal_transcripts(genome):
178 | t1 = genome.genes_by_name("TP53")[0].transcripts[0]
179 | # get an identical gene
180 | t2 = genome.transcript_by_id(t1.id)
181 | eq_(t1, t2)
182 | eq_(hash(t1), hash(t2))
183 |
184 |
185 | @run_multiple_genomes()
186 | def test_not_equal_transcripts(genome):
187 | t1 = genome.genes_by_name("MUC1")[0].transcripts[0]
188 | t2 = genome.genes_by_name("BRCA1")[0].transcripts[0]
189 | neq_(t1, t2)
190 |
191 |
192 | def test_protein_id():
193 | transcript = ensembl77.transcripts_by_name("EGFR-001")[0]
194 | eq_(transcript.protein_id, "ENSP00000275493")
195 |
196 |
197 | def test_protein_protein_sequence():
198 | transcript = ensembl77.transcripts_by_name("EGFR-001")[0]
199 | eq_(transcript.protein_sequence, EGFR_001_protein_sequence)
200 |
201 |
202 | def test_transcript_gene_should_match_parent_gene():
203 | gene = ensembl77.gene_by_id(TP53_gene_id)
204 | for transcript in gene.transcripts:
205 | eq_(transcript.gene, gene)
206 |
207 |
208 | @run_multiple_genomes()
209 | def test_BRCA1_201_has_protein_coding_biotype(genome):
210 | transcript = genome.transcripts_by_name("BRCA1-201")[0]
211 | assert (
212 | transcript.is_protein_coding
213 | ), "Expected BRCA1-201 transcript %s to have a protein coding biotype in %s" % (
214 | transcript,
215 | genome,
216 | )
217 | eq_(transcript.biotype, "protein_coding")
218 |
--------------------------------------------------------------------------------
/tests/test_transcript_sequences.py:
--------------------------------------------------------------------------------
1 | """Make sure we're getting correct transcritp sequence from Ensembl and that
2 | it's a sequence type which correctly implements `complement`
3 | and `reverse_complement`
4 | """
5 |
6 | from __future__ import absolute_import
7 | from .common import eq_
8 | from pyensembl import genome_for_reference_name
9 |
10 | grch38 = genome_for_reference_name("GRCh38")
11 |
12 |
13 | def test_transcript_sequence_ensembl_grch38():
14 | # extremely short TRD gene
15 | seq = grch38.transcript_sequence("ENST00000448914")
16 | expected = "ACTGGGGGATACG"
17 | eq_(seq, expected)
18 | # now try via a Transcript object
19 | eq_(grch38.transcript_by_id("ENST00000448914").sequence, expected)
20 |
--------------------------------------------------------------------------------
/tests/test_transcript_support_level.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for methods which return collections of transcript IDs that aren't
3 | converting from some type of name or ID.
4 | """
5 | from __future__ import absolute_import
6 |
7 | from .common import eq_
8 |
9 | from pyensembl import cached_release
10 |
11 |
12 | def test_transcript_support_level():
13 | """The Transcript Support Level (TSL) is a method to highlight the well-supported and poorly-supported transcript
14 | models for users, based on the type and quality of the alignments used to annotate the transcript.
15 | In the Ensembl database, it can be assigned to a value 1 through 5, or reported as NA, or missing, or missing
16 | completely in older releases. We translate it to an integer value, otherwise to None.
17 | """
18 | ensembl93 = cached_release(93)
19 | transcript = ensembl93.transcripts_by_name("DDX11L1-202")[0]
20 | eq_(transcript.support_level, 1)
21 |
22 | # For this transcript, the transcript_support_level value is missing in the database record:
23 | transcript = ensembl93.transcripts_by_name("OR4G11P-202")[0]
24 | eq_(transcript.support_level, None)
25 |
26 | # Some features are reported as "NA" in Ensembl: those are features like pseudogenes, single exon transcripts,
27 | # HLA, T-cell receptor and Ig transcripts that are not analysed in terms of TSL and therefore not given any
28 | # of the TSL categories. We translate NA to None as well.
29 | transcript = ensembl93.transcripts_by_name("MIR1302-2-201")[0]
30 | eq_(transcript.support_level, None)
31 |
32 | # Transcript_support_level column was missing completely in GRCh37 and older releases of GRCh38:
33 | ensembl77 = cached_release(77)
34 | transcript = ensembl77.transcripts_by_name("DDX11L1-002")[0]
35 | eq_(transcript.support_level, None)
36 |
--------------------------------------------------------------------------------
/tests/test_ucsc_gtf.py:
--------------------------------------------------------------------------------
1 | from pyensembl import Genome, Database
2 |
3 | from .common import TemporaryDirectory, eq_
4 | from .data import data_path
5 |
6 | UCSC_GENCODE_PATH = data_path("gencode.ucsc.small.gtf")
7 | UCSC_REFSEQ_PATH = data_path("refseq.ucsc.small.gtf")
8 |
9 |
10 | def test_ucsc_gencode_gtf():
11 | with TemporaryDirectory() as tmpdir:
12 | db = Database(UCSC_GENCODE_PATH, cache_directory_path=tmpdir)
13 | df = db._load_gtf_as_dataframe()
14 | exons = df[df["feature"] == "exon"]
15 | # expect 12 exons from the dataframe
16 | assert len(exons) == 12, "Expected 12 exons, got %d: %s" % (len(exons), exons)
17 |
18 |
19 | def test_ucsc_gencode_genome():
20 | """
21 | Testing with a small GENCODE GTF file downloaded from
22 | http://genome.ucsc.edu/cgi-bin/hgTables
23 | """
24 | with TemporaryDirectory() as tmpdir:
25 | genome = Genome(
26 | reference_name="GRCh38",
27 | annotation_name="ucsc_test",
28 | gtf_path_or_url=UCSC_GENCODE_PATH,
29 | cache_directory_path=tmpdir,
30 | )
31 | genome.index()
32 | genes = genome.genes()
33 | for gene in genes:
34 | assert gene.id, "Gene with missing ID in %s" % (genome,)
35 | assert len(genes) == 7, "Expected 7 genes, got %d: %s" % (len(genes), genes)
36 | transcripts = genome.transcripts()
37 | for transcript in transcripts:
38 | assert transcript.id, "Transcript with missing ID in %s" % (genome,)
39 | assert len(transcripts) == 7, "Expected 7 transcripts, got %d: %s" % (
40 | len(transcripts),
41 | transcripts,
42 | )
43 |
44 | gene_uc001aak4 = genome.gene_by_id("uc001aak.4")
45 | eq_(gene_uc001aak4.id, "uc001aak.4")
46 | eq_(gene_uc001aak4.name, None)
47 | eq_(gene_uc001aak4.biotype, None)
48 |
49 | gene_1_17369 = genome.genes_at_locus("chr1", 17369)
50 | eq_(gene_1_17369[0].id, "uc031tla.1")
51 |
52 | transcript_1_30564 = genome.transcripts_at_locus("chr1", 30564)
53 | eq_(transcript_1_30564[0].id, "uc057aty.1")
54 |
55 |
56 | def test_ucsc_refseq_gtf():
57 | """
58 | Test GTF object with a small RefSeq GTF file downloaded from
59 | http://genome.ucsc.edu/cgi-bin/hgTables
60 | """
61 | with TemporaryDirectory() as tmpdir:
62 | db = Database(UCSC_REFSEQ_PATH, cache_directory_path=tmpdir)
63 | df = db._load_gtf_as_dataframe()
64 | exons = df[df["feature"] == "exon"]
65 | # expect 16 exons from the GTF
66 | assert len(exons) == 16, "Expected 16 exons, got %d: %s" % (len(exons), exons)
67 |
68 |
69 | def test_ucsc_refseq_genome():
70 | """
71 | Test Genome object with a small RefSeq GTF file downloaded from
72 | http://genome.ucsc.edu/cgi-bin/hgTables
73 | """
74 | with TemporaryDirectory() as tmpdir:
75 | genome = Genome(
76 | reference_name="GRCh38",
77 | annotation_name="ucsc_test",
78 | gtf_path_or_url=UCSC_REFSEQ_PATH,
79 | cache_directory_path=tmpdir,
80 | )
81 | genome.index()
82 | genes = genome.genes()
83 | for gene in genes:
84 | assert gene.id, "Gene with missing ID in %s" % (
85 | genome.db._load_gtf_as_dataframe(),
86 | )
87 | assert len(genes) == 2, "Expected 2 genes, got %d: %s" % (len(genes), genes)
88 | transcripts = genome.transcripts()
89 | for transcript in transcripts:
90 | assert transcript.id, "Transcript with missing ID in %s" % (
91 | genome.db._load_gtf_as_dataframe(),
92 | )
93 | assert len(transcripts) == 2, "Expected 2 transcripts, got %d: %s" % (
94 | len(transcripts),
95 | transcripts,
96 | )
97 | genes_at_locus = genome.genes_at_locus("chr1", 67092176)
98 | assert (
99 | len(genes_at_locus) == 2
100 | ), "Expected 2 genes at locus chr1:67092176, got %d: %s" % (
101 | len(genes_at_locus),
102 | genes_at_locus,
103 | )
104 | ids = set([gene.id for gene in genes_at_locus])
105 | eq_(set(["NM_001276352", "NR_075077"]), ids)
106 |
--------------------------------------------------------------------------------