├── .gitignore ├── .idea ├── .gitignore ├── cdot.iml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── CHANGELOG.md ├── LICENSE ├── README.md ├── cdot ├── __init__.py ├── data_release.py ├── hgvs │ ├── __init__.py │ └── dataproviders │ │ ├── __init__.py │ │ ├── ensembl_tark_data_provider.py │ │ ├── fasta_seqfetcher.py │ │ ├── json_data_provider.py │ │ └── seqfetcher.py └── pyhgvs │ ├── __init__.py │ └── pyhgvs_transcript.py ├── generate_transcript_data ├── Mus_musculus │ ├── refseq_transcripts_grcm38.sh │ └── refseq_transcripts_grcm39.sh ├── Snakefile ├── __init__.py ├── all_transcripts.sh ├── cdot_gene_info.py ├── cdot_json.py ├── cdot_transcripts.yaml ├── ensembl_transcripts_chm13v2.sh ├── ensembl_transcripts_grch37.sh ├── ensembl_transcripts_grch38.sh ├── gene_info.sh ├── gff_parser.py ├── github_release_upload.sh ├── json_encoders.py ├── json_schema_version.py ├── mus_musculus.sh ├── refseq_transcripts_chm13v2.sh ├── refseq_transcripts_grch37.sh ├── refseq_transcripts_grch38.sh ├── requirements.txt ├── uta_20210129_grch37.sql ├── uta_20210129_grch38.sql └── uta_transcripts.sh ├── paper ├── HGVS cleaning.ipynb ├── clean_hgvs_search_csvs.py ├── combine_csv.py └── investigate_fails.py ├── pyproject.toml ├── setup.cfg └── tests ├── __init__.py ├── benchmark_hgvs.py ├── genome.py ├── mock_ensembl_tark.py ├── mock_seqfetcher.py ├── test_data ├── cdot.ensembl.grch38.json ├── cdot.refseq.grch37.json ├── clinvar_hgvs │ ├── clinvar_hgvs_010.tsv │ ├── clinvar_hgvs_050.tsv │ ├── clinvar_hgvs_100.tsv │ ├── clinvar_hgvs_500.tsv │ ├── clinvar_hgvs_ensembl.tsv │ ├── clinvar_hgvs_ensembl_100.tsv │ ├── clinvar_hgvs_ensembl_50.tsv │ └── clinvar_hgvs_ensembl_500.tsv ├── ensembl_tark │ └── transcript │ │ ├── assembly_name=GRCh38&expand=transcript_release_set&loc_end=36530514&loc_region=7&loc_start=36530417&page=2.json │ │ ├── assembly_name=GRCh38&expand=transcript_release_set&loc_end=36530514&loc_region=7&loc_start=36530417&page=3.json │ │ ├── assembly_name=GRCh38&expand=transcript_release_set&loc_end=36530514&loc_region=7&loc_start=36530417&page=4.json │ │ ├── assembly_name=GRCh38&expand=transcript_release_set&loc_end=36530514&loc_region=7&loc_start=36530417&page=5.json │ │ ├── assembly_name=GRCh38&expand=transcript_release_set&loc_end=36530514&loc_region=7&loc_start=36530417&page=6.json │ │ ├── assembly_name=GRCh38&expand=transcript_release_set&loc_end=36530514&loc_region=7&loc_start=36530417.json │ │ ├── search │ │ └── identifier_field=AOAH&expand=exons,genes,sequence.json │ │ └── stable_id=ENST00000617537&stable_id_version=5&expand_all=true.json ├── ensembl_test.GRCh38.104.gtf ├── ensembl_test.GRCh38.111.gtf ├── grch37.genome ├── hg19_chrY_300kb_genes.gtf ├── refseq_grch37_mt.gff ├── refseq_grch38.p14_mt.gff ├── refseq_test.GRCh38.p13_genomic.109.20210514.gff ├── refseq_test.GRCh38.p14_genomic.RS_2023_03.gff └── transcript_sequences.json ├── test_gff_parsers.py ├── test_json_data_provider_ensembl.py ├── test_json_data_provider_refseq.py ├── test_pyhgvs.py └── test_uta_conversion.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | /workspace.xml 2 | misc.xml 3 | 4 | -------------------------------------------------------------------------------- /.idea/cdot.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 19 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## [0.2.26] 2024-08-15 2 | 3 | Bumped version to 0.2.26 to catch up with data release. Only new client functionality is #81 'data_release' helper functions 4 | 5 | All other changes in this release were for data (and contained in data_v0.2.26) 6 | 7 | ### Added 8 | 9 | - #81 New 'data_release' code eg 'get_latest_combo_file_urls' that looks on GitHub to find latest data 10 | - New GFFs: RefSeq RS_2023_10, Ensembl 111, 112 11 | - #79 - RefSeq MT transcripts 12 | - #66 - We now store 'Note' field (thanks holtgrewe for suggestion) 13 | - Added requirements.txt for 'generate_transcript_data' sections 14 | - client / JSON data schema version compatability check 15 | 16 | ### Changed 17 | 18 | - #56 - Fix occasional UTA duplicated exons 19 | - #57 - Correctly handle retrieving genomic position and dealing w/indels in GFF (thanks ltnetcase for reporting) 20 | - #60 - Fix for missing protein IDs due to Genbank / GenBank (thanks holtgrewe) 21 | - #64 - Split code/data versions. json.gz are now labelled according to data schema version (thanks holtgrewe) 22 | - Renamed 'CHM13v2.0' to 'T2T-CHM13v2.0' so it could work with biocommons bioutils 23 | - #72 - Correctly handle ncRNA_gene genes (thanks holtgrewe for reporting) 24 | - #73 - HGNC ID was missing for some chrMT genes in Ensembl 25 | 26 | ## [0.2.21] - 2023-08-14 27 | 28 | ### Changed 29 | 30 | - #45 - FastaSeqFetcher - fix alignment gaps properly 31 | - #52 - Added transcripts from Ensembl 110 GRCh38 release 32 | - #53 - UTA to cdot transcript start/end conversion issue 33 | 34 | ## [0.2.20] - 2023-07-10 35 | 36 | ### Changed 37 | 38 | - #50 - Biotype was missing in Ensembl transcripts 39 | 40 | ## [0.2.19] - 2023-07-06 41 | 42 | ### Changed 43 | 44 | - #49 - MT not converted to contigs correctly (GRCh37/Ensembl only) #49 45 | - Removed accidental logging 46 | 47 | ## [0.2.18] - 2023-07-05 48 | 49 | ### Added 50 | 51 | - #44 - Support for mouse transcripts (Mus Musculus GRCm38 and GRCm39) 52 | - #47 - Implement HGVS DataProvider get_alignments_for_region 53 | 54 | ### Changed 55 | 56 | - #45 - FastaSeqFetcher - handle deletions correctly (had swapped HGVS cigar projections around) 57 | - #46 - HGVS DataProvider get_tx_info should properly handle alt_ac and alt_aln_method 58 | 59 | ## [0.2.17] - 2023-05-08 60 | 61 | ### Added 62 | 63 | - #42 - Ensembl T2T CHM13v2.0 64 | 65 | ### Changed 66 | 67 | - #43 - Contigs not converted to accession numbers properly (this was breaking local Biocommons HGVS conversion using 0.2.16 data) 68 | 69 | ## [0.2.16] - 2023-04-12 70 | 71 | ### Added 72 | 73 | - Added historical release 110 (2022-04-12) for T2T CHM13v2.0 74 | - Added latest GRCh38.p14 release (2023-03-21) 75 | 76 | ## [0.2.15] - 2023-04-03 77 | 78 | ### Added 79 | 80 | - Support for T2T CHM13v2.0 81 | 82 | ## [0.2.14] - 2023-03-21 83 | 84 | ### Added 85 | 86 | - #39 - Fasta file SeqFetcher implementation 87 | - Add Ensembl 109 GTF 88 | 89 | ### Changed 90 | 91 | - #38 - Differing implementation of get_tx_for_region to hgvs one (reported by Manuel Holtgrewe) 92 | - #35 - Tags (ie MANE Select / RefSeq select etc) should be genome build specific 93 | - #34 - Stick to PyHGVS conventions, throw ValueError: transcript is required on missing transcript 94 | 95 | ## [0.2.13] - 2023-02-23 96 | 97 | ### Changed 98 | 99 | - Fix for #25 - Pyhgvs data conversion - non-coding transcripts have bad cds start/end conversion 100 | - Fix for #32 - Signature of get_pyhgvs_data consistent for all return statements 101 | 102 | ## [0.2.12] - 2022-12-08 103 | 104 | ### Added 105 | 106 | - #30 - We now store "tag" attributes (eg "MANE Select", "RefSeq Select") 107 | - Switch to using Ensembl GFF3 (so we can get tags out) 108 | - Add Ensembl 108 GFF3 109 | 110 | ### Changed 111 | 112 | - Fix for #25 - GeneInfo currently fails for some records 113 | - Fix for #27 - Change URL for missing RefSeq GFFs 114 | 115 | ## [0.2.11] - 2022-09-27 116 | 117 | ### Added 118 | 119 | - Now support all methods (get_gene_info, get_tx_for_gene, get_tx_for_region) for REST 120 | - Add Ensembl 107 GTF 121 | 122 | ### Changed 123 | 124 | - Ensembl gene info was missing "description" 125 | 126 | ## [0.2.10] - 2022-09-19 127 | 128 | ### Added 129 | 130 | - [Implement get_gene_info](https://github.com/SACGF/cdot/issues/20) - For local JSON data only 131 | 132 | ### Changed 133 | 134 | - Fixed issue [#23 UTA transcripts for PyHGVS](https://github.com/SACGF/cdot/issues/23) 135 | 136 | ## [0.2.9] - 2022-09-01 137 | 138 | ### Changed 139 | 140 | - [BugFix for get_tx_for_region](https://github.com/SACGF/cdot/issues/22) 141 | 142 | 143 | ## [0.2.8] - 2022-08-29 144 | 145 | ### Added 146 | 147 | - [Implemented get_pro_ac_for_tx_ac](https://github.com/SACGF/cdot/issues/14) (c_to_p can now generate p.HGVS) 148 | - [Implemented get_tx_for_region](https://github.com/SACGF/cdot/issues/18) for local JSON data only 149 | 150 | ## [0.2.7] - 2022-05-19 151 | 152 | ### Added 153 | 154 | - Add transcripts from latest RefSeq GRCh37 (105) and RefSeq GRCh38 (110) 155 | 156 | ### Changed 157 | 158 | - Fixed default arguments bug where PyHGVS only worked on SACGF fork 159 | - gtf_to_json now goes straight to cdot format (without intermediary PyReference format) 160 | - UTA is not included in generation scripts by default, to enable, set environment variable UTA_TRANSCRIPTS=True 161 | - Handle mismatches in UTA CIGAR alignments (convert to match (no indels) as GFF format has no support for mismatch) 162 | 163 | ## [0.2.6] - 2022-05-19 164 | 165 | ### Changed 166 | 167 | - Fixed issue [Ensembl contigs g_to_c](https://github.com/SACGF/cdot/issues/9) - Ensembl JSON was using chrom names ie "17" instead of "NC_000017.11" for contig 168 | 169 | ## [0.2.5] - 2022-04-14 170 | 171 | ### Changed 172 | 173 | - PyHGVS conversion fix - non-coding cds_start/cds_end is set to start/end (not None) 174 | 175 | ## [0.2.4] - 2022-04-13 176 | 177 | ### Added 178 | 179 | - Latest RefSeq (110) and Ensembl (106) transcripts 180 | 181 | ### Changed 182 | 183 | - Fixed bug where all UTA transcripts were '-' strand 184 | - Add "other_chroms" to combined historical file 185 | 186 | ## [0.2.3] - 2022-03-29 187 | 188 | ### Changed 189 | 190 | - Fixed bug where HGNC not extracted properly from Ensembl GTFs 191 | - Gene information is now included by default (only adds 5%) 192 | - Clean artifacts from UTA data 193 | - Support for [SACGF PyHGVS fork](https://github.com/SACGF/hgvs) (which adds alignment gap support) 194 | 195 | ## [0.2.2] - 2022-03-03 196 | 197 | ### Added 198 | 199 | - Support for HTTPS (bought SSL certificate for REST server) 200 | 201 | ## [0.2.1] - 2022-03-03 202 | 203 | ### Added 204 | 205 | - [Download/Convert UTA transcripts](https://github.com/SACGF/cdot/issues/1) 206 | - [REST client](https://github.com/SACGF/cdot/issues/4) for [REST Service](https://github.com/SACGF/cdot_rest/) 207 | 208 | ### Changed 209 | 210 | - [JSON format changed](https://github.com/SACGF/cdot/issues/2), separating common/build specific coordinates. This is so a transcript can contain data for multiple builds. 211 | - [Use ijson to reduce RAM usage](https://github.com/SACGF/cdot/issues/7) - uses iterator vs loading all JSON into RAM 212 | 213 | ## [0.1.1] - 2022-01-19 214 | 215 | ### Added 216 | 217 | - Initial commit 218 | 219 | [unreleased]: https://github.com/SACGF/cdot/compare/v0.2.26...HEAD 220 | [0.2.26]: https://github.com/SACGF/cdot/compare/v0.2.21...v0.2.26 221 | [0.2.21]: https://github.com/SACGF/cdot/compare/v0.2.20...v0.2.21 222 | [0.2.20]: https://github.com/SACGF/cdot/compare/v0.2.19...v0.2.20 223 | [0.2.19]: https://github.com/SACGF/cdot/compare/v0.2.18...v0.2.19 224 | [0.2.18]: https://github.com/SACGF/cdot/compare/v0.2.17...v0.2.18 225 | [0.2.17]: https://github.com/SACGF/cdot/compare/v0.2.16...v0.2.17 226 | [0.2.16]: https://github.com/SACGF/cdot/compare/v0.2.15...v0.2.16 227 | [0.2.15]: https://github.com/SACGF/cdot/compare/v0.2.14...v0.2.15 228 | [0.2.14]: https://github.com/SACGF/cdot/compare/v0.2.13...v0.2.14 229 | [0.2.13]: https://github.com/SACGF/cdot/compare/v0.2.12...v0.2.13 230 | [0.2.12]: https://github.com/SACGF/cdot/compare/v0.2.11...v0.2.12 231 | [0.2.11]: https://github.com/SACGF/cdot/compare/v0.2.10...v0.2.11 232 | [0.2.10]: https://github.com/SACGF/cdot/compare/v0.2.9...v0.2.10 233 | [0.2.9]: https://github.com/SACGF/cdot/compare/v0.2.8...v0.2.9 234 | [0.2.8]: https://github.com/SACGF/cdot/compare/v0.2.7...v0.2.8 235 | [0.2.7]: https://github.com/SACGF/cdot/compare/v0.2.6...v0.2.7 236 | [0.2.6]: https://github.com/SACGF/cdot/compare/v0.2.5...v0.2.6 237 | [0.2.5]: https://github.com/SACGF/cdot/compare/v0.2.4...v0.2.5 238 | [0.2.4]: https://github.com/SACGF/cdot/compare/v0.2.3...v0.2.4 239 | [0.2.3]: https://github.com/SACGF/cdot/compare/v0.2.2...v0.2.3 240 | [0.2.2]: https://github.com/SACGF/cdot/compare/v0.2.1...v0.2.2 241 | [0.2.1]: https://github.com/SACGF/cdot/compare/v0.1.1...v0.2.1 242 | [0.1.1]: https://github.com/SACGF/cdot/releases/tag/v0.1.1 243 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 SACGF 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cdot 2 | 3 | [![PyPi version](https://img.shields.io/pypi/v/cdot.svg)](https://pypi.org/project/cdot/) [![Python versions](https://img.shields.io/pypi/pyversions/cdot.svg)](https://pypi.org/project/cdot/) [![DOI](https://zenodo.org/badge/448753921.svg)](https://zenodo.org/doi/10.5281/zenodo.13324621) 4 | 5 | 6 | cdot provides transcripts for the 2 most popular Python [HGVS](http://varnomen.hgvs.org/) libraries. 7 | 8 | It works by: 9 | 10 | * Converting RefSeq/Ensembl GTFs to JSON 11 | * Providing loaders for the HGVS libraries, via JSON.gz files, or REST API via [cdot_rest](https://github.com/SACGF/cdot_rest)) 12 | 13 | We currently support 1.58 million transcript/genome alignments (vs ~141k in UTA v.20210129) 14 | 15 | ## New 16 | 17 | See [changelog](https://github.com/SACGF/cdot/blob/main/CHANGELOG.md) 18 | 19 | 2024-08-15: 20 | 21 | * 'data_release' helper code 22 | * Many minor updates to data (see changelog) 23 | 24 | 2023-07-05: 25 | * BioCommons HGVS DataProvider fixes 26 | * Support for mouse transcripts (Mus Musculus GRCm38 and GRCm39) 27 | 28 | 2023-04-03: 29 | * #41 - Support for T2T CHM13v2.0 [example code](https://github.com/SACGF/cdot/wiki/Biocommons-T2T-CHM13v2.0-example-code) 30 | 31 | ## Install 32 | 33 | ``` 34 | pip install cdot 35 | ``` 36 | 37 | ## Examples 38 | 39 | [Biocommons HGVS](https://github.com/biocommons/hgvs) example: 40 | 41 | ``` 42 | import hgvs 43 | from hgvs.assemblymapper import AssemblyMapper 44 | from cdot.hgvs.dataproviders import JSONDataProvider, RESTDataProvider 45 | 46 | hdp = RESTDataProvider() # Uses API server at cdot.cc 47 | # hdp = JSONDataProvider(["./cdot-0.2.14.refseq.grch37.json.gz"]) # Uses local JSON file 48 | 49 | am = AssemblyMapper(hdp, 50 | assembly_name='GRCh37', 51 | alt_aln_method='splign', replace_reference=True) 52 | 53 | hp = hgvs.parser.Parser() 54 | var_c = hp.parse_hgvs_variant('NM_001637.3:c.1582G>A') 55 | am.c_to_g(var_c) 56 | ``` 57 | 58 | [more Biocommons examples](https://github.com/SACGF/cdot/wiki/Biocommons-HGVS-example-code): 59 | 60 | [PyHGVS](https://github.com/counsyl/hgvs) example: 61 | 62 | ``` 63 | import pyhgvs 64 | from pysam.libcfaidx import FastaFile 65 | from cdot.pyhgvs.pyhgvs_transcript import JSONPyHGVSTranscriptFactory, RESTPyHGVSTranscriptFactory 66 | 67 | genome = FastaFile("/data/annotation/fasta/GCF_000001405.25_GRCh37.p13_genomic.fna.gz") 68 | factory = RESTPyHGVSTranscriptFactory() 69 | # factory = JSONPyHGVSTranscriptFactory(["./cdot-0.2.14.refseq.grch37.json.gz"]) # Uses local JSON file 70 | pyhgvs.parse_hgvs_name('NM_001637.3:c.1582G>A', genome, get_transcript=factory.get_transcript_grch37) 71 | ``` 72 | 73 | [more PyHGVS examples](https://github.com/SACGF/cdot/wiki/PyHGVS-example-code): 74 | 75 | ## Q. What's the performance like? 76 | 77 | * UTA public DB: 1-1.5 seconds / transcript 78 | * cdot REST service: 10/second 79 | * cdot JSON.gz: 500-1k/second 80 | 81 | ## Q. Where can I download the JSON.gz files? 82 | 83 | [Download from GitHub releases](https://github.com/SACGF/cdot/releases) - RefSeq (37/38) - 72M, Ensembl (37/38) 61M 84 | 85 | Details on what the files contain [here](https://github.com/SACGF/cdot/wiki/GitHub-release-file-details) 86 | 87 | ## Q. How does this compare to Universal Transcript Archive? 88 | 89 | Both projects have similar goals of providing transcripts for loading HGVS, but they approach it from different ways 90 | 91 | * UTA aligns sequences, then stores coordinates in an SQL database. 92 | * cdot convert existing Ensembl/RefSeq GTFs into JSON 93 | 94 | See [wiki for more details](https://github.com/SACGF/cdot/wiki/cdot-vs-UTA) 95 | 96 | ## Q. How do you store transcripts in JSON? 97 | 98 | See [wiki page](https://github.com/SACGF/cdot/wiki/Transcript-JSON-format) for the format. 99 | 100 | We think a standard for JSON gene/transcript information would be a great thing, and am keen to collaborate to make it happen! 101 | 102 | ## Q. What does cdot stand for? 103 | 104 | cdot, pronounced "see dot" is a play on HGVS coding sequence ```:c.``` But if you want a backronym, it's "Complete Dict Of Transcripts" 105 | 106 | This was developed for the [Australian Genomics](https://www.australiangenomics.org.au/) [Shariant](https://shariant.org.au/) project, due to the need to load historical HGVS from lab archives. 107 | -------------------------------------------------------------------------------- /cdot/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.26" 2 | # Data version is kept in generate_transcript_version.json_schema_version 3 | 4 | def get_data_schema_int(version: str) -> int: 5 | """ Return an int which increments upon breaking changes - ie anything other than patch """ 6 | major, minor, patch = version.split(".") 7 | return 1000 * int(major) + int(minor) 8 | -------------------------------------------------------------------------------- /cdot/data_release.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | import cdot 4 | 5 | from cdot import get_data_schema_int 6 | 7 | 8 | def get_latest_data_release_tag_name(): 9 | latest_data_release = get_latest_data_release() 10 | return latest_data_release.get('tag_name') 11 | 12 | def _get_version_from_tag_name(tag_name, data_version=False): 13 | """ Returns None if doesn't match required prefix """ 14 | release_prefix = "v" 15 | if data_version: 16 | release_prefix = "data_" + release_prefix 17 | 18 | if not tag_name.startswith(release_prefix): 19 | return None 20 | return tag_name.lstrip(release_prefix) 21 | 22 | 23 | def get_latest_data_release(): 24 | client_data_schema = get_data_schema_int(cdot.__version__) 25 | 26 | url = "https://api.github.com/repos/SACGF/cdot/releases" 27 | response = requests.get(url) 28 | json_data = response.json() 29 | for release in json_data: 30 | tag_name = release['tag_name'] # Should look like 'v0.2.25' for code or 'data_v0.2.25' for data 31 | # We require a data version 32 | data_version = _get_version_from_tag_name(tag_name, data_version=True) 33 | if data_version is None: 34 | continue 35 | 36 | data_schema = get_data_schema_int(data_version) 37 | if data_schema != client_data_schema: 38 | continue 39 | return release 40 | return {} 41 | 42 | def get_latest_combo_file_urls(annotation_consortia, genome_builds): 43 | # lower case everything to be case insensitive 44 | annotation_consortia = {x.lower() for x in annotation_consortia} 45 | genome_builds = {x.lower() for x in genome_builds} 46 | 47 | file_urls = [] 48 | if latest_data_release := get_latest_data_release(): 49 | for asset in latest_data_release["assets"]: 50 | browser_download_url = asset["browser_download_url"] 51 | filename = browser_download_url.rsplit("/")[-1] 52 | if m := re.match(r"cdot-(\d+\.\d+\.\d+)\.(refseq|ensembl)\.(.+)\.json\.gz", filename): 53 | _version, annotation_consortium, genome_build = m.groups() 54 | if annotation_consortium.lower() in annotation_consortia and genome_build.lower() in genome_builds: 55 | file_urls.append(browser_download_url) 56 | return file_urls -------------------------------------------------------------------------------- /cdot/hgvs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SACGF/cdot/ddeb78d58731dd4136689360d0fce4a8a91af87d/cdot/hgvs/__init__.py -------------------------------------------------------------------------------- /cdot/hgvs/dataproviders/__init__.py: -------------------------------------------------------------------------------- 1 | from .fasta_seqfetcher import * 2 | from .json_data_provider import * 3 | from .seqfetcher import * 4 | -------------------------------------------------------------------------------- /cdot/hgvs/dataproviders/fasta_seqfetcher.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import re 3 | 4 | from pysam.libcfaidx import FastaFile 5 | from hgvs.dataproviders.interface import Interface 6 | from hgvs.exceptions import HGVSDataNotAvailableError 7 | from bioutils.sequences import reverse_complement 8 | 9 | from cdot.hgvs.dataproviders.seqfetcher import AbstractTranscriptSeqFetcher, PrefixSeqFetcher 10 | 11 | 12 | class GenomeFastaSeqFetcher: 13 | def __init__(self, *args): 14 | self.source = "Local Fasta file reference" 15 | self.contig_fastas = {} 16 | for fasta_filename in args: 17 | fasta_file = FastaFile(fasta_filename) 18 | for contig in fasta_file.references: 19 | self.contig_fastas[contig] = fasta_file 20 | 21 | if not self.contig_fastas: 22 | raise ValueError("Need to provide at least one of fasta file as argument") 23 | 24 | def fetch_seq(self, ac, start_i=None, end_i=None): 25 | if fasta_file := self.contig_fastas.get(ac): # Contig 26 | return fasta_file.fetch(ac, start_i, end_i).upper() 27 | 28 | raise HGVSDataNotAvailableError(f"Accession '{ac}' not in fasta contigs") 29 | 30 | 31 | class ExonsFromGenomeFastaSeqFetcher(AbstractTranscriptSeqFetcher): 32 | """ This produces artificial transcript sequences by pasting together exons from the genome 33 | It is possible that this does not exactly match the transcript sequences - USE AT OWN RISK! """ 34 | def __init__(self, *args, cache=True): 35 | self.cache = cache 36 | self.transcript_cache = {} 37 | self.hdp = None # Set when passed to data provider (via set_data_provider) 38 | self.source = "Transcript Exons using Genome Fasta file reference" 39 | self.contig_fastas = {} 40 | self.cigar_pattern = re.compile(r"(\d+)([=DIX])") 41 | for fasta_filename in args: 42 | fasta_file = FastaFile(fasta_filename) 43 | for contig in fasta_file.references: 44 | self.contig_fastas[contig] = fasta_file 45 | 46 | if not self.contig_fastas: 47 | raise ValueError("Need to provide at least one of fasta file as argument") 48 | super().__init__(*args, cache) 49 | 50 | def get_mapping_options(self, ac): 51 | return self.hdp.get_tx_mapping_options(ac) 52 | 53 | def _get_transcript_seq(self, ac): 54 | possible_contigs = set() 55 | for tx_mo in self.get_mapping_options(ac): 56 | alt_ac = tx_mo["alt_ac"] 57 | possible_contigs.add(alt_ac) 58 | if alt_ac in self.contig_fastas: 59 | return self._fetch_seq_from_fasta(ac, alt_ac, tx_mo["alt_aln_method"]) 60 | 61 | msg = f"Failed to fetch {ac} from {self.source}. " 62 | if possible_contigs: 63 | possible_contigs = sorted(possible_contigs) 64 | raise HGVSDataNotAvailableError(f"{msg} No Fasta provided with contigs: {possible_contigs}") 65 | raise HGVSDataNotAvailableError(f"{msg} Transcript '{ac}' not found.") 66 | 67 | def _fetch_seq_from_fasta(self, ac, alt_ac, alt_aln_method): 68 | fasta_file = self.contig_fastas[alt_ac] 69 | 70 | exons = self.hdp.get_tx_exons(ac, alt_ac, alt_aln_method) 71 | exon_sequences = [] 72 | expected_transcript_length = 0 73 | sorted_exons = list(sorted(exons, key=lambda ex: ex["ord"])) 74 | first_exon = sorted_exons[0] 75 | transcript_start_offset = first_exon["tx_start_i"] # HGVS/UTA starts w/0 76 | if transcript_start_offset: 77 | exon_sequences.append("N" * transcript_start_offset) 78 | expected_transcript_length += transcript_start_offset 79 | 80 | for exon in sorted_exons: 81 | exon_seq = fasta_file.fetch(alt_ac, exon["alt_start_i"], exon["alt_end_i"]) 82 | exon_seq = exon_seq.upper() 83 | 84 | exon_seq_list = [] 85 | start = 0 86 | # We are using HGVS cigar 87 | for (length_str, op) in self.cigar_pattern.findall(exon["cigar"]): 88 | length = int(length_str) 89 | if op == 'D': # Deletion in reference vs transcript 90 | exon_seq_list.append("N" * length) 91 | # Don't increment start (as we didn't move along genomic exon) 92 | elif op == 'I': # Insertion in reference vs transcript 93 | # Leave out of exon_seq 94 | start += length # We do increment through genomic sequence though 95 | else: # match/mismatch 96 | exon_seq_list.append(exon_seq[start:start+length]) 97 | start += length 98 | 99 | exon_seq = "".join(exon_seq_list) 100 | if exon["alt_strand"] == -1: 101 | exon_seq = reverse_complement(exon_seq) 102 | exon_sequences.append(exon_seq) 103 | expected_transcript_length += exon["tx_end_i"] - exon["tx_start_i"] 104 | 105 | transcript_sequence = "".join(exon_sequences) 106 | if len(transcript_sequence) != expected_transcript_length: 107 | raise ValueError(f"Error creating {ac} sequence from genome fasta ({alt_ac}): " 108 | f"{expected_transcript_length=} != {len(transcript_sequence)=}") 109 | return transcript_sequence 110 | 111 | 112 | 113 | class FastaSeqFetcher(PrefixSeqFetcher): 114 | """ Re-implementing using above - deprecated use """ 115 | 116 | def __init__(self, *args, cache=True): 117 | default_seqfetcher = ExonsFromGenomeFastaSeqFetcher(*args, cache=True) 118 | 119 | super().__init__(default_seqfetcher=default_seqfetcher) 120 | self.prefix_seqfetchers.update({ 121 | "NC_": GenomeFastaSeqFetcher(*args), 122 | }) 123 | -------------------------------------------------------------------------------- /cdot/hgvs/dataproviders/seqfetcher.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | from more_itertools import all_equal 4 | from hgvs.dataproviders.interface import Interface 5 | from hgvs.exceptions import HGVSDataNotAvailableError 6 | 7 | 8 | class PrefixSeqFetcher: 9 | def __init__(self, default_seqfetcher=None): 10 | self.default_seqfetcher = default_seqfetcher 11 | self.prefix_seqfetchers = {} 12 | 13 | def add_seqfetcher(self, prefix, seqfetcher): 14 | self.prefix_seqfetchers[prefix] = seqfetcher 15 | 16 | @property 17 | def all_seqfetchers(self): 18 | seqfetchers = list(self.prefix_seqfetchers.values()) 19 | if self.default_seqfetcher: 20 | seqfetchers.append(self.default_seqfetcher) 21 | return seqfetchers 22 | 23 | def set_data_provider(self, hdp: Interface): 24 | for seqfetcher in self.all_seqfetchers: 25 | try: 26 | seqfetcher.set_data_provider(hdp) 27 | except AttributeError: 28 | pass 29 | 30 | def fetch_seq(self, ac, start_i=None, end_i=None): 31 | for prefix, sf in self.prefix_seqfetchers.items(): 32 | if ac.startswith(prefix): 33 | return sf.fetch_seq(ac, start_i=start_i, end_i=end_i) 34 | if self.default_seqfetcher: 35 | return self.default_seqfetcher.fetch_seq(ac, start_i=start_i, end_i=end_i) 36 | 37 | known_prefixes = ','.join(self.prefix_seqfetchers.keys()) 38 | msg = f"Couldn't handle '{ac}', must match known prefixes: '{known_prefixes}'. No default set" 39 | raise HGVSDataNotAvailableError(msg) 40 | 41 | 42 | class MultiSeqFetcher(abc.ABC): 43 | def __init__(self, *args): 44 | self.seqfetchers = list(args) 45 | 46 | def set_data_provider(self, hdp: Interface): 47 | for seqfetcher in self.seqfetchers: 48 | try: 49 | seqfetcher.set_data_provider(hdp) 50 | except AttributeError: 51 | pass 52 | 53 | @abc.abstractmethod 54 | def fetch_seq(self, ac, start_i=None, end_i=None): 55 | pass 56 | 57 | @property 58 | def source(self): 59 | # This needs to execute after set_data_provider is called 60 | return ", ".join(s.source for s in self.seqfetchers) 61 | 62 | 63 | 64 | class ChainedSeqFetcher(MultiSeqFetcher): 65 | """ This takes multiple SeqFetcher instances, and tries them in order if HGVSDataNotAvailableError 66 | until one succeeds (or finally throws) 67 | 68 | This is useful if you want to use FastaSeqFetcher (below) as a fallback if SeqFetcher fails 69 | 70 | seqfetcher = ChainedSeqFetcher(SeqFetcher(), FastaSeqFetcher(fasta_filename)) 71 | """ 72 | def fetch_seq(self, ac, start_i=None, end_i=None): 73 | exceptions = [] 74 | for sf in self.seqfetchers: 75 | try: 76 | return sf.fetch_seq(ac, start_i=start_i, end_i=end_i) 77 | except HGVSDataNotAvailableError as e: 78 | exceptions.append(e) 79 | 80 | raise HGVSDataNotAvailableError(exceptions) 81 | 82 | 83 | class VerifyMultipleSeqFetcher(MultiSeqFetcher): 84 | """ This takes multiple SeqFetcher instances, queries them both and checks the BOTH SUCCEED AND ARE IDENTICAL 85 | - otherwise it fails with HGVSDataNotAvailableError 86 | 87 | This is useful for eg verifying that RefSeq transcripts agree with the genome (otherwise there must be) 88 | """ 89 | def fetch_seq(self, ac, start_i=None, end_i=None): 90 | results = {} 91 | exceptions = [] 92 | for sf in self.seqfetchers: 93 | try: 94 | seq = sf.fetch_seq(ac, start_i=start_i, end_i=end_i) 95 | results[sf.source] = seq 96 | except HGVSDataNotAvailableError as e: 97 | exceptions.append(e) 98 | if exceptions: 99 | raise HGVSDataNotAvailableError(exceptions) 100 | 101 | values = list(results.values()) 102 | if not all_equal(values): 103 | raise HGVSDataNotAvailableError(f"Inconsistent sequences for '{ac}'") 104 | return values[0] 105 | 106 | 107 | class AlwaysFailSeqFetcher: 108 | def __init__(self, message): 109 | self.message = message 110 | self.source = str(self.__class__.__name__) 111 | 112 | def fetch_seq(self, ac, start_i=None, end_i=None): 113 | raise HGVSDataNotAvailableError(self.message) 114 | 115 | 116 | 117 | class AbstractTranscriptSeqFetcher: 118 | def __init__(self, *args, cache=True): 119 | self.cache = cache 120 | self.transcript_cache = {} 121 | self.hdp = None # Set when passed to data provider (via set_data_provider) 122 | 123 | @abc.abstractmethod 124 | def _get_transcript_seq(self, ac): 125 | pass 126 | 127 | def get_transcript_seq(self, ac): 128 | transcript_seq = self.transcript_cache.get(ac) 129 | if not transcript_seq: 130 | transcript_seq = self._get_transcript_seq(ac) 131 | if self.cache: 132 | self.transcript_cache[ac] = transcript_seq 133 | return transcript_seq 134 | 135 | def set_data_provider(self, hdp: Interface): 136 | self.hdp = hdp 137 | 138 | def fetch_seq(self, ac, start_i=None, end_i=None): 139 | if self.hdp is None: 140 | raise HGVSDataNotAvailableError(f"{self}: You need to set set_data_provider() before calling fetch_seq()") 141 | 142 | transcript_seq = self.get_transcript_seq(ac) 143 | if start_i is None: 144 | start_i = 0 145 | if end_i is None: 146 | end_i = len(transcript_seq) 147 | return transcript_seq[start_i:end_i] 148 | -------------------------------------------------------------------------------- /cdot/pyhgvs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SACGF/cdot/ddeb78d58731dd4136689360d0fce4a8a91af87d/cdot/pyhgvs/__init__.py -------------------------------------------------------------------------------- /cdot/pyhgvs/pyhgvs_transcript.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import gzip 3 | import json 4 | from importlib import metadata 5 | from typing import Dict, Tuple 6 | 7 | import requests 8 | 9 | from pyhgvs.utils import make_transcript 10 | 11 | 12 | class AbstractPyHGVSTranscriptFactory(abc.ABC): 13 | 14 | def __init__(self): 15 | pass 16 | 17 | @abc.abstractmethod 18 | def _get_transcript(self, transcript_id): 19 | pass 20 | 21 | def get_transcript_grch37(self, transcript_id, sacgf_pyhgvs_fork=False): 22 | return self.get_transcript(transcript_id, "GRCh37", sacgf_pyhgvs_fork=sacgf_pyhgvs_fork) 23 | 24 | def get_transcript_grch38(self, transcript_id, sacgf_pyhgvs_fork=False): 25 | return self.get_transcript(transcript_id, "GRCh38", sacgf_pyhgvs_fork=sacgf_pyhgvs_fork) 26 | 27 | def get_transcript(self, transcript_id, genome_build, sacgf_pyhgvs_fork=False): 28 | transcript = None 29 | if pyhgvs_data := self.get_pyhgvs_data(transcript_id, genome_build, sacgf_pyhgvs_fork=sacgf_pyhgvs_fork): 30 | transcript = make_transcript(pyhgvs_data) 31 | return transcript 32 | 33 | def get_pyhgvs_data(self, transcript_id, genome_build, sacgf_pyhgvs_fork=False) -> Dict: 34 | transcript_json = self._get_transcript(transcript_id) or {} 35 | build_coords = transcript_json.get("genome_builds", {}).get(genome_build) 36 | if build_coords is None: 37 | return {} 38 | 39 | exons = build_coords['exons'] 40 | start = exons[0][0] 41 | end = exons[-1][1] 42 | 43 | pyhgvs_data = { 44 | "id": transcript_json["id"], 45 | "chrom": build_coords['contig'], 46 | "start": start, 47 | "end": end, 48 | "strand": build_coords["strand"], 49 | # PyHGVS has cds_start/cds_end equal end (so CDS length is 0) if non-coding 50 | "cds_start": build_coords.get('cds_start', end), 51 | "cds_end": build_coords.get('cds_end', end), 52 | "gene_name": transcript_json['gene_name'], 53 | } 54 | 55 | if sacgf_pyhgvs_fork: 56 | # Remove the 3rd element (exon_number) 57 | exons = [e[:2] + e[3:] for e in exons] 58 | pyhgvs_data["cdna_match"] = exons 59 | pyhgvs_data["start_codon_transcript_pos"] = transcript_json.get("start_codon") 60 | pyhgvs_data["stop_codon_transcript_pos"] = transcript_json.get("stop_codon") 61 | if other_chroms := build_coords.get("other_chroms"): 62 | pyhgvs_data["other_chroms"] = other_chroms 63 | else: 64 | # Standard PyHGVS - only keep start/end 65 | exons = [e[:2] for e in exons] 66 | 67 | pyhgvs_data["exons"] = exons 68 | return pyhgvs_data 69 | 70 | 71 | class PyHGVSTranscriptFactory(AbstractPyHGVSTranscriptFactory): 72 | def _get_transcript(self, transcript_id): 73 | return self.transcripts.get(transcript_id) 74 | 75 | def __init__(self, transcripts): 76 | super().__init__() 77 | self.transcripts = transcripts 78 | 79 | 80 | class JSONPyHGVSTranscriptFactory(PyHGVSTranscriptFactory): 81 | def __init__(self, file_or_filename_list): 82 | transcripts = {} 83 | for file_or_filename in file_or_filename_list: 84 | if isinstance(file_or_filename, str): 85 | if file_or_filename.endswith(".gz"): 86 | f = gzip.open(file_or_filename) 87 | else: 88 | f = open(file_or_filename) 89 | else: 90 | f = file_or_filename 91 | data = json.load(f) 92 | transcripts.update(data["transcripts"]) 93 | super().__init__(transcripts=transcripts) 94 | 95 | 96 | class RESTPyHGVSTranscriptFactory(AbstractPyHGVSTranscriptFactory): 97 | 98 | def _get_transcript(self, transcript_id): 99 | # We store None for 404 on REST 100 | if transcript_id in self.transcripts: 101 | return self.transcripts[transcript_id] 102 | 103 | transcript_url = self.url + "/transcript/" + transcript_id 104 | response = requests.get(transcript_url) 105 | if response.ok: 106 | if 'application/json' in response.headers.get('Content-Type'): 107 | transcript = response.json() 108 | else: 109 | raise ValueError("Non-json response received for '%s' - are you behind a firewall?" % transcript_url) 110 | else: 111 | transcript = None 112 | self.transcripts[transcript_id] = transcript 113 | return transcript 114 | 115 | def __init__(self, url=None, secure=True): 116 | super().__init__() 117 | if url is None: 118 | if secure: 119 | url = "https://cdot.cc" 120 | else: 121 | url = "http://cdot.cc" 122 | self.url = url 123 | self.transcripts = {} 124 | 125 | 126 | def is_sacgf_pyhgvs_fork(): 127 | required_version = (0, 12, 0) # Bumped version on 24 Nov 2021 - has mito and cDNA_match fixes 128 | imported_version = [int(v) for v in metadata.version("pyhgvs").split(".")] 129 | return tuple(imported_version) >= required_version 130 | 131 | 132 | # Changes from old loading: 133 | 134 | # See dot has no cds_start/end if non-coding 135 | # PyHGVS expects cds_start/cds_end be equal to end/end for non-coding transcripts (so coding length ie end-start = 0) 136 | # cds_start = transcript_data.get("cds_start", end) 137 | # cds_end = transcript_data.get("cds_end", end) 138 | 139 | 140 | # VG loader also expects biotype to be comma sep, now is list 141 | -------------------------------------------------------------------------------- /generate_transcript_data/Mus_musculus/refseq_transcripts_grcm38.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | BASE_DIR=$(dirname $(dirname ${BASH_SOURCE[0]})) 6 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version) 7 | 8 | if [[ -z ${GENE_INFO_JSON} ]]; then 9 | echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py" 10 | exit 1 11 | fi 12 | 13 | filename=GCF_000001635.26_GRCm38.p6_genomic.gff.gz 14 | url=https://ftp.ncbi.nlm.nih.gov/refseq/M_musculus/annotation_releases/108/GCF_000001635.26_GRCm38.p6/${filename} 15 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 16 | 17 | if [[ ! -e ${filename} ]]; then 18 | wget ${url} 19 | fi 20 | if [[ ! -e ${cdot_file} ]]; then 21 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --no-contig-conversion --url "${url}" --genome-build=GRCm38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 22 | fi 23 | -------------------------------------------------------------------------------- /generate_transcript_data/Mus_musculus/refseq_transcripts_grcm39.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | BASE_DIR=$(dirname $(dirname ${BASH_SOURCE[0]})) 6 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version) 7 | 8 | if [[ -z ${GENE_INFO_JSON} ]]; then 9 | echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py" 10 | exit 1 11 | fi 12 | 13 | 14 | filename=GCF_000001635.27_GRCm39_genomic.gff.gz 15 | url=https://ftp.ncbi.nlm.nih.gov/refseq/M_musculus/annotation_releases/109/GCF_000001635.27_GRCm39/${filename} 16 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 17 | if [[ ! -e ${filename} ]]; then 18 | wget ${url} 19 | fi 20 | if [[ ! -e ${cdot_file} ]]; then 21 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --no-contig-conversion --url "${url}" --genome-build=GRCm39 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 22 | fi 23 | -------------------------------------------------------------------------------- /generate_transcript_data/Snakefile: -------------------------------------------------------------------------------- 1 | import sys 2 | import subprocess 3 | from datetime import datetime 4 | 5 | configfile: os.path.join(workflow.basedir, "cdot_transcripts.yaml") 6 | 7 | cdot_json = os.path.join(workflow.basedir, "cdot_json.py") 8 | cdot_dir = os.path.dirname(workflow.basedir) 9 | cdot_output_raw = subprocess.check_output(f"{sys.executable} {cdot_json} --version", shell=True, env={"PYTHONPATH": cdot_dir}) 10 | cdot_data_version = cdot_output_raw.decode().strip() 11 | 12 | # Name it based on date as it may vary 13 | today = datetime.now().date().isoformat() 14 | gene_info_download_filename = os.path.join("downloads/gene_info", f"Homo_sapiens.gene_info.{today}.gz") 15 | gene_info_json_filename = f"Homo_sapiens.gene-info-{cdot_data_version}.json.gz" 16 | 17 | all_urls = {} 18 | annotation_consortium_list = [] 19 | genome_build_list = [] 20 | for annotation_consortium, builds in config["config"].items(): 21 | annotation_consortium_list.append(annotation_consortium) 22 | for gb, urls_dict in builds.items(): 23 | genome_build_list.append(gb) 24 | all_urls.update(urls_dict) 25 | 26 | 27 | def get_url_from_name(wildcards): 28 | return all_urls[wildcards.name] 29 | 30 | 31 | def get_urls(wildcards): 32 | print(f"get_urls") 33 | for key, value in wildcards.items(): 34 | print(f"{key}={value}") 35 | 36 | return urls 37 | 38 | 39 | def get_cdot_command(wildcards): 40 | url = all_urls[wildcards.name] 41 | # gffs can end with 'gff.gz' or 'gff3.gz', gtfs always end with 'gtf.gz' 42 | cdot_command = "gtf_to_json" if url.endswith(".gtf.gz") else "gff3_to_json" 43 | return cdot_command 44 | 45 | 46 | def get_build_input_files(wildcards): 47 | urls = config["config"][wildcards.annotation_consortium][wildcards.genome_build] 48 | prefix = f"{wildcards.annotation_consortium}/{wildcards.genome_build}/cdot-{cdot_data_version}" 49 | return expand(prefix + "-{name}.json.gz", name=urls) 50 | 51 | 52 | rule all: 53 | input: 54 | gene_info_json_filename, 55 | expand("{annotation_consortium}/cdot-" + cdot_data_version + "-{annotation_consortium}-{genome_build}.json.gz", 56 | annotation_consortium=annotation_consortium_list, # ["RefSeq", "Ensembl"], 57 | genome_build=genome_build_list) 58 | 59 | 60 | rule cdot_merge_historical_json: 61 | # Merges multiple files together for 1 build 62 | output: 63 | "{annotation_consortium}/cdot-" + cdot_data_version + "-{annotation_consortium}-{genome_build}.json.gz" 64 | input: 65 | get_build_input_files, 66 | shell: 67 | """ 68 | PYTHONPATH={cdot_dir} \ 69 | {cdot_json} \ 70 | merge_historical \ 71 | {input} \ 72 | --genome-build={wildcards.genome_build} \ 73 | --output {output} 74 | """ 75 | 76 | 77 | rule cdot_gff_json: 78 | # Individual GFF 79 | input: 80 | gene_info_json=gene_info_json_filename, 81 | gff_file="downloads/{name}.gz" 82 | output: 83 | protected("{annotation_consortium}/{genome_build}/cdot-" + cdot_data_version + "-{name}.json.gz") 84 | params: 85 | url=get_url_from_name, 86 | cdot_command=get_cdot_command 87 | shell: 88 | """ 89 | PYTHONPATH={cdot_dir} \ 90 | {cdot_json} \ 91 | {params.cdot_command} \ 92 | "{input.gff_file}" \ 93 | --url "{params.url}" \ 94 | --genome-build="{wildcards.genome_build}" \ 95 | --output "{output}" \ 96 | --gene-info-json="{input.gene_info_json}" 97 | """ 98 | 99 | 100 | rule download_gff_files: 101 | threads: 4 # We seem to sometimes get failures/booted if too many simultaneous connections to site 102 | output: 103 | # Don't re-download if snakemake script changes 104 | protected("downloads/{name}.gz") 105 | params: 106 | url=lambda wildcards: all_urls[wildcards.name] 107 | shell: 108 | "curl --fail --show-error -o {output} {params.url}" 109 | 110 | 111 | rule process_gene_info_json: 112 | input: 113 | gene_info_download_filename 114 | output: 115 | protected(gene_info_json_filename) 116 | shell: 117 | """ 118 | PYTHONPATH={cdot_dir} \ 119 | "{workflow.basedir}/cdot_gene_info.py" \ 120 | --gene-info {input} \ 121 | --output {output} \ 122 | --email cdot@cdot.cc 123 | """ 124 | 125 | 126 | rule download_gene_info: 127 | output: 128 | protected(gene_info_download_filename) 129 | shell: 130 | "curl --fail --show-error -o {output} https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/Homo_sapiens.gene_info.gz" 131 | 132 | -------------------------------------------------------------------------------- /generate_transcript_data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SACGF/cdot/ddeb78d58731dd4136689360d0fce4a8a91af87d/generate_transcript_data/__init__.py -------------------------------------------------------------------------------- /generate_transcript_data/all_transcripts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | FULL_PATH_TO_SCRIPT="$(realpath "${BASH_SOURCE[-1]}")" 6 | BASE_DIR=$(dirname ${FULL_PATH_TO_SCRIPT}) 7 | 8 | # Python scripts will import via generate_transcript_data 9 | export PYTHONPATH=${BASE_DIR}/.. 10 | 11 | CDOT_DATA_VERSION=$(${BASE_DIR}/cdot_json.py --version) 12 | 13 | echo "Generating all transcripts for cdot data version ${CDOT_DATA_VERSION}" 14 | 15 | # This needs to be passed to called bash scripts, so they are invoked with "." to use these variables 16 | export GENE_INFO_JSON=$(pwd)/Homo_sapiens.gene-info-${CDOT_DATA_VERSION}.json.gz 17 | 18 | if [[ ! -e ${GENE_INFO_JSON} ]]; then 19 | ${BASE_DIR}/gene_info.sh 20 | fi 21 | 22 | echo "Gene summary variable = ${GENE_INFO_JSON}" 23 | 24 | # RefSeq 25 | mkdir -p refseq 26 | cd refseq 27 | 28 | mkdir -p GRCh37 29 | cd GRCh37 30 | ${BASE_DIR}/refseq_transcripts_grch37.sh 31 | cd .. 32 | 33 | mkdir -p GRCh38 34 | cd GRCh38 35 | ${BASE_DIR}/refseq_transcripts_grch38.sh 36 | cd .. 37 | 38 | mkdir -p T2T-CHM13v2.0 39 | cd T2T-CHM13v2.0 40 | ${BASE_DIR}/refseq_transcripts_chm13v2.sh 41 | cd .. 42 | 43 | # Combine genome builds (we're in refseq dir) 44 | REFSEQ_COMBO=cdot-${CDOT_DATA_VERSION}.refseq.grch37_grch38.json.gz 45 | if [[ ! -e ${REFSEQ_COMBO} ]]; then 46 | ${BASE_DIR}/cdot_json.py combine_builds \ 47 | --grch37 GRCh37/cdot-${CDOT_DATA_VERSION}.refseq.grch37.json.gz \ 48 | --grch38 GRCh38/cdot-${CDOT_DATA_VERSION}.refseq.grch38.json.gz \ 49 | --output ${REFSEQ_COMBO} 50 | fi 51 | 52 | cd .. 53 | 54 | # Ensembl 55 | mkdir -p ensembl 56 | cd ensembl 57 | 58 | mkdir -p GRCh37 59 | cd GRCh37 60 | ${BASE_DIR}/ensembl_transcripts_grch37.sh 61 | cd .. 62 | 63 | mkdir -p GRCh38 64 | cd GRCh38 65 | ${BASE_DIR}/ensembl_transcripts_grch38.sh 66 | cd .. 67 | 68 | mkdir -p T2T-CHM13v2.0 69 | cd T2T-CHM13v2.0 70 | ${BASE_DIR}/ensembl_transcripts_chm13v2.sh 71 | cd .. 72 | 73 | 74 | # Combine genome builds (we're in ensembl dir) 75 | ENSEMBL_COMBO=cdot-${CDOT_DATA_VERSION}.ensembl.grch37_grch38.json.gz 76 | if [[ ! -e ${ENSEMBL_COMBO} ]]; then 77 | ${BASE_DIR}/cdot_json.py combine_builds \ 78 | --grch37 GRCh37/cdot-${CDOT_DATA_VERSION}.ensembl.grch37.json.gz \ 79 | --grch38 GRCh38/cdot-${CDOT_DATA_VERSION}.ensembl.grch38.json.gz \ 80 | --output ${ENSEMBL_COMBO} 81 | fi 82 | 83 | cd .. 84 | -------------------------------------------------------------------------------- /generate_transcript_data/cdot_gene_info.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import csv 4 | import gzip 5 | import json 6 | import logging 7 | import os 8 | from argparse import ArgumentParser 9 | from datetime import datetime 10 | from typing import Iterable, Iterator, List, TypeVar 11 | 12 | import cdot 13 | from Bio import Entrez 14 | from json_encoders import SortedSetEncoder 15 | 16 | T = TypeVar("T") 17 | 18 | 19 | def handle_args(): 20 | parser = ArgumentParser(description='cdot Gene Info retrieval') 21 | parser.add_argument('--email', required=True, help='Entrez email') 22 | parser.add_argument('--gene-info', required=True, help='refseq gene info file') 23 | parser.add_argument('--output', required=True, help='output filename') 24 | 25 | args = parser.parse_args() 26 | return args 27 | 28 | 29 | def batch_iterator(iterable: Iterable[T], batch_size: int = 10) -> Iterator[List[T]]: 30 | batch: List[T] = list() 31 | for record in iterable: 32 | batch.append(record) 33 | if len(batch) >= batch_size: 34 | yield batch 35 | batch = list() 36 | if batch: 37 | yield batch 38 | 39 | 40 | def _get_entrez_gene_summary(id_list): 41 | for _ in range(3): 42 | try: 43 | request = Entrez.epost("gene", id=",".join(id_list)) 44 | result = Entrez.read(request) 45 | web_env = result["WebEnv"] 46 | query_key = result["QueryKey"] 47 | data = Entrez.esummary(db="gene", webenv=web_env, query_key=query_key) 48 | document = Entrez.read(data, ignore_errors=True, validate=False) # Need recent BioPython 49 | return document["DocumentSummarySet"]["DocumentSummary"] 50 | except Exception as e: 51 | logging.warning(e) 52 | logging.warning("Trying again...") 53 | 54 | def iter_entrez_ids(reader): 55 | for gi in reader: 56 | if gi["Symbol_from_nomenclature_authority"] != '-': 57 | yield gi['GeneID'] 58 | 59 | def main(): 60 | args = handle_args() 61 | Entrez.email = args.email # Stop warning message 62 | start_date = datetime.now().isoformat() 63 | 64 | # 10k limit of return data from NCBI 65 | # NCBI_BATCH_SIZE = 10000 66 | NCBI_BATCH_SIZE = 1000 67 | 68 | gene_info = {} 69 | with gzip.open(args.gene_info, "rt") as f: 70 | reader = csv.DictReader(f, dialect='excel-tab') 71 | 72 | for entrez_ids in batch_iterator(iter_entrez_ids(reader), batch_size=NCBI_BATCH_SIZE): 73 | # We should really store it under the gene Id so dupe symbols don't wipe 74 | for gene_summary in _get_entrez_gene_summary(entrez_ids): 75 | gene_id = gene_summary.attributes["uid"] 76 | if error := gene_summary.get("error"): 77 | logging.warning("Skipping '%s' error: %s", gene_id, error) 78 | continue 79 | 80 | gene_info[gene_id] = { 81 | "gene_symbol": gene_summary["NomenclatureSymbol"], 82 | "map_location": gene_summary["MapLocation"], 83 | # Already have description for RefSeq but not Ensembl (will just overwrite) 84 | "description": gene_summary["NomenclatureName"], 85 | # "added": record["date_name_changed"], 86 | "aliases": gene_summary["OtherAliases"], 87 | "summary": gene_summary["Summary"], 88 | } 89 | 90 | print(f"Processed {len(gene_info)} records") 91 | 92 | if gene_info: 93 | with gzip.open(args.output, 'wt') as outfile: 94 | gene_info_file_dt = datetime.fromtimestamp(os.stat(args.gene_info).st_ctime) 95 | 96 | data = { 97 | "cdot_version": cdot.__version__, 98 | "api_retrieval_date": start_date, 99 | "gene_info_date": gene_info_file_dt.isoformat(), 100 | "gene_info": gene_info, 101 | } 102 | json.dump(data, outfile, cls=SortedSetEncoder, sort_keys=True) # Sort so diffs work 103 | 104 | 105 | if __name__ == '__main__': 106 | main() 107 | -------------------------------------------------------------------------------- /generate_transcript_data/cdot_transcripts.yaml: -------------------------------------------------------------------------------- 1 | config: 2 | Ensembl: 3 | # For Ensembl - we have to use GTFs as the GFF3s don't have protein versions in them 4 | GRCh37: 5 | #v81 (points to 75) and earlier at GTFs that don't have transcript versions - just skip them 6 | #82 is first GFF3 for GRCh37 7 | #83 has no data 8 | #84 is 82 again 9 | #86 is 85 again 10 | Homo_sapiens_GRCh37_Ensembl_82.gtf: "https://ftp.ensembl.org/pub/grch37/release-82/gtf/homo_sapiens/Homo_sapiens.GRCh37.82.gtf.gz" 11 | Homo_sapiens_GRCh37_Ensembl_85.gtf: "https://ftp.ensembl.org/pub/grch37/release-85/gtf/homo_sapiens/Homo_sapiens.GRCh37.85.gtf.gz" 12 | Homo_sapiens_GRCh37_Ensembl_87.gtf: "https://ftp.ensembl.org/pub/grch37/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz" 13 | 14 | GRCh38: 15 | Homo_sapiens_GRCh38_Ensembl_81.gtf: "https://ftp.ensembl.org/pub/release-81/gtf/homo_sapiens/Homo_sapiens.GRCh38.81.gtf.gz" 16 | Homo_sapiens_GRCh38_Ensembl_82.gtf: "https://ftp.ensembl.org/pub/release-82/gtf/homo_sapiens/Homo_sapiens.GRCh38.82.gtf.gz" 17 | Homo_sapiens_GRCh38_Ensembl_83.gtf: "https://ftp.ensembl.org/pub/release-83/gtf/homo_sapiens/Homo_sapiens.GRCh38.83.gtf.gz" 18 | Homo_sapiens_GRCh38_Ensembl_84.gtf: "https://ftp.ensembl.org/pub/release-84/gtf/homo_sapiens/Homo_sapiens.GRCh38.84.gtf.gz" 19 | Homo_sapiens_GRCh38_Ensembl_85.gtf: "https://ftp.ensembl.org/pub/release-85/gtf/homo_sapiens/Homo_sapiens.GRCh38.85.gtf.gz" 20 | Homo_sapiens_GRCh38_Ensembl_86.gtf: "https://ftp.ensembl.org/pub/release-86/gtf/homo_sapiens/Homo_sapiens.GRCh38.86.gtf.gz" 21 | Homo_sapiens_GRCh38_Ensembl_87.gtf: "https://ftp.ensembl.org/pub/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh38.87.gtf.gz" 22 | Homo_sapiens_GRCh38_Ensembl_88.gtf: "https://ftp.ensembl.org/pub/release-88/gtf/homo_sapiens/Homo_sapiens.GRCh38.88.gtf.gz" 23 | Homo_sapiens_GRCh38_Ensembl_89.gtf: "https://ftp.ensembl.org/pub/release-89/gtf/homo_sapiens/Homo_sapiens.GRCh38.89.gtf.gz" 24 | Homo_sapiens_GRCh38_Ensembl_90.gtf: "https://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens/Homo_sapiens.GRCh38.90.gtf.gz" 25 | Homo_sapiens_GRCh38_Ensembl_91.gtf: "https://ftp.ensembl.org/pub/release-91/gtf/homo_sapiens/Homo_sapiens.GRCh38.91.gtf.gz" 26 | Homo_sapiens_GRCh38_Ensembl_92.gtf: "https://ftp.ensembl.org/pub/release-92/gtf/homo_sapiens/Homo_sapiens.GRCh38.92.gtf.gz" 27 | Homo_sapiens_GRCh38_Ensembl_93.gtf: "https://ftp.ensembl.org/pub/release-93/gtf/homo_sapiens/Homo_sapiens.GRCh38.93.gtf.gz" 28 | Homo_sapiens_GRCh38_Ensembl_94.gtf: "https://ftp.ensembl.org/pub/release-94/gtf/homo_sapiens/Homo_sapiens.GRCh38.94.gtf.gz" 29 | Homo_sapiens_GRCh38_Ensembl_95.gtf: "https://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz" 30 | Homo_sapiens_GRCh38_Ensembl_96.gtf: "https://ftp.ensembl.org/pub/release-96/gtf/homo_sapiens/Homo_sapiens.GRCh38.96.gtf.gz" 31 | Homo_sapiens_GRCh38_Ensembl_97.gtf: "https://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz" 32 | Homo_sapiens_GRCh38_Ensembl_98.gtf: "https://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz" 33 | Homo_sapiens_GRCh38_Ensembl_99.gtf: "https://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens/Homo_sapiens.GRCh38.99.gtf.gz" 34 | Homo_sapiens_GRCh38_Ensembl_100.gtf: "https://ftp.ensembl.org/pub/release-100/gtf/homo_sapiens/Homo_sapiens.GRCh38.100.gtf.gz" 35 | Homo_sapiens_GRCh38_Ensembl_101.gtf: "https://ftp.ensembl.org/pub/release-101/gtf/homo_sapiens/Homo_sapiens.GRCh38.101.gtf.gz" 36 | Homo_sapiens_GRCh38_Ensembl_102.gtf: "https://ftp.ensembl.org/pub/release-102/gtf/homo_sapiens/Homo_sapiens.GRCh38.102.gtf.gz" 37 | Homo_sapiens_GRCh38_Ensembl_103.gtf: "https://ftp.ensembl.org/pub/release-103/gtf/homo_sapiens/Homo_sapiens.GRCh38.103.gtf.gz" 38 | Homo_sapiens_GRCh38_Ensembl_104.gtf: "https://ftp.ensembl.org/pub/release-104/gtf/homo_sapiens/Homo_sapiens.GRCh38.104.gtf.gz" 39 | Homo_sapiens_GRCh38_Ensembl_105.gtf: "https://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.gtf.gz" 40 | Homo_sapiens_GRCh38_Ensembl_106.gtf: "https://ftp.ensembl.org/pub/release-106/gtf/homo_sapiens/Homo_sapiens.GRCh38.106.gtf.gz" 41 | Homo_sapiens_GRCh38_Ensembl_107.gtf: "https://ftp.ensembl.org/pub/release-107/gtf/homo_sapiens/Homo_sapiens.GRCh38.107.gtf.gz" 42 | Homo_sapiens_GRCh38_Ensembl_108.gtf: "https://ftp.ensembl.org/pub/release-108/gtf/homo_sapiens/Homo_sapiens.GRCh38.108.gtf.gz" 43 | Homo_sapiens_GRCh38_Ensembl_109.gtf: "https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.gtf.gz" 44 | Homo_sapiens_GRCh38_Ensembl_110.gtf: "https://ftp.ensembl.org/pub/release-110/gtf/homo_sapiens/Homo_sapiens.GRCh38.110.gtf.gz" 45 | Homo_sapiens_GRCh38_Ensembl_111.gtf: "https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz" 46 | Homo_sapiens_GRCh38_Ensembl_112.gtf: "https://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.gtf.gz" 47 | Homo_sapiens_GRCh38_Ensembl_113.gtf: "https://ftp.ensembl.org/pub/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh38.113.gtf.gz" 48 | Homo_sapiens_GRCh38_Ensembl_114.gtf: "https://ftp.ensembl.org/pub/release-114/gtf/homo_sapiens/Homo_sapiens.GRCh38.114.gtf.gz" 49 | # Gives me a 403 - reported to Ensembl 50 | # Homo_sapiens_GRCh38_Ensembl_115.gtf: "https://ftp.ensembl.org/pub/release-115/gtf/homo_sapiens/Homo_sapiens.GRCh38.115.gtf.gz" 51 | 52 | T2T-CHM13v2.0: 53 | Homo_sapiens_T2T-CHM13v2.0_Ensembl_2022_06.gtf: "https://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/GCA_009914755.4/ensembl/geneset/2022_06/Homo_sapiens-GCA_009914755.4-2022_06-genes.gtf.gz" 54 | Homo_sapiens_T2T-CHM13v2.0_Ensembl_2022_07.gtf: "https://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/GCA_009914755.4/ensembl/geneset/2022_07/Homo_sapiens-GCA_009914755.4-2022_07-genes.gtf.gz" 55 | RefSeq: 56 | GRCh37: 57 | Homo_sapiens_GRCh37_RefSeq_p5.gff3: "https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/BUILD.37.3/GFF/ref_GRCh37.p5_top_level.gff3.gz" 58 | Homo_sapiens_GRCh37_RefSeq_103.gff3: "https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.103/GFF/ref_GRCh37.p9_top_level.gff3.gz" 59 | Homo_sapiens_GRCh37_RefSeq_104.gff3: "https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.104/GFF/ref_GRCh37.p10_top_level.gff3.gz" 60 | Homo_sapiens_GRCh37_RefSeq_105.20190906.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20190906/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz" 61 | Homo_sapiens_GRCh37_RefSeq_105.20201022.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20201022/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz" 62 | Homo_sapiens_GRCh37_RefSeq_105.20220307.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20220307/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz" 63 | Homo_sapiens_GRCh37_RS_2024_09: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.25-RS_2024_09/GCF_000001405.25_GRCh37.p13_genomic.gff.gz" 64 | 65 | GRCh38: 66 | Homo_sapiens_GRCh38_RefSeq_106.gff3: "https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz" 67 | Homo_sapiens_GRCh38_RefSeq_107.gff3: "https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz" 68 | Homo_sapiens_GRCh38_RefSeq_108.gff3: "https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.108/GFF/ref_GRCh38.p7_top_level.gff3.gz" 69 | Homo_sapiens_GRCh38_RefSeq_109.gff3: "https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.109/GFF/ref_GRCh38.p12_top_level.gff3.gz" 70 | # The date on this 109 version is 2020-2024 (after the other 109s below), not sure what's going on 71 | Homo_sapiens_GRCh38_RefSeq_109.GCF.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.gff.gz" 72 | Homo_sapiens_GRCh38_RefSeq_109.20190607.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20190607/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" 73 | Homo_sapiens_GRCh38_RefSeq_109.20190905.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20190905/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" 74 | Homo_sapiens_GRCh38_RefSeq_109.20191205.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20191205/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" 75 | Homo_sapiens_GRCh38_RefSeq_109.20200228.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20200228/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" 76 | Homo_sapiens_GRCh38_RefSeq_109.20200522.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20200522/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" 77 | Homo_sapiens_GRCh38_RefSeq_109.20200815.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20200815/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" 78 | Homo_sapiens_GRCh38_RefSeq_109.20201120.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20201120/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" 79 | Homo_sapiens_GRCh38_RefSeq_109.20210226.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20210226/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" 80 | Homo_sapiens_GRCh38_RefSeq_109.20210514.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20210514/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" 81 | Homo_sapiens_GRCh38_RefSeq_109.20211119.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" 82 | Homo_sapiens_GRCh38_RefSeq_110.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" 83 | Homo_sapiens_GRCh38_RefSeq_RS_2023_03.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2023_03/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" 84 | Homo_sapiens_GRCh38_RefSeq_RS_2023_10.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2023_10/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" 85 | Homo_sapiens_GRCh38_RefSeq_RS_2024_08.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2024_08/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" 86 | 87 | T2T-CHM13v2.0: 88 | Homo_sapiens_T2T-CHM13v2.0_RefSeq_110.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz" 89 | Homo_sapiens_T2T-CHM13v2.0_RefSeq_RS_2023_03.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2023_03/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz" 90 | Homo_sapiens_T2T-CHM13v2.0_RefSeq_RS_2023_10.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2023_10/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz" 91 | Homo_sapiens_T2T-CHM13v2.0_RefSeq_RS_2024_08.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2024_08/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz" 92 | -------------------------------------------------------------------------------- /generate_transcript_data/ensembl_transcripts_chm13v2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | BASE_DIR=$(dirname ${BASH_SOURCE[0]}) 6 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version) 7 | GENOME_BUILD=T2T-CHM13v2.0 8 | 9 | if [[ -z ${GENE_INFO_JSON} ]]; then 10 | echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py" 11 | exit 1 12 | fi 13 | 14 | merge_args=() 15 | for release in 2022_06 2022_07; do 16 | filename=Homo_sapiens-GCA_009914755.4-${release}-genes.gtf.gz 17 | url=https://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/GCA_009914755.4/ensembl/geneset/${release}/${filename} 18 | cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz 19 | 20 | if [[ ! -e ${filename} ]]; then 21 | wget ${url} 22 | fi 23 | if [[ ! -e ${cdot_file} ]]; then 24 | ${BASE_DIR}/cdot_json.py gtf_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 25 | fi 26 | merge_args+=(${cdot_file}) 27 | done 28 | 29 | merged_file="cdot-${CDOT_VERSION}.ensembl.${GENOME_BUILD}.json.gz" 30 | if [[ ! -e ${merged_file} ]]; then 31 | ${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=${GENOME_BUILD} --output "${merged_file}" 32 | fi 33 | -------------------------------------------------------------------------------- /generate_transcript_data/ensembl_transcripts_grch37.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | BASE_DIR=$(dirname ${BASH_SOURCE[0]}) 6 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version) 7 | 8 | if [[ -z ${GENE_INFO_JSON} ]]; then 9 | echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py" 10 | exit 1 11 | fi 12 | 13 | # v81 (points to 75) and earlier at GTFs that don't have transcript versions - just skip them 14 | 15 | #82 is first GFF3 for GRCh37 16 | #83 has no data 17 | #84 is 82 again 18 | #86 is 85 again 19 | merge_args=() 20 | for release in 82 85 87; do 21 | # Switched to using GTFs as they contain protein version 22 | filename=Homo_sapiens.GRCh37.${release}.gtf.gz 23 | url=ftp://ftp.ensembl.org/pub/grch37/release-${release}/gtf/homo_sapiens/${filename} 24 | cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz 25 | if [[ ! -e ${filename} ]]; then 26 | wget ${url} 27 | fi 28 | if [[ ! -e ${cdot_file} ]]; then 29 | ${BASE_DIR}/cdot_json.py gtf_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 30 | fi 31 | merge_args+=(${cdot_file}) 32 | done 33 | 34 | merged_file="cdot-${CDOT_VERSION}.ensembl.grch37.json.gz" 35 | if [[ ! -e ${merged_file} ]]; then 36 | ${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=GRCh37 --output "${merged_file}" 37 | fi 38 | -------------------------------------------------------------------------------- /generate_transcript_data/ensembl_transcripts_grch38.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | BASE_DIR=$(dirname ${BASH_SOURCE[0]}) 6 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version) 7 | 8 | if [[ -z ${GENE_INFO_JSON} ]]; then 9 | echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py" 10 | exit 1 11 | fi 12 | 13 | # Skip earlier GTFs as they don't have versions 14 | #for release in 76 77 78 79 80; do 15 | # filename=Homo_sapiens.GRCh38.${release}.gtf.gz 16 | # url=ftp://ftp.ensembl.org/pub/release-${release}/gtf/homo_sapiens/${filename} 17 | # if [[ ! -e ${filename} ]]; then 18 | # wget ${url} 19 | # fi 20 | 21 | # if [[ ! -e ${filename}.json.gz ]]; then 22 | # pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}" 23 | # fi 24 | #done 25 | 26 | #81 is first GFF3 for GRCh38 27 | merge_args=() 28 | for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112; do 29 | # Switched to using GTFs as they contain protein version while Ensembl GFF3s do not (required for c_to_p) 30 | filename=Homo_sapiens.GRCh38.${release}.gtf.gz 31 | url=ftp://ftp.ensembl.org/pub/release-${release}/gtf/homo_sapiens/${filename} 32 | cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz 33 | 34 | if [[ ! -e ${filename} ]]; then 35 | wget ${url} 36 | fi 37 | if [[ ! -e ${cdot_file} ]]; then 38 | ${BASE_DIR}/cdot_json.py gtf_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 39 | fi 40 | merge_args+=(${cdot_file}) 41 | done 42 | 43 | merged_file="cdot-${CDOT_VERSION}.ensembl.grch38.json.gz" 44 | if [[ ! -e ${merged_file} ]]; then 45 | ${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=GRCh38 --output "${merged_file}" 46 | fi 47 | -------------------------------------------------------------------------------- /generate_transcript_data/gene_info.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ -z ${EMAIL} ]]; then 4 | echo "You need to set the 'EMAIL' shell variable (used for NCBI API calls)" 5 | exit 6 | fi 7 | 8 | BASE_DIR=$(dirname ${BASH_SOURCE[0]}) 9 | # Python scripts will import via generate_transcript_data 10 | export PYTHONPATH=${BASE_DIR}/.. 11 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version) 12 | REFSEQ_DIR=${REFSEQ_DIR:-H_sapiens} 13 | SPECIES=${SPECIES:-Homo_sapiens} 14 | 15 | echo "Generating Gene Info for REFSEQ_DIR=${REFSEQ_DIR}, SPECIES=${SPECIES}" 16 | 17 | filename=${SPECIES}.gene_info.gz 18 | url=https://ftp.ncbi.nlm.nih.gov/refseq/${REFSEQ_DIR}/${filename} 19 | if [[ ! -e ${filename} ]]; then 20 | echo "Downloading ${url}" 21 | wget ${url} 22 | fi 23 | 24 | out_json=${SPECIES}.gene-info-${CDOT_VERSION}.json.gz 25 | if [[ ! -e ${out_json} ]]; then 26 | echo "Processing gene info file..." 27 | ${BASE_DIR}/cdot_gene_info.py --gene-info ${filename} --output ${out_json} --email ${EMAIL} 28 | fi 29 | -------------------------------------------------------------------------------- /generate_transcript_data/github_release_upload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [[ -z ${CDOT_DATA_DIR} ]]; then 6 | echo "You need to set environment variable CDOT_DATA_DIR, pointing to where you ran 'all_transcripts.sh'" 7 | exit 1 8 | fi 9 | 10 | FULL_PATH_TO_SCRIPT="$(realpath "${BASH_SOURCE[-1]}")" 11 | BASE_DIR=$(dirname ${FULL_PATH_TO_SCRIPT}) 12 | 13 | # Python scripts will import via generate_transcript_data 14 | export PYTHONPATH=${BASE_DIR}/.. 15 | 16 | CDOT_DATA_VERSION=$(${BASE_DIR}/cdot_json.py --version) 17 | 18 | CDOT_RELEASE_NAME=data_v${CDOT_DATA_VERSION} 19 | echo "For the rest of the script to work, it assumes you have tagged + pushed a data release of ${CDOT_DATA_VERSION}" 20 | echo "then run: gh release create ${CDOT_RELEASE_NAME} --title=${CDOT_RELEASE_NAME} --notes 'release notes...'" 21 | 22 | gh release upload ${CDOT_RELEASE_NAME} \ 23 | ${CDOT_DATA_DIR}/ensembl/GRCh37/cdot-${CDOT_DATA_VERSION}.ensembl.grch37.json.gz \ 24 | ${CDOT_DATA_DIR}/ensembl/GRCh37/cdot-${CDOT_DATA_VERSION}.ensembl.Homo_sapiens.GRCh37.87.gtf.json.gz \ 25 | ${CDOT_DATA_DIR}/ensembl/GRCh38/cdot-${CDOT_DATA_VERSION}.ensembl.grch38.json.gz \ 26 | ${CDOT_DATA_DIR}/ensembl/GRCh38/cdot-${CDOT_DATA_VERSION}.ensembl.Homo_sapiens.GRCh38.110.gtf.json.gz \ 27 | ${CDOT_DATA_DIR}/ensembl/GRCh38/cdot-${CDOT_DATA_VERSION}.ensembl.Homo_sapiens.GRCh38.112.gtf.json.gz \ 28 | ${CDOT_DATA_DIR}/ensembl/T2T-CHM13v2.0/cdot-${CDOT_DATA_VERSION}.ensembl.T2T-CHM13v2.0.json.gz \ 29 | ${CDOT_DATA_DIR}/refseq/GRCh37/cdot-${CDOT_DATA_VERSION}.refseq.grch37.json.gz \ 30 | ${CDOT_DATA_DIR}/refseq/GRCh37/cdot-${CDOT_DATA_VERSION}.GCF_000001405.25_GRCh37.p13_genomic.105.20201022.gff.json.gz \ 31 | ${CDOT_DATA_DIR}/refseq/GRCh37/cdot-${CDOT_DATA_VERSION}.GCF_000001405.25_GRCh37.p13_genomic.105.20220307.gff.json.gz \ 32 | ${CDOT_DATA_DIR}/refseq/GRCh38/cdot-${CDOT_DATA_VERSION}.refseq.grch38.json.gz \ 33 | ${CDOT_DATA_DIR}/refseq/GRCh38/cdot-${CDOT_DATA_VERSION}.GCF_000001405.40_GRCh38.p14_genomic.110.gff.json.gz \ 34 | ${CDOT_DATA_DIR}/refseq/GRCh38/cdot-${CDOT_DATA_VERSION}.GCF_000001405.40_GRCh38.p14_genomic.RS_2023_10.gff.json.gz \ 35 | ${CDOT_DATA_DIR}/refseq/T2T-CHM13v2.0/cdot-${CDOT_DATA_VERSION}.refseq.T2T-CHM13v2.0.json.gz 36 | 37 | -------------------------------------------------------------------------------- /generate_transcript_data/json_encoders.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class SortedSetEncoder(json.JSONEncoder): 5 | """ Dump set as list, from: https://stackoverflow.com/a/8230505/295724 """ 6 | 7 | def default(self, obj): 8 | if isinstance(obj, set): 9 | return list(sorted(obj)) 10 | return json.JSONEncoder.default(self, obj) -------------------------------------------------------------------------------- /generate_transcript_data/json_schema_version.py: -------------------------------------------------------------------------------- 1 | # After 0.2.22 we split version into separate code (pip) and data schema versions 2 | # The cdot client will use its own major/minor to determine whether it can read these data files 3 | JSON_SCHEMA_VERSION = "0.2.28" 4 | -------------------------------------------------------------------------------- /generate_transcript_data/mus_musculus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | FULL_PATH_TO_SCRIPT="$(realpath "${BASH_SOURCE[-1]}")" 6 | BASE_DIR=$(dirname ${FULL_PATH_TO_SCRIPT}) 7 | 8 | # Python scripts will import via generate_transcript_data 9 | export PYTHONPATH=${BASE_DIR}/.. 10 | 11 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version) 12 | 13 | # This needs to be passed to called bash scripts, so they are invoked with "." to use these variables 14 | export REFSEQ_DIR=M_musculus 15 | export SPECIES=Mus_musculus 16 | export GENE_INFO_JSON=$(pwd)/${SPECIES}.gene-info-${CDOT_VERSION}.json.gz 17 | 18 | if [[ ! -e ${GENE_INFO_JSON} ]]; then 19 | ${BASE_DIR}/gene_info.sh 20 | fi 21 | 22 | echo "Gene summary variable = ${GENE_INFO_JSON}" 23 | 24 | # RefSeq 25 | mkdir -p refseq 26 | cd refseq 27 | 28 | mkdir -p GRCm38 29 | cd GRCm38 30 | ${BASE_DIR}/Mus_musculus/refseq_transcripts_grcm38.sh 31 | cd .. 32 | 33 | mkdir -p GRCm39 34 | cd GRCm39 35 | ${BASE_DIR}/Mus_musculus/refseq_transcripts_grcm39.sh 36 | cd .. 37 | -------------------------------------------------------------------------------- /generate_transcript_data/refseq_transcripts_chm13v2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | BASE_DIR=$(dirname ${BASH_SOURCE[0]}) 6 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version) 7 | GENOME_BUILD=T2T-CHM13v2.0 8 | 9 | if [[ -z ${GENE_INFO_JSON} ]]; then 10 | echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py" 11 | exit 1 12 | fi 13 | 14 | merge_args=() 15 | 16 | filename=GCF_009914755.1_T2T-CHM13v2.0_genomic.110.gff.gz 17 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz 18 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 19 | 20 | if [[ ! -e ${filename} ]]; then 21 | wget ${url} --output-document=${filename} 22 | fi 23 | if [[ ! -e ${cdot_file} ]]; then 24 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 25 | fi 26 | merge_args+=(${cdot_file}) 27 | 28 | 29 | filename=GCF_009914755.1_T2T-CHM13v2.0_genomic.RS_2023_03.gff.gz 30 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2023_03/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz 31 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 32 | 33 | if [[ ! -e ${filename} ]]; then 34 | wget ${url} --output-document=${filename} 35 | fi 36 | if [[ ! -e ${cdot_file} ]]; then 37 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 38 | fi 39 | merge_args+=(${cdot_file}) 40 | 41 | filename=GCF_009914755.1_T2T-CHM13v2.0_genomic.RS_2023_10.gff.gz 42 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2023_10/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz 43 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 44 | 45 | if [[ ! -e ${filename} ]]; then 46 | wget ${url} --output-document=${filename} 47 | fi 48 | if [[ ! -e ${cdot_file} ]]; then 49 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 50 | fi 51 | merge_args+=(${cdot_file}) 52 | 53 | 54 | filename=GCF_009914755.1_T2T-CHM13v2.0_genomic.RS_2024_08.gff.gz 55 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2024_08/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz 56 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 57 | 58 | if [[ ! -e ${filename} ]]; then 59 | wget ${url} --output-document=${filename} 60 | fi 61 | if [[ ! -e ${cdot_file} ]]; then 62 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 63 | fi 64 | merge_args+=(${cdot_file}) 65 | 66 | 67 | merged_file="cdot-${CDOT_VERSION}.refseq.${GENOME_BUILD}.json.gz" 68 | if [[ ! -e ${merged_file} ]]; then 69 | echo "Creating ${merged_file}" 70 | ${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=${GENOME_BUILD} --output "${merged_file}" 71 | fi -------------------------------------------------------------------------------- /generate_transcript_data/refseq_transcripts_grch37.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | BASE_DIR=$(dirname ${BASH_SOURCE[0]}) 6 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version) 7 | UTA_VERSION=20210129 8 | 9 | if [[ -z ${GENE_INFO_JSON} ]]; then 10 | echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py" 11 | exit 1 12 | fi 13 | 14 | if [[ -z ${UTA_TRANSCRIPTS} ]]; then 15 | echo "Not including UTA transcripts. Set environment variable UTA_TRANSCRIPTS=True to do so" 16 | else 17 | echo "Retrieving / storing UTA transcripts" 18 | fi 19 | 20 | # Having troubles with corrupted files downloading via FTP from NCBI via IPv6, http works ok 21 | # NOTE: RefSeq transcripts in GRCh37 before p13 did not have alignment gap information 22 | 23 | merge_args=() 24 | 25 | filename=ref_GRCh37.p5_top_level.gff3.gz 26 | url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/BUILD.37.3/GFF/${filename} 27 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 28 | if [[ ! -e ${filename} ]]; then 29 | wget ${url} 30 | fi 31 | if [[ ! -e ${cdot_file} ]]; then 32 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 33 | fi 34 | merge_args+=(${cdot_file}) 35 | 36 | 37 | filename=ref_GRCh37.p9_top_level.gff3.gz 38 | url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.103/GFF/${filename} 39 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 40 | if [[ ! -e ${filename} ]]; then 41 | wget ${url} 42 | fi 43 | if [[ ! -e ${cdot_file} ]]; then 44 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 45 | fi 46 | merge_args+=(${cdot_file}) 47 | 48 | 49 | filename=ref_GRCh37.p10_top_level.gff3.gz 50 | url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.104/GFF/${filename} 51 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 52 | if [[ ! -e ${filename} ]]; then 53 | wget ${url} 54 | fi 55 | if [[ ! -e ${cdot_file} ]]; then 56 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 57 | fi 58 | merge_args+=(${cdot_file}) 59 | 60 | 61 | if [[ ! -z ${UTA_TRANSCRIPTS} ]]; then 62 | # UTA transcripts have gaps, so they should overwrite the earlier refseq transcripts (without gaps) 63 | # But will be overwritten by newer (post p13) official transcripts 64 | cdot_file="cdot-${CDOT_VERSION}.uta_${UTA_VERSION}.GRCh37.json.gz" 65 | ${BASE_DIR}/uta_transcripts.sh ${UTA_VERSION} GRCh37 66 | merge_args+=(${cdot_file}) 67 | fi 68 | 69 | filename=ref_GRCh37.p13_top_level.gff3.gz 70 | url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/${filename} 71 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 72 | if [[ ! -e ${filename} ]]; then 73 | wget ${url} 74 | fi 75 | if [[ ! -e ${cdot_file} ]]; then 76 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 77 | fi 78 | merge_args+=(${cdot_file}) 79 | 80 | 81 | # These all have the same name, so rename them based on release ID 82 | for release in 105.20190906 105.20201022 105.20220307; do 83 | filename=GCF_000001405.25_GRCh37.p13_genomic.${release}.gff.gz 84 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/${release}/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz 85 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 86 | if [[ ! -e ${filename} ]]; then 87 | wget ${url} --output-document=${filename} 88 | fi 89 | if [[ ! -e ${cdot_file} ]]; then 90 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 91 | fi 92 | merge_args+=(${cdot_file}) 93 | done 94 | 95 | merged_file="cdot-${CDOT_VERSION}.refseq.grch37.json.gz" 96 | if [[ ! -e ${merged_file} ]]; then 97 | echo "Creating ${merged_file}" 98 | ${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=GRCh37 --output "${merged_file}" 99 | fi 100 | -------------------------------------------------------------------------------- /generate_transcript_data/refseq_transcripts_grch38.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | BASE_DIR=$(dirname ${BASH_SOURCE[0]}) 6 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version) 7 | UTA_VERSION=20210129 8 | 9 | if [[ -z ${GENE_INFO_JSON} ]]; then 10 | echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py" 11 | exit 1 12 | fi 13 | 14 | # Having troubles with corrupted files downloading via FTP from NCBI via IPv6, http works ok 15 | 16 | if [[ -z ${UTA_TRANSCRIPTS} ]]; then 17 | echo "Not including UTA transcripts. Set environment variable UTA_TRANSCRIPTS=True to do so" 18 | else 19 | echo "Retrieving / storing UTA transcripts" 20 | fi 21 | 22 | 23 | merge_args=() 24 | 25 | if [[ ! -z ${UTA_TRANSCRIPTS} ]]; then 26 | # All GRCh38 transcripts have alignments gaps, so use UTA first (and override with official releases) 27 | uta_cdot_file="cdot-${CDOT_VERSION}.uta_${UTA_VERSION}.GRCh38.json.gz" 28 | ${BASE_DIR}/uta_transcripts.sh ${UTA_VERSION} GRCh38 29 | merge_args+=(${uta_cdot_file}) 30 | fi 31 | 32 | if [[ -z ${GRCH38_REFSEQ_HISTORICAL} ]]; then 33 | echo "Not including RefSeq GRCh38 historical transcripts. Set env variable GRCH38_REFSEQ_HISTORICAL=True to do so" 34 | else 35 | echo "Adding RefSeq GRCh38 historical transcripts" 36 | # Historical - these are stored in separate files for annotation/alignments 37 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2023_03/RefSeq_historical_alignments/GCF_000001405.40-RS_2023_03_genomic.gff.gz 38 | annotation_filename=$(basename $url) 39 | if [[ ! -e ${annotation_filename} ]]; then 40 | wget ${url} --output-document=${annotation_filename} 41 | fi 42 | 43 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2023_03/RefSeq_historical_alignments/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz 44 | alignments_filename=$(basename $url) 45 | if [[ ! -e ${alignments_filename} ]]; then 46 | wget ${url} --output-document=${alignments_filename} 47 | fi 48 | 49 | filename=GCF_000001405.40-RS_2023_03_combined_annotation_alignments.gff.gz 50 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 51 | 52 | if [[ ! -e ${filename} ]]; then 53 | echo "Combining historical annotations and alignments..." 54 | cat ${annotation_filename} ${alignments_filename} > ${filename} 55 | fi 56 | if [[ ! -e ${cdot_file} ]]; then 57 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" --skip-missing-parents 58 | fi 59 | merge_args+=(${cdot_file}) 60 | fi 61 | 62 | filename=ref_GRCh38_top_level.gff3.gz 63 | url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/${filename} 64 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 65 | 66 | if [[ ! -e ${filename} ]]; then 67 | wget ${url} 68 | fi 69 | if [[ ! -e ${cdot_file} ]]; then 70 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 71 | fi 72 | merge_args+=(${cdot_file}) 73 | 74 | 75 | filename=ref_GRCh38.p2_top_level.gff3.gz 76 | url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.107/GFF/${filename} 77 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 78 | 79 | if [[ ! -e ${filename} ]]; then 80 | wget ${url} 81 | fi 82 | if [[ ! -e ${cdot_file} ]]; then 83 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 84 | fi 85 | merge_args+=(${cdot_file}) 86 | 87 | 88 | filename=ref_GRCh38.p7_top_level.gff3.gz 89 | url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.108/GFF/${filename} 90 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 91 | 92 | if [[ ! -e ${filename} ]]; then 93 | wget ${url} 94 | fi 95 | if [[ ! -e ${cdot_file} ]]; then 96 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 97 | fi 98 | merge_args+=(${cdot_file}) 99 | 100 | 101 | filename=ref_GRCh38.p12_top_level.gff3.gz 102 | url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.109/GFF/${filename} 103 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 104 | 105 | if [[ ! -e ${filename} ]]; then 106 | wget ${url} 107 | fi 108 | if [[ ! -e ${cdot_file} ]]; then 109 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 110 | fi 111 | merge_args+=(${cdot_file}) 112 | 113 | 114 | filename=GCF_000001405.38_GRCh38.p12_genomic.gff.gz 115 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109/GCF_000001405.38_GRCh38.p12/${filename} 116 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 117 | 118 | if [[ ! -e ${filename} ]]; then 119 | wget ${url} 120 | fi 121 | if [[ ! -e ${cdot_file} ]]; then 122 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 123 | fi 124 | merge_args+=(${cdot_file}) 125 | 126 | 127 | # 109.20211119 needs latest HTSeq (Feb 2022) or dies with quoting error 128 | for release in 109.20190607 109.20190905 109.20191205 109.20200228 109.20200522 109.20200815 109.20201120 109.20210226 109.20210514 109.20211119; do 129 | # These all have the same name, so rename them based on release ID 130 | filename=GCF_000001405.39_GRCh38.p13_genomic.${release}.gff.gz 131 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/${release}/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz 132 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 133 | if [[ ! -e ${filename} ]]; then 134 | wget ${url} --output-document=${filename} 135 | fi 136 | if [[ ! -e ${cdot_file} ]]; then 137 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 138 | fi 139 | merge_args+=(${cdot_file}) 140 | done 141 | 142 | # GRCh38.p14 143 | # These have same filenames (but in diff directory structures) so need to rename them 144 | filename=GCF_000001405.40_GRCh38.p14_genomic.110.gff.gz 145 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz 146 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 147 | 148 | if [[ ! -e ${filename} ]]; then 149 | wget ${url} --output-document=${filename} 150 | fi 151 | if [[ ! -e ${cdot_file} ]]; then 152 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 153 | fi 154 | merge_args+=(${cdot_file}) 155 | 156 | ## Dated versions 157 | 158 | filename=GCF_000001405.40_GRCh38.p14_genomic.RS_2023_03.gff.gz 159 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2023_03/GCF_000001405.40_GRCh38.p14_genomic.gff.gz 160 | cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz 161 | 162 | if [[ ! -e ${filename} ]]; then 163 | wget ${url} --output-document=${filename} 164 | fi 165 | if [[ ! -e ${cdot_file} ]]; then 166 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 167 | fi 168 | merge_args+=(${cdot_file}) 169 | 170 | 171 | filename=GCF_000001405.40_GRCh38.p14_genomic.RS_2023_10.gff.gz 172 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2023_10/GCF_000001405.40_GRCh38.p14_genomic.gff.gz 173 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 174 | 175 | if [[ ! -e ${filename} ]]; then 176 | wget ${url} --output-document=${filename} 177 | fi 178 | if [[ ! -e ${cdot_file} ]]; then 179 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 180 | fi 181 | merge_args+=(${cdot_file}) 182 | 183 | 184 | filename=GCF_000001405.40_GRCh38.p14_genomic.RS_2024_08.gff.gz 185 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2024_08/GCF_000001405.40_GRCh38.p14_genomic.gff.gz 186 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz 187 | 188 | if [[ ! -e ${filename} ]]; then 189 | wget ${url} --output-document=${filename} 190 | fi 191 | if [[ ! -e ${cdot_file} ]]; then 192 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" 193 | fi 194 | merge_args+=(${cdot_file}) 195 | 196 | 197 | merged_file="cdot-${CDOT_VERSION}.refseq.grch38.json.gz" 198 | if [[ ! -e ${merged_file} ]]; then 199 | echo "Creating ${merged_file}" 200 | ${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=GRCh38 --output "${merged_file}" 201 | fi 202 | -------------------------------------------------------------------------------- /generate_transcript_data/requirements.txt: -------------------------------------------------------------------------------- 1 | biopython 2 | bioutils 3 | htseq 4 | ijson 5 | intervaltree 6 | lazy 7 | pyhgvs 8 | requests -------------------------------------------------------------------------------- /generate_transcript_data/uta_20210129_grch37.sql: -------------------------------------------------------------------------------- 1 | \copy (SELECT transcript.ac, string_agg(distinct transcript.hgnc, ',') as hgnc, 'http://www.ncbi.nlm.nih.gov/refseq/' as origin_url, 2 | string_agg(distinct aln_v.alt_ac::varchar, ',') as contig, 3 | string_agg(distinct aln_v.alt_strand::varchar, ',') as strand, 4 | transcript.cds_start_i, 5 | transcript.cds_end_i, 6 | string_agg(aln_v.alt_start_i::varchar, ',' order by aln_v.alt_exon_id) as exon_starts, 7 | string_agg(aln_v.alt_end_i::varchar, ',' order by aln_v.alt_exon_id) as exon_ends, 8 | string_agg(aln_v.cigar, ',' order by aln_v.alt_exon_id) as cigars, 9 | string_agg(distinct aa.pro_ac, ',' order by aa.pro_ac) as protein 10 | from uta_20210129.transcript transcript 11 | inner join uta_20210129.tx_exon_aln_v aln_v on (transcript.ac = aln_v.tx_ac AND alt_aln_method = 'splign') 12 | left outer join uta_20210129.associated_accessions aa on (transcript.ac = aa.tx_ac) 13 | WHERE aln_v.alt_ac in 14 | ('NC_000001.10', 'NC_000002.11', 'NC_000003.11', 'NC_000004.11', 'NC_000005.9', 'NC_000006.11', 'NC_000007.13', 'NC_000008.10', 'NC_000009.11', 'NC_000010.10', 'NC_000011.9', 'NC_000012.11', 'NC_000013.10', 'NC_000014.8', 'NC_000015.9', 'NC_000016.9', 'NC_000017.10', 'NC_000018.9', 'NC_000019.9', 'NC_000020.10', 'NC_000021.8', 'NC_000022.10', 'NC_000023.10', 'NC_000024.9') 15 | group by transcript.ac) TO 'uta_20210129_grch37.csv' CSV HEADER; -------------------------------------------------------------------------------- /generate_transcript_data/uta_20210129_grch38.sql: -------------------------------------------------------------------------------- 1 | \copy (SELECT transcript.ac, string_agg(distinct transcript.hgnc, ',') as hgnc, 'http://www.ncbi.nlm.nih.gov/refseq/' as origin_url, 2 | string_agg(distinct aln_v.alt_ac::varchar, ',') as contig, 3 | string_agg(distinct aln_v.alt_strand::varchar, ',') as strand, 4 | transcript.cds_start_i, 5 | transcript.cds_end_i, 6 | string_agg(aln_v.alt_start_i::varchar, ',' order by aln_v.alt_exon_id) as exon_starts, 7 | string_agg(aln_v.alt_end_i::varchar, ',' order by aln_v.alt_exon_id) as exon_ends, 8 | string_agg(aln_v.cigar, ',' order by aln_v.alt_exon_id) as cigars, 9 | string_agg(distinct aa.pro_ac, ',' order by aa.pro_ac) as protein 10 | from uta_20210129.transcript transcript 11 | inner join uta_20210129.tx_exon_aln_v aln_v on (transcript.ac = aln_v.tx_ac AND alt_aln_method = 'splign') 12 | left outer join uta_20210129.associated_accessions aa on (transcript.ac = aa.tx_ac) 13 | WHERE aln_v.alt_ac in 14 | ('NC_000001.11', 'NC_000002.12', 'NC_000003.12', 'NC_000004.12', 'NC_000005.10', 'NC_000006.12', 'NC_000007.14', 'NC_000008.11', 'NC_000009.12', 'NC_000010.11', 'NC_000011.10', 'NC_000012.12', 'NC_000013.11', 'NC_000014.9', 'NC_000015.10', 'NC_000016.10', 'NC_000017.11', 'NC_000018.10', 'NC_000019.10', 'NC_000020.11', 'NC_000021.9', 'NC_000022.11', 'NC_000023.11', 'NC_000024.10') and origin.origin_id not in (10, 11) 15 | group by transcript.ac) TO 'uta_20210129_grch38.csv' CSV HEADER; -------------------------------------------------------------------------------- /generate_transcript_data/uta_transcripts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -ne 2 ]; then 4 | echo "Usage ${BASH_SOURCE[0]} uta_version genome_build" 5 | exit 1; 6 | fi 7 | 8 | BASE_DIR=$(dirname ${BASH_SOURCE[0]}) 9 | UTA_BASE_URL=uta.biocommons.org # uta.invitae.com moved here 10 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version) 11 | UTA_VERSION=${1} 12 | GENOME_BUILD=${2} 13 | 14 | export PGPASSWORD=anonymous 15 | 16 | uta_csv_filename=uta_${UTA_VERSION}_${GENOME_BUILD,,}.csv 17 | if [[ ! -e ${uta_csv_filename} ]]; then 18 | SQL=${BASE_DIR}/uta_${UTA_VERSION}_${GENOME_BUILD,,}.sql # Lowercase filename 19 | 20 | # can't have newlines in \copy command 21 | cat ${SQL} | tr -s '\n' ' ' | psql -h ${UTA_BASE_URL} -U anonymous -d uta 22 | fi 23 | 24 | cdot_file="cdot-${CDOT_VERSION}.uta_${UTA_VERSION}.${GENOME_BUILD}.json.gz" 25 | if [[ ! -e ${cdot_file} ]]; then 26 | POSTGRES_URL=postgresql://${UTA_BASE_URL}/uta_${UTA_VERSION} 27 | ${BASE_DIR}/cdot_json.py uta_to_json "${uta_csv_filename}" --url "${POSTGRES_URL}" --output "${cdot_file}" --genome-build=${GENOME_BUILD} 28 | fi 29 | -------------------------------------------------------------------------------- /paper/HGVS cleaning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "bb4a4052", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import re\n", 11 | "import string\n", 12 | "from typing import Tuple, List\n", 13 | "import pandas as pd\n", 14 | "\n", 15 | "\n", 16 | "df = pd.read_csv(\"./hgvs_searches.csv\")\n", 17 | "non_resolve_mask = df[\"can_resolve\"] == False\n", 18 | "hgvs_errors_df = df[non_resolve_mask]\n", 19 | "\n", 20 | "hgvs_errors_df = hgvs_errors_df.sort_values(\"hgvs\")\n", 21 | " \n", 22 | "# dropping ALL duplicate values\n", 23 | "hgvs_errors_df.drop_duplicates(subset=\"hgvs\",\n", 24 | " keep=False, inplace=True)\n", 25 | "\n", 26 | "hgvs_errors_df.to_csv(\"hgvs_errors_uniq.csv\")\n", 27 | "\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "a11dfb04", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "\n", 38 | "hgvs_errors_df" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "id": "8d610f24", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "\n", 49 | "\n", 50 | "\n", 51 | "pattern_kind_no_colon = re.compile(r\"(c|g|m|n|p)\\.(\\d+)\")\n", 52 | "pattern_kind_no_dot = re.compile(r\":(c|g|m|n|p)(\\d+)\")\n", 53 | "pattern_gene_symbol = re.compile(r\"^[A-Z0-9-]+$|^C[0-9XY]+orf[0-9]+\") # HGNC gene symbol - https://www.biostars.org/p/60118/#65063\n", 54 | "\n", 55 | "\n", 56 | "# Copy/pasted from pyhgvs\n", 57 | "# The RefSeq standard for naming contigs/transcripts/proteins:\n", 58 | "# http://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly # nopep8\n", 59 | "REFSEQ_PREFIXES = [\n", 60 | " ('AC_', 'genomic',\n", 61 | " 'Complete genomic molecule, usually alternate assembly'),\n", 62 | " ('NC_', 'genomic',\n", 63 | " 'Complete genomic molecule, usually reference assembly'),\n", 64 | " ('NG_', 'genomic', 'Incomplete genomic region'),\n", 65 | " ('NT_', 'genomic', 'Contig or scaffold, clone-based or WGS'),\n", 66 | " ('NW_', 'genomic', 'Contig or scaffold, primarily WGS'),\n", 67 | " ('NS_', 'genomic', 'Environmental sequence'),\n", 68 | " ('NZ_', 'genomic', 'Unfinished WGS'),\n", 69 | " ('NM_', 'mRNA', ''),\n", 70 | " ('NR_', 'RNA', ''),\n", 71 | " ('XM_', 'mRNA', 'Predicted model'),\n", 72 | " ('XR_', 'RNA', 'Predicted model'),\n", 73 | " ('AP_', 'Protein', 'Annotated on AC_ alternate assembly'),\n", 74 | " ('NP_', 'Protein', 'Associated with an NM_ or NC_ accession'),\n", 75 | " ('YP_', 'Protein', ''),\n", 76 | " ('XP_', 'Protein', 'Predicted model, associated with an XM_ accession'),\n", 77 | " ('ZP_', 'Protein', 'Predicted model, annotated on NZ_ genomic records'),\n", 78 | "]\n", 79 | "\n", 80 | "\n", 81 | "\n", 82 | "\n", 83 | "def remove_non_printable_characters(hgvs_string):\n", 84 | " return re.sub(f'[^{re.escape(string.printable)}]', '', hgvs_string)\n", 85 | "\n", 86 | "def remove_whitespace(hgvs_string):\n", 87 | " \"\"\" This would be covered in remove_invalid_characters but this gives a nicer message \"\"\"\n", 88 | " return re.sub(\"\\s\", '', hgvs_string)\n", 89 | "\n", 90 | "def remove_invalid_characters(hgvs_string):\n", 91 | " return re.sub(\"[^A-Za-z0-9-_\\(\\)\\>=]\", '', hgvs_string)\n", 92 | "\n", 93 | "\n", 94 | "def clean_kind(hgvs_string):\n", 95 | " # Fix common typos\n", 96 | " \n", 97 | " # c, -> c. \n", 98 | " # ;c -> :c semicolon\n", 99 | " \n", 100 | " \n", 101 | " return hgvs_string\n", 102 | " \n", 103 | "\n", 104 | "def add_unmatched_brackets(hgvs_string):\n", 105 | " return hgvs_string\n", 106 | "\n", 107 | "def add_missing_colon(hgvs_string):\n", 108 | " # GLA c.\n", 109 | " # NM_001205293.2(CACNA1E):c.4165C>T'\n", 110 | " \n", 111 | " return hgvs_string\n", 112 | "\n", 113 | "def remove_duplicates(hgvs_string):\n", 114 | " hgvs_string = re.sub(\"::+\", \":\", hgvs_string)\n", 115 | " hgvs_string = re.sub(\"\\.\\.+\", \".\", hgvs_string)\n", 116 | " return hgvs_string\n", 117 | "\n", 118 | "\n", 119 | "def fix_allele_case(allele_string):\n", 120 | " allele_keyworks = [\n", 121 | " 'del',\n", 122 | " 'delins',\n", 123 | " 'dup',\n", 124 | " 'ins',\n", 125 | " 'inv',\n", 126 | " ]\n", 127 | " for ak in allele_keyworks:\n", 128 | " allele_string = re.sub(ak, ak, allele_string, flags=re.IGNORECASE)\n", 129 | " return allele_string\n", 130 | " \n", 131 | "\n", 132 | "GLOBAL_CLEAN = {\n", 133 | " \"remove_non_printable_characters\": remove_non_printable_characters,\n", 134 | " \"remove_whitespace\": remove_whitespace,\n", 135 | " \"remove_invalid_characters\", remove_invalid_characters,\n", 136 | " \"remove duplicates\": remove_duplicates,\n", 137 | "}\n", 138 | "\n", 139 | "\n", 140 | " # Optional - remove gene symbol - (for clingen and biocommons HGVS) \n", 141 | " # \"remove_gene_symbol\": remove_gene_symbol,\n", 142 | "\n", 143 | "# \"clean_kind\": clean_kind,\n", 144 | "# \"add_unmatched_brackets\": add_unmatched_brackets,\n", 145 | "# \"add_missing_colon\": add_missing_colon,\n", 146 | "\n", 147 | "\n", 148 | "test_hgvs = [\n", 149 | " \"c.4165C>T\", # This should fail as it has no transcript/gene\n", 150 | " \"CACNA1E:c.4165C>T'\", # gene name - it's resolution that is trick here\n", 151 | " \"CACNA1E c.4165C>T'\", # extra space, missing colon \n", 152 | " \"CACNA1Ec.4165C>T'\", # missing colon\n", 153 | " \"NM_001205293.2 :c.4165C>T'\", # whitespace\n", 154 | " \"NM_001205293.2(CACNA1E):c.4165C>T'\", # \n", 155 | " \"NM_001205293.2 :c.4165C>T'\", # whitespace\n", 156 | "]\n", 157 | "\n", 158 | "\n", 159 | "\n", 160 | "def clean_hgvs(original_hgvs_string) -> Tuple[str, List[str]]:\n", 161 | " hgvs_string = original_hgvs_string\n", 162 | " clean_messages = []\n", 163 | "\n", 164 | " for clean_method_desc, clean_hgvs_func in GLOBAL_CLEAN.items():\n", 165 | " cleaned_hgvs_string = clean_hgvs_func(hgvs_string) # hgvs_method)\n", 166 | " if cleaned_hgvs_string != hgvs_string:\n", 167 | " clean_messages.append(clean_method_desc)\n", 168 | " hgvs_string = cleaned_hgvs_string\n", 169 | "\n", 170 | "\n", 171 | " # Now we split it up into reference/kind/allele\n", 172 | " \n", 173 | " \n", 174 | " return hgvs_string, clean_messages\n", 175 | " " 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "id": "d30b8846", 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "# I think we should first break it up into \n", 186 | "# reference / kind / allele\n", 187 | "# \n", 188 | "\n", 189 | "\n", 190 | "original_hgvs_string = \"GLA c.1277_1278delAA\"\n", 191 | "hgvs_string, clean_messages = clean_hgvs(original_hgvs_string)\n", 192 | "print(f\"{original_hgvs_string} -> {hgvs_string} \")\n", 193 | "for msg in clean_messages:\n", 194 | " print(msg)\n", 195 | " \n" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "id": "ba35d85d", 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "# This is from VariantGrid code\n", 206 | "\n", 207 | "\n", 208 | " BAD_HGVS = [\n", 209 | " \"NM_000038.6;c.4332A>T\" # Semicolon instead\n", 210 | " \"NM_205768 c.44A>G\", # Missing colon (no version)\n", 211 | " \"NM_005629.3:c1403A>C\", # Missing dot after kind\n", 212 | " \"NM_001101.4 c.95C>G\", # Missing colon\n", 213 | " \"NM_00380.3: c.648_649delGA\", # space after colon\n", 214 | " \"NC_000023.10:g. 31496384G>A\",\n", 215 | " \"NM_004245: :c.337G>T\", # Double colon\n", 216 | " \"NC_000017.10:g.21085664 G>C\", # Space after numbers\n", 217 | " \"NC_000023.10:g. 133547943G>A\", # Space after g.\n", 218 | " # Missing transcript underscore, Missing colon, Missing dot after g\n", 219 | " # Space between position and reference base\n", 220 | " \"NC000002.10g39139341 C>T\",\n", 221 | " # Unbalanced brackets\n", 222 | " \"NM_001754.5):c.557T>A\",\n", 223 | " \"(NM_004991.4:c.2577+4A>T\",\n", 224 | " # Good brackets HGVS (just testing gene symbol)\n", 225 | " \"NM_001754.5(RUNX1):c.1415T>C\",\n", 226 | " \"NM_032638:c.1126_1133DUP\", # Case\n", 227 | " \"NM_001754.5:557T>A\", # Missing \"c.\"\n", 228 | " \"NC_000007.13:117199563G>T\", # Missing \"g.\"\n", 229 | " ]\n", 230 | "\n", 231 | " for bad_hgvs in BAD_HGVS:\n", 232 | " try:\n", 233 | " HGVSName(bad_hgvs)\n", 234 | " self.fail(f\"Expected '{bad_hgvs}' to fail!\")\n", 235 | " except:\n", 236 | " pass # Expected\n", 237 | "\n", 238 | " fixed_hgvs = HGVSMatcher.clean_hgvs(bad_hgvs)[0]\n", 239 | " HGVSName(fixed_hgvs)\n", 240 | "\n", 241 | "\n" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "id": "86c509cb", 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [] 251 | } 252 | ], 253 | "metadata": { 254 | "kernelspec": { 255 | "display_name": "Python 3 (ipykernel)", 256 | "language": "python", 257 | "name": "python3" 258 | }, 259 | "language_info": { 260 | "codemirror_mode": { 261 | "name": "ipython", 262 | "version": 3 263 | }, 264 | "file_extension": ".py", 265 | "mimetype": "text/x-python", 266 | "name": "python", 267 | "nbconvert_exporter": "python", 268 | "pygments_lexer": "ipython3", 269 | "version": "3.10.6" 270 | } 271 | }, 272 | "nbformat": 4, 273 | "nbformat_minor": 5 274 | } 275 | -------------------------------------------------------------------------------- /paper/clean_hgvs_search_csvs.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | 3 | import math 4 | import re 5 | import sys 6 | import pandas as pd 7 | from pysam.libcfaidx import FastaFile 8 | import pyhgvs 9 | from cdot.pyhgvs.pyhgvs_transcript import JSONPyHGVSTranscriptFactory 10 | 11 | 12 | def get_combined_df(): 13 | servers = ["shariant", "vg-aws", "vg3_upgrade"] 14 | 15 | df_list = [] 16 | 17 | for server in servers: 18 | filename = f"{server}_search_hgvs.csv" 19 | server_df = pd.read_csv(filename, names=["date", "details"], skiprows=1) 20 | server_df["server"] = server 21 | df_list.append(server_df) 22 | 23 | return pd.concat(df_list) 24 | 25 | 26 | def add_hgvs_column(df): 27 | pattern_calculated = re.compile(r"'(.+)' calculated") 28 | pattern_type = re.compile(r"'(.+)' = type") 29 | pattern_returned = re.compile(r"'(.+)' returned") 30 | 31 | hgvs_list = [] 32 | for details in df["details"].values: 33 | for pattern in [pattern_calculated, pattern_type, pattern_returned]: 34 | if m := pattern.match(details): 35 | hgvs_list.append(m.group(1)) 36 | break 37 | else: 38 | print(f"No match for '{details}'") 39 | hgvs_list.append("") 40 | 41 | df["hgvs"] = hgvs_list 42 | 43 | 44 | def can_resolve(genome, factory, hgvs_c): 45 | try: 46 | pyhgvs.parse_hgvs_name(hgvs_c, genome, get_transcript=factory.get_transcript_grch37) # 37 47 | return True 48 | except Exception as e: 49 | print(e) 50 | try: 51 | pyhgvs.parse_hgvs_name(hgvs_c, genome, get_transcript=factory.get_transcript_grch38) # 38 52 | return True 53 | except Exception as e2: 54 | print(e2) 55 | pass 56 | return False 57 | 58 | 59 | def add_hgvs_validation_columns(df): 60 | genome = FastaFile("/data/annotation/fasta/GCF_000001405.25_GRCh37.p13_genomic.fna.gz") 61 | factory = JSONPyHGVSTranscriptFactory(["/home/dlawrence/Downloads/cdot-0.2.12.refseq.grch37_grch38.json.gz", 62 | "/home/dlawrence/Downloads/cdot-0.2.12.ensembl.grch37_grch38.json.gz"]) 63 | 64 | valid_hgvs_list = [] 65 | can_resolve_list = [] 66 | for hgvs_c in df["hgvs"].values: 67 | # print(f"testing... {hgvs_c}") 68 | resolve_ok = False 69 | try: 70 | pyhgvs.HGVSName(hgvs_c) 71 | valid_hgvs = True 72 | resolve_ok = can_resolve(genome, factory, hgvs_c) 73 | except: 74 | valid_hgvs = False 75 | 76 | valid_hgvs_list.append(valid_hgvs) 77 | can_resolve_list.append(resolve_ok) 78 | 79 | df["valid_hgvs"] = valid_hgvs_list 80 | df["can_resolve"] = can_resolve_list 81 | 82 | 83 | def split_df_chunks(data_df,chunk_size): 84 | """ From https://xhinker.medium.com/python-split-a-dataframe-to-a-chunk-list-fe80bf9d63be """ 85 | total_length = len(data_df) 86 | total_chunk_num = math.ceil(total_length/chunk_size) 87 | normal_chunk_num = math.floor(total_length/chunk_size) 88 | chunks = [] 89 | for i in range(normal_chunk_num): 90 | chunk = data_df[(i*chunk_size):((i+1)*chunk_size)] 91 | chunks.append(chunk) 92 | if total_chunk_num > normal_chunk_num: 93 | chunk = data_df[(normal_chunk_num*chunk_size):total_length] 94 | chunks.append(chunk) 95 | return chunks 96 | 97 | 98 | def main(): 99 | if len(sys.argv) == 1: 100 | print("main") 101 | df = get_combined_df() 102 | add_hgvs_column(df) 103 | for i, chunk in enumerate(split_df_chunks(df, 500)): 104 | filename = f"hgvs_search_{i}.csv" 105 | print(f"writing {filename}") 106 | chunk.to_csv(filename) 107 | else: 108 | filename = sys.argv[1] 109 | print(f"Processing {filename}") 110 | df = pd.read_csv(filename) 111 | add_hgvs_validation_columns(df) 112 | df.to_csv(f"validate_{filename}") 113 | 114 | 115 | if __name__ == "__main__": 116 | main() 117 | -------------------------------------------------------------------------------- /paper/combine_csv.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | 3 | columns = ['date', 'details', 'server', 'hgvs', 'valid_hgvs', 'can_resolve'] 4 | df_list = [] 5 | for filename in glob.glob("validate*.csv"): 6 | df = pd.read_csv(filename) 7 | df = df[columns] 8 | df_list.append(df) 9 | df_combined.sort_values("date").to_csv("hgvs_searches_combined.csv", index=False) 10 | 11 | 12 | -------------------------------------------------------------------------------- /paper/investigate_fails.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | 3 | import math 4 | import re 5 | import sys 6 | import pandas as pd 7 | from pysam.libcfaidx import FastaFile 8 | import pyhgvs 9 | from cdot.pyhgvs.pyhgvs_transcript import JSONPyHGVSTranscriptFactory 10 | 11 | 12 | def main(): 13 | if len(sys.argv) != 1: 14 | sys.stderr.write(f"Usage {sys.argv[0]} hgvs_searches_combined.csv\n") 15 | sys.exit(1) 16 | 17 | filename = sys.argv[1] 18 | df = pd.read_csv(filename) 19 | 20 | non_resolve_mask = df["can_resolve"] is False 21 | hgvs_errors_df = df[non_resolve_mask] 22 | 23 | genome = FastaFile("/data/annotation/fasta/GCF_000001405.25_GRCh37.p13_genomic.fna.gz") 24 | factory = JSONPyHGVSTranscriptFactory(["/home/dlawrence/Downloads/cdot-0.2.12.refseq.grch37_grch38.json.gz", 25 | "/home/dlawrence/Downloads/cdot-0.2.12.ensembl.grch37_grch38.json.gz"]) 26 | 27 | 28 | if __name__ == "__main__": 29 | main() 30 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = cdot 3 | version = attr: cdot.__version__ 4 | author = Dave Lawrence 5 | author_email = davmlaw@gmail.com 6 | description = Transcripts for HGVS libraries 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | url = https://github.com/SACGF/cdot 10 | project_urls = 11 | Bug Tracker = https://github.com/SACGF/cdot/issues 12 | classifiers = 13 | Programming Language :: Python :: 3 14 | License :: OSI Approved :: MIT License 15 | Operating System :: OS Independent 16 | 17 | [options] 18 | package_dir = 19 | packages = find: 20 | python_requires = >=3.8 21 | install_requires = 22 | requests 23 | intervaltree 24 | more_itertools 25 | bioutils>=0.5.8 26 | lazy 27 | 28 | [options.packages.find] 29 | where = 30 | exclude= 31 | tests 32 | generate_transcript_data 33 | 34 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SACGF/cdot/ddeb78d58731dd4136689360d0fce4a8a91af87d/tests/__init__.py -------------------------------------------------------------------------------- /tests/benchmark_hgvs.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | 3 | """ 4 | See instructions at end of file on how to extract test HGVS from clinvar 5 | """ 6 | import logging 7 | import time 8 | import pandas as pd 9 | from argparse import ArgumentParser 10 | 11 | import hgvs 12 | import hgvs.dataproviders.uta 13 | from hgvs.assemblymapper import AssemblyMapper 14 | from hgvs.exceptions import HGVSDataNotAvailableError, HGVSInvalidVariantError 15 | 16 | from cdot.hgvs.dataproviders import JSONDataProvider, RESTDataProvider, FastaSeqFetcher 17 | from cdot.hgvs.dataproviders.ensembl_tark_data_provider import EnsemblTarkDataProvider, EnsemblTarkSeqFetcher 18 | 19 | 20 | def handle_args(): 21 | parser = ArgumentParser(description='Benchmark cdot') 22 | parser.add_argument("--debug", action="store_true") 23 | parser.add_argument("hgvs_file") 24 | group = parser.add_mutually_exclusive_group() 25 | group.add_argument('--uta', action='store_true') 26 | group.add_argument('--rest', action='store_true') 27 | group.add_argument('--rest-insecure', action='store_true') 28 | group.add_argument('--ensembl-tark', action='store_true') 29 | parser.add_argument('--json', help='JSON file') 30 | parser.add_argument('--fasta', help='Fasta file for local sequences') 31 | args = parser.parse_args() 32 | if not any([args.uta, args.rest, args.rest_insecure, args.ensembl_tark, args.json]): 33 | parser.error("You need to specify at least one of 'uta', 'rest', 'rest-insecure', 'ensembl-tark', 'json'") 34 | return args 35 | 36 | 37 | def main(): 38 | args = handle_args() 39 | if args.debug: 40 | logging.basicConfig(level=logging.DEBUG) 41 | 42 | hgvs_g_c_list = [] 43 | with open(args.hgvs_file) as f: 44 | for line in f: 45 | hgvs_g_c_list.append(line.split()) 46 | 47 | total = len(hgvs_g_c_list) 48 | logging.debug(f"Using {total} test records") 49 | 50 | seqfetcher = None 51 | if args.fasta: 52 | if args.debug: 53 | logging.debug("Using fasta: %s", args.fasta) 54 | seqfetcher = FastaSeqFetcher(args.fasta) 55 | 56 | if args.uta: 57 | hdp = hgvs.dataproviders.uta.connect() 58 | elif args.rest: 59 | hdp = RESTDataProvider(seqfetcher=seqfetcher) # Uses API server at cdot.cc 60 | elif args.rest_insecure: 61 | hdp = RESTDataProvider(secure=False, seqfetcher=seqfetcher) 62 | elif args.json: 63 | hdp = JSONDataProvider([args.json], seqfetcher=seqfetcher) 64 | elif args.ensembl_tark: 65 | # Tark doesn't provide genomes so it needs a genome one... 66 | if args.fasta: 67 | fasta_files = [args.fasta] 68 | else: 69 | fasta_files = None 70 | seqfetcher = EnsemblTarkSeqFetcher(fasta_files=fasta_files) 71 | hdp = EnsemblTarkDataProvider(seqfetcher=seqfetcher) 72 | else: 73 | raise ValueError("Unknown data provider method!") 74 | 75 | if args.debug: 76 | logging.debug("Starting benchmark...") 77 | am = AssemblyMapper(hdp, 78 | assembly_name='GRCh38', 79 | alt_aln_method='splign', replace_reference=True) 80 | 81 | hp = hgvs.parser.Parser() 82 | 83 | run_times = [] 84 | correct = 0 85 | incorrect = 0 86 | no_data = 0 87 | errors = 0 88 | total_start = time.time() 89 | 90 | def _show_stats(): 91 | df = pd.DataFrame(run_times) 92 | print(df.describe().T) 93 | print(f"Correct: {correct}, incorrect: {incorrect}, no data: {no_data}, errors: {errors}") 94 | 95 | last_notification = time.time() 96 | for hgvs_g, hgvs_c in hgvs_g_c_list: 97 | if args.debug: 98 | logging.debug("c.HGVS: %s", hgvs_c) 99 | 100 | start = time.time() 101 | if start - last_notification > 5: 102 | last_notification = start 103 | _show_stats() 104 | print("-" * 50) 105 | 106 | try: 107 | var_c = hp.parse_hgvs_variant(hgvs_c) 108 | if ":c." in hgvs_c: 109 | converted_hgvs_g = str(am.c_to_g(var_c)) 110 | else: 111 | converted_hgvs_g = str(am.n_to_g(var_c)) 112 | except HGVSDataNotAvailableError as dne: 113 | logging.warning(dne) 114 | no_data += 1 115 | continue 116 | except HGVSInvalidVariantError as ive: 117 | print(f"{hgvs_c}: {ive}") 118 | incorrect += 1 119 | continue 120 | except Exception as e: 121 | logging.error(e) 122 | errors += 1 123 | continue 124 | 125 | if converted_hgvs_g == hgvs_g: 126 | correct += 1 127 | else: 128 | incorrect += 1 129 | print(f"{hgvs_c}: '{hgvs_g}' != '{converted_hgvs_g}' (actual)") 130 | continue 131 | 132 | # We only keep times for correct data 133 | end = time.time() 134 | time_taken = end - start 135 | run_times.append(time_taken) 136 | 137 | _show_stats() 138 | total_end = time.time() 139 | total_time = total_end - total_start 140 | num_per_second = 1 / total_time * total 141 | print(f"{total} in {total_time} = {num_per_second} per second") 142 | 143 | if __name__ == '__main__': 144 | main() 145 | 146 | """ 147 | 148 | How to make RefSeq test files: 149 | -------------------------------- 150 | 151 | * Get a subset of rows from ClinVar VCF 152 | * zgrep "^#" clinvar.vcf.gz > header.txt 153 | * zgrep -v "^#" clinvar.vcf.gz | shuf -n 1000 > clinvar_1k_records.vcf 154 | * cat header.txt clinvar_1k_rows.vcf | gzip > clinvar_1k.vcf.gz 155 | 156 | * Annotate the VCF to get MANE transcript (via --pick) 157 | 158 | vep -i clinvar_1k.vcf.gz -o clinvar_1k.vep_annotated.vcf.gz --cache --dir /data/annotation/VEP/vep_cache --fasta /data/annotation/fasta/GCF_000001405.39_GRCh38.p13_genomic.fna.gz --assembly GRCh38 --offline --use_given_ref --vcf --compress_output gzip --force_overwrite --pick --no_escape --hgvs --refseq --buffer_size 1000 159 | 160 | * Extract out the g.HGVS and c.HGVS 161 | 162 | def cyvcf2_header_types(cyvcf2_reader): 163 | header_types = defaultdict(dict) 164 | for h in cyvcf2_reader.header_iter(): 165 | info = h.info() 166 | h_id = info.get("ID") 167 | if h_id: # Not much use w/o this 168 | header_types[h.type][h_id] = info 169 | return header_types 170 | 171 | 172 | reader = Reader("./clinvar_1k.vcf.gz") 173 | header_types = cyvcf2_header_types(reader) 174 | description = header_types["INFO"]["CSQ"]["description"] 175 | description = description.replace('"', '') # Strip double quotes 176 | 177 | match = "Format: " 178 | columns_str = description[description.rfind(match) + len(match):] 179 | vep_columns = columns_str.split("|") 180 | 181 | hgvs = [] 182 | for v in reader: 183 | csq = v.INFO.get("CSQ") 184 | td = dict(zip(vep_columns, csq.split("|"))) 185 | g_hgvs = v.INFO.get("CLNHGVS") 186 | c_hgvs = td.get("HGVSc") 187 | if g_hgvs and c_hgvs: 188 | hgvs.append((g_hgvs, c_hgvs)) 189 | 190 | 191 | -------------------------- 192 | How to make Ensembl files 193 | -------------------------- 194 | 195 | * Import ClinVar subset into VariantGrid 196 | * As admin, on VCF page click "Populate ClinGen Alleles" 197 | * Should have enough with both ClinVar and MANE 198 | 199 | def get_38_ghgvs(cga): 200 | for ga in cga.api_response["genomicAlleles"]: 201 | if ga["referenceGenome"] == 'GRCh38': 202 | for h in ga["hgvs"]: 203 | if h.startswith("NC_"): 204 | return h 205 | return None 206 | 207 | def get_ensembl_mane(cga): 208 | for ta in cga.api_response["transcriptAlleles"]: 209 | if mane := ta.get("MANE"): 210 | if nt := mane.get("nucleotide"): 211 | if e := nt.get("Ensembl"): 212 | if h := e.get("hgvs"): 213 | return h 214 | return None 215 | 216 | g_and_c = [] 217 | 218 | clingen_qs = ClinGenAllele.objects.filter(Q(api_response__icontains='ClinVarAlleles') & Q(api_response__icontains='MANE')) 219 | for cga in clingen_qs: 220 | g_hgvs = get_38_ghgvs(cga) 221 | c_hgvs = get_ensembl_mane(cga) 222 | if g_hgvs and c_hgvs: 223 | g_and_c.append((g_hgvs, c_hgvs)) 224 | 225 | with open("/tmp/transcripts.txt", "wt") as f: 226 | for x in g_and_c: 227 | f.write("\t".join(x) + "\n") 228 | 229 | 230 | """ 231 | -------------------------------------------------------------------------------- /tests/genome.py: -------------------------------------------------------------------------------- 1 | """ 2 | From https://github.com/counsyl/hgvs 3 | """ 4 | 5 | from __future__ import absolute_import 6 | from __future__ import unicode_literals 7 | 8 | import itertools 9 | import os 10 | 11 | try: 12 | # Original PyHGVS 13 | from pyhgvs.variants import revcomp 14 | except ImportError: 15 | # SACGF fork of PyHGVS 16 | from pyhgvs.models.variants import revcomp 17 | 18 | 19 | try: 20 | from pyfaidx import Genome as SequenceFileDB 21 | # Allow pyflakes to ignore redefinition in except clause. 22 | SequenceFileDB 23 | except ImportError: 24 | SequenceFileDB = None 25 | 26 | 27 | class MockGenomeError(Exception): 28 | pass 29 | 30 | 31 | class MockSequence(object): 32 | def __init__(self, sequence): 33 | self.sequence = sequence 34 | 35 | def __neg__(self): 36 | """Return reverse complement sequence.""" 37 | return MockSequence(revcomp(self.sequence)) 38 | 39 | def __str__(self): 40 | return self.sequence 41 | 42 | def __repr__(self): 43 | return 'MockSequence("%s")' % self.sequence 44 | 45 | 46 | class MockChromosome(object): 47 | def __init__(self, name, genome=None): 48 | self.name = name 49 | self.genome = genome 50 | 51 | def __getitem__(self, n): 52 | """Return sequence from region [start, end) 53 | 54 | Coordinates are 0-based, end-exclusive.""" 55 | if isinstance(n, slice): 56 | return self.genome.get_seq(self.name, n.start, n.stop) 57 | else: 58 | return self.genome.get_seq(self.name, n, n+1) 59 | 60 | def __repr__(self): 61 | return 'MockChromosome("%s")' % (self.name) 62 | 63 | 64 | class MockGenome(object): 65 | def __init__(self, lookup=None, filename=None, db_filename=None, 66 | default_seq=None): 67 | """ 68 | A mock genome object that provides a pygr compatible interface. 69 | 70 | lookup: a list of ((chrom, start, end), seq) values that define 71 | a lookup table for genome sequence requests. 72 | filename: a stream or filename containing a lookup table. 73 | db_filename: a fasta file to use for genome sequence requests. All 74 | requests are recorded and can be writen to a lookup table file 75 | using the `write` method. 76 | default_seq: if given, this base will always be returned if 77 | region is unavailable. 78 | """ 79 | self._chroms = {} 80 | self._lookup = lookup if lookup is not None else {} 81 | self._genome = None 82 | self._default_seq = default_seq 83 | 84 | if db_filename: 85 | # Use a real genome database. 86 | if SequenceFileDB is None: 87 | raise ValueError('pygr is not available.') 88 | self._genome = SequenceFileDB(db_filename) 89 | self._source_filename = db_filename 90 | elif filename: 91 | # Read genome sequence from lookup table. 92 | self.read(filename) 93 | self._source_filename = filename 94 | 95 | def __contains__(self, chrom): 96 | """Return True if genome contains chromosome.""" 97 | return chrom in (self._genome or self._chroms) 98 | 99 | def __getitem__(self, chrom): 100 | """Return a chromosome by its name.""" 101 | if chrom not in self._chroms: 102 | self._chroms[chrom] = MockChromosome(chrom, self) 103 | return self._chroms[chrom] 104 | 105 | def get_seq(self, chrom, start, end): 106 | """Return a sequence by chromosome name and region [start, end). 107 | 108 | Coordinates are 0-based, end-exclusive. 109 | """ 110 | if self._genome: 111 | # Get sequence from real genome object and save result. 112 | seq = self._genome[chrom][start:end] 113 | self._lookup[(chrom, start, end)] = str(seq) 114 | return seq 115 | else: 116 | # Use lookup table to fetch genome sequence. 117 | try: 118 | return MockSequence(self._lookup[(chrom, start, end)]) 119 | except KeyError: 120 | if self._default_seq: 121 | # Generate default sequence. 122 | return ''.join(itertools.islice( 123 | itertools.cycle(self._default_seq), 124 | None, end - start)) 125 | else: 126 | raise MockGenomeError( 127 | 'Sequence not in test data: %s:%d-%d source: %s' % 128 | (chrom, start, end, self._source_filename)) 129 | 130 | def read(self, filename): 131 | """Read a sequence lookup table from a file. 132 | 133 | filename: a filename string or file stream. 134 | """ 135 | if hasattr(filename, 'read'): 136 | infile = filename 137 | else: 138 | with open(filename) as infile: 139 | return self.read(infile) 140 | 141 | for line in infile: 142 | tokens = line.rstrip().split('\t') 143 | chrom, start, end, seq = tokens 144 | self._lookup[(chrom, int(start), int(end))] = seq 145 | if chrom not in self._lookup: 146 | self._chroms[chrom] = MockChromosome(chrom, self) 147 | 148 | def write(self, filename): 149 | """Write a sequence lookup table to file.""" 150 | if hasattr(filename, 'write'): 151 | out = filename 152 | else: 153 | with open(filename, 'w') as out: 154 | return self.write(out) 155 | 156 | for (chrom, start, end), seq in self._lookup.items(): 157 | out.write('\t'.join(map(str, [chrom, start, end, seq])) + '\n') 158 | 159 | 160 | class MockGenomeTestFile(MockGenome): 161 | def __init__(self, lookup=None, filename=None, db_filename=None, 162 | default_seq=None, create_data=False): 163 | if not create_data: 164 | db_filename = None 165 | super(MockGenomeTestFile, self).__init__( 166 | lookup=lookup, db_filename=db_filename, 167 | filename=filename, 168 | default_seq=default_seq) 169 | 170 | self._filename = filename 171 | self._create_data = (db_filename is not None) 172 | 173 | if self._create_data and os.path.exists(filename): 174 | # Clear output file when creating data. 175 | os.remove(filename) 176 | 177 | def get_seq(self, chrom, start, end): 178 | seq = super(MockGenomeTestFile, self).get_seq(chrom, start, end) 179 | 180 | # Save each query in append mode. 181 | if self._create_data: 182 | with open(self._filename, 'a') as out: 183 | out.write('\t'.join(map(str, [chrom, start, end, seq])) + '\n') 184 | return seq 185 | -------------------------------------------------------------------------------- /tests/mock_ensembl_tark.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os.path 3 | import re 4 | from inspect import getsourcefile 5 | from os.path import abspath 6 | 7 | from cdot.hgvs.dataproviders.ensembl_tark_data_provider import EnsemblTarkDataProvider 8 | 9 | 10 | class MockEnsemblTarkDataProvider(EnsemblTarkDataProvider): 11 | def __init__(self, assemblies: list[str] = None, mode=None, cache=None, seqfetcher=None): 12 | super().__init__(assemblies, mode, cache, seqfetcher) 13 | self._this_file_dir = os.path.dirname(abspath(getsourcefile(lambda: 0))) 14 | 15 | 16 | def _get_from_url(self, url): 17 | if not url.startswith(self.base_url): 18 | raise ValueError(f"{url} does not start with {self.base_url}") 19 | 20 | dirname = os.path.dirname(url) 21 | basename = os.path.basename(url) 22 | params = re.sub(r"^\?", "", basename) 23 | path = re.sub(f"^{self.base_url}/", "", dirname) 24 | filename = os.path.join(self._this_file_dir, "test_data", "ensembl_tark", path, f"{params}.json") 25 | if not os.path.exists(filename): 26 | raise FileNotFoundError(f"{filename} not found") 27 | 28 | with open(filename, "r") as f: 29 | return json.load(f) 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /tests/mock_seqfetcher.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from hgvs.exceptions import HGVSDataNotAvailableError 4 | 5 | 6 | class MockSeqFetcher: 7 | def __init__(self, filename): 8 | with open(filename) as f: 9 | self.transcripts = json.load(f) 10 | self.source = f"Mock: Local JSON file: {filename}" 11 | 12 | def fetch_seq(self, ac, start_i=None, end_i=None): 13 | seq = self.transcripts.get(ac) 14 | if seq is None: 15 | raise HGVSDataNotAvailableError() 16 | if start_i is None: 17 | start_i = 0 18 | if end_i is None: 19 | end_i = len(seq) 20 | return seq[start_i:end_i] 21 | 22 | -------------------------------------------------------------------------------- /tests/test_data/cdot.ensembl.grch38.json: -------------------------------------------------------------------------------- 1 | { 2 | "transcripts": { 3 | "ENST00000617537.5": { 4 | "biotype": [ 5 | "mRNA", 6 | "protein_coding" 7 | ], 8 | "gene_name": "AOAH", 9 | "gene_version": "ENSG00000136250.12", 10 | "genome_builds": { 11 | "GRCh38": { 12 | "cds_end": 36724148, 13 | "cds_start": 36513251, 14 | "contig": "NC_000007.14", 15 | "exons": [ 16 | [ 17 | 36512940, 18 | 36513380, 19 | 20, 20 | 1946, 21 | 2385, 22 | null 23 | ], 24 | [ 25 | 36522038, 26 | 36522115, 27 | 19, 28 | 1869, 29 | 1945, 30 | null 31 | ], 32 | [ 33 | 36530417, 34 | 36530514, 35 | 18, 36 | 1772, 37 | 1868, 38 | null 39 | ], 40 | [ 41 | 36532146, 42 | 36532206, 43 | 17, 44 | 1712, 45 | 1771, 46 | null 47 | ], 48 | [ 49 | 36532285, 50 | 36532344, 51 | 16, 52 | 1653, 53 | 1711, 54 | null 55 | ], 56 | [ 57 | 36540318, 58 | 36540491, 59 | 15, 60 | 1480, 61 | 1652, 62 | null 63 | ], 64 | [ 65 | 36548611, 66 | 36548686, 67 | 14, 68 | 1405, 69 | 1479, 70 | null 71 | ], 72 | [ 73 | 36549438, 74 | 36549475, 75 | 13, 76 | 1368, 77 | 1404, 78 | null 79 | ], 80 | [ 81 | 36576573, 82 | 36576656, 83 | 12, 84 | 1285, 85 | 1367, 86 | null 87 | ], 88 | [ 89 | 36594338, 90 | 36594430, 91 | 11, 92 | 1193, 93 | 1284, 94 | null 95 | ], 96 | [ 97 | 36616379, 98 | 36616474, 99 | 10, 100 | 1098, 101 | 1192, 102 | null 103 | ], 104 | [ 105 | 36618296, 106 | 36618345, 107 | 9, 108 | 1049, 109 | 1097, 110 | null 111 | ], 112 | [ 113 | 36620780, 114 | 36620829, 115 | 8, 116 | 1000, 117 | 1048, 118 | null 119 | ], 120 | [ 121 | 36621709, 122 | 36621780, 123 | 7, 124 | 929, 125 | 999, 126 | null 127 | ], 128 | [ 129 | 36623189, 130 | 36623250, 131 | 6, 132 | 868, 133 | 928, 134 | null 135 | ], 136 | [ 137 | 36632035, 138 | 36632106, 139 | 5, 140 | 797, 141 | 867, 142 | null 143 | ], 144 | [ 145 | 36637850, 146 | 36637910, 147 | 4, 148 | 737, 149 | 796, 150 | null 151 | ], 152 | [ 153 | 36659165, 154 | 36659265, 155 | 3, 156 | 637, 157 | 736, 158 | null 159 | ], 160 | [ 161 | 36673942, 162 | 36674009, 163 | 2, 164 | 570, 165 | 636, 166 | null 167 | ], 168 | [ 169 | 36686698, 170 | 36686794, 171 | 1, 172 | 474, 173 | 569, 174 | null 175 | ], 176 | [ 177 | 36724021, 178 | 36724494, 179 | 0, 180 | 1, 181 | 473, 182 | null 183 | ] 184 | ], 185 | "strand": "-", 186 | "tag": "CCDS,basic,Ensembl_canonical,GENCODE Primary,MANE_Select", 187 | "url": "ftp://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.gtf.gz" 188 | } 189 | }, 190 | "id": "ENST00000617537.5", 191 | "protein": "ENSP00000483783.1", 192 | "start_codon": 346, 193 | "stop_codon": 2074 194 | } 195 | }, 196 | "cdot_version": "0.2.26", 197 | "genome_builds": [ 198 | "GRCh38" 199 | ] 200 | } -------------------------------------------------------------------------------- /tests/test_data/cdot.refseq.grch37.json: -------------------------------------------------------------------------------- 1 | { 2 | "transcripts": { 3 | "NM_001637.3": { 4 | "start_codon": 401, 5 | "stop_codon": 2129, 6 | "id": "NM_001637.3", 7 | "gene_version": "313", 8 | "gene_name": "AOAH", 9 | "biotype": [ 10 | "protein_coding" 11 | ], 12 | "protein": "NP_001628.1", 13 | "genome_builds": { 14 | "GRCh37": { 15 | "cds_end": 36763753, 16 | "cds_start": 36552857, 17 | "contig": "NC_000007.13", 18 | "exons": [ 19 | [ 20 | 36552548, 21 | 36552986, 22 | 20, 23 | 2001, 24 | 2440, 25 | "M196 I1 M61 I1 M181" 26 | ], 27 | [ 28 | 36561644, 29 | 36561721, 30 | 19, 31 | 1924, 32 | 2000, 33 | null 34 | ], 35 | [ 36 | 36570023, 37 | 36570120, 38 | 18, 39 | 1827, 40 | 1923, 41 | null 42 | ], 43 | [ 44 | 36571752, 45 | 36571812, 46 | 17, 47 | 1767, 48 | 1826, 49 | null 50 | ], 51 | [ 52 | 36571891, 53 | 36571950, 54 | 16, 55 | 1708, 56 | 1766, 57 | null 58 | ], 59 | [ 60 | 36579924, 61 | 36580097, 62 | 15, 63 | 1535, 64 | 1707, 65 | null 66 | ], 67 | [ 68 | 36588217, 69 | 36588292, 70 | 14, 71 | 1460, 72 | 1534, 73 | null 74 | ], 75 | [ 76 | 36589044, 77 | 36589081, 78 | 13, 79 | 1423, 80 | 1459, 81 | null 82 | ], 83 | [ 84 | 36616179, 85 | 36616262, 86 | 12, 87 | 1340, 88 | 1422, 89 | null 90 | ], 91 | [ 92 | 36633944, 93 | 36634036, 94 | 11, 95 | 1248, 96 | 1339, 97 | null 98 | ], 99 | [ 100 | 36655985, 101 | 36656080, 102 | 10, 103 | 1153, 104 | 1247, 105 | null 106 | ], 107 | [ 108 | 36657902, 109 | 36657951, 110 | 9, 111 | 1104, 112 | 1152, 113 | null 114 | ], 115 | [ 116 | 36660386, 117 | 36660435, 118 | 8, 119 | 1055, 120 | 1103, 121 | null 122 | ], 123 | [ 124 | 36661315, 125 | 36661386, 126 | 7, 127 | 984, 128 | 1054, 129 | null 130 | ], 131 | [ 132 | 36662795, 133 | 36662856, 134 | 6, 135 | 923, 136 | 983, 137 | null 138 | ], 139 | [ 140 | 36671641, 141 | 36671712, 142 | 5, 143 | 852, 144 | 922, 145 | null 146 | ], 147 | [ 148 | 36677456, 149 | 36677516, 150 | 4, 151 | 792, 152 | 851, 153 | null 154 | ], 155 | [ 156 | 36698770, 157 | 36698870, 158 | 3, 159 | 692, 160 | 791, 161 | null 162 | ], 163 | [ 164 | 36713547, 165 | 36713614, 166 | 2, 167 | 625, 168 | 691, 169 | null 170 | ], 171 | [ 172 | 36726303, 173 | 36726399, 174 | 1, 175 | 529, 176 | 624, 177 | null 178 | ], 179 | [ 180 | 36763626, 181 | 36764154, 182 | 0, 183 | 1, 184 | 528, 185 | null 186 | ] 187 | ], 188 | "start": 36552548, 189 | "stop": 36764154, 190 | "strand": "-", 191 | "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz" 192 | } 193 | } 194 | }, 195 | "NR_023343.1": { 196 | "id": "NR_023343.1", 197 | "cdot": "0.2.12", 198 | "hgnc": "34016", 199 | "biotype": [ 200 | "non_coding" 201 | ], 202 | "gene_name": "RNU4ATAC", 203 | "genome_builds": { 204 | "GRCh37": { 205 | "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20220307/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz", 206 | "exons": [ 207 | [ 208 | 122288455, 209 | 122288585, 210 | 0, 211 | 1, 212 | 130, 213 | null 214 | ] 215 | ], 216 | "contig": "NC_000002.11", 217 | "strand": "+" 218 | } 219 | } 220 | } 221 | }, 222 | "genes": { 223 | "GATA2": { 224 | "aliases": "DCML, IMD21, MONOMAC, NFE1B", 225 | "biotype": "protein_coding", 226 | "description": "GATA binding protein 2", 227 | "gene_symbol": "GATA2", 228 | "hgnc": "4171", 229 | "map_location": "3q21.3", 230 | "summary": "This gene encodes a member of the GATA family of zinc-finger transcription factors that are named for the consensus nucleotide sequence they bind in the promoter regions of target genes. The encoded protein plays an essential role in regulating transcription of genes involved in the development and proliferation of hematopoietic and endocrine cell lineages. Alternative splicing results in multiple transcript variants.[provided by RefSeq, Mar 2009]", 231 | "url": "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" 232 | } 233 | }, 234 | "cdot_version": "0.2.10", 235 | "genome_builds": [ 236 | "GRCh37" 237 | ] 238 | } -------------------------------------------------------------------------------- /tests/test_data/clinvar_hgvs/clinvar_hgvs_010.tsv: -------------------------------------------------------------------------------- 1 | NC_000002.12:g.189003410G>A NM_000090.4:c.2554-1G>A 2 | NC_000002.12:g.73572910G>A NM_015120.4:c.11036G>A 3 | NC_000003.12:g.36996633_36996634delinsTT NM_001354619.1:c.-593_-592delinsTT 4 | NC_000003.12:g.58149866C>T NM_001164317.2:c.6201C>T 5 | NC_000006.12:g.52082460T>C NM_138694.4:c.213A>G 6 | NC_000007.14:g.98906228C>T NM_001244580.1:c.1088C>T 7 | NC_000010.11:g.120905027T>C NM_018117.12:c.3193+216T>C 8 | NC_000012.12:g.8854218G>A NM_144670.6:c.2681G>A 9 | NC_000016.10:g.81869317C>T NM_002661.5:c.564+19C>T 10 | NC_000017.11:g.58734198A>G NM_058216.3:c.1107A>G 11 | -------------------------------------------------------------------------------- /tests/test_data/clinvar_hgvs/clinvar_hgvs_050.tsv: -------------------------------------------------------------------------------- 1 | NC_000001.11:g.237617406G>A NM_001035.3:c.5836G>A 2 | NC_000001.11:g.33021456_33021458del NM_001625.4:c.336_338del 3 | NC_000001.11:g.52383874T>C NM_004153.4:c.1819A>G 4 | NC_000001.11:g.52397773C>T NM_004153.4:c.314G>A 5 | NC_000001.11:g.53213501A>C NM_000098.3:c.1883A>C 6 | NC_000001.11:g.94047046C>T NM_000350.3:c.2791G>A 7 | NC_000002.12:g.169275142G>A NM_004525.3:c.1869C>T 8 | NC_000002.12:g.178538451C>G NM_001267550.2:c.99289+89G>C 9 | NC_000002.12:g.178608202G>T NM_001267550.2:c.52681C>A 10 | NC_000002.12:g.178741896G>A NM_001267550.2:c.11337C>T 11 | NC_000002.12:g.214781051A>T NM_000465.4:c.823T>A 12 | NC_000002.12:g.219490515_219490517del NM_005876.5:c.9028_9030del 13 | NC_000002.12:g.46909034C>T NM_001171511.2:c.93-1065G>A 14 | NC_000002.12:g.47806494A>C NM_001281493.1:c.2938A>C 15 | NC_000002.12:g.47813341T>C NM_001190274.2:c.2120A>G 16 | NC_000002.12:g.85888897C>A NM_003896.4:c.9G>T 17 | NC_000003.12:g.123752448G>C NM_001321309.2:c.-155-12447C>G 18 | NC_000005.10:g.10255709T>C NM_012073.5:c.332-246T>C 19 | NC_000005.10:g.112841537T>A NM_000038.6:c.5943T>A 20 | NC_000005.10:g.139026767A>G NM_022464.5:c.645+34T>C 21 | NC_000005.10:g.149027638G>A NM_024577.4:c.2094C>T 22 | NC_000005.10:g.35873480C>T NM_002185.5:c.538C>T 23 | NC_000007.14:g.16089456T>C NM_001101426.4:c.*2239A>G 24 | NC_000007.14:g.97852358C>T NM_001673.5:c.1587G>A 25 | NC_000008.11:g.18062326C>A NM_177924.5:c.601G>T 26 | NC_000008.11:g.89980722C>T NM_002485.5:c.480+12G>A 27 | NC_000009.12:g.37784879T>G NM_016042.4:c.166A>C 28 | NC_000009.12:g.69035905C>T NM_000144.5:c.123C>T 29 | NC_000010.11:g.87925523T>C NM_000314.8:c.175T>C 30 | NC_000011.10:g.112045279del NR_164072.1:n.1167+49del 31 | NC_000011.10:g.118312837C>T NM_000733.4:c.323C>T 32 | NC_000011.10:g.17427125C>T NR_147094.2:n.2212G>A 33 | NC_000011.10:g.64809875del NM_130804.2:c.237del 34 | NC_000012.12:g.120737861C>T NM_000017.4:c.497C>T 35 | NC_000013.11:g.27920121C>T NM_000209.4:c.-18C>T 36 | NC_000014.9:g.30879438C>T NM_004086.3:c.389C>T 37 | NC_000016.10:g.89770196G>A NM_000135.4:c.2286C>T 38 | NC_000017.11:g.43045725C>T NR_027676.2:n.5722G>A 39 | NC_000017.11:g.43094198del NR_027676.2:n.1510del 40 | NC_000017.11:g.59031925C>T NM_015294.6:c.1919G>A 41 | NC_000017.11:g.80058782T>C NM_017950.4:c.1318-76T>C 42 | NC_000018.10:g.58343058C>G NM_001144967.3:c.1530C>G 43 | NC_000018.10:g.70176380_70176383del NM_173630.4:c.1476+295_1476+298del 44 | NC_000019.10:g.12897438G>A NM_000159.4:c.1082+10G>A 45 | NC_000021.9:g.42477365G>A NM_080860.4:c.653C>T 46 | NC_000022.11:g.17110140A>G NM_001289905.1:c.*320A>G 47 | NC_000023.11:g.154767342C>T NM_001363.5:c.600C>T 48 | NC_000023.11:g.19355691T>C NM_000284.4:c.765T>C 49 | NC_000023.11:g.45059450G>A NR_111960.1:n.1555G>A 50 | NC_000023.11:g.74742052G>A NM_001008537.3:c.2505C>T 51 | -------------------------------------------------------------------------------- /tests/test_data/clinvar_hgvs/clinvar_hgvs_100.tsv: -------------------------------------------------------------------------------- 1 | NC_000001.11:g.12007126G>A NM_001127660.1:c.1946G>A 2 | NC_000001.11:g.21860220G>A NM_005529.7:c.4971C>T 3 | NC_000001.11:g.237643408G>C NM_001035.3:c.7303G>C 4 | NC_000001.11:g.23808165T>C NM_000191.3:c.720A>G 5 | NC_000001.11:g.241517292C>A NM_000143.4:c.157G>T 6 | NC_000001.11:g.94111501G>A NM_000350.3:c.239C>T 7 | NC_000002.12:g.113220093C>T NM_003466.4:c.1275G>A 8 | NC_000002.12:g.144399194T>C NM_014795.4:c.1993A>G 9 | NC_000002.12:g.15286830T>C NM_015909.4:c.5138+243A>G 10 | NC_000002.12:g.178592272G>A NM_001267550.2:c.59632C>T 11 | NC_000002.12:g.178715710C>T NM_001267550.2:c.25704G>A 12 | NC_000002.12:g.214745103C>G NM_000465.4:c.1867G>C 13 | NC_000002.12:g.47803493G>C NM_001281493.1:c.2340G>C 14 | NC_000002.12:g.73432288G>A NM_015120.4:c.1432G>A 15 | NC_000002.12:g.73453022T>C NM_015120.4:c.6498T>C 16 | NC_000002.12:g.85343265G>T NM_017750.4:c.1810C>A 17 | NC_000002.12:g.85888897C>A NM_003896.4:c.9G>T 18 | NC_000003.12:g.15644611A>G NM_001281723.3:c.695A>G 19 | NC_000003.12:g.158691443T>C NR_164500.1:n.2195T>C 20 | NC_000003.12:g.43598588T>A NM_018075.5:c.416A>T 21 | NC_000003.12:g.49099390C>T NM_005051.3:c.1568G>A 22 | NC_000003.12:g.49099430C>T NM_005051.3:c.1528G>A 23 | NC_000003.12:g.52402867A>G NM_004656.4:c.1895T>C 24 | NC_000003.12:g.69118837G>T NM_001304418.3:c.1518C>A 25 | NC_000004.12:g.112646986A>G NM_016648.4:c.552+31A>G 26 | NC_000004.12:g.113355165A>G NM_001148.6:c.6547A>G 27 | NC_000004.12:g.43030609T>A NM_001080476.2:c.*69T>A 28 | NC_000004.12:g.52038248C>A NM_000232.5:c.12G>T 29 | NC_000004.12:g.83273598C>T NM_015697.8:c.590G>A 30 | NC_000004.12:g.88268819C>T NM_152542.5:c.629G>A 31 | NC_000005.10:g.113064054C>G NM_001085377.2:c.2143G>C 32 | NC_000005.10:g.126550263C>A NM_001182.5:c.1348G>T 33 | NC_000005.10:g.179126029C>T NM_014244.5:c.2719G>A 34 | NC_000005.10:g.79051354T>C NM_013391.3:c.678A>G 35 | NC_000005.10:g.83539495C>T NM_004385.5:c.6492C>T 36 | NC_000006.12:g.129280072C>T NM_001079823.2:c.2462C>T 37 | NC_000006.12:g.129353296C>A NM_001079823.2:c.4656C>A 38 | NC_000006.12:g.52079909A>G NM_138694.4:c.381T>C 39 | NC_000006.12:g.52082460T>C NM_138694.4:c.213A>G 40 | NC_000007.14:g.22945636C>G NM_032581.4:c.1519G>C 41 | NC_000007.14:g.5999116G>C NM_000535.7:c.697C>G 42 | NC_000007.14:g.93146872C>A NM_152703.5:c.-779+11G>T 43 | NC_000008.11:g.89980722C>T NM_002485.5:c.480+12G>A 44 | NC_000009.12:g.2717979G>A NM_133497.4:c.240G>A 45 | NC_000009.12:g.34648800G>A NM_001258332.1:c.399G>A 46 | NC_000009.12:g.37745681G>A NM_014907.3:c.3649G>A 47 | NC_000009.12:g.69035905C>T NM_000144.5:c.123C>T 48 | NC_000009.12:g.92045993A>G NM_006415.4:c.1136+6T>C 49 | NC_000010.11:g.110784352G>A NM_001134363.3:c.1349G>A 50 | NC_000010.11:g.110799819A>G NM_001134363.3:c.1701A>G 51 | NC_000010.11:g.111013593T>C NR_136749.1:n.2936T>C 52 | NC_000011.10:g.112045279del NR_164072.1:n.1167+49del 53 | NC_000011.10:g.118503818del NM_001197104.2:c.7926del 54 | NC_000011.10:g.119026031A>G NM_001164279.2:c.701T>C 55 | NC_000011.10:g.1752849G>A NM_001909.5:c.*654C>T 56 | NC_000011.10:g.47342611del NM_000256.3:c.1595del 57 | NC_000011.10:g.5226575del NM_000518.5:c.315+2del 58 | NC_000011.10:g.61445957A>G NM_017841.4:c.387A>G 59 | NC_000013.11:g.32332371_32332377delinsTACTTCAG NM_000059.3:c.893_899delinsTACTTCAG 60 | NC_000013.11:g.32337627T>C NM_000059.3:c.3272T>C 61 | NC_000013.11:g.32339462G>C NM_000059.3:c.5107G>C 62 | NC_000014.9:g.23432514G>C NM_000257.4:c.503-8C>G 63 | NC_000014.9:g.45176022C>G NM_020937.4:c.3268C>G 64 | NC_000014.9:g.67766373A>G NM_015346.4:c.5865T>C 65 | NC_000014.9:g.89980550C>G NM_018319.4:c.802C>G 66 | NC_000014.9:g.92006041T>C NM_004239.4:c.1935A>G 67 | NC_000015.10:g.90754814G>A NM_000057.4:c.963G>A 68 | NC_000015.10:g.92985680G>T NM_001271.4:c.3413+7G>T 69 | NC_000016.10:g.1352026C>T NM_032520.5:c.52+9C>T 70 | NC_000016.10:g.173193A>G NM_000517.6:c.164A>G 71 | NC_000016.10:g.2046301C>G NM_002528.7:c.181G>C 72 | NC_000016.10:g.2048728T>C NM_000548.5:c.113T>C 73 | NC_000016.10:g.2081776T>C NM_000548.5:c.3792T>C 74 | NC_000016.10:g.2109256C>T NM_000296.4:c.5911G>A 75 | NC_000016.10:g.30993204G>T NM_052874.5:c.712C>A 76 | NC_000017.11:g.41769530G>A NM_002230.4:c.356C>T 77 | NC_000017.11:g.43094198del NR_027676.2:n.1510del 78 | NC_000017.11:g.50356597T>C NM_022167.4:c.1569T>C 79 | NC_000017.11:g.59031925C>T NM_015294.6:c.1919G>A 80 | NC_000017.11:g.65557863T>A NM_004655.4:c.758A>T 81 | NC_000017.11:g.65557982_65557984del NM_004655.4:c.639_641del 82 | NC_000018.10:g.46639718G>A NM_144612.6:c.409C>T 83 | NC_000018.10:g.70176380_70176383del NM_173630.4:c.1476+295_1476+298del 84 | NC_000019.10:g.13298566C>T NM_001127222.2:c.3067G>A 85 | NC_000019.10:g.48969229T>A NM_002103.5:c.*59A>T 86 | NC_000019.10:g.51353233dup NM_001014763.1:c.551dup 87 | NC_000019.10:g.53904356T>C NM_002739.5:c.1657-279T>C 88 | NC_000019.10:g.54191771C>T NM_001077446.4:c.294C>T 89 | NC_000019.10:g.54193391G>A NM_001077446.4:c.*29G>A 90 | NC_000020.11:g.10658702_10658703insA NM_000214.3:c.459_460insT 91 | NC_000020.11:g.10673008T>C NM_000214.3:c.82-2A>G 92 | NC_000020.11:g.31831073G>A NM_033118.4:c.1356G>A 93 | NC_000020.11:g.35504820_35504831del NM_007186.6:c.6451_6462del 94 | NC_000020.11:g.63494913G>A NM_001958.5:c.513C>T 95 | NC_000021.9:g.32678630del NM_003895.3:c.1627+15del 96 | NC_000021.9:g.34370822G>T NM_172201.1:c.344G>T 97 | NC_000021.9:g.45468322G>A NM_130445.4:c.187G>A 98 | NC_000022.11:g.17110140A>G NM_001289905.1:c.*320A>G 99 | NC_000023.11:g.22221720T>C NM_000444.6:c.1876T>C 100 | NC_000023.11:g.41343808_41343809dup NR_126093.1:n.1696_1697dup 101 | -------------------------------------------------------------------------------- /tests/test_data/clinvar_hgvs/clinvar_hgvs_ensembl_100.tsv: -------------------------------------------------------------------------------- 1 | NC_000003.12:g.52999279_53001028del ENST00000394752.8:c.-130-31769_-130-30020del 2 | NC_000004.12:g.121364109_121368709del ENST00000394427.3:c.340+11600_340+16200del 3 | NC_000017.11:g.31163377_31164732del ENST00000358273.9:c.479+1_479+1356del 4 | NC_000019.10:g.12536733_12537889del ENST00000339282.12:c.4-9193_4-8037del 5 | NC_000007.14:g.43891176_43895702del ENST00000453200.6:c.15-7886_15-3360del 6 | NC_000009.12:g.354334_355793del ENST00000432829.7:c.1680-13684_1680-12225del 7 | NC_000011.10:g.88639541_88647384del ENST00000305447.5:c.1147+5785_1147+13628del 8 | NC_000023.11:g.32626321_32627544del ENST00000357033.9:c.1332-13088_1332-11865del 9 | NC_000005.10:g.112840128_112843000del ENST00000257430.9:c.4534_7406del 10 | NC_000005.10:g.140855987_140858640del ENST00000504120.4:c.2394+67303_2394+69956del 11 | NC_000007.14:g.88958724_88966741del ENST00000333190.5:c.108+198640_108+206657del 12 | NC_000017.11:g.17710076_17713445del ENST00000353383.6:c.-148-13952_-148-10583del 13 | NC_000005.10:g.40779130_40785601del ENST00000397128.7:c.128-8015_128-1544del 14 | NC_000005.10:g.90686004_90688051del ENST00000405460.9:c.6490+9_6491-1810del 15 | NC_000001.11:g.3373437_3380436del ENST00000270722.10:c.439-11715_439-4716del 16 | NC_000001.11:g.7621941_7624940del ENST00000303635.12:c.511-18459_511-15460del 17 | NC_000002.12:g.178560066_178564516del ENST00000589042.5:c.81618_86068del 18 | NC_000002.12:g.141316191_141320434del ENST00000389484.8:c.344-65793_344-61550del 19 | NC_000001.11:g.33008231_33013268del ENST00000672715.1:c.636_*4953del 20 | NC_000002.12:g.50041090_50048923del ENST00000401669.7:c.4128+4348_4128+12181del 21 | NC_000002.12:g.60458961_60461239del ENST00000642384.2:c.1675_*1445del 22 | NC_000023.11:g.119927096_119928345del ENST00000371410.5:c.1073+1672_1074-1701del 23 | NC_000023.11:g.154026923_154030670del ENST00000303391.11:c.1158_*3444del 24 | NC_000003.12:g.192587419_192594481del ENST00000445105.7:c.13+132700_13+139762del 25 | NC_000004.12:g.996520_998294del ENST00000514224.2:c.300-4092_300-2318del 26 | NC_000006.12:g.1610446_1613897del ENST00000645831.2:c.1_*1790del 27 | NC_000003.12:g.37938391_37944758del ENST00000273179.10:c.80-8666_80-2299del 28 | NC_000008.11:g.3929021_3931750del ENST00000635120.2:c.818+66153_818+68882del 29 | NC_000010.11:g.26710075_26713224del ENST00000376215.10:c.467+307_467+3456del 30 | NC_000015.10:g.82641775_82645527del ENST00000684509.1:c.-98+1610_-98+5362del 31 | NC_000018.10:g.58268755_58269877del ENST00000400345.8:c.297+16701_297+17823del 32 | NC_000018.10:g.69541905_69550035del ENST00000382713.10:c.67-22582_67-14452del 33 | NC_000001.11:g.245473613_245478917del ENST00000407071.7:c.1166+53868_1166+59172del 34 | NC_000001.11:g.245867700_245869699del ENST00000490107.6:c.814-5812_814-3813del 35 | NC_000002.12:g.17581968_17584091del ENST00000295156.9:c.-5-10102_-5-7979del 36 | NC_000002.12:g.211906513_211910431del ENST00000342788.9:c.421+36999_421+40917del 37 | NC_000007.14:g.108418131_108425883del ENST00000379028.8:c.-331-26289_-331-18537del 38 | NC_000008.11:g.50314668_50318252del ENST00000642720.2:c.-27-79544_-27-75960del 39 | NC_000010.11:g.113086890_113087964del ENST00000355995.9:c.552+46764_552+47838del 40 | NC_000019.10:g.47839728_47843005del ENST00000221996.12:c.661_*3038del 41 | NC_000012.12:g.2136470_2143758del ENST00000399603.6:c.477+16040_477+23328del 42 | NC_000002.12:g.197705407_197707188dup ENST00000282276.8:c.2_*1dup 43 | NC_000012.12:g.70286084_70288034del ENST00000229195.8:c.48+7810_48+9760del 44 | NC_000011.10:g.2699445_2700825del ENST00000155840.12:c.1514+37364_1514+38744del 45 | NC_000012.12:g.99608908_99614995del ENST00000683438.2:c.1272+40072_1272+46159del 46 | NC_000012.12:g.61875937_61878789del ENST00000416284.8:c.-1-11362_-1-8510del 47 | NC_000021.9:g.33355546_33357370del ENST00000270139.8:c.1671_*1821del 48 | NC_000012.12:g.25495743_25501512del ENST00000458174.7:c.*22+2227_*22+7996del 49 | NC_000016.10:g.68767553_68771266del ENST00000261769.10:c.163+29142_164-30404del 50 | NC_000006.12:g.151297100_151300476del ENST00000402676.7:c.163-8647_163-5271del 51 | NC_000010.11:g.51447874_51451689del ENST00000373980.11:c.479-19849_479-16034del 52 | NC_000010.11:g.86194845_86199292del ENST00000327946.12:c.520+7072_520+11519del 53 | NC_000010.11:g.87948970_87950261del ENST00000371953.8:c.493-3148_493-1857del 54 | NC_000013.11:g.32398162_32399672del ENST00000380152.8:c.9649_*902del 55 | NC_000013.11:g.38805150_38809282del ENST00000280481.9:c.6019+20342_6019+24474del 56 | NC_000013.11:g.77000458_77002517del ENST00000377453.9:c.566_*1548del 57 | NC_000009.12:g.36259405_36266486del ENST00000396594.8:c.51+10408_52-10008del 58 | NC_000016.10:g.89919802_89920805del ENST00000555147.2:c.544_*593del 59 | NC_000018.10:g.10913665_10919991del ENST00000674853.1:c.287-8763_287-2437del 60 | NC_000007.14:g.117559509G>T ENST00000003084.11:c.1438G>T 61 | NC_000004.12:g.186083673G>A ENST00000296795.8:c.1987G>A 62 | NC_000017.11:g.32531952_32539872del ENST00000318217.10:c.2865-44956_2865-37036del 63 | NC_000023.11:g.17533722_17538533del ENST00000676302.1:c.566-154020_566-149209del 64 | NC_000021.9:g.33550233_33554247del ENST00000356577.10:c.1002_5016del 65 | NC_000011.10:g.119052306C>G ENST00000617285.5:c.1111G>C 66 | NC_000001.11:g.209786745del ENST00000367021.8:c.*1678del 67 | NC_000001.11:g.220213740G>A ENST00000358951.7:c.304+116C>T 68 | NC_000001.11:g.226875528C>T ENST00000366783.8:c.-43C>T 69 | NC_000002.12:g.113129758T>G ENST00000409930.4:c.205+94T>G 70 | NC_000002.12:g.127058848A>G ENST00000316724.10:c.1002+163T>C 71 | NC_000002.12:g.241190354T>G ENST00000674324.2:c.108+183T>G 72 | NC_000003.12:g.31624774del ENST00000295770.4:c.1728-140del 73 | NC_000003.12:g.38751968T>A ENST00000449082.3:c.1755+251A>T 74 | NC_000003.12:g.53105415_53105419del ENST00000296292.8:c.957+258_957+262del 75 | NC_000003.12:g.160381874dup ENST00000326448.12:c.38-144dup 76 | NC_000005.10:g.78964145A>C ENST00000264914.10:c.690+271T>G 77 | NC_000005.10:g.123386676T>A ENST00000306467.10:c.1431-9A>T 78 | NC_000006.12:g.13306845G>A ENST00000379300.8:c.666-318C>T 79 | NC_000006.12:g.33181375C>G ENST00000341947.7:c.1120-205G>C 80 | NC_000006.12:g.38803495G>A ENST00000327475.11:c.3034+184G>A 81 | NC_000007.14:g.135626418dup ENST00000285968.11:c.4793+57dup 82 | NC_000009.12:g.95508385_95508387dup ENST00000437951.6:c.199-1768_199-1766dup 83 | NC_000009.12:g.104782004T>C ENST00000374736.8:c.*2311A>G 84 | NC_000009.12:g.134823305T>G ENST00000371817.8:c.4645-111T>G 85 | NC_000010.11:g.5001841T>G ENST00000380753.9:c.85-160A>C 86 | NC_000003.12:g.52403787T>G ENST00000460680.6:c.1358A>C 87 | NC_000003.12:g.55470273T>C ENST00000264634.9:c.962A>G 88 | NC_000003.12:g.56593711A>G ENST00000394672.8:c.1289A>G 89 | NC_000003.12:g.66381521C>T ENST00000273261.8:c.2728G>A 90 | NC_000003.12:g.69959257T>C ENST00000352241.9:c.1032-16T>C 91 | NC_000003.12:g.93905835del ENST00000394236.9:c.550del 92 | NC_000011.10:g.68761568_68761570delinsA ENST00000265641.10:c.1993_1995delinsT 93 | NC_000011.10:g.72108616C>T ENST00000541899.3:c.468C>T 94 | NC_000010.11:g.30336592T>C ENST00000263063.9:c.780+211A>G 95 | NC_000010.11:g.87715935A>G ENST00000456849.2:c.865+92A>G 96 | NC_000012.12:g.25205716A>T ENST00000256078.10:c.*4200T>A 97 | NC_000012.12:g.57581372A>C ENST00000455537.7:c.2756-43A>C 98 | NC_000014.9:g.45181610G>A ENST00000267430.10:c.4318-27G>A 99 | NC_000014.9:g.63950166G>A ENST00000555002.6:c.590+160G>A 100 | NC_000014.9:g.67724398T>G ENST00000551171.6:c.69-75T>G 101 | -------------------------------------------------------------------------------- /tests/test_data/clinvar_hgvs/clinvar_hgvs_ensembl_50.tsv: -------------------------------------------------------------------------------- 1 | NC_000003.12:g.52999279_53001028del ENST00000394752.8:c.-130-31769_-130-30020del 2 | NC_000004.12:g.121364109_121368709del ENST00000394427.3:c.340+11600_340+16200del 3 | NC_000017.11:g.31163377_31164732del ENST00000358273.9:c.479+1_479+1356del 4 | NC_000019.10:g.12536733_12537889del ENST00000339282.12:c.4-9193_4-8037del 5 | NC_000007.14:g.43891176_43895702del ENST00000453200.6:c.15-7886_15-3360del 6 | NC_000009.12:g.354334_355793del ENST00000432829.7:c.1680-13684_1680-12225del 7 | NC_000011.10:g.88639541_88647384del ENST00000305447.5:c.1147+5785_1147+13628del 8 | NC_000023.11:g.32626321_32627544del ENST00000357033.9:c.1332-13088_1332-11865del 9 | NC_000005.10:g.112840128_112843000del ENST00000257430.9:c.4534_7406del 10 | NC_000005.10:g.140855987_140858640del ENST00000504120.4:c.2394+67303_2394+69956del 11 | NC_000007.14:g.88958724_88966741del ENST00000333190.5:c.108+198640_108+206657del 12 | NC_000017.11:g.17710076_17713445del ENST00000353383.6:c.-148-13952_-148-10583del 13 | NC_000005.10:g.40779130_40785601del ENST00000397128.7:c.128-8015_128-1544del 14 | NC_000005.10:g.90686004_90688051del ENST00000405460.9:c.6490+9_6491-1810del 15 | NC_000001.11:g.3373437_3380436del ENST00000270722.10:c.439-11715_439-4716del 16 | NC_000001.11:g.7621941_7624940del ENST00000303635.12:c.511-18459_511-15460del 17 | NC_000002.12:g.178560066_178564516del ENST00000589042.5:c.81618_86068del 18 | NC_000002.12:g.141316191_141320434del ENST00000389484.8:c.344-65793_344-61550del 19 | NC_000001.11:g.33008231_33013268del ENST00000672715.1:c.636_*4953del 20 | NC_000002.12:g.50041090_50048923del ENST00000401669.7:c.4128+4348_4128+12181del 21 | NC_000002.12:g.60458961_60461239del ENST00000642384.2:c.1675_*1445del 22 | NC_000023.11:g.119927096_119928345del ENST00000371410.5:c.1073+1672_1074-1701del 23 | NC_000023.11:g.154026923_154030670del ENST00000303391.11:c.1158_*3444del 24 | NC_000003.12:g.192587419_192594481del ENST00000445105.7:c.13+132700_13+139762del 25 | NC_000004.12:g.996520_998294del ENST00000514224.2:c.300-4092_300-2318del 26 | NC_000006.12:g.1610446_1613897del ENST00000645831.2:c.1_*1790del 27 | NC_000003.12:g.37938391_37944758del ENST00000273179.10:c.80-8666_80-2299del 28 | NC_000008.11:g.3929021_3931750del ENST00000635120.2:c.818+66153_818+68882del 29 | NC_000010.11:g.26710075_26713224del ENST00000376215.10:c.467+307_467+3456del 30 | NC_000015.10:g.82641775_82645527del ENST00000684509.1:c.-98+1610_-98+5362del 31 | NC_000018.10:g.58268755_58269877del ENST00000400345.8:c.297+16701_297+17823del 32 | NC_000018.10:g.69541905_69550035del ENST00000382713.10:c.67-22582_67-14452del 33 | NC_000001.11:g.245473613_245478917del ENST00000407071.7:c.1166+53868_1166+59172del 34 | NC_000001.11:g.245867700_245869699del ENST00000490107.6:c.814-5812_814-3813del 35 | NC_000002.12:g.17581968_17584091del ENST00000295156.9:c.-5-10102_-5-7979del 36 | NC_000002.12:g.211906513_211910431del ENST00000342788.9:c.421+36999_421+40917del 37 | NC_000007.14:g.108418131_108425883del ENST00000379028.8:c.-331-26289_-331-18537del 38 | NC_000008.11:g.50314668_50318252del ENST00000642720.2:c.-27-79544_-27-75960del 39 | NC_000010.11:g.113086890_113087964del ENST00000355995.9:c.552+46764_552+47838del 40 | NC_000019.10:g.47839728_47843005del ENST00000221996.12:c.661_*3038del 41 | NC_000012.12:g.2136470_2143758del ENST00000399603.6:c.477+16040_477+23328del 42 | NC_000002.12:g.197705407_197707188dup ENST00000282276.8:c.2_*1dup 43 | NC_000012.12:g.70286084_70288034del ENST00000229195.8:c.48+7810_48+9760del 44 | NC_000011.10:g.2699445_2700825del ENST00000155840.12:c.1514+37364_1514+38744del 45 | NC_000012.12:g.99608908_99614995del ENST00000683438.2:c.1272+40072_1272+46159del 46 | NC_000012.12:g.61875937_61878789del ENST00000416284.8:c.-1-11362_-1-8510del 47 | NC_000021.9:g.33355546_33357370del ENST00000270139.8:c.1671_*1821del 48 | NC_000012.12:g.25495743_25501512del ENST00000458174.7:c.*22+2227_*22+7996del 49 | NC_000016.10:g.68767553_68771266del ENST00000261769.10:c.163+29142_164-30404del 50 | NC_000006.12:g.151297100_151300476del ENST00000402676.7:c.163-8647_163-5271del 51 | -------------------------------------------------------------------------------- /tests/test_data/ensembl_tark/transcript/assembly_name=GRCh38&expand=transcript_release_set&loc_end=36530514&loc_region=7&loc_start=36530417&page=6.json: -------------------------------------------------------------------------------- 1 | { 2 | "count": 53, 3 | "next": null, 4 | "previous": "http://tark.ensembl.org/api/transcript/?assembly_name=GRCh38&expand=transcript_release_set&loc_end=36530514&loc_region=7&loc_start=36530417&page=5", 5 | "results": [ 6 | { 7 | "stable_id": "XM_011515338", 8 | "stable_id_version": 3, 9 | "assembly": "GRCh38", 10 | "loc_start": 36514653, 11 | "loc_end": 36724494, 12 | "loc_strand": -1, 13 | "loc_region": "7", 14 | "loc_checksum": "3A0C94FD0C19832E867F7DD408729ED9FA0CC577", 15 | "exon_set_checksum": "C755149B187679201C78733FD72856459A1B9984", 16 | "transcript_checksum": "802BD7AF1B237E57A3C4A0C653C3772D2174199E", 17 | "sequence": "9B29344586871C26D51D42E2750778B9F0AED323", 18 | "biotype": "predicted_protein_coding", 19 | "three_prime_utr_start": 36514670, 20 | "three_prime_utr_end": 36514653, 21 | "three_prime_utr_seq": "AAACCACTGTTGAGATGG", 22 | "three_prime_utr_checksum": "361C329D180C3DD63FD9036DF88F00192E50514F", 23 | "five_prime_utr_start": 36724494, 24 | "five_prime_utr_end": 36724149, 25 | "five_prime_utr_seq": "ACTGAGCCAGGGAGCACGGAAGTTGTGCCACTGTGCAACTTGGGTTTTCTTTATCCTGCAGTCTTTACCTCAGCAGAACCGCACACCACAGACTCCCTCCAGCTCTTTGTGTGTGGCTCTCTCAGGGTCCAACAAGAGCAAGCTGTGGGTCTGTGAGTGTTTATGTGTGCTTTTATTCACTTCACACTTATTGAAAAGTGTGTATGTGAGAGGGTGGGGTGTGTGTGTCAAAGAGAGTGAGGAAGAGAAGGAGAGAGAGATCAATTGATTCTGCAGCCTCAGCTCCAGCATCCCTCAGTTGGGAGCTTCCAAAGCCGGGTGATCACTTGGGGTGCATAGCTCGGAG", 26 | "five_prime_utr_checksum": "60878BA89AD674207714116AF20DB12EE2B653C0", 27 | "transcript_release_set": [ 28 | { 29 | "assembly": "GRCh38", 30 | "shortname": "110_20220707", 31 | "description": "Refseq Homo sapiens Annotation Release 110.20220707", 32 | "release_date": "2022-07-07", 33 | "source": "RefSeq" 34 | }, 35 | { 36 | "assembly": "GRCh38", 37 | "shortname": "GCF_000001405_20230320", 38 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20230320", 39 | "release_date": "2023-03-20", 40 | "source": "RefSeq" 41 | }, 42 | { 43 | "assembly": "GRCh38", 44 | "shortname": "GCF_000001405_20231007", 45 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20231007", 46 | "release_date": "2023-10-07", 47 | "source": "RefSeq" 48 | }, 49 | { 50 | "assembly": "GRCh38", 51 | "shortname": "GCF_000001405_20240827", 52 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20240827", 53 | "release_date": "2024-08-24", 54 | "source": "RefSeq" 55 | } 56 | ] 57 | }, 58 | { 59 | "stable_id": "XM_011515339", 60 | "stable_id_version": 3, 61 | "assembly": "GRCh38", 62 | "loc_start": 36528175, 63 | "loc_end": 36724494, 64 | "loc_strand": -1, 65 | "loc_region": "7", 66 | "loc_checksum": "8DE122E93958674611C3E40852B0EA2C3257B3DD", 67 | "exon_set_checksum": "266DDEFF8992E3D25D19156E1DACE37F5B59C197", 68 | "transcript_checksum": "F3930BD6833EE5CE890A7F6F5A36D74BD2606ED2", 69 | "sequence": "2FEE2ADE3B6333A89337680BC9737760EF94EE4A", 70 | "biotype": "predicted_protein_coding", 71 | "three_prime_utr_start": 36528307, 72 | "three_prime_utr_end": 36528175, 73 | "three_prime_utr_seq": "ACTGAACATTCTACATCAACGTGGGAGAAAGCTCCTAGGCATTTCTCCATGCTTGGCGATTCCCATCAACAGTTCAAGGAACTTCCTGTTTTGTAATCTCCTGCTATTTGTTTAAAATAAATGTGAAGATCTA", 74 | "three_prime_utr_checksum": "32FCDE7C368E86B254F7A1EF1D5837213DA6C6A8", 75 | "five_prime_utr_start": 36724494, 76 | "five_prime_utr_end": 36724149, 77 | "five_prime_utr_seq": "ACTGAGCCAGGGAGCACGGAAGTTGTGCCACTGTGCAACTTGGGTTTTCTTTATCCTGCAGTCTTTACCTCAGCAGAACCGCACACCACAGACTCCCTCCAGCTCTTTGTGTGTGGCTCTCTCAGGGTCCAACAAGAGCAAGCTGTGGGTCTGTGAGTGTTTATGTGTGCTTTTATTCACTTCACACTTATTGAAAAGTGTGTATGTGAGAGGGTGGGGTGTGTGTGTCAAAGAGAGTGAGGAAGAGAAGGAGAGAGAGATCAATTGATTCTGCAGCCTCAGCTCCAGCATCCCTCAGTTGGGAGCTTCCAAAGCCGGGTGATCACTTGGGGTGCATAGCTCGGAG", 78 | "five_prime_utr_checksum": "60878BA89AD674207714116AF20DB12EE2B653C0", 79 | "transcript_release_set": [ 80 | { 81 | "assembly": "GRCh38", 82 | "shortname": "110_20220707", 83 | "description": "Refseq Homo sapiens Annotation Release 110.20220707", 84 | "release_date": "2022-07-07", 85 | "source": "RefSeq" 86 | }, 87 | { 88 | "assembly": "GRCh38", 89 | "shortname": "GCF_000001405_20230320", 90 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20230320", 91 | "release_date": "2023-03-20", 92 | "source": "RefSeq" 93 | }, 94 | { 95 | "assembly": "GRCh38", 96 | "shortname": "GCF_000001405_20231007", 97 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20231007", 98 | "release_date": "2023-10-07", 99 | "source": "RefSeq" 100 | }, 101 | { 102 | "assembly": "GRCh38", 103 | "shortname": "GCF_000001405_20240827", 104 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20240827", 105 | "release_date": "2024-08-24", 106 | "source": "RefSeq" 107 | } 108 | ] 109 | }, 110 | { 111 | "stable_id": "XM_011515340", 112 | "stable_id_version": 3, 113 | "assembly": "GRCh38", 114 | "loc_start": 36528175, 115 | "loc_end": 36724494, 116 | "loc_strand": -1, 117 | "loc_region": "7", 118 | "loc_checksum": "8DE122E93958674611C3E40852B0EA2C3257B3DD", 119 | "exon_set_checksum": "459BB82C3050AB0D64DC509D36F0AB1DC55E4B79", 120 | "transcript_checksum": "B01C1023FAF579C23978532BDA07596F571E3589", 121 | "sequence": "2529B9E4932091E20EF3C74F7495067255A1CEC6", 122 | "biotype": "predicted_protein_coding", 123 | "three_prime_utr_start": 36529217, 124 | "three_prime_utr_end": 36528175, 125 | "three_prime_utr_seq": "TCACTAGCTCTGTCATTTATTATCTCTGGGACTGCAGGCAATTCTCTGAATATTTTTAAACTCCCTTTCCAGCCCTGGAAATGGGGATACCATATCTTCCTAACAAATCTCTGTGATGATTCTATGAAATAATGTGTCATCCTCCTAAATGTTATGCCTGGTACACAGAAAAGGTACCATTTCCTTACACCCTTGCTACTCAATGTATGCTTTTGGATAAGCAAAATCAACACCACCCAGGAGCTGGTTATAAATGCAGAATTCCAGGCCCTACTCCAGACCTACCTAATCAGAACCTGCATTCAAACACAACCCACAGGTCATCTCTATGCCCATTAAAATTTTGTAAATGGTTTGTGAAGCTGAGATGGGAGGATCGCTCAAGTCCAGGAGGTCAAGACCAGCCTGGGCAACATTGCAAGACCCCATCTCTAAAAACAAAACATCTTTTTTTATTAGCCAGGCATGGTGGTGCATGCTTGTAGTCCCAGCTACTCTGGAGGTTGAGATGGGGGGATCACTTGAGCCTGGGTGGTTGAGGCTGCAGTGAGTTGTGATCACGTCACTCAACTCCAGCCTGGTTGACAGAGAGAGACCTCATCTCTATAAAAATAAAAATAAAGTTTAATAAATGGAGCTCTATAATGCCTCAAGATTAAGAGCTGGGTCAGTCATCAGATTTATAAGTCCTGCTATGTGCCAGGAGCTATGAAGGTGCTGGGAACATAATCGTCAACAAAGCAGAAGAGTCCTTATCTCTGTGGAGGATATAGAGAGAAGAGCTTTATTCTTGTCTGTCCAGAATTCCACTGGTGAACTGATATGCTTGGAAGCAGGGCCTCATCAGCCCTTTCTTGTCCTCACAGCAATAATAAACACAGTGAAAAAGCAAAAAGTTGAATTAAGCTGAACTGAACATTCTACATCAACGTGGGAGAAAGCTCCTAGGCATTTCTCCATGCTTGGCGATTCCCATCAACAGTTCAAGGAACTTCCTGTTTTGTAATCTCCTGCTATTTGTTTAAAATAAATGTGAAGATCTA", 126 | "three_prime_utr_checksum": "48FEEDED5670D4AFBEAC79CFA54306DD9ECDAFE1", 127 | "five_prime_utr_start": 36724494, 128 | "five_prime_utr_end": 36724149, 129 | "five_prime_utr_seq": "ACTGAGCCAGGGAGCACGGAAGTTGTGCCACTGTGCAACTTGGGTTTTCTTTATCCTGCAGTCTTTACCTCAGCAGAACCGCACACCACAGACTCCCTCCAGCTCTTTGTGTGTGGCTCTCTCAGGGTCCAACAAGAGCAAGCTGTGGGTCTGTGAGTGTTTATGTGTGCTTTTATTCACTTCACACTTATTGAAAAGTGTGTATGTGAGAGGGTGGGGTGTGTGTGTCAAAGAGAGTGAGGAAGAGAAGGAGAGAGAGATCAATTGATTCTGCAGCCTCAGCTCCAGCATCCCTCAGTTGGGAGCTTCCAAAGCCGGGTGATCACTTGGGGTGCATAGCTCGGAG", 130 | "five_prime_utr_checksum": "60878BA89AD674207714116AF20DB12EE2B653C0", 131 | "transcript_release_set": [ 132 | { 133 | "assembly": "GRCh38", 134 | "shortname": "110_20220707", 135 | "description": "Refseq Homo sapiens Annotation Release 110.20220707", 136 | "release_date": "2022-07-07", 137 | "source": "RefSeq" 138 | }, 139 | { 140 | "assembly": "GRCh38", 141 | "shortname": "GCF_000001405_20230320", 142 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20230320", 143 | "release_date": "2023-03-20", 144 | "source": "RefSeq" 145 | }, 146 | { 147 | "assembly": "GRCh38", 148 | "shortname": "GCF_000001405_20231007", 149 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20231007", 150 | "release_date": "2023-10-07", 151 | "source": "RefSeq" 152 | }, 153 | { 154 | "assembly": "GRCh38", 155 | "shortname": "GCF_000001405_20240827", 156 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20240827", 157 | "release_date": "2024-08-24", 158 | "source": "RefSeq" 159 | } 160 | ] 161 | } 162 | ] 163 | } -------------------------------------------------------------------------------- /tests/test_data/ensembl_test.GRCh38.111.gtf: -------------------------------------------------------------------------------- 1 | 1 ensembl_havana gene 65419 71585 . + . gene_id "ENSG00000186092"; gene_version "7"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; 2 | 1 havana transcript 65419 71585 . + . gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; 3 | 1 havana exon 65419 65433 . + . gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "1"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; exon_id "ENSE00003812156"; exon_version "1"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; 4 | 1 havana exon 65520 65573 . + . gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; exon_id "ENSE00003813641"; exon_version "1"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; 5 | 1 havana CDS 65565 65573 . + 0 gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; protein_id "ENSP00000493376"; protein_version "2"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; 6 | 1 havana start_codon 65565 65567 . + 0 gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; 7 | 1 havana exon 69037 71585 . + . gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "3"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; exon_id "ENSE00003813949"; exon_version "1"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; 8 | 1 havana CDS 69037 70005 . + 0 gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "3"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; protein_id "ENSP00000493376"; protein_version "2"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; 9 | 1 havana stop_codon 70006 70008 . + 0 gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "3"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; 10 | 1 havana five_prime_utr 65419 65433 . + . gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; 11 | 1 havana five_prime_utr 65520 65564 . + . gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; 12 | 1 havana three_prime_utr 70009 71585 . + . gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; 13 | MT insdc gene 8295 8364 . + . gene_id "ENSG00000210156"; gene_version "1"; gene_name "MT-TK"; gene_source "insdc"; gene_biotype "Mt_tRNA"; 14 | MT insdc transcript 8295 8364 . + . gene_id "ENSG00000210156"; gene_version "1"; transcript_id "ENST00000387421"; transcript_version "1"; gene_name "MT-TK"; gene_source "insdc"; gene_biotype "Mt_tRNA"; transcript_name "MT-TK-201"; transcript_source "insdc"; transcript_biotype "Mt_tRNA"; tag "basic"; tag "Ensembl_canonical"; transcript_support_level "NA"; 15 | MT insdc exon 8295 8364 . + . gene_id "ENSG00000210156"; gene_version "1"; transcript_id "ENST00000387421"; transcript_version "1"; exon_number "1"; gene_name "MT-TK"; gene_source "insdc"; gene_biotype "Mt_tRNA"; transcript_name "MT-TK-201"; transcript_source "insdc"; transcript_biotype "Mt_tRNA"; exon_id "ENSE00001544484"; exon_version "1"; tag "basic"; tag "Ensembl_canonical"; transcript_support_level "NA"; 16 | -------------------------------------------------------------------------------- /tests/test_data/grch37.genome: -------------------------------------------------------------------------------- 1 | NC_000007.13 36561661 36561662 C 2 | -------------------------------------------------------------------------------- /tests/test_data/hg19_chrY_300kb_genes.gtf: -------------------------------------------------------------------------------- 1 | # From iGenomes hg19 UCSC genes.gtf 2 | chrY stdin exon 244668 245252 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "1"; exon_id "NM_013239.1"; gene_name "PPP2R3B"; 3 | chrY stdin CDS 245105 245252 . - 1 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "1"; exon_id "NM_013239.1"; gene_name "PPP2R3B"; 4 | chrY stdin exon 249339 249445 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "2"; exon_id "NM_013239.2"; gene_name "PPP2R3B"; 5 | chrY stdin CDS 249339 249445 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "2"; exon_id "NM_013239.2"; gene_name "PPP2R3B"; 6 | chrY stdin exon 249513 249631 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "3"; exon_id "NM_013239.3"; gene_name "PPP2R3B"; 7 | chrY stdin CDS 249513 249631 . - 2 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "3"; exon_id "NM_013239.3"; gene_name "PPP2R3B"; 8 | chrY stdin exon 251500 251675 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "4"; exon_id "NM_013239.4"; gene_name "PPP2R3B"; 9 | chrY stdin CDS 251500 251675 . - 1 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "4"; exon_id "NM_013239.4"; gene_name "PPP2R3B"; 10 | chrY stdin exon 252042 252131 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "5"; exon_id "NM_013239.5"; gene_name "PPP2R3B"; 11 | chrY stdin CDS 252042 252131 . - 1 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "5"; exon_id "NM_013239.5"; gene_name "PPP2R3B"; 12 | chrY stdin exon 252618 252666 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "6"; exon_id "NM_013239.6"; gene_name "PPP2R3B"; 13 | chrY stdin CDS 252618 252666 . - 2 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "6"; exon_id "NM_013239.6"; gene_name "PPP2R3B"; 14 | chrY stdin exon 256251 256407 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "7"; exon_id "NM_013239.7"; gene_name "PPP2R3B"; 15 | chrY stdin CDS 256251 256407 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "7"; exon_id "NM_013239.7"; gene_name "PPP2R3B"; 16 | chrY stdin exon 256909 256995 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "8"; exon_id "NM_013239.8"; gene_name "PPP2R3B"; 17 | chrY stdin CDS 256909 256995 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "8"; exon_id "NM_013239.8"; gene_name "PPP2R3B"; 18 | chrY stdin exon 257436 257510 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "9"; exon_id "NM_013239.9"; gene_name "PPP2R3B"; 19 | chrY stdin CDS 257436 257510 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "9"; exon_id "NM_013239.9"; gene_name "PPP2R3B"; 20 | chrY stdin exon 257969 258071 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "10"; exon_id "NM_013239.10"; gene_name "PPP2R3B"; 21 | chrY stdin CDS 257969 258071 . - 1 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "10"; exon_id "NM_013239.10"; gene_name "PPP2R3B"; 22 | chrY stdin exon 258325 258428 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "11"; exon_id "NM_013239.11"; gene_name "PPP2R3B"; 23 | chrY stdin CDS 258325 258428 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "11"; exon_id "NM_013239.11"; gene_name "PPP2R3B"; 24 | chrY stdin exon 272140 272325 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "12"; exon_id "NM_013239.12"; gene_name "PPP2R3B"; 25 | chrY stdin CDS 272140 272325 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "12"; exon_id "NM_013239.12"; gene_name "PPP2R3B"; 26 | chrY stdin exon 297103 297690 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "13"; exon_id "NM_013239.13"; gene_name "PPP2R3B"; 27 | chrY stdin CDS 297103 297426 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "13"; exon_id "NM_013239.13"; gene_name "PPP2R3B"; 28 | chrY stdin start_codon 297424 297426 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "1"; exon_id "NM_013239.1"; gene_name "PPP2R3B"; 29 | chrY stdin stop_codon 245102 245104 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "1"; exon_id "NM_013239.1"; gene_name "PPP2R3B"; 30 | chrY stdin exon 231385 232054 . + . gene_id "PPP2R3B-AS1"; transcript_id "NR_027231"; exon_number "1"; exon_id "NR_027231.1"; gene_name "PPP2R3B-AS1"; 31 | chrY stdin exon 231385 232054 . + . gene_id "PPP2R3B-AS1"; transcript_id "NR_027232"; exon_number "1"; exon_id "NR_027232.1"; gene_name "PPP2R3B-AS1"; 32 | chrY stdin exon 148061 148351 . + . gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "1"; exon_id "NM_018390_2.1"; gene_name "PLCXD1"; 33 | chrY stdin exon 150834 150981 . + . gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "2"; exon_id "NM_018390_2.2"; gene_name "PLCXD1"; 34 | chrY stdin CDS 150855 150981 . + 0 gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "2"; exon_id "NM_018390_2.2"; gene_name "PLCXD1"; 35 | chrY stdin exon 155400 155536 . + . gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "3"; exon_id "NM_018390_2.3"; gene_name "PLCXD1"; 36 | chrY stdin CDS 155400 155536 . + 2 gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "3"; exon_id "NM_018390_2.3"; gene_name "PLCXD1"; 37 | chrY stdin exon 157315 157443 . + . gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "4"; exon_id "NM_018390_2.4"; gene_name "PLCXD1"; 38 | chrY stdin CDS 157315 157443 . + 0 gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "4"; exon_id "NM_018390_2.4"; gene_name "PLCXD1"; 39 | chrY stdin exon 158166 158321 . + . gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "5"; exon_id "NM_018390_2.5"; gene_name "PLCXD1"; 40 | chrY stdin CDS 158166 158321 . + 0 gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "5"; exon_id "NM_018390_2.5"; gene_name "PLCXD1"; 41 | chrY stdin exon 159702 159885 . + . gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "6"; exon_id "NM_018390_2.6"; gene_name "PLCXD1"; 42 | chrY stdin CDS 159702 159885 . + 0 gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "6"; exon_id "NM_018390_2.6"; gene_name "PLCXD1"; 43 | chrY stdin exon 165764 170022 . + . gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "7"; exon_id "NM_018390_2.7"; gene_name "PLCXD1"; 44 | chrY stdin CDS 165764 165999 . + 2 gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "7"; exon_id "NM_018390_2.7"; gene_name "PLCXD1"; 45 | chrY stdin start_codon 150855 150857 . + 0 gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "1"; exon_id "NM_018390_2.1"; gene_name "PLCXD1"; 46 | chrY stdin stop_codon 166000 166002 . + 0 gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "1"; exon_id "NM_018390_2.1"; gene_name "PLCXD1"; 47 | chrY stdin exon 142991 143061 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "1"; exon_id "NR_028057_2.1"; gene_name "PLCXD1"; 48 | chrY stdin exon 148149 148351 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "2"; exon_id "NR_028057_2.2"; gene_name "PLCXD1"; 49 | chrY stdin exon 150834 150981 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "3"; exon_id "NR_028057_2.3"; gene_name "PLCXD1"; 50 | chrY stdin exon 155400 155536 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "4"; exon_id "NR_028057_2.4"; gene_name "PLCXD1"; 51 | chrY stdin exon 157315 157443 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "5"; exon_id "NR_028057_2.5"; gene_name "PLCXD1"; 52 | chrY stdin exon 158166 158321 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "6"; exon_id "NR_028057_2.6"; gene_name "PLCXD1"; 53 | chrY stdin exon 159702 159885 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "7"; exon_id "NR_028057_2.7"; gene_name "PLCXD1"; 54 | chrY stdin exon 165764 166059 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "8"; exon_id "NR_028057_2.8"; gene_name "PLCXD1"; 55 | chrY stdin exon 169260 170022 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "9"; exon_id "NR_028057_2.9"; gene_name "PLCXD1"; 56 | -------------------------------------------------------------------------------- /tests/test_data/transcript_sequences.json: -------------------------------------------------------------------------------- 1 | { 2 | "ENST00000617537.5": "ACTGAGCCAGGGAGCACGGAAGTTGTGCCACTGTGCAACTTGGGTTTTCTTTATCCTGCAGTCTTTACCTCAGCAGAACCGCACACCACAGACTCCCTCCAGCTCTTTGTGTGTGGCTCTCTCAGGGTCCAACAAGAGCAAGCTGTGGGTCTGTGAGTGTTTATGTGTGCTTTTATTCACTTCACACTTATTGAAAAGTGTGTATGTGAGAGGGTGGGGTGTGTGTGTCAAAGAGAGTGAGGAAGAGAAGGAGAGAGAGATCAATTGATTCTGCAGCCTCAGCTCCAGCATCCCTCAGTTGGGAGCTTCCAAAGCCGGGTGATCACTTGGGGTGCATAGCTCGGAGATGCAGTCCCCCTGGAAAATCCTTACGGTGGCGCCTCTATTCTTGCTCCTGTCTCTTCAGTCCTCGGCCTCTCCAGCCAACGATGACCAGTCCAGGCCCAGCCTCTCGAATGGGCACACCTGTGTAGGGTGTGTGCTGGTGGTGTCTGTAATAGAACAGCTTGCTCAAGTTCACAACTCGACGGTCCAGGCCTCGATGGAGAGACTGTGCAGCTACCTGCCTGAAAAACTGTTCTTGAAAACCACCTGCTATTTAGTCATTGACAAGTTTGGATCAGACATCATAAAACTGCTTAGCGCAGATATGAATGCTGATGTGGTATGTCACACTCTGGAGTTTTGTAAACAGAACACTGGCCAACCATTGTGTCATCTCTACCCTCTTCCCAAGGAGACATGGAAATTTACACTACAGAAGGCAAGACAAATTGTCAAGAAGTCCCCGATTCTGAAATATTCTAGAAGTGGTTCTGACATTTGTTCACTCCCGGTTTTGGCCAAGATCTGCCAGAAAATTAAATTAGCTATGGAACAGTCTGTGCCATTCAAAGATGTGGATTCAGACAAATACAGCGTTTTCCCAACACTGCGGGGCTATCACTGGCGGGGGAGAGACTGTAATGACAGCGACGAGTCAGTGTACCCAGGTAGAAGGCCGAACAACTGGGATGTCCATCAGGATTCAAACTGTAATGGCATTTGGGGTGTCGATCCAAAAGATGGAGTTCCATATGAGAAGAAATTCTGTGAAGGTTCACAGCCCAGGGGAATCATTTTGCTGGGAGACTCAGCTGGGGCTCATTTTCACATCTCTCCTGAATGGATCACAGCGTCGCAGATGTCTTTGAACTCTTTCATCAATCTACCAACAGCCCTTACCAACGAGCTTGACTGGCCCCAACTCTCTGGTGCTACAGGATTTCTGGACTCCACTGTTGGAATTAAAGAAAAATCTATTTACCTTCGCTTATGGAAAAGAAACCACTGTAATCACAGGGACTACCAGAATATTTCAAGAAATGGTGCATCTTCCCGAAACCTGAAGAAATTTATAGAAAGCTTGTCTAGAAACAAGGTGTTGGACTATCCCGCCATCGTTATATATGCCATGATTGGAAATGATGTCTGCAGTGGGAAGAGTGACCCAGTCCCAGCCATGACCACTCCTGAGAAACTCTACTCCAACGTCATGCAGACTCTGAAGCATCTAAATTCCCACCTGCCCAATGGCAGCCATGTTATTTTGTATGGCTTACCAGATGGAACCTTTCTCTGGGATAATTTGCACAACAGATATCATCCTCTCGGCCAGCTAAATAAAGACATGACCTATGCGCAGTTGTACTCCTTCCTGAACTGCCTCCAGGTCAGCCCCTGCCACGGCTGGATGTCTTCCAACAAGACGTTGCGGACTCTCACTTCAGAGAGAGCAGAGCAACTCTCCAACACACTGAAAAAAATTGCAGCCAGTGAGAAATTTACAAACTTCAATCTTTTCTACATGGATTTTGCCTTCCATGAAATCATACAGGAGTGGCAGAAGAGAGGCGGACAGCCCTGGCAGCTCATCGAGCCCGTGGATGGATTCCACCCCAACGAGGTGGCTTTGCTGTTGTTGGCGGATCATTTCTGGAAAAAGGTGCAGCTCCAGTGGCCCCAAATCCTGGGAAAGGAGAATCCGTTCAACCCCCAGATTAAACAGGTGTTTGGAGACCAAGGCGGGCACTGAGCCTCTCAGGAGCATGCACCCCTGGGGAGCACAGGGAGGCAGAGGCTTGGGTAAACTCATTCCACAAACCCTATGGGGGCTGCCACGTCACAGGCCCAAAGGACTCTTCTTCAGCAGCATCTTTGCAAAATGTCTTTCTCTCAATGAAGAGCATATCTGGACGACTGTGCAATGCTGTGTGCTCCCGGGATCAGTAACCCTTCCGCTGTTCCTGAAATAACCTTTCATAAAGTGCTTTGGGTGCCATTCCAAACAAGAGAGTATCTGTGCCCTTTACAGCTAATTGTTCTAAAAGGAGTTTCTAAAAACAC", 3 | "NM_001637.3": "AACAGATCAGTTCCGGCAAGCCTCGAGGCTCACGGGGTTTATGCACACTAACTTCACTGAGCCAGGGAGCACGGAAGTTGTGCCACTGTGCAACTTGGGTTTTCTTTATCCTGCAGTCTTTACCTCAGCAGAACCGCACACCACAGACTCCCTCCAGCTCTTTGTGTGTGGCTCTCTCAGGGTCCAACAAGAGCAAGCTGTGGGTCTGTGAGTGTTTATGTGTGCTTTTATTCACTTCACACTTATTGAAAAGTGTGTATGTGAGAGGGTGGGGTGTGTGTGTCAAAGAGAGTGAGGAAGAGAAGGAGAGAGAGATCAATTGATTCTGCAGCCTCAGCTCCAGCATCCCTCAGTTGGGAGCTTCCAAAGCCGGGTGATCACTTGGGGTGCATAGCTCGGAGATGCAGTCCCCCTGGAAAATCCTTACGGTGGCGCCTCTATTCTTGCTCCTGTCTCTTCAGTCCTCGGCCTCTCCAGCCAACGATGACCAGTCCAGGCCCAGCCTCTCGAATGGGCACACCTGTGTAGGGTGTGTGCTGGTGGTGTCTGTAATAGAACAGCTTGCTCAAGTTCACAACTCGACGGTCCAGGCCTCGATGGAGAGACTGTGCAGCTACCTGCCTGAAAAACTGTTCTTGAAAACCACCTGCTATTTAGTCATTGACAAGTTTGGATCAGACATCATAAAACTGCTTAGCGCAGATATGAATGCTGATGTGGTATGTCACACTCTGGAGTTTTGTAAACAGAACACTGGCCAACCATTGTGTCATCTCTACCCTCTTCCCAAGGAGACATGGAAATTTACACTACAGAAGGCAAGACAAATTGTCAAGAAGTCCCCGATTCTGAAATATTCTAGAAGTGGTTCTGACATTTGTTCACTCCCGGTTTTGGCCAAGATCTGCCAGAAAATTAAATTAGCTATGGAACAGTCTGTGCCATTCAAAGATGTGGATTCAGACAAATACAGCGTTTTCCCAACACTGCGGGGCTATCACTGGCGGGGGAGAGACTGTAATGACAGCGACGAGTCAGTGTACCCAGGTAGAAGGCCGAACAACTGGGATGTCCATCAGGATTCAAACTGTAATGGCATTTGGGGTGTCGATCCAAAAGATGGAGTTCCATATGAGAAGAAATTCTGTGAAGGTTCACAGCCCAGGGGAATCATTTTGCTGGGAGACTCAGCTGGGGCTCATTTTCACATCTCTCCTGAATGGATCACAGCGTCGCAGATGTCTTTGAACTCTTTCATCAATCTACCAACAGCCCTTACCAACGAGCTTGACTGGCCCCAACTCTCTGGTGCTACAGGATTTCTGGACTCCACTGTTGGAATTAAAGAAAAATCTATTTACCTTCGCTTATGGAAAAGAAACCACTGTAATCACAGGGACTACCAGAATATTTCAAGAAATGGTGCATCTTCCCGAAACCTGAAGAAATTTATAGAAAGCTTGTCTAGAAACAAGGTGTTGGACTATCCCGCCATCGTTATATATGCCATGATTGGAAATGATGTCTGCAGTGGGAAGAGTGACCCAGTCCCAGCCATGACCACTCCTGAGAAACTCTACTCCAACGTCATGCAGACTCTGAAGCATCTAAATTCCCACCTGCCCAATGGCAGCCATGTTATTTTGTATGGCTTACCAGATGGAACCTTTCTCTGGGATAATTTGCACAACAGATATCATCCTCTCGGCCAGCTAAATAAAGACATGACCTATGCGCAGTTGTACTCCTTCCTGAACTGCCTCCAGGTCAGCCCCTGCCACGGCTGGATGTCTTCCAACAAGACGTTGCGGACTCTCACTTCAGAGAGAGCAGAGCAACTCTCCAACACACTGAAAAAAATTGCAGCCAGTGAGAAATTTACAAACTTCAATCTTTTCTACATGGATTTTGCCTTCCATGAAATCATACAGGAGTGGCAGAAGAGAGGCGGACAGCCCTGGCAGCTCATCGAGCCCGTGGATGGATTCCACCCCAACGAGGTGGCTTTGCTGTTGTTGGCGGATCATTTCTGGAAAAAGGTGCAGCTCCAGTGGCCCCAAATCCTGGGAAAGGAGAATCCGTTCAACCCCCAGATTAAACAGGTGTTTGGAGACCAAGGCGGGCACTGAGCCTCTCAGGAGCATGCACCCCTGGGGAGCACAGGGAGGCAGAGGCTTGGGTAAACTCATTCCACAAACCCTATGGGGGCTGCCACGTCACAGGCCCAAAGGACTCTTCTTCAGCAGCATCTTTGCAAAATGTCTTTCTCTCAATGAAGAGCATATCTGGACGACTGTGCAATGCTGTGTGCTCCCGGGATCAGTAACCCTTCCGCTGTTCCTGAAATAACCTTTCATAAAGTGCTTTGGGTGCCATTCCAAACAAGAGAGTATCTGTGCCCTTTACAGCTAATTGTTCTAAAAGGAGTTTCTAAAAACAC" 4 | } -------------------------------------------------------------------------------- /tests/test_gff_parsers.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | from inspect import getsourcefile 4 | import unittest 5 | from generate_transcript_data.gff_parser import GTFParser, GFF3Parser 6 | 7 | 8 | class Test(unittest.TestCase): 9 | this_file_dir = os.path.dirname(os.path.abspath(getsourcefile(lambda: 0))) 10 | test_data_dir = os.path.join(this_file_dir, "test_data") 11 | ENSEMBL_104_GTF_FILENAME = os.path.join(test_data_dir, "ensembl_test.GRCh38.104.gtf") 12 | ENSEMBL_111_GTF_FILENAME = os.path.join(test_data_dir, "ensembl_test.GRCh38.111.gtf") 13 | # Older RefSeq, before Genbank => GenBank changed 14 | REFSEQ_GFF3_FILENAME_2021 = os.path.join(test_data_dir, "refseq_test.GRCh38.p13_genomic.109.20210514.gff") 15 | # Newer RefSeq, before Genbank => GenBank changed 16 | REFSEQ_GFF3_FILENAME_2023 = os.path.join(test_data_dir, "refseq_test.GRCh38.p14_genomic.RS_2023_03.gff") 17 | REFSEQ_GFF3_FILENAME_GRCH37_MT = os.path.join(test_data_dir, "refseq_grch37_mt.gff") 18 | REFSEQ_GFF3_FILENAME_GRCH38_MT = os.path.join(test_data_dir, "refseq_grch38.p14_mt.gff") 19 | UCSC_GTF_FILENAME = os.path.join(test_data_dir, "hg19_chrY_300kb_genes.gtf") 20 | FAKE_URL = "http://fake.url" 21 | 22 | FAKE_MT_TRANSCRIPTS = [ 23 | "fake-rna-ATP6", "fake-rna-ATP8", "fake-rna-COX1", "fake-rna-COX2", "fake-rna-COX3", "fake-rna-CYTB", 24 | "fake-rna-ND1", "fake-rna-ND2", "fake-rna-ND3", "fake-rna-ND4", "fake-rna-ND4L", "fake-rna-ND5", "fake-rna-ND6" 25 | ] 26 | 27 | def _test_exon_length(self, transcripts, genome_build, transcript_id, expected_length): 28 | transcript = transcripts[transcript_id] 29 | exons = transcript["genome_builds"][genome_build]["exons"] 30 | length = sum([exon[1] - exon[0] for exon in exons]) 31 | self.assertEqual(expected_length, length, "%s exons sum" % transcript_id) 32 | 33 | def test_ucsc_gtf(self): 34 | genome_build = "GRCh37" 35 | parser = GTFParser(self.UCSC_GTF_FILENAME, genome_build, self.FAKE_URL) 36 | _, transcripts = parser.get_genes_and_transcripts() 37 | self._test_exon_length(transcripts, genome_build, "NM_013239", 2426) 38 | 39 | def test_ensembl_gtf(self): 40 | genome_build = "GRCh38" 41 | parser = GTFParser(self.ENSEMBL_104_GTF_FILENAME, genome_build, self.FAKE_URL) 42 | genes, transcripts = parser.get_genes_and_transcripts() 43 | self._test_exon_length(transcripts, genome_build, "ENST00000357654.9", 7088) 44 | 45 | # Ensure that geneID was inserted with a version 46 | expected_gene_version = "ENSG00000012048.23" 47 | 48 | transcript = transcripts["ENST00000357654.9"] 49 | transcript_gene_version = transcript["gene_version"] 50 | self.assertEqual(expected_gene_version, transcript_gene_version, "Transcript gene has version") 51 | 52 | self.assertTrue(expected_gene_version in genes, f"{expected_gene_version=} in genes") 53 | 54 | protein = transcript.get("protein") 55 | self.assertEqual(protein, "ENSP00000350283.3") 56 | 57 | def test_refseq_gff3_2021(self): 58 | genome_build = "GRCh38" 59 | parser = GFF3Parser(self.REFSEQ_GFF3_FILENAME_2021, genome_build, self.FAKE_URL) 60 | _, transcripts = parser.get_genes_and_transcripts() 61 | self._test_exon_length(transcripts, genome_build, "NM_007294.4", 7088) 62 | 63 | transcript = transcripts["NM_015120.4"] 64 | protein = transcript.get("protein") 65 | self.assertEqual(protein, "NP_055935.4") 66 | 67 | def test_refseq_gff3_2023(self): 68 | genome_build = "GRCh38" 69 | parser = GFF3Parser(self.REFSEQ_GFF3_FILENAME_2023, genome_build, self.FAKE_URL) 70 | _, transcripts = parser.get_genes_and_transcripts() 71 | self._test_exon_length(transcripts, genome_build, "NM_007294.4", 7088) 72 | 73 | transcript = transcripts["NM_015120.4"] 74 | protein = transcript.get("protein") 75 | self.assertEqual(protein, "NP_055935.4") 76 | 77 | def test_exons_in_genomic_order(self): 78 | genome_build = "GRCh38" 79 | parser = GTFParser(self.ENSEMBL_104_GTF_FILENAME, genome_build, self.FAKE_URL) 80 | _, transcripts = parser.get_genes_and_transcripts() 81 | transcript = transcripts["ENST00000357654.9"] 82 | exons = transcript["genome_builds"][genome_build]["exons"] 83 | first_exon = exons[0] 84 | last_exon = exons[-1] 85 | self.assertGreater(last_exon[0], first_exon[0]) 86 | 87 | parser = GFF3Parser(self.REFSEQ_GFF3_FILENAME_2021, genome_build, self.FAKE_URL) 88 | _, transcripts = parser.get_genes_and_transcripts() 89 | transcript = transcripts["NM_007294.4"] 90 | self.assertEqual(transcript.get("hgnc"), "1100", f"{transcript} has HGNC:1100") 91 | exons = transcript["genome_builds"][genome_build]["exons"] 92 | first_exon = exons[0] 93 | last_exon = exons[-1] 94 | self.assertGreater(last_exon[0], first_exon[0]) 95 | 96 | parser = GFF3Parser(self.REFSEQ_GFF3_FILENAME_2023, genome_build, self.FAKE_URL) 97 | _, transcripts = parser.get_genes_and_transcripts() 98 | transcript = transcripts["NM_007294.4"] 99 | self.assertEqual(transcript.get("hgnc"), "1100", f"{transcript} has HGNC:1100") 100 | exons = transcript["genome_builds"][genome_build]["exons"] 101 | first_exon = exons[0] 102 | last_exon = exons[-1] 103 | self.assertGreater(last_exon[0], first_exon[0]) 104 | 105 | def test_ensembl_gtf_tags(self): 106 | genome_build = "GRCh38" 107 | parser = GTFParser(self.ENSEMBL_111_GTF_FILENAME, genome_build, self.FAKE_URL) 108 | genes, transcripts = parser.get_genes_and_transcripts() 109 | transcript = transcripts["ENST00000641515.2"] 110 | tag = transcript["genome_builds"][genome_build].get("tag") 111 | self.assertIn("MANE_Select", tag) 112 | 113 | def test_chrom_contig_conversion(self): 114 | genome_build = "GRCh38" 115 | parser = GTFParser(self.ENSEMBL_111_GTF_FILENAME, genome_build, self.FAKE_URL) 116 | _, transcripts = parser.get_genes_and_transcripts() 117 | transcript = transcripts["ENST00000641515.2"] 118 | contig = transcript["genome_builds"][genome_build].get("contig") 119 | self.assertEqual(contig, "NC_000001.11") 120 | 121 | def test_ncrna_gene(self): 122 | """ We were incorrectly missing ncRNA gene info @see https://github.com/SACGF/cdot/issues/72 """ 123 | genome_build = "GRCh38" 124 | parser = GTFParser(self.ENSEMBL_111_GTF_FILENAME, genome_build, self.FAKE_URL) 125 | genes, transcripts = parser.get_genes_and_transcripts() 126 | gene = genes["ENSG00000210156"] 127 | gene_symbol = gene["gene_symbol"] 128 | self.assertEqual(gene_symbol, "MT-TK") 129 | 130 | def _test_mito(self, filename, genome_build): 131 | parser = GFF3Parser(filename, genome_build, self.FAKE_URL) 132 | genes, transcripts = parser.get_genes_and_transcripts() 133 | 134 | for transcript_accession in self.FAKE_MT_TRANSCRIPTS: 135 | self.assertIn(transcript_accession, transcripts) 136 | 137 | transcript = transcripts["fake-rna-ATP6"] 138 | exons = transcript["genome_builds"][genome_build]["exons"] 139 | first_exon = exons[0] 140 | self.assertEqual(first_exon[0], 8526) 141 | self.assertEqual(first_exon[1], 9207) 142 | 143 | def test_mito_mrna(self): 144 | """ Need to make fake MT transcripts for RefSeq @see https://github.com/SACGF/cdot/issues/72 """ 145 | self._test_mito(self.REFSEQ_GFF3_FILENAME_GRCH38_MT, "GRCh38") 146 | 147 | def test_mito_no_mrna(self): 148 | """ Need to make fake MT transcripts for RefSeq @see https://github.com/SACGF/cdot/issues/72 """ 149 | self._test_mito(self.REFSEQ_GFF3_FILENAME_GRCH37_MT, "GRCh37") 150 | -------------------------------------------------------------------------------- /tests/test_json_data_provider_ensembl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from abc import ABC, abstractmethod 4 | from inspect import getsourcefile 5 | from os.path import abspath 6 | 7 | import hgvs 8 | from hgvs.assemblymapper import AssemblyMapper 9 | from hgvs.dataproviders.seqfetcher import SeqFetcher 10 | from hgvs.exceptions import HGVSDataNotAvailableError 11 | 12 | from cdot.hgvs.dataproviders import ChainedSeqFetcher 13 | from cdot.hgvs.dataproviders.json_data_provider import JSONDataProvider 14 | from tests.mock_seqfetcher import MockSeqFetcher 15 | from tests.mock_ensembl_tark import MockEnsemblTarkDataProvider 16 | 17 | 18 | class AbstractEnsemblTestCase(unittest.TestCase, ABC): 19 | @classmethod 20 | def setUpClass(cls): 21 | """ Subclasses need to override this """ 22 | raise unittest.SkipTest 23 | 24 | def test_transcript(self): 25 | am = AssemblyMapper(self.json_data_provider, 26 | assembly_name='GRCh38', alt_aln_method='splign', replace_reference=True) 27 | HGVS_C_TO_G = [ 28 | ('ENST00000617537.5:c.1582G>A', 'NC_000007.14:g.36522056C>T'), 29 | ] 30 | 31 | hp = hgvs.parser.Parser() 32 | for hgvs_c, expected_hgvs_g in HGVS_C_TO_G: 33 | var_c = hp.parse_hgvs_variant(hgvs_c) 34 | var_g = am.c_to_g(var_c) 35 | self.assertEqual(str(var_g), expected_hgvs_g) 36 | 37 | def test_get_tx_for_gene(self): 38 | found = False 39 | expected_transcript = "ENST00000617537.5" 40 | for tx_data in self.json_data_provider.get_tx_for_gene("AOAH"): 41 | print(tx_data) 42 | if tx_data["tx_ac"] == expected_transcript: 43 | found = True 44 | self.assertEqual(tx_data["alt_ac"], "NC_000007.14") 45 | continue 46 | self.assertTrue(found) 47 | 48 | def test_get_tx_for_region(self): 49 | found = False 50 | expected_transcript = "ENST00000617537.5" 51 | # Exonic coordinate 52 | for tx_data in self.json_data_provider.get_tx_for_region("NC_000007.14", "splign", 36530416, 36530514): 53 | if tx_data["tx_ac"] == expected_transcript: 54 | found = True 55 | self.assertEqual(tx_data["alt_strand"], -1) 56 | self.assertEqual(tx_data["start_i"], 36512940) 57 | self.assertEqual(tx_data["end_i"], 36724494) 58 | break 59 | 60 | self.assertTrue(found) 61 | 62 | def test_get_pro_ac_for_tx_ac(self): 63 | pro_ac = self.json_data_provider.get_pro_ac_for_tx_ac("ENST00000617537.5") 64 | self.assertEqual(pro_ac, "ENSP00000483783.1") 65 | 66 | def test_get_tx_info(self): 67 | # We only have data for GRCh38 but none for 37 68 | 69 | # Make sure 37 fails 70 | with self.assertRaises(HGVSDataNotAvailableError): 71 | tx_info = self.json_data_provider.get_tx_info("ENST00000617537.5", "NC_000007.13", "splign") 72 | 73 | # Make sure 38 works 74 | tx_info = self.json_data_provider.get_tx_info("ENST00000617537.5", "NC_000007.14", "splign") 75 | print(tx_info) 76 | 77 | 78 | class JsonDataProviderTestCase(AbstractEnsemblTestCase): 79 | @classmethod 80 | def setUpClass(cls): 81 | this_file_dir = os.path.dirname(abspath(getsourcefile(lambda: 0))) 82 | # parent_dir = os.path.dirname(this_file_dir) 83 | test_json_file = os.path.join(this_file_dir, "test_data/cdot.ensembl.grch38.json") 84 | test_transcripts_file = os.path.join(this_file_dir, "test_data/transcript_sequences.json") 85 | mock_seqfetcher = MockSeqFetcher(test_transcripts_file) 86 | seqfetcher = ChainedSeqFetcher(mock_seqfetcher, SeqFetcher()) 87 | cls.json_data_provider = JSONDataProvider([test_json_file], seqfetcher=seqfetcher) 88 | 89 | 90 | class EnsemblTarkDataProviderTestCase(AbstractEnsemblTestCase): 91 | @classmethod 92 | def setUpClass(cls): 93 | cls.json_data_provider = MockEnsemblTarkDataProvider() 94 | 95 | 96 | 97 | if __name__ == '__main__': 98 | unittest.main() 99 | -------------------------------------------------------------------------------- /tests/test_json_data_provider_refseq.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from inspect import getsourcefile 4 | from os.path import abspath 5 | 6 | import hgvs 7 | from hgvs.assemblymapper import AssemblyMapper 8 | from hgvs.dataproviders.seqfetcher import SeqFetcher 9 | from hgvs.exceptions import HGVSDataNotAvailableError 10 | 11 | from cdot.hgvs.dataproviders import ChainedSeqFetcher 12 | from cdot.hgvs.dataproviders.json_data_provider import JSONDataProvider 13 | from tests.mock_seqfetcher import MockSeqFetcher 14 | 15 | 16 | class TestJSONDataProvider(unittest.TestCase): 17 | @classmethod 18 | def setUpClass(cls): 19 | this_file_dir = os.path.dirname(abspath(getsourcefile(lambda: 0))) 20 | # parent_dir = os.path.dirname(this_file_dir) 21 | test_json_file = os.path.join(this_file_dir, "test_data/cdot.refseq.grch37.json") 22 | test_transcripts_file = os.path.join(this_file_dir, "test_data/transcript_sequences.json") 23 | mock_seqfetcher = MockSeqFetcher(test_transcripts_file) 24 | seqfetcher = ChainedSeqFetcher(mock_seqfetcher, SeqFetcher()) 25 | cls.json_data_provider = JSONDataProvider([test_json_file], seqfetcher=seqfetcher) 26 | 27 | def test_transcript(self): 28 | am = AssemblyMapper(self.json_data_provider, 29 | assembly_name='GRCh37', alt_aln_method='splign', replace_reference=True) 30 | HGVS_C_TO_G = [ 31 | ('NM_001637.3:c.1582G>A', 'NC_000007.13:g.36561662C>T'), 32 | ] 33 | 34 | hp = hgvs.parser.Parser() 35 | for hgvs_c, expected_hgvs_g in HGVS_C_TO_G: 36 | var_c = hp.parse_hgvs_variant(hgvs_c) 37 | var_g = am.c_to_g(var_c) 38 | self.assertEqual(str(var_g), expected_hgvs_g) 39 | 40 | def test_get_tx_for_gene(self): 41 | found = False 42 | expected_transcript = "NM_001637.3" 43 | for tx_data in self.json_data_provider.get_tx_for_gene("AOAH"): 44 | print(tx_data) 45 | if tx_data["tx_ac"] == expected_transcript: 46 | found = True 47 | self.assertEqual(tx_data["alt_ac"], "NC_000007.13") 48 | continue 49 | self.assertTrue(found) 50 | 51 | def test_get_tx_for_region(self): 52 | found = False 53 | expected_transcript = "NM_001637.3" 54 | # Exonic coordinate 55 | for tx_data in self.json_data_provider.get_tx_for_region("NC_000007.13", "splign", 36570024, 36570025): 56 | if tx_data["tx_ac"] == expected_transcript: 57 | found = True 58 | self.assertEqual(tx_data["alt_strand"], -1) 59 | self.assertEqual(tx_data["start_i"], 36552548) 60 | self.assertEqual(tx_data["end_i"], 36764154) 61 | continue 62 | self.assertTrue(found) 63 | 64 | def test_get_tx_for_region_intron(self): 65 | """ Test case for https://github.com/SACGF/cdot/issues/38 """ 66 | found = False 67 | expected_transcript = "NM_001637.3" 68 | # Coordinate below is intronic 69 | for tx_data in self.json_data_provider.get_tx_for_region("NC_000007.13", "splign", 36743533, 36745648): 70 | if tx_data["tx_ac"] == expected_transcript: 71 | found = True 72 | self.assertEqual(tx_data["alt_strand"], -1) 73 | self.assertEqual(tx_data["start_i"], 36552548) 74 | self.assertEqual(tx_data["end_i"], 36764154) 75 | continue 76 | self.assertTrue(found) 77 | 78 | 79 | def test_get_pro_ac_for_tx_ac(self): 80 | pro_ac = self.json_data_provider.get_pro_ac_for_tx_ac("NM_001637.3") 81 | self.assertEqual(pro_ac, "NP_001628.1") 82 | 83 | def test_get_gene_info(self): 84 | gene_info = self.json_data_provider.get_gene_info("GATA2") 85 | summary = gene_info.pop("summary") 86 | self.assertTrue("zinc-finger transcription factors" in summary) 87 | expected = { 88 | "hgnc": "GATA2", 89 | "maploc": "3q21.3", 90 | "descr": "GATA binding protein 2", 91 | "aliases": "{DCML,IMD21,MONOMAC,NFE1B}", 92 | "added": None, 93 | } 94 | self.assertEqual(gene_info, expected) 95 | 96 | def test_get_tx_info(self): 97 | # We only have data for GRCh37 but none for 38 98 | 99 | # Make sure 37 works 100 | tx_info = self.json_data_provider.get_tx_info("NM_001637.3", "NC_000007.13", "splign") 101 | print(tx_info) 102 | 103 | # Make sure 38 fails 104 | with self.assertRaises(HGVSDataNotAvailableError): 105 | tx_info = self.json_data_provider.get_tx_info("NM_001637.3", "NC_000007.14", "splign") 106 | 107 | 108 | if __name__ == '__main__': 109 | unittest.main() 110 | -------------------------------------------------------------------------------- /tests/test_pyhgvs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from inspect import getsourcefile 4 | from os.path import abspath 5 | 6 | import pyhgvs 7 | 8 | from cdot.pyhgvs.pyhgvs_transcript import JSONPyHGVSTranscriptFactory, is_sacgf_pyhgvs_fork 9 | from .genome import MockGenomeTestFile 10 | 11 | 12 | class TestPyHGVS(unittest.TestCase): 13 | def test_transcript(self): 14 | this_file_dir = os.path.dirname(abspath(getsourcefile(lambda: 0))) 15 | test_json_file = os.path.join(this_file_dir, "test_data/cdot.refseq.grch37.json") 16 | factory = JSONPyHGVSTranscriptFactory([test_json_file]) 17 | 18 | HGVS_C_TO_G = [ 19 | ('NM_001637.3:c.1582G>A', 'NC_000007.13:g.36561662C>T'), 20 | ] 21 | 22 | genome = MockGenomeTestFile( 23 | db_filename='grch37.fa', 24 | filename=os.path.join(this_file_dir, 'test_data/grch37.genome'), 25 | create_data=False) 26 | 27 | sacgf_pyhgvs_fork = is_sacgf_pyhgvs_fork() 28 | 29 | def get_transcript(transcript_id): 30 | return factory.get_transcript_grch37(transcript_id, sacgf_pyhgvs_fork=sacgf_pyhgvs_fork) 31 | 32 | for hgvs_c, expected_hgvs_g in HGVS_C_TO_G: 33 | result = pyhgvs.parse_hgvs_name(hgvs_c, genome, get_transcript=get_transcript) 34 | name = pyhgvs.HGVSName(expected_hgvs_g) 35 | expected = (name.chrom, name.start, name.ref_allele, name.alt_allele) 36 | self.assertEqual(result, expected) 37 | 38 | def test_non_coding_transcript(self): 39 | this_file_dir = os.path.dirname(abspath(getsourcefile(lambda: 0))) 40 | test_json_file = os.path.join(this_file_dir, "test_data/cdot.refseq.grch37.json") 41 | factory = JSONPyHGVSTranscriptFactory([test_json_file]) 42 | 43 | genome = MockGenomeTestFile( 44 | db_filename='grch37.fa', 45 | filename=os.path.join(this_file_dir, 'test_data/grch37.genome'), 46 | create_data=False) 47 | 48 | transcript_id = "NR_023343.1" 49 | sacgf_pyhgvs_fork = is_sacgf_pyhgvs_fork() 50 | pyhgvs_transcript = factory.get_transcript_grch37(transcript_id, sacgf_pyhgvs_fork=sacgf_pyhgvs_fork) 51 | self.assertFalse(pyhgvs_transcript.is_coding, f"Transcript {transcript_id} is non-coding") 52 | 53 | 54 | if __name__ == '__main__': 55 | unittest.main() 56 | -------------------------------------------------------------------------------- /tests/test_uta_conversion.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from generate_transcript_data.cdot_json import _cigar_to_gap_and_length 3 | 4 | 5 | class UTAConversionTestCase(unittest.TestCase): 6 | def test_cigar_to_gap_and_length(self): 7 | cigar = '194=1D60=1D184=' 8 | expected_gap = 'M194 I1 M60 I1 M184' 9 | 10 | gap, exon_length = _cigar_to_gap_and_length(cigar) 11 | self.assertEqual(gap, expected_gap) 12 | 13 | def test_cigar_full_match(self): 14 | """ Should return None as perfect match """ 15 | cigar = '194=' 16 | expected_gap = None 17 | 18 | gap, exon_length = _cigar_to_gap_and_length(cigar) 19 | self.assertEqual(gap, expected_gap) 20 | 21 | def test_cigar_merged_matches(self): 22 | cigar = '194=100=' 23 | expected_gap = None 24 | 25 | gap, exon_length = _cigar_to_gap_and_length(cigar) 26 | self.assertEqual(gap, expected_gap) 27 | 28 | def test_cigar_mismatch(self): 29 | cigar = '195=1X1D430=' # X will become match and should merge w/first 30 | expected_gap = "M196 I1 M430" 31 | 32 | gap, exon_length = _cigar_to_gap_and_length(cigar) 33 | self.assertEqual(gap, expected_gap) 34 | 35 | def test_cigar_deletion_exon_length(self): 36 | cigar = '100=50I100=' # 100 match, 50 insertion (in ref, del in transcript), 100 match = 200 exon length 37 | 38 | _, exon_length = _cigar_to_gap_and_length(cigar) 39 | self.assertEqual(exon_length, 200) 40 | 41 | 42 | if __name__ == '__main__': 43 | unittest.main() 44 | --------------------------------------------------------------------------------