├── .gitignore
├── .idea
├── .gitignore
├── cdot.iml
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── vcs.xml
├── CHANGELOG.md
├── LICENSE
├── README.md
├── cdot
├── __init__.py
├── data_release.py
├── hgvs
│ ├── __init__.py
│ └── dataproviders
│ │ ├── __init__.py
│ │ ├── ensembl_tark_data_provider.py
│ │ ├── fasta_seqfetcher.py
│ │ ├── json_data_provider.py
│ │ └── seqfetcher.py
└── pyhgvs
│ ├── __init__.py
│ └── pyhgvs_transcript.py
├── generate_transcript_data
├── Mus_musculus
│ ├── refseq_transcripts_grcm38.sh
│ └── refseq_transcripts_grcm39.sh
├── Snakefile
├── __init__.py
├── all_transcripts.sh
├── cdot_gene_info.py
├── cdot_json.py
├── cdot_transcripts.yaml
├── ensembl_transcripts_chm13v2.sh
├── ensembl_transcripts_grch37.sh
├── ensembl_transcripts_grch38.sh
├── gene_info.sh
├── gff_parser.py
├── github_release_upload.sh
├── json_encoders.py
├── json_schema_version.py
├── mus_musculus.sh
├── refseq_transcripts_chm13v2.sh
├── refseq_transcripts_grch37.sh
├── refseq_transcripts_grch38.sh
├── requirements.txt
├── uta_20210129_grch37.sql
├── uta_20210129_grch38.sql
└── uta_transcripts.sh
├── paper
├── HGVS cleaning.ipynb
├── clean_hgvs_search_csvs.py
├── combine_csv.py
└── investigate_fails.py
├── pyproject.toml
├── setup.cfg
└── tests
├── __init__.py
├── benchmark_hgvs.py
├── genome.py
├── mock_ensembl_tark.py
├── mock_seqfetcher.py
├── test_data
├── cdot.ensembl.grch38.json
├── cdot.refseq.grch37.json
├── clinvar_hgvs
│ ├── clinvar_hgvs_010.tsv
│ ├── clinvar_hgvs_050.tsv
│ ├── clinvar_hgvs_100.tsv
│ ├── clinvar_hgvs_500.tsv
│ ├── clinvar_hgvs_ensembl.tsv
│ ├── clinvar_hgvs_ensembl_100.tsv
│ ├── clinvar_hgvs_ensembl_50.tsv
│ └── clinvar_hgvs_ensembl_500.tsv
├── ensembl_tark
│ └── transcript
│ │ ├── assembly_name=GRCh38&expand=transcript_release_set&loc_end=36530514&loc_region=7&loc_start=36530417&page=2.json
│ │ ├── assembly_name=GRCh38&expand=transcript_release_set&loc_end=36530514&loc_region=7&loc_start=36530417&page=3.json
│ │ ├── assembly_name=GRCh38&expand=transcript_release_set&loc_end=36530514&loc_region=7&loc_start=36530417&page=4.json
│ │ ├── assembly_name=GRCh38&expand=transcript_release_set&loc_end=36530514&loc_region=7&loc_start=36530417&page=5.json
│ │ ├── assembly_name=GRCh38&expand=transcript_release_set&loc_end=36530514&loc_region=7&loc_start=36530417&page=6.json
│ │ ├── assembly_name=GRCh38&expand=transcript_release_set&loc_end=36530514&loc_region=7&loc_start=36530417.json
│ │ ├── search
│ │ └── identifier_field=AOAH&expand=exons,genes,sequence.json
│ │ └── stable_id=ENST00000617537&stable_id_version=5&expand_all=true.json
├── ensembl_test.GRCh38.104.gtf
├── ensembl_test.GRCh38.111.gtf
├── grch37.genome
├── hg19_chrY_300kb_genes.gtf
├── refseq_grch37_mt.gff
├── refseq_grch38.p14_mt.gff
├── refseq_test.GRCh38.p13_genomic.109.20210514.gff
├── refseq_test.GRCh38.p14_genomic.RS_2023_03.gff
└── transcript_sequences.json
├── test_gff_parsers.py
├── test_json_data_provider_ensembl.py
├── test_json_data_provider_refseq.py
├── test_pyhgvs.py
└── test_uta_conversion.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | /workspace.xml
2 | misc.xml
3 |
4 |
--------------------------------------------------------------------------------
/.idea/cdot.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ## [0.2.26] 2024-08-15
2 |
3 | Bumped version to 0.2.26 to catch up with data release. Only new client functionality is #81 'data_release' helper functions
4 |
5 | All other changes in this release were for data (and contained in data_v0.2.26)
6 |
7 | ### Added
8 |
9 | - #81 New 'data_release' code eg 'get_latest_combo_file_urls' that looks on GitHub to find latest data
10 | - New GFFs: RefSeq RS_2023_10, Ensembl 111, 112
11 | - #79 - RefSeq MT transcripts
12 | - #66 - We now store 'Note' field (thanks holtgrewe for suggestion)
13 | - Added requirements.txt for 'generate_transcript_data' sections
14 | - client / JSON data schema version compatability check
15 |
16 | ### Changed
17 |
18 | - #56 - Fix occasional UTA duplicated exons
19 | - #57 - Correctly handle retrieving genomic position and dealing w/indels in GFF (thanks ltnetcase for reporting)
20 | - #60 - Fix for missing protein IDs due to Genbank / GenBank (thanks holtgrewe)
21 | - #64 - Split code/data versions. json.gz are now labelled according to data schema version (thanks holtgrewe)
22 | - Renamed 'CHM13v2.0' to 'T2T-CHM13v2.0' so it could work with biocommons bioutils
23 | - #72 - Correctly handle ncRNA_gene genes (thanks holtgrewe for reporting)
24 | - #73 - HGNC ID was missing for some chrMT genes in Ensembl
25 |
26 | ## [0.2.21] - 2023-08-14
27 |
28 | ### Changed
29 |
30 | - #45 - FastaSeqFetcher - fix alignment gaps properly
31 | - #52 - Added transcripts from Ensembl 110 GRCh38 release
32 | - #53 - UTA to cdot transcript start/end conversion issue
33 |
34 | ## [0.2.20] - 2023-07-10
35 |
36 | ### Changed
37 |
38 | - #50 - Biotype was missing in Ensembl transcripts
39 |
40 | ## [0.2.19] - 2023-07-06
41 |
42 | ### Changed
43 |
44 | - #49 - MT not converted to contigs correctly (GRCh37/Ensembl only) #49
45 | - Removed accidental logging
46 |
47 | ## [0.2.18] - 2023-07-05
48 |
49 | ### Added
50 |
51 | - #44 - Support for mouse transcripts (Mus Musculus GRCm38 and GRCm39)
52 | - #47 - Implement HGVS DataProvider get_alignments_for_region
53 |
54 | ### Changed
55 |
56 | - #45 - FastaSeqFetcher - handle deletions correctly (had swapped HGVS cigar projections around)
57 | - #46 - HGVS DataProvider get_tx_info should properly handle alt_ac and alt_aln_method
58 |
59 | ## [0.2.17] - 2023-05-08
60 |
61 | ### Added
62 |
63 | - #42 - Ensembl T2T CHM13v2.0
64 |
65 | ### Changed
66 |
67 | - #43 - Contigs not converted to accession numbers properly (this was breaking local Biocommons HGVS conversion using 0.2.16 data)
68 |
69 | ## [0.2.16] - 2023-04-12
70 |
71 | ### Added
72 |
73 | - Added historical release 110 (2022-04-12) for T2T CHM13v2.0
74 | - Added latest GRCh38.p14 release (2023-03-21)
75 |
76 | ## [0.2.15] - 2023-04-03
77 |
78 | ### Added
79 |
80 | - Support for T2T CHM13v2.0
81 |
82 | ## [0.2.14] - 2023-03-21
83 |
84 | ### Added
85 |
86 | - #39 - Fasta file SeqFetcher implementation
87 | - Add Ensembl 109 GTF
88 |
89 | ### Changed
90 |
91 | - #38 - Differing implementation of get_tx_for_region to hgvs one (reported by Manuel Holtgrewe)
92 | - #35 - Tags (ie MANE Select / RefSeq select etc) should be genome build specific
93 | - #34 - Stick to PyHGVS conventions, throw ValueError: transcript is required on missing transcript
94 |
95 | ## [0.2.13] - 2023-02-23
96 |
97 | ### Changed
98 |
99 | - Fix for #25 - Pyhgvs data conversion - non-coding transcripts have bad cds start/end conversion
100 | - Fix for #32 - Signature of get_pyhgvs_data consistent for all return statements
101 |
102 | ## [0.2.12] - 2022-12-08
103 |
104 | ### Added
105 |
106 | - #30 - We now store "tag" attributes (eg "MANE Select", "RefSeq Select")
107 | - Switch to using Ensembl GFF3 (so we can get tags out)
108 | - Add Ensembl 108 GFF3
109 |
110 | ### Changed
111 |
112 | - Fix for #25 - GeneInfo currently fails for some records
113 | - Fix for #27 - Change URL for missing RefSeq GFFs
114 |
115 | ## [0.2.11] - 2022-09-27
116 |
117 | ### Added
118 |
119 | - Now support all methods (get_gene_info, get_tx_for_gene, get_tx_for_region) for REST
120 | - Add Ensembl 107 GTF
121 |
122 | ### Changed
123 |
124 | - Ensembl gene info was missing "description"
125 |
126 | ## [0.2.10] - 2022-09-19
127 |
128 | ### Added
129 |
130 | - [Implement get_gene_info](https://github.com/SACGF/cdot/issues/20) - For local JSON data only
131 |
132 | ### Changed
133 |
134 | - Fixed issue [#23 UTA transcripts for PyHGVS](https://github.com/SACGF/cdot/issues/23)
135 |
136 | ## [0.2.9] - 2022-09-01
137 |
138 | ### Changed
139 |
140 | - [BugFix for get_tx_for_region](https://github.com/SACGF/cdot/issues/22)
141 |
142 |
143 | ## [0.2.8] - 2022-08-29
144 |
145 | ### Added
146 |
147 | - [Implemented get_pro_ac_for_tx_ac](https://github.com/SACGF/cdot/issues/14) (c_to_p can now generate p.HGVS)
148 | - [Implemented get_tx_for_region](https://github.com/SACGF/cdot/issues/18) for local JSON data only
149 |
150 | ## [0.2.7] - 2022-05-19
151 |
152 | ### Added
153 |
154 | - Add transcripts from latest RefSeq GRCh37 (105) and RefSeq GRCh38 (110)
155 |
156 | ### Changed
157 |
158 | - Fixed default arguments bug where PyHGVS only worked on SACGF fork
159 | - gtf_to_json now goes straight to cdot format (without intermediary PyReference format)
160 | - UTA is not included in generation scripts by default, to enable, set environment variable UTA_TRANSCRIPTS=True
161 | - Handle mismatches in UTA CIGAR alignments (convert to match (no indels) as GFF format has no support for mismatch)
162 |
163 | ## [0.2.6] - 2022-05-19
164 |
165 | ### Changed
166 |
167 | - Fixed issue [Ensembl contigs g_to_c](https://github.com/SACGF/cdot/issues/9) - Ensembl JSON was using chrom names ie "17" instead of "NC_000017.11" for contig
168 |
169 | ## [0.2.5] - 2022-04-14
170 |
171 | ### Changed
172 |
173 | - PyHGVS conversion fix - non-coding cds_start/cds_end is set to start/end (not None)
174 |
175 | ## [0.2.4] - 2022-04-13
176 |
177 | ### Added
178 |
179 | - Latest RefSeq (110) and Ensembl (106) transcripts
180 |
181 | ### Changed
182 |
183 | - Fixed bug where all UTA transcripts were '-' strand
184 | - Add "other_chroms" to combined historical file
185 |
186 | ## [0.2.3] - 2022-03-29
187 |
188 | ### Changed
189 |
190 | - Fixed bug where HGNC not extracted properly from Ensembl GTFs
191 | - Gene information is now included by default (only adds 5%)
192 | - Clean artifacts from UTA data
193 | - Support for [SACGF PyHGVS fork](https://github.com/SACGF/hgvs) (which adds alignment gap support)
194 |
195 | ## [0.2.2] - 2022-03-03
196 |
197 | ### Added
198 |
199 | - Support for HTTPS (bought SSL certificate for REST server)
200 |
201 | ## [0.2.1] - 2022-03-03
202 |
203 | ### Added
204 |
205 | - [Download/Convert UTA transcripts](https://github.com/SACGF/cdot/issues/1)
206 | - [REST client](https://github.com/SACGF/cdot/issues/4) for [REST Service](https://github.com/SACGF/cdot_rest/)
207 |
208 | ### Changed
209 |
210 | - [JSON format changed](https://github.com/SACGF/cdot/issues/2), separating common/build specific coordinates. This is so a transcript can contain data for multiple builds.
211 | - [Use ijson to reduce RAM usage](https://github.com/SACGF/cdot/issues/7) - uses iterator vs loading all JSON into RAM
212 |
213 | ## [0.1.1] - 2022-01-19
214 |
215 | ### Added
216 |
217 | - Initial commit
218 |
219 | [unreleased]: https://github.com/SACGF/cdot/compare/v0.2.26...HEAD
220 | [0.2.26]: https://github.com/SACGF/cdot/compare/v0.2.21...v0.2.26
221 | [0.2.21]: https://github.com/SACGF/cdot/compare/v0.2.20...v0.2.21
222 | [0.2.20]: https://github.com/SACGF/cdot/compare/v0.2.19...v0.2.20
223 | [0.2.19]: https://github.com/SACGF/cdot/compare/v0.2.18...v0.2.19
224 | [0.2.18]: https://github.com/SACGF/cdot/compare/v0.2.17...v0.2.18
225 | [0.2.17]: https://github.com/SACGF/cdot/compare/v0.2.16...v0.2.17
226 | [0.2.16]: https://github.com/SACGF/cdot/compare/v0.2.15...v0.2.16
227 | [0.2.15]: https://github.com/SACGF/cdot/compare/v0.2.14...v0.2.15
228 | [0.2.14]: https://github.com/SACGF/cdot/compare/v0.2.13...v0.2.14
229 | [0.2.13]: https://github.com/SACGF/cdot/compare/v0.2.12...v0.2.13
230 | [0.2.12]: https://github.com/SACGF/cdot/compare/v0.2.11...v0.2.12
231 | [0.2.11]: https://github.com/SACGF/cdot/compare/v0.2.10...v0.2.11
232 | [0.2.10]: https://github.com/SACGF/cdot/compare/v0.2.9...v0.2.10
233 | [0.2.9]: https://github.com/SACGF/cdot/compare/v0.2.8...v0.2.9
234 | [0.2.8]: https://github.com/SACGF/cdot/compare/v0.2.7...v0.2.8
235 | [0.2.7]: https://github.com/SACGF/cdot/compare/v0.2.6...v0.2.7
236 | [0.2.6]: https://github.com/SACGF/cdot/compare/v0.2.5...v0.2.6
237 | [0.2.5]: https://github.com/SACGF/cdot/compare/v0.2.4...v0.2.5
238 | [0.2.4]: https://github.com/SACGF/cdot/compare/v0.2.3...v0.2.4
239 | [0.2.3]: https://github.com/SACGF/cdot/compare/v0.2.2...v0.2.3
240 | [0.2.2]: https://github.com/SACGF/cdot/compare/v0.2.1...v0.2.2
241 | [0.2.1]: https://github.com/SACGF/cdot/compare/v0.1.1...v0.2.1
242 | [0.1.1]: https://github.com/SACGF/cdot/releases/tag/v0.1.1
243 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 SACGF
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # cdot
2 |
3 | [](https://pypi.org/project/cdot/) [](https://pypi.org/project/cdot/) [](https://zenodo.org/doi/10.5281/zenodo.13324621)
4 |
5 |
6 | cdot provides transcripts for the 2 most popular Python [HGVS](http://varnomen.hgvs.org/) libraries.
7 |
8 | It works by:
9 |
10 | * Converting RefSeq/Ensembl GTFs to JSON
11 | * Providing loaders for the HGVS libraries, via JSON.gz files, or REST API via [cdot_rest](https://github.com/SACGF/cdot_rest))
12 |
13 | We currently support 1.58 million transcript/genome alignments (vs ~141k in UTA v.20210129)
14 |
15 | ## New
16 |
17 | See [changelog](https://github.com/SACGF/cdot/blob/main/CHANGELOG.md)
18 |
19 | 2024-08-15:
20 |
21 | * 'data_release' helper code
22 | * Many minor updates to data (see changelog)
23 |
24 | 2023-07-05:
25 | * BioCommons HGVS DataProvider fixes
26 | * Support for mouse transcripts (Mus Musculus GRCm38 and GRCm39)
27 |
28 | 2023-04-03:
29 | * #41 - Support for T2T CHM13v2.0 [example code](https://github.com/SACGF/cdot/wiki/Biocommons-T2T-CHM13v2.0-example-code)
30 |
31 | ## Install
32 |
33 | ```
34 | pip install cdot
35 | ```
36 |
37 | ## Examples
38 |
39 | [Biocommons HGVS](https://github.com/biocommons/hgvs) example:
40 |
41 | ```
42 | import hgvs
43 | from hgvs.assemblymapper import AssemblyMapper
44 | from cdot.hgvs.dataproviders import JSONDataProvider, RESTDataProvider
45 |
46 | hdp = RESTDataProvider() # Uses API server at cdot.cc
47 | # hdp = JSONDataProvider(["./cdot-0.2.14.refseq.grch37.json.gz"]) # Uses local JSON file
48 |
49 | am = AssemblyMapper(hdp,
50 | assembly_name='GRCh37',
51 | alt_aln_method='splign', replace_reference=True)
52 |
53 | hp = hgvs.parser.Parser()
54 | var_c = hp.parse_hgvs_variant('NM_001637.3:c.1582G>A')
55 | am.c_to_g(var_c)
56 | ```
57 |
58 | [more Biocommons examples](https://github.com/SACGF/cdot/wiki/Biocommons-HGVS-example-code):
59 |
60 | [PyHGVS](https://github.com/counsyl/hgvs) example:
61 |
62 | ```
63 | import pyhgvs
64 | from pysam.libcfaidx import FastaFile
65 | from cdot.pyhgvs.pyhgvs_transcript import JSONPyHGVSTranscriptFactory, RESTPyHGVSTranscriptFactory
66 |
67 | genome = FastaFile("/data/annotation/fasta/GCF_000001405.25_GRCh37.p13_genomic.fna.gz")
68 | factory = RESTPyHGVSTranscriptFactory()
69 | # factory = JSONPyHGVSTranscriptFactory(["./cdot-0.2.14.refseq.grch37.json.gz"]) # Uses local JSON file
70 | pyhgvs.parse_hgvs_name('NM_001637.3:c.1582G>A', genome, get_transcript=factory.get_transcript_grch37)
71 | ```
72 |
73 | [more PyHGVS examples](https://github.com/SACGF/cdot/wiki/PyHGVS-example-code):
74 |
75 | ## Q. What's the performance like?
76 |
77 | * UTA public DB: 1-1.5 seconds / transcript
78 | * cdot REST service: 10/second
79 | * cdot JSON.gz: 500-1k/second
80 |
81 | ## Q. Where can I download the JSON.gz files?
82 |
83 | [Download from GitHub releases](https://github.com/SACGF/cdot/releases) - RefSeq (37/38) - 72M, Ensembl (37/38) 61M
84 |
85 | Details on what the files contain [here](https://github.com/SACGF/cdot/wiki/GitHub-release-file-details)
86 |
87 | ## Q. How does this compare to Universal Transcript Archive?
88 |
89 | Both projects have similar goals of providing transcripts for loading HGVS, but they approach it from different ways
90 |
91 | * UTA aligns sequences, then stores coordinates in an SQL database.
92 | * cdot convert existing Ensembl/RefSeq GTFs into JSON
93 |
94 | See [wiki for more details](https://github.com/SACGF/cdot/wiki/cdot-vs-UTA)
95 |
96 | ## Q. How do you store transcripts in JSON?
97 |
98 | See [wiki page](https://github.com/SACGF/cdot/wiki/Transcript-JSON-format) for the format.
99 |
100 | We think a standard for JSON gene/transcript information would be a great thing, and am keen to collaborate to make it happen!
101 |
102 | ## Q. What does cdot stand for?
103 |
104 | cdot, pronounced "see dot" is a play on HGVS coding sequence ```:c.``` But if you want a backronym, it's "Complete Dict Of Transcripts"
105 |
106 | This was developed for the [Australian Genomics](https://www.australiangenomics.org.au/) [Shariant](https://shariant.org.au/) project, due to the need to load historical HGVS from lab archives.
107 |
--------------------------------------------------------------------------------
/cdot/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.26"
2 | # Data version is kept in generate_transcript_version.json_schema_version
3 |
4 | def get_data_schema_int(version: str) -> int:
5 | """ Return an int which increments upon breaking changes - ie anything other than patch """
6 | major, minor, patch = version.split(".")
7 | return 1000 * int(major) + int(minor)
8 |
--------------------------------------------------------------------------------
/cdot/data_release.py:
--------------------------------------------------------------------------------
1 | import re
2 | import requests
3 | import cdot
4 |
5 | from cdot import get_data_schema_int
6 |
7 |
8 | def get_latest_data_release_tag_name():
9 | latest_data_release = get_latest_data_release()
10 | return latest_data_release.get('tag_name')
11 |
12 | def _get_version_from_tag_name(tag_name, data_version=False):
13 | """ Returns None if doesn't match required prefix """
14 | release_prefix = "v"
15 | if data_version:
16 | release_prefix = "data_" + release_prefix
17 |
18 | if not tag_name.startswith(release_prefix):
19 | return None
20 | return tag_name.lstrip(release_prefix)
21 |
22 |
23 | def get_latest_data_release():
24 | client_data_schema = get_data_schema_int(cdot.__version__)
25 |
26 | url = "https://api.github.com/repos/SACGF/cdot/releases"
27 | response = requests.get(url)
28 | json_data = response.json()
29 | for release in json_data:
30 | tag_name = release['tag_name'] # Should look like 'v0.2.25' for code or 'data_v0.2.25' for data
31 | # We require a data version
32 | data_version = _get_version_from_tag_name(tag_name, data_version=True)
33 | if data_version is None:
34 | continue
35 |
36 | data_schema = get_data_schema_int(data_version)
37 | if data_schema != client_data_schema:
38 | continue
39 | return release
40 | return {}
41 |
42 | def get_latest_combo_file_urls(annotation_consortia, genome_builds):
43 | # lower case everything to be case insensitive
44 | annotation_consortia = {x.lower() for x in annotation_consortia}
45 | genome_builds = {x.lower() for x in genome_builds}
46 |
47 | file_urls = []
48 | if latest_data_release := get_latest_data_release():
49 | for asset in latest_data_release["assets"]:
50 | browser_download_url = asset["browser_download_url"]
51 | filename = browser_download_url.rsplit("/")[-1]
52 | if m := re.match(r"cdot-(\d+\.\d+\.\d+)\.(refseq|ensembl)\.(.+)\.json\.gz", filename):
53 | _version, annotation_consortium, genome_build = m.groups()
54 | if annotation_consortium.lower() in annotation_consortia and genome_build.lower() in genome_builds:
55 | file_urls.append(browser_download_url)
56 | return file_urls
--------------------------------------------------------------------------------
/cdot/hgvs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SACGF/cdot/ddeb78d58731dd4136689360d0fce4a8a91af87d/cdot/hgvs/__init__.py
--------------------------------------------------------------------------------
/cdot/hgvs/dataproviders/__init__.py:
--------------------------------------------------------------------------------
1 | from .fasta_seqfetcher import *
2 | from .json_data_provider import *
3 | from .seqfetcher import *
4 |
--------------------------------------------------------------------------------
/cdot/hgvs/dataproviders/fasta_seqfetcher.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import re
3 |
4 | from pysam.libcfaidx import FastaFile
5 | from hgvs.dataproviders.interface import Interface
6 | from hgvs.exceptions import HGVSDataNotAvailableError
7 | from bioutils.sequences import reverse_complement
8 |
9 | from cdot.hgvs.dataproviders.seqfetcher import AbstractTranscriptSeqFetcher, PrefixSeqFetcher
10 |
11 |
12 | class GenomeFastaSeqFetcher:
13 | def __init__(self, *args):
14 | self.source = "Local Fasta file reference"
15 | self.contig_fastas = {}
16 | for fasta_filename in args:
17 | fasta_file = FastaFile(fasta_filename)
18 | for contig in fasta_file.references:
19 | self.contig_fastas[contig] = fasta_file
20 |
21 | if not self.contig_fastas:
22 | raise ValueError("Need to provide at least one of fasta file as argument")
23 |
24 | def fetch_seq(self, ac, start_i=None, end_i=None):
25 | if fasta_file := self.contig_fastas.get(ac): # Contig
26 | return fasta_file.fetch(ac, start_i, end_i).upper()
27 |
28 | raise HGVSDataNotAvailableError(f"Accession '{ac}' not in fasta contigs")
29 |
30 |
31 | class ExonsFromGenomeFastaSeqFetcher(AbstractTranscriptSeqFetcher):
32 | """ This produces artificial transcript sequences by pasting together exons from the genome
33 | It is possible that this does not exactly match the transcript sequences - USE AT OWN RISK! """
34 | def __init__(self, *args, cache=True):
35 | self.cache = cache
36 | self.transcript_cache = {}
37 | self.hdp = None # Set when passed to data provider (via set_data_provider)
38 | self.source = "Transcript Exons using Genome Fasta file reference"
39 | self.contig_fastas = {}
40 | self.cigar_pattern = re.compile(r"(\d+)([=DIX])")
41 | for fasta_filename in args:
42 | fasta_file = FastaFile(fasta_filename)
43 | for contig in fasta_file.references:
44 | self.contig_fastas[contig] = fasta_file
45 |
46 | if not self.contig_fastas:
47 | raise ValueError("Need to provide at least one of fasta file as argument")
48 | super().__init__(*args, cache)
49 |
50 | def get_mapping_options(self, ac):
51 | return self.hdp.get_tx_mapping_options(ac)
52 |
53 | def _get_transcript_seq(self, ac):
54 | possible_contigs = set()
55 | for tx_mo in self.get_mapping_options(ac):
56 | alt_ac = tx_mo["alt_ac"]
57 | possible_contigs.add(alt_ac)
58 | if alt_ac in self.contig_fastas:
59 | return self._fetch_seq_from_fasta(ac, alt_ac, tx_mo["alt_aln_method"])
60 |
61 | msg = f"Failed to fetch {ac} from {self.source}. "
62 | if possible_contigs:
63 | possible_contigs = sorted(possible_contigs)
64 | raise HGVSDataNotAvailableError(f"{msg} No Fasta provided with contigs: {possible_contigs}")
65 | raise HGVSDataNotAvailableError(f"{msg} Transcript '{ac}' not found.")
66 |
67 | def _fetch_seq_from_fasta(self, ac, alt_ac, alt_aln_method):
68 | fasta_file = self.contig_fastas[alt_ac]
69 |
70 | exons = self.hdp.get_tx_exons(ac, alt_ac, alt_aln_method)
71 | exon_sequences = []
72 | expected_transcript_length = 0
73 | sorted_exons = list(sorted(exons, key=lambda ex: ex["ord"]))
74 | first_exon = sorted_exons[0]
75 | transcript_start_offset = first_exon["tx_start_i"] # HGVS/UTA starts w/0
76 | if transcript_start_offset:
77 | exon_sequences.append("N" * transcript_start_offset)
78 | expected_transcript_length += transcript_start_offset
79 |
80 | for exon in sorted_exons:
81 | exon_seq = fasta_file.fetch(alt_ac, exon["alt_start_i"], exon["alt_end_i"])
82 | exon_seq = exon_seq.upper()
83 |
84 | exon_seq_list = []
85 | start = 0
86 | # We are using HGVS cigar
87 | for (length_str, op) in self.cigar_pattern.findall(exon["cigar"]):
88 | length = int(length_str)
89 | if op == 'D': # Deletion in reference vs transcript
90 | exon_seq_list.append("N" * length)
91 | # Don't increment start (as we didn't move along genomic exon)
92 | elif op == 'I': # Insertion in reference vs transcript
93 | # Leave out of exon_seq
94 | start += length # We do increment through genomic sequence though
95 | else: # match/mismatch
96 | exon_seq_list.append(exon_seq[start:start+length])
97 | start += length
98 |
99 | exon_seq = "".join(exon_seq_list)
100 | if exon["alt_strand"] == -1:
101 | exon_seq = reverse_complement(exon_seq)
102 | exon_sequences.append(exon_seq)
103 | expected_transcript_length += exon["tx_end_i"] - exon["tx_start_i"]
104 |
105 | transcript_sequence = "".join(exon_sequences)
106 | if len(transcript_sequence) != expected_transcript_length:
107 | raise ValueError(f"Error creating {ac} sequence from genome fasta ({alt_ac}): "
108 | f"{expected_transcript_length=} != {len(transcript_sequence)=}")
109 | return transcript_sequence
110 |
111 |
112 |
113 | class FastaSeqFetcher(PrefixSeqFetcher):
114 | """ Re-implementing using above - deprecated use """
115 |
116 | def __init__(self, *args, cache=True):
117 | default_seqfetcher = ExonsFromGenomeFastaSeqFetcher(*args, cache=True)
118 |
119 | super().__init__(default_seqfetcher=default_seqfetcher)
120 | self.prefix_seqfetchers.update({
121 | "NC_": GenomeFastaSeqFetcher(*args),
122 | })
123 |
--------------------------------------------------------------------------------
/cdot/hgvs/dataproviders/seqfetcher.py:
--------------------------------------------------------------------------------
1 | import abc
2 |
3 | from more_itertools import all_equal
4 | from hgvs.dataproviders.interface import Interface
5 | from hgvs.exceptions import HGVSDataNotAvailableError
6 |
7 |
8 | class PrefixSeqFetcher:
9 | def __init__(self, default_seqfetcher=None):
10 | self.default_seqfetcher = default_seqfetcher
11 | self.prefix_seqfetchers = {}
12 |
13 | def add_seqfetcher(self, prefix, seqfetcher):
14 | self.prefix_seqfetchers[prefix] = seqfetcher
15 |
16 | @property
17 | def all_seqfetchers(self):
18 | seqfetchers = list(self.prefix_seqfetchers.values())
19 | if self.default_seqfetcher:
20 | seqfetchers.append(self.default_seqfetcher)
21 | return seqfetchers
22 |
23 | def set_data_provider(self, hdp: Interface):
24 | for seqfetcher in self.all_seqfetchers:
25 | try:
26 | seqfetcher.set_data_provider(hdp)
27 | except AttributeError:
28 | pass
29 |
30 | def fetch_seq(self, ac, start_i=None, end_i=None):
31 | for prefix, sf in self.prefix_seqfetchers.items():
32 | if ac.startswith(prefix):
33 | return sf.fetch_seq(ac, start_i=start_i, end_i=end_i)
34 | if self.default_seqfetcher:
35 | return self.default_seqfetcher.fetch_seq(ac, start_i=start_i, end_i=end_i)
36 |
37 | known_prefixes = ','.join(self.prefix_seqfetchers.keys())
38 | msg = f"Couldn't handle '{ac}', must match known prefixes: '{known_prefixes}'. No default set"
39 | raise HGVSDataNotAvailableError(msg)
40 |
41 |
42 | class MultiSeqFetcher(abc.ABC):
43 | def __init__(self, *args):
44 | self.seqfetchers = list(args)
45 |
46 | def set_data_provider(self, hdp: Interface):
47 | for seqfetcher in self.seqfetchers:
48 | try:
49 | seqfetcher.set_data_provider(hdp)
50 | except AttributeError:
51 | pass
52 |
53 | @abc.abstractmethod
54 | def fetch_seq(self, ac, start_i=None, end_i=None):
55 | pass
56 |
57 | @property
58 | def source(self):
59 | # This needs to execute after set_data_provider is called
60 | return ", ".join(s.source for s in self.seqfetchers)
61 |
62 |
63 |
64 | class ChainedSeqFetcher(MultiSeqFetcher):
65 | """ This takes multiple SeqFetcher instances, and tries them in order if HGVSDataNotAvailableError
66 | until one succeeds (or finally throws)
67 |
68 | This is useful if you want to use FastaSeqFetcher (below) as a fallback if SeqFetcher fails
69 |
70 | seqfetcher = ChainedSeqFetcher(SeqFetcher(), FastaSeqFetcher(fasta_filename))
71 | """
72 | def fetch_seq(self, ac, start_i=None, end_i=None):
73 | exceptions = []
74 | for sf in self.seqfetchers:
75 | try:
76 | return sf.fetch_seq(ac, start_i=start_i, end_i=end_i)
77 | except HGVSDataNotAvailableError as e:
78 | exceptions.append(e)
79 |
80 | raise HGVSDataNotAvailableError(exceptions)
81 |
82 |
83 | class VerifyMultipleSeqFetcher(MultiSeqFetcher):
84 | """ This takes multiple SeqFetcher instances, queries them both and checks the BOTH SUCCEED AND ARE IDENTICAL
85 | - otherwise it fails with HGVSDataNotAvailableError
86 |
87 | This is useful for eg verifying that RefSeq transcripts agree with the genome (otherwise there must be)
88 | """
89 | def fetch_seq(self, ac, start_i=None, end_i=None):
90 | results = {}
91 | exceptions = []
92 | for sf in self.seqfetchers:
93 | try:
94 | seq = sf.fetch_seq(ac, start_i=start_i, end_i=end_i)
95 | results[sf.source] = seq
96 | except HGVSDataNotAvailableError as e:
97 | exceptions.append(e)
98 | if exceptions:
99 | raise HGVSDataNotAvailableError(exceptions)
100 |
101 | values = list(results.values())
102 | if not all_equal(values):
103 | raise HGVSDataNotAvailableError(f"Inconsistent sequences for '{ac}'")
104 | return values[0]
105 |
106 |
107 | class AlwaysFailSeqFetcher:
108 | def __init__(self, message):
109 | self.message = message
110 | self.source = str(self.__class__.__name__)
111 |
112 | def fetch_seq(self, ac, start_i=None, end_i=None):
113 | raise HGVSDataNotAvailableError(self.message)
114 |
115 |
116 |
117 | class AbstractTranscriptSeqFetcher:
118 | def __init__(self, *args, cache=True):
119 | self.cache = cache
120 | self.transcript_cache = {}
121 | self.hdp = None # Set when passed to data provider (via set_data_provider)
122 |
123 | @abc.abstractmethod
124 | def _get_transcript_seq(self, ac):
125 | pass
126 |
127 | def get_transcript_seq(self, ac):
128 | transcript_seq = self.transcript_cache.get(ac)
129 | if not transcript_seq:
130 | transcript_seq = self._get_transcript_seq(ac)
131 | if self.cache:
132 | self.transcript_cache[ac] = transcript_seq
133 | return transcript_seq
134 |
135 | def set_data_provider(self, hdp: Interface):
136 | self.hdp = hdp
137 |
138 | def fetch_seq(self, ac, start_i=None, end_i=None):
139 | if self.hdp is None:
140 | raise HGVSDataNotAvailableError(f"{self}: You need to set set_data_provider() before calling fetch_seq()")
141 |
142 | transcript_seq = self.get_transcript_seq(ac)
143 | if start_i is None:
144 | start_i = 0
145 | if end_i is None:
146 | end_i = len(transcript_seq)
147 | return transcript_seq[start_i:end_i]
148 |
--------------------------------------------------------------------------------
/cdot/pyhgvs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SACGF/cdot/ddeb78d58731dd4136689360d0fce4a8a91af87d/cdot/pyhgvs/__init__.py
--------------------------------------------------------------------------------
/cdot/pyhgvs/pyhgvs_transcript.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import gzip
3 | import json
4 | from importlib import metadata
5 | from typing import Dict, Tuple
6 |
7 | import requests
8 |
9 | from pyhgvs.utils import make_transcript
10 |
11 |
12 | class AbstractPyHGVSTranscriptFactory(abc.ABC):
13 |
14 | def __init__(self):
15 | pass
16 |
17 | @abc.abstractmethod
18 | def _get_transcript(self, transcript_id):
19 | pass
20 |
21 | def get_transcript_grch37(self, transcript_id, sacgf_pyhgvs_fork=False):
22 | return self.get_transcript(transcript_id, "GRCh37", sacgf_pyhgvs_fork=sacgf_pyhgvs_fork)
23 |
24 | def get_transcript_grch38(self, transcript_id, sacgf_pyhgvs_fork=False):
25 | return self.get_transcript(transcript_id, "GRCh38", sacgf_pyhgvs_fork=sacgf_pyhgvs_fork)
26 |
27 | def get_transcript(self, transcript_id, genome_build, sacgf_pyhgvs_fork=False):
28 | transcript = None
29 | if pyhgvs_data := self.get_pyhgvs_data(transcript_id, genome_build, sacgf_pyhgvs_fork=sacgf_pyhgvs_fork):
30 | transcript = make_transcript(pyhgvs_data)
31 | return transcript
32 |
33 | def get_pyhgvs_data(self, transcript_id, genome_build, sacgf_pyhgvs_fork=False) -> Dict:
34 | transcript_json = self._get_transcript(transcript_id) or {}
35 | build_coords = transcript_json.get("genome_builds", {}).get(genome_build)
36 | if build_coords is None:
37 | return {}
38 |
39 | exons = build_coords['exons']
40 | start = exons[0][0]
41 | end = exons[-1][1]
42 |
43 | pyhgvs_data = {
44 | "id": transcript_json["id"],
45 | "chrom": build_coords['contig'],
46 | "start": start,
47 | "end": end,
48 | "strand": build_coords["strand"],
49 | # PyHGVS has cds_start/cds_end equal end (so CDS length is 0) if non-coding
50 | "cds_start": build_coords.get('cds_start', end),
51 | "cds_end": build_coords.get('cds_end', end),
52 | "gene_name": transcript_json['gene_name'],
53 | }
54 |
55 | if sacgf_pyhgvs_fork:
56 | # Remove the 3rd element (exon_number)
57 | exons = [e[:2] + e[3:] for e in exons]
58 | pyhgvs_data["cdna_match"] = exons
59 | pyhgvs_data["start_codon_transcript_pos"] = transcript_json.get("start_codon")
60 | pyhgvs_data["stop_codon_transcript_pos"] = transcript_json.get("stop_codon")
61 | if other_chroms := build_coords.get("other_chroms"):
62 | pyhgvs_data["other_chroms"] = other_chroms
63 | else:
64 | # Standard PyHGVS - only keep start/end
65 | exons = [e[:2] for e in exons]
66 |
67 | pyhgvs_data["exons"] = exons
68 | return pyhgvs_data
69 |
70 |
71 | class PyHGVSTranscriptFactory(AbstractPyHGVSTranscriptFactory):
72 | def _get_transcript(self, transcript_id):
73 | return self.transcripts.get(transcript_id)
74 |
75 | def __init__(self, transcripts):
76 | super().__init__()
77 | self.transcripts = transcripts
78 |
79 |
80 | class JSONPyHGVSTranscriptFactory(PyHGVSTranscriptFactory):
81 | def __init__(self, file_or_filename_list):
82 | transcripts = {}
83 | for file_or_filename in file_or_filename_list:
84 | if isinstance(file_or_filename, str):
85 | if file_or_filename.endswith(".gz"):
86 | f = gzip.open(file_or_filename)
87 | else:
88 | f = open(file_or_filename)
89 | else:
90 | f = file_or_filename
91 | data = json.load(f)
92 | transcripts.update(data["transcripts"])
93 | super().__init__(transcripts=transcripts)
94 |
95 |
96 | class RESTPyHGVSTranscriptFactory(AbstractPyHGVSTranscriptFactory):
97 |
98 | def _get_transcript(self, transcript_id):
99 | # We store None for 404 on REST
100 | if transcript_id in self.transcripts:
101 | return self.transcripts[transcript_id]
102 |
103 | transcript_url = self.url + "/transcript/" + transcript_id
104 | response = requests.get(transcript_url)
105 | if response.ok:
106 | if 'application/json' in response.headers.get('Content-Type'):
107 | transcript = response.json()
108 | else:
109 | raise ValueError("Non-json response received for '%s' - are you behind a firewall?" % transcript_url)
110 | else:
111 | transcript = None
112 | self.transcripts[transcript_id] = transcript
113 | return transcript
114 |
115 | def __init__(self, url=None, secure=True):
116 | super().__init__()
117 | if url is None:
118 | if secure:
119 | url = "https://cdot.cc"
120 | else:
121 | url = "http://cdot.cc"
122 | self.url = url
123 | self.transcripts = {}
124 |
125 |
126 | def is_sacgf_pyhgvs_fork():
127 | required_version = (0, 12, 0) # Bumped version on 24 Nov 2021 - has mito and cDNA_match fixes
128 | imported_version = [int(v) for v in metadata.version("pyhgvs").split(".")]
129 | return tuple(imported_version) >= required_version
130 |
131 |
132 | # Changes from old loading:
133 |
134 | # See dot has no cds_start/end if non-coding
135 | # PyHGVS expects cds_start/cds_end be equal to end/end for non-coding transcripts (so coding length ie end-start = 0)
136 | # cds_start = transcript_data.get("cds_start", end)
137 | # cds_end = transcript_data.get("cds_end", end)
138 |
139 |
140 | # VG loader also expects biotype to be comma sep, now is list
141 |
--------------------------------------------------------------------------------
/generate_transcript_data/Mus_musculus/refseq_transcripts_grcm38.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | BASE_DIR=$(dirname $(dirname ${BASH_SOURCE[0]}))
6 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version)
7 |
8 | if [[ -z ${GENE_INFO_JSON} ]]; then
9 | echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py"
10 | exit 1
11 | fi
12 |
13 | filename=GCF_000001635.26_GRCm38.p6_genomic.gff.gz
14 | url=https://ftp.ncbi.nlm.nih.gov/refseq/M_musculus/annotation_releases/108/GCF_000001635.26_GRCm38.p6/${filename}
15 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
16 |
17 | if [[ ! -e ${filename} ]]; then
18 | wget ${url}
19 | fi
20 | if [[ ! -e ${cdot_file} ]]; then
21 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --no-contig-conversion --url "${url}" --genome-build=GRCm38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
22 | fi
23 |
--------------------------------------------------------------------------------
/generate_transcript_data/Mus_musculus/refseq_transcripts_grcm39.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | BASE_DIR=$(dirname $(dirname ${BASH_SOURCE[0]}))
6 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version)
7 |
8 | if [[ -z ${GENE_INFO_JSON} ]]; then
9 | echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py"
10 | exit 1
11 | fi
12 |
13 |
14 | filename=GCF_000001635.27_GRCm39_genomic.gff.gz
15 | url=https://ftp.ncbi.nlm.nih.gov/refseq/M_musculus/annotation_releases/109/GCF_000001635.27_GRCm39/${filename}
16 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
17 | if [[ ! -e ${filename} ]]; then
18 | wget ${url}
19 | fi
20 | if [[ ! -e ${cdot_file} ]]; then
21 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --no-contig-conversion --url "${url}" --genome-build=GRCm39 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
22 | fi
23 |
--------------------------------------------------------------------------------
/generate_transcript_data/Snakefile:
--------------------------------------------------------------------------------
1 | import sys
2 | import subprocess
3 | from datetime import datetime
4 |
5 | configfile: os.path.join(workflow.basedir, "cdot_transcripts.yaml")
6 |
7 | cdot_json = os.path.join(workflow.basedir, "cdot_json.py")
8 | cdot_dir = os.path.dirname(workflow.basedir)
9 | cdot_output_raw = subprocess.check_output(f"{sys.executable} {cdot_json} --version", shell=True, env={"PYTHONPATH": cdot_dir})
10 | cdot_data_version = cdot_output_raw.decode().strip()
11 |
12 | # Name it based on date as it may vary
13 | today = datetime.now().date().isoformat()
14 | gene_info_download_filename = os.path.join("downloads/gene_info", f"Homo_sapiens.gene_info.{today}.gz")
15 | gene_info_json_filename = f"Homo_sapiens.gene-info-{cdot_data_version}.json.gz"
16 |
17 | all_urls = {}
18 | annotation_consortium_list = []
19 | genome_build_list = []
20 | for annotation_consortium, builds in config["config"].items():
21 | annotation_consortium_list.append(annotation_consortium)
22 | for gb, urls_dict in builds.items():
23 | genome_build_list.append(gb)
24 | all_urls.update(urls_dict)
25 |
26 |
27 | def get_url_from_name(wildcards):
28 | return all_urls[wildcards.name]
29 |
30 |
31 | def get_urls(wildcards):
32 | print(f"get_urls")
33 | for key, value in wildcards.items():
34 | print(f"{key}={value}")
35 |
36 | return urls
37 |
38 |
39 | def get_cdot_command(wildcards):
40 | url = all_urls[wildcards.name]
41 | # gffs can end with 'gff.gz' or 'gff3.gz', gtfs always end with 'gtf.gz'
42 | cdot_command = "gtf_to_json" if url.endswith(".gtf.gz") else "gff3_to_json"
43 | return cdot_command
44 |
45 |
46 | def get_build_input_files(wildcards):
47 | urls = config["config"][wildcards.annotation_consortium][wildcards.genome_build]
48 | prefix = f"{wildcards.annotation_consortium}/{wildcards.genome_build}/cdot-{cdot_data_version}"
49 | return expand(prefix + "-{name}.json.gz", name=urls)
50 |
51 |
52 | rule all:
53 | input:
54 | gene_info_json_filename,
55 | expand("{annotation_consortium}/cdot-" + cdot_data_version + "-{annotation_consortium}-{genome_build}.json.gz",
56 | annotation_consortium=annotation_consortium_list, # ["RefSeq", "Ensembl"],
57 | genome_build=genome_build_list)
58 |
59 |
60 | rule cdot_merge_historical_json:
61 | # Merges multiple files together for 1 build
62 | output:
63 | "{annotation_consortium}/cdot-" + cdot_data_version + "-{annotation_consortium}-{genome_build}.json.gz"
64 | input:
65 | get_build_input_files,
66 | shell:
67 | """
68 | PYTHONPATH={cdot_dir} \
69 | {cdot_json} \
70 | merge_historical \
71 | {input} \
72 | --genome-build={wildcards.genome_build} \
73 | --output {output}
74 | """
75 |
76 |
77 | rule cdot_gff_json:
78 | # Individual GFF
79 | input:
80 | gene_info_json=gene_info_json_filename,
81 | gff_file="downloads/{name}.gz"
82 | output:
83 | protected("{annotation_consortium}/{genome_build}/cdot-" + cdot_data_version + "-{name}.json.gz")
84 | params:
85 | url=get_url_from_name,
86 | cdot_command=get_cdot_command
87 | shell:
88 | """
89 | PYTHONPATH={cdot_dir} \
90 | {cdot_json} \
91 | {params.cdot_command} \
92 | "{input.gff_file}" \
93 | --url "{params.url}" \
94 | --genome-build="{wildcards.genome_build}" \
95 | --output "{output}" \
96 | --gene-info-json="{input.gene_info_json}"
97 | """
98 |
99 |
100 | rule download_gff_files:
101 | threads: 4 # We seem to sometimes get failures/booted if too many simultaneous connections to site
102 | output:
103 | # Don't re-download if snakemake script changes
104 | protected("downloads/{name}.gz")
105 | params:
106 | url=lambda wildcards: all_urls[wildcards.name]
107 | shell:
108 | "curl --fail --show-error -o {output} {params.url}"
109 |
110 |
111 | rule process_gene_info_json:
112 | input:
113 | gene_info_download_filename
114 | output:
115 | protected(gene_info_json_filename)
116 | shell:
117 | """
118 | PYTHONPATH={cdot_dir} \
119 | "{workflow.basedir}/cdot_gene_info.py" \
120 | --gene-info {input} \
121 | --output {output} \
122 | --email cdot@cdot.cc
123 | """
124 |
125 |
126 | rule download_gene_info:
127 | output:
128 | protected(gene_info_download_filename)
129 | shell:
130 | "curl --fail --show-error -o {output} https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/Homo_sapiens.gene_info.gz"
131 |
132 |
--------------------------------------------------------------------------------
/generate_transcript_data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SACGF/cdot/ddeb78d58731dd4136689360d0fce4a8a91af87d/generate_transcript_data/__init__.py
--------------------------------------------------------------------------------
/generate_transcript_data/all_transcripts.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | FULL_PATH_TO_SCRIPT="$(realpath "${BASH_SOURCE[-1]}")"
6 | BASE_DIR=$(dirname ${FULL_PATH_TO_SCRIPT})
7 |
8 | # Python scripts will import via generate_transcript_data
9 | export PYTHONPATH=${BASE_DIR}/..
10 |
11 | CDOT_DATA_VERSION=$(${BASE_DIR}/cdot_json.py --version)
12 |
13 | echo "Generating all transcripts for cdot data version ${CDOT_DATA_VERSION}"
14 |
15 | # This needs to be passed to called bash scripts, so they are invoked with "." to use these variables
16 | export GENE_INFO_JSON=$(pwd)/Homo_sapiens.gene-info-${CDOT_DATA_VERSION}.json.gz
17 |
18 | if [[ ! -e ${GENE_INFO_JSON} ]]; then
19 | ${BASE_DIR}/gene_info.sh
20 | fi
21 |
22 | echo "Gene summary variable = ${GENE_INFO_JSON}"
23 |
24 | # RefSeq
25 | mkdir -p refseq
26 | cd refseq
27 |
28 | mkdir -p GRCh37
29 | cd GRCh37
30 | ${BASE_DIR}/refseq_transcripts_grch37.sh
31 | cd ..
32 |
33 | mkdir -p GRCh38
34 | cd GRCh38
35 | ${BASE_DIR}/refseq_transcripts_grch38.sh
36 | cd ..
37 |
38 | mkdir -p T2T-CHM13v2.0
39 | cd T2T-CHM13v2.0
40 | ${BASE_DIR}/refseq_transcripts_chm13v2.sh
41 | cd ..
42 |
43 | # Combine genome builds (we're in refseq dir)
44 | REFSEQ_COMBO=cdot-${CDOT_DATA_VERSION}.refseq.grch37_grch38.json.gz
45 | if [[ ! -e ${REFSEQ_COMBO} ]]; then
46 | ${BASE_DIR}/cdot_json.py combine_builds \
47 | --grch37 GRCh37/cdot-${CDOT_DATA_VERSION}.refseq.grch37.json.gz \
48 | --grch38 GRCh38/cdot-${CDOT_DATA_VERSION}.refseq.grch38.json.gz \
49 | --output ${REFSEQ_COMBO}
50 | fi
51 |
52 | cd ..
53 |
54 | # Ensembl
55 | mkdir -p ensembl
56 | cd ensembl
57 |
58 | mkdir -p GRCh37
59 | cd GRCh37
60 | ${BASE_DIR}/ensembl_transcripts_grch37.sh
61 | cd ..
62 |
63 | mkdir -p GRCh38
64 | cd GRCh38
65 | ${BASE_DIR}/ensembl_transcripts_grch38.sh
66 | cd ..
67 |
68 | mkdir -p T2T-CHM13v2.0
69 | cd T2T-CHM13v2.0
70 | ${BASE_DIR}/ensembl_transcripts_chm13v2.sh
71 | cd ..
72 |
73 |
74 | # Combine genome builds (we're in ensembl dir)
75 | ENSEMBL_COMBO=cdot-${CDOT_DATA_VERSION}.ensembl.grch37_grch38.json.gz
76 | if [[ ! -e ${ENSEMBL_COMBO} ]]; then
77 | ${BASE_DIR}/cdot_json.py combine_builds \
78 | --grch37 GRCh37/cdot-${CDOT_DATA_VERSION}.ensembl.grch37.json.gz \
79 | --grch38 GRCh38/cdot-${CDOT_DATA_VERSION}.ensembl.grch38.json.gz \
80 | --output ${ENSEMBL_COMBO}
81 | fi
82 |
83 | cd ..
84 |
--------------------------------------------------------------------------------
/generate_transcript_data/cdot_gene_info.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import csv
4 | import gzip
5 | import json
6 | import logging
7 | import os
8 | from argparse import ArgumentParser
9 | from datetime import datetime
10 | from typing import Iterable, Iterator, List, TypeVar
11 |
12 | import cdot
13 | from Bio import Entrez
14 | from json_encoders import SortedSetEncoder
15 |
16 | T = TypeVar("T")
17 |
18 |
19 | def handle_args():
20 | parser = ArgumentParser(description='cdot Gene Info retrieval')
21 | parser.add_argument('--email', required=True, help='Entrez email')
22 | parser.add_argument('--gene-info', required=True, help='refseq gene info file')
23 | parser.add_argument('--output', required=True, help='output filename')
24 |
25 | args = parser.parse_args()
26 | return args
27 |
28 |
29 | def batch_iterator(iterable: Iterable[T], batch_size: int = 10) -> Iterator[List[T]]:
30 | batch: List[T] = list()
31 | for record in iterable:
32 | batch.append(record)
33 | if len(batch) >= batch_size:
34 | yield batch
35 | batch = list()
36 | if batch:
37 | yield batch
38 |
39 |
40 | def _get_entrez_gene_summary(id_list):
41 | for _ in range(3):
42 | try:
43 | request = Entrez.epost("gene", id=",".join(id_list))
44 | result = Entrez.read(request)
45 | web_env = result["WebEnv"]
46 | query_key = result["QueryKey"]
47 | data = Entrez.esummary(db="gene", webenv=web_env, query_key=query_key)
48 | document = Entrez.read(data, ignore_errors=True, validate=False) # Need recent BioPython
49 | return document["DocumentSummarySet"]["DocumentSummary"]
50 | except Exception as e:
51 | logging.warning(e)
52 | logging.warning("Trying again...")
53 |
54 | def iter_entrez_ids(reader):
55 | for gi in reader:
56 | if gi["Symbol_from_nomenclature_authority"] != '-':
57 | yield gi['GeneID']
58 |
59 | def main():
60 | args = handle_args()
61 | Entrez.email = args.email # Stop warning message
62 | start_date = datetime.now().isoformat()
63 |
64 | # 10k limit of return data from NCBI
65 | # NCBI_BATCH_SIZE = 10000
66 | NCBI_BATCH_SIZE = 1000
67 |
68 | gene_info = {}
69 | with gzip.open(args.gene_info, "rt") as f:
70 | reader = csv.DictReader(f, dialect='excel-tab')
71 |
72 | for entrez_ids in batch_iterator(iter_entrez_ids(reader), batch_size=NCBI_BATCH_SIZE):
73 | # We should really store it under the gene Id so dupe symbols don't wipe
74 | for gene_summary in _get_entrez_gene_summary(entrez_ids):
75 | gene_id = gene_summary.attributes["uid"]
76 | if error := gene_summary.get("error"):
77 | logging.warning("Skipping '%s' error: %s", gene_id, error)
78 | continue
79 |
80 | gene_info[gene_id] = {
81 | "gene_symbol": gene_summary["NomenclatureSymbol"],
82 | "map_location": gene_summary["MapLocation"],
83 | # Already have description for RefSeq but not Ensembl (will just overwrite)
84 | "description": gene_summary["NomenclatureName"],
85 | # "added": record["date_name_changed"],
86 | "aliases": gene_summary["OtherAliases"],
87 | "summary": gene_summary["Summary"],
88 | }
89 |
90 | print(f"Processed {len(gene_info)} records")
91 |
92 | if gene_info:
93 | with gzip.open(args.output, 'wt') as outfile:
94 | gene_info_file_dt = datetime.fromtimestamp(os.stat(args.gene_info).st_ctime)
95 |
96 | data = {
97 | "cdot_version": cdot.__version__,
98 | "api_retrieval_date": start_date,
99 | "gene_info_date": gene_info_file_dt.isoformat(),
100 | "gene_info": gene_info,
101 | }
102 | json.dump(data, outfile, cls=SortedSetEncoder, sort_keys=True) # Sort so diffs work
103 |
104 |
105 | if __name__ == '__main__':
106 | main()
107 |
--------------------------------------------------------------------------------
/generate_transcript_data/cdot_transcripts.yaml:
--------------------------------------------------------------------------------
1 | config:
2 | Ensembl:
3 | # For Ensembl - we have to use GTFs as the GFF3s don't have protein versions in them
4 | GRCh37:
5 | #v81 (points to 75) and earlier at GTFs that don't have transcript versions - just skip them
6 | #82 is first GFF3 for GRCh37
7 | #83 has no data
8 | #84 is 82 again
9 | #86 is 85 again
10 | Homo_sapiens_GRCh37_Ensembl_82.gtf: "https://ftp.ensembl.org/pub/grch37/release-82/gtf/homo_sapiens/Homo_sapiens.GRCh37.82.gtf.gz"
11 | Homo_sapiens_GRCh37_Ensembl_85.gtf: "https://ftp.ensembl.org/pub/grch37/release-85/gtf/homo_sapiens/Homo_sapiens.GRCh37.85.gtf.gz"
12 | Homo_sapiens_GRCh37_Ensembl_87.gtf: "https://ftp.ensembl.org/pub/grch37/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz"
13 |
14 | GRCh38:
15 | Homo_sapiens_GRCh38_Ensembl_81.gtf: "https://ftp.ensembl.org/pub/release-81/gtf/homo_sapiens/Homo_sapiens.GRCh38.81.gtf.gz"
16 | Homo_sapiens_GRCh38_Ensembl_82.gtf: "https://ftp.ensembl.org/pub/release-82/gtf/homo_sapiens/Homo_sapiens.GRCh38.82.gtf.gz"
17 | Homo_sapiens_GRCh38_Ensembl_83.gtf: "https://ftp.ensembl.org/pub/release-83/gtf/homo_sapiens/Homo_sapiens.GRCh38.83.gtf.gz"
18 | Homo_sapiens_GRCh38_Ensembl_84.gtf: "https://ftp.ensembl.org/pub/release-84/gtf/homo_sapiens/Homo_sapiens.GRCh38.84.gtf.gz"
19 | Homo_sapiens_GRCh38_Ensembl_85.gtf: "https://ftp.ensembl.org/pub/release-85/gtf/homo_sapiens/Homo_sapiens.GRCh38.85.gtf.gz"
20 | Homo_sapiens_GRCh38_Ensembl_86.gtf: "https://ftp.ensembl.org/pub/release-86/gtf/homo_sapiens/Homo_sapiens.GRCh38.86.gtf.gz"
21 | Homo_sapiens_GRCh38_Ensembl_87.gtf: "https://ftp.ensembl.org/pub/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh38.87.gtf.gz"
22 | Homo_sapiens_GRCh38_Ensembl_88.gtf: "https://ftp.ensembl.org/pub/release-88/gtf/homo_sapiens/Homo_sapiens.GRCh38.88.gtf.gz"
23 | Homo_sapiens_GRCh38_Ensembl_89.gtf: "https://ftp.ensembl.org/pub/release-89/gtf/homo_sapiens/Homo_sapiens.GRCh38.89.gtf.gz"
24 | Homo_sapiens_GRCh38_Ensembl_90.gtf: "https://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens/Homo_sapiens.GRCh38.90.gtf.gz"
25 | Homo_sapiens_GRCh38_Ensembl_91.gtf: "https://ftp.ensembl.org/pub/release-91/gtf/homo_sapiens/Homo_sapiens.GRCh38.91.gtf.gz"
26 | Homo_sapiens_GRCh38_Ensembl_92.gtf: "https://ftp.ensembl.org/pub/release-92/gtf/homo_sapiens/Homo_sapiens.GRCh38.92.gtf.gz"
27 | Homo_sapiens_GRCh38_Ensembl_93.gtf: "https://ftp.ensembl.org/pub/release-93/gtf/homo_sapiens/Homo_sapiens.GRCh38.93.gtf.gz"
28 | Homo_sapiens_GRCh38_Ensembl_94.gtf: "https://ftp.ensembl.org/pub/release-94/gtf/homo_sapiens/Homo_sapiens.GRCh38.94.gtf.gz"
29 | Homo_sapiens_GRCh38_Ensembl_95.gtf: "https://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz"
30 | Homo_sapiens_GRCh38_Ensembl_96.gtf: "https://ftp.ensembl.org/pub/release-96/gtf/homo_sapiens/Homo_sapiens.GRCh38.96.gtf.gz"
31 | Homo_sapiens_GRCh38_Ensembl_97.gtf: "https://ftp.ensembl.org/pub/release-97/gtf/homo_sapiens/Homo_sapiens.GRCh38.97.gtf.gz"
32 | Homo_sapiens_GRCh38_Ensembl_98.gtf: "https://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/Homo_sapiens.GRCh38.98.gtf.gz"
33 | Homo_sapiens_GRCh38_Ensembl_99.gtf: "https://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens/Homo_sapiens.GRCh38.99.gtf.gz"
34 | Homo_sapiens_GRCh38_Ensembl_100.gtf: "https://ftp.ensembl.org/pub/release-100/gtf/homo_sapiens/Homo_sapiens.GRCh38.100.gtf.gz"
35 | Homo_sapiens_GRCh38_Ensembl_101.gtf: "https://ftp.ensembl.org/pub/release-101/gtf/homo_sapiens/Homo_sapiens.GRCh38.101.gtf.gz"
36 | Homo_sapiens_GRCh38_Ensembl_102.gtf: "https://ftp.ensembl.org/pub/release-102/gtf/homo_sapiens/Homo_sapiens.GRCh38.102.gtf.gz"
37 | Homo_sapiens_GRCh38_Ensembl_103.gtf: "https://ftp.ensembl.org/pub/release-103/gtf/homo_sapiens/Homo_sapiens.GRCh38.103.gtf.gz"
38 | Homo_sapiens_GRCh38_Ensembl_104.gtf: "https://ftp.ensembl.org/pub/release-104/gtf/homo_sapiens/Homo_sapiens.GRCh38.104.gtf.gz"
39 | Homo_sapiens_GRCh38_Ensembl_105.gtf: "https://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.gtf.gz"
40 | Homo_sapiens_GRCh38_Ensembl_106.gtf: "https://ftp.ensembl.org/pub/release-106/gtf/homo_sapiens/Homo_sapiens.GRCh38.106.gtf.gz"
41 | Homo_sapiens_GRCh38_Ensembl_107.gtf: "https://ftp.ensembl.org/pub/release-107/gtf/homo_sapiens/Homo_sapiens.GRCh38.107.gtf.gz"
42 | Homo_sapiens_GRCh38_Ensembl_108.gtf: "https://ftp.ensembl.org/pub/release-108/gtf/homo_sapiens/Homo_sapiens.GRCh38.108.gtf.gz"
43 | Homo_sapiens_GRCh38_Ensembl_109.gtf: "https://ftp.ensembl.org/pub/release-109/gtf/homo_sapiens/Homo_sapiens.GRCh38.109.gtf.gz"
44 | Homo_sapiens_GRCh38_Ensembl_110.gtf: "https://ftp.ensembl.org/pub/release-110/gtf/homo_sapiens/Homo_sapiens.GRCh38.110.gtf.gz"
45 | Homo_sapiens_GRCh38_Ensembl_111.gtf: "https://ftp.ensembl.org/pub/release-111/gtf/homo_sapiens/Homo_sapiens.GRCh38.111.gtf.gz"
46 | Homo_sapiens_GRCh38_Ensembl_112.gtf: "https://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.gtf.gz"
47 | Homo_sapiens_GRCh38_Ensembl_113.gtf: "https://ftp.ensembl.org/pub/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh38.113.gtf.gz"
48 | Homo_sapiens_GRCh38_Ensembl_114.gtf: "https://ftp.ensembl.org/pub/release-114/gtf/homo_sapiens/Homo_sapiens.GRCh38.114.gtf.gz"
49 | # Gives me a 403 - reported to Ensembl
50 | # Homo_sapiens_GRCh38_Ensembl_115.gtf: "https://ftp.ensembl.org/pub/release-115/gtf/homo_sapiens/Homo_sapiens.GRCh38.115.gtf.gz"
51 |
52 | T2T-CHM13v2.0:
53 | Homo_sapiens_T2T-CHM13v2.0_Ensembl_2022_06.gtf: "https://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/GCA_009914755.4/ensembl/geneset/2022_06/Homo_sapiens-GCA_009914755.4-2022_06-genes.gtf.gz"
54 | Homo_sapiens_T2T-CHM13v2.0_Ensembl_2022_07.gtf: "https://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/GCA_009914755.4/ensembl/geneset/2022_07/Homo_sapiens-GCA_009914755.4-2022_07-genes.gtf.gz"
55 | RefSeq:
56 | GRCh37:
57 | Homo_sapiens_GRCh37_RefSeq_p5.gff3: "https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/BUILD.37.3/GFF/ref_GRCh37.p5_top_level.gff3.gz"
58 | Homo_sapiens_GRCh37_RefSeq_103.gff3: "https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.103/GFF/ref_GRCh37.p9_top_level.gff3.gz"
59 | Homo_sapiens_GRCh37_RefSeq_104.gff3: "https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.104/GFF/ref_GRCh37.p10_top_level.gff3.gz"
60 | Homo_sapiens_GRCh37_RefSeq_105.20190906.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20190906/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz"
61 | Homo_sapiens_GRCh37_RefSeq_105.20201022.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20201022/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz"
62 | Homo_sapiens_GRCh37_RefSeq_105.20220307.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20220307/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz"
63 | Homo_sapiens_GRCh37_RS_2024_09: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.25-RS_2024_09/GCF_000001405.25_GRCh37.p13_genomic.gff.gz"
64 |
65 | GRCh38:
66 | Homo_sapiens_GRCh38_RefSeq_106.gff3: "https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz"
67 | Homo_sapiens_GRCh38_RefSeq_107.gff3: "https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz"
68 | Homo_sapiens_GRCh38_RefSeq_108.gff3: "https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.108/GFF/ref_GRCh38.p7_top_level.gff3.gz"
69 | Homo_sapiens_GRCh38_RefSeq_109.gff3: "https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.109/GFF/ref_GRCh38.p12_top_level.gff3.gz"
70 | # The date on this 109 version is 2020-2024 (after the other 109s below), not sure what's going on
71 | Homo_sapiens_GRCh38_RefSeq_109.GCF.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.gff.gz"
72 | Homo_sapiens_GRCh38_RefSeq_109.20190607.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20190607/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
73 | Homo_sapiens_GRCh38_RefSeq_109.20190905.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20190905/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
74 | Homo_sapiens_GRCh38_RefSeq_109.20191205.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20191205/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
75 | Homo_sapiens_GRCh38_RefSeq_109.20200228.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20200228/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
76 | Homo_sapiens_GRCh38_RefSeq_109.20200522.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20200522/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
77 | Homo_sapiens_GRCh38_RefSeq_109.20200815.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20200815/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
78 | Homo_sapiens_GRCh38_RefSeq_109.20201120.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20201120/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
79 | Homo_sapiens_GRCh38_RefSeq_109.20210226.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20210226/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
80 | Homo_sapiens_GRCh38_RefSeq_109.20210514.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20210514/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
81 | Homo_sapiens_GRCh38_RefSeq_109.20211119.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20211119/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz"
82 | Homo_sapiens_GRCh38_RefSeq_110.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz"
83 | Homo_sapiens_GRCh38_RefSeq_RS_2023_03.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2023_03/GCF_000001405.40_GRCh38.p14_genomic.gff.gz"
84 | Homo_sapiens_GRCh38_RefSeq_RS_2023_10.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2023_10/GCF_000001405.40_GRCh38.p14_genomic.gff.gz"
85 | Homo_sapiens_GRCh38_RefSeq_RS_2024_08.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2024_08/GCF_000001405.40_GRCh38.p14_genomic.gff.gz"
86 |
87 | T2T-CHM13v2.0:
88 | Homo_sapiens_T2T-CHM13v2.0_RefSeq_110.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz"
89 | Homo_sapiens_T2T-CHM13v2.0_RefSeq_RS_2023_03.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2023_03/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz"
90 | Homo_sapiens_T2T-CHM13v2.0_RefSeq_RS_2023_10.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2023_10/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz"
91 | Homo_sapiens_T2T-CHM13v2.0_RefSeq_RS_2024_08.gff: "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2024_08/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz"
92 |
--------------------------------------------------------------------------------
/generate_transcript_data/ensembl_transcripts_chm13v2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | BASE_DIR=$(dirname ${BASH_SOURCE[0]})
6 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version)
7 | GENOME_BUILD=T2T-CHM13v2.0
8 |
9 | if [[ -z ${GENE_INFO_JSON} ]]; then
10 | echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py"
11 | exit 1
12 | fi
13 |
14 | merge_args=()
15 | for release in 2022_06 2022_07; do
16 | filename=Homo_sapiens-GCA_009914755.4-${release}-genes.gtf.gz
17 | url=https://ftp.ensembl.org/pub/rapid-release/species/Homo_sapiens/GCA_009914755.4/ensembl/geneset/${release}/${filename}
18 | cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz
19 |
20 | if [[ ! -e ${filename} ]]; then
21 | wget ${url}
22 | fi
23 | if [[ ! -e ${cdot_file} ]]; then
24 | ${BASE_DIR}/cdot_json.py gtf_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
25 | fi
26 | merge_args+=(${cdot_file})
27 | done
28 |
29 | merged_file="cdot-${CDOT_VERSION}.ensembl.${GENOME_BUILD}.json.gz"
30 | if [[ ! -e ${merged_file} ]]; then
31 | ${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=${GENOME_BUILD} --output "${merged_file}"
32 | fi
33 |
--------------------------------------------------------------------------------
/generate_transcript_data/ensembl_transcripts_grch37.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | BASE_DIR=$(dirname ${BASH_SOURCE[0]})
6 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version)
7 |
8 | if [[ -z ${GENE_INFO_JSON} ]]; then
9 | echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py"
10 | exit 1
11 | fi
12 |
13 | # v81 (points to 75) and earlier at GTFs that don't have transcript versions - just skip them
14 |
15 | #82 is first GFF3 for GRCh37
16 | #83 has no data
17 | #84 is 82 again
18 | #86 is 85 again
19 | merge_args=()
20 | for release in 82 85 87; do
21 | # Switched to using GTFs as they contain protein version
22 | filename=Homo_sapiens.GRCh37.${release}.gtf.gz
23 | url=ftp://ftp.ensembl.org/pub/grch37/release-${release}/gtf/homo_sapiens/${filename}
24 | cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz
25 | if [[ ! -e ${filename} ]]; then
26 | wget ${url}
27 | fi
28 | if [[ ! -e ${cdot_file} ]]; then
29 | ${BASE_DIR}/cdot_json.py gtf_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
30 | fi
31 | merge_args+=(${cdot_file})
32 | done
33 |
34 | merged_file="cdot-${CDOT_VERSION}.ensembl.grch37.json.gz"
35 | if [[ ! -e ${merged_file} ]]; then
36 | ${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=GRCh37 --output "${merged_file}"
37 | fi
38 |
--------------------------------------------------------------------------------
/generate_transcript_data/ensembl_transcripts_grch38.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | BASE_DIR=$(dirname ${BASH_SOURCE[0]})
6 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version)
7 |
8 | if [[ -z ${GENE_INFO_JSON} ]]; then
9 | echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py"
10 | exit 1
11 | fi
12 |
13 | # Skip earlier GTFs as they don't have versions
14 | #for release in 76 77 78 79 80; do
15 | # filename=Homo_sapiens.GRCh38.${release}.gtf.gz
16 | # url=ftp://ftp.ensembl.org/pub/release-${release}/gtf/homo_sapiens/${filename}
17 | # if [[ ! -e ${filename} ]]; then
18 | # wget ${url}
19 | # fi
20 |
21 | # if [[ ! -e ${filename}.json.gz ]]; then
22 | # pyreference_gff_to_json.py --url "${url}" --gff3 "${filename}"
23 | # fi
24 | #done
25 |
26 | #81 is first GFF3 for GRCh38
27 | merge_args=()
28 | for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112; do
29 | # Switched to using GTFs as they contain protein version while Ensembl GFF3s do not (required for c_to_p)
30 | filename=Homo_sapiens.GRCh38.${release}.gtf.gz
31 | url=ftp://ftp.ensembl.org/pub/release-${release}/gtf/homo_sapiens/${filename}
32 | cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz
33 |
34 | if [[ ! -e ${filename} ]]; then
35 | wget ${url}
36 | fi
37 | if [[ ! -e ${cdot_file} ]]; then
38 | ${BASE_DIR}/cdot_json.py gtf_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
39 | fi
40 | merge_args+=(${cdot_file})
41 | done
42 |
43 | merged_file="cdot-${CDOT_VERSION}.ensembl.grch38.json.gz"
44 | if [[ ! -e ${merged_file} ]]; then
45 | ${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=GRCh38 --output "${merged_file}"
46 | fi
47 |
--------------------------------------------------------------------------------
/generate_transcript_data/gene_info.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [[ -z ${EMAIL} ]]; then
4 | echo "You need to set the 'EMAIL' shell variable (used for NCBI API calls)"
5 | exit
6 | fi
7 |
8 | BASE_DIR=$(dirname ${BASH_SOURCE[0]})
9 | # Python scripts will import via generate_transcript_data
10 | export PYTHONPATH=${BASE_DIR}/..
11 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version)
12 | REFSEQ_DIR=${REFSEQ_DIR:-H_sapiens}
13 | SPECIES=${SPECIES:-Homo_sapiens}
14 |
15 | echo "Generating Gene Info for REFSEQ_DIR=${REFSEQ_DIR}, SPECIES=${SPECIES}"
16 |
17 | filename=${SPECIES}.gene_info.gz
18 | url=https://ftp.ncbi.nlm.nih.gov/refseq/${REFSEQ_DIR}/${filename}
19 | if [[ ! -e ${filename} ]]; then
20 | echo "Downloading ${url}"
21 | wget ${url}
22 | fi
23 |
24 | out_json=${SPECIES}.gene-info-${CDOT_VERSION}.json.gz
25 | if [[ ! -e ${out_json} ]]; then
26 | echo "Processing gene info file..."
27 | ${BASE_DIR}/cdot_gene_info.py --gene-info ${filename} --output ${out_json} --email ${EMAIL}
28 | fi
29 |
--------------------------------------------------------------------------------
/generate_transcript_data/github_release_upload.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | if [[ -z ${CDOT_DATA_DIR} ]]; then
6 | echo "You need to set environment variable CDOT_DATA_DIR, pointing to where you ran 'all_transcripts.sh'"
7 | exit 1
8 | fi
9 |
10 | FULL_PATH_TO_SCRIPT="$(realpath "${BASH_SOURCE[-1]}")"
11 | BASE_DIR=$(dirname ${FULL_PATH_TO_SCRIPT})
12 |
13 | # Python scripts will import via generate_transcript_data
14 | export PYTHONPATH=${BASE_DIR}/..
15 |
16 | CDOT_DATA_VERSION=$(${BASE_DIR}/cdot_json.py --version)
17 |
18 | CDOT_RELEASE_NAME=data_v${CDOT_DATA_VERSION}
19 | echo "For the rest of the script to work, it assumes you have tagged + pushed a data release of ${CDOT_DATA_VERSION}"
20 | echo "then run: gh release create ${CDOT_RELEASE_NAME} --title=${CDOT_RELEASE_NAME} --notes 'release notes...'"
21 |
22 | gh release upload ${CDOT_RELEASE_NAME} \
23 | ${CDOT_DATA_DIR}/ensembl/GRCh37/cdot-${CDOT_DATA_VERSION}.ensembl.grch37.json.gz \
24 | ${CDOT_DATA_DIR}/ensembl/GRCh37/cdot-${CDOT_DATA_VERSION}.ensembl.Homo_sapiens.GRCh37.87.gtf.json.gz \
25 | ${CDOT_DATA_DIR}/ensembl/GRCh38/cdot-${CDOT_DATA_VERSION}.ensembl.grch38.json.gz \
26 | ${CDOT_DATA_DIR}/ensembl/GRCh38/cdot-${CDOT_DATA_VERSION}.ensembl.Homo_sapiens.GRCh38.110.gtf.json.gz \
27 | ${CDOT_DATA_DIR}/ensembl/GRCh38/cdot-${CDOT_DATA_VERSION}.ensembl.Homo_sapiens.GRCh38.112.gtf.json.gz \
28 | ${CDOT_DATA_DIR}/ensembl/T2T-CHM13v2.0/cdot-${CDOT_DATA_VERSION}.ensembl.T2T-CHM13v2.0.json.gz \
29 | ${CDOT_DATA_DIR}/refseq/GRCh37/cdot-${CDOT_DATA_VERSION}.refseq.grch37.json.gz \
30 | ${CDOT_DATA_DIR}/refseq/GRCh37/cdot-${CDOT_DATA_VERSION}.GCF_000001405.25_GRCh37.p13_genomic.105.20201022.gff.json.gz \
31 | ${CDOT_DATA_DIR}/refseq/GRCh37/cdot-${CDOT_DATA_VERSION}.GCF_000001405.25_GRCh37.p13_genomic.105.20220307.gff.json.gz \
32 | ${CDOT_DATA_DIR}/refseq/GRCh38/cdot-${CDOT_DATA_VERSION}.refseq.grch38.json.gz \
33 | ${CDOT_DATA_DIR}/refseq/GRCh38/cdot-${CDOT_DATA_VERSION}.GCF_000001405.40_GRCh38.p14_genomic.110.gff.json.gz \
34 | ${CDOT_DATA_DIR}/refseq/GRCh38/cdot-${CDOT_DATA_VERSION}.GCF_000001405.40_GRCh38.p14_genomic.RS_2023_10.gff.json.gz \
35 | ${CDOT_DATA_DIR}/refseq/T2T-CHM13v2.0/cdot-${CDOT_DATA_VERSION}.refseq.T2T-CHM13v2.0.json.gz
36 |
37 |
--------------------------------------------------------------------------------
/generate_transcript_data/json_encoders.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 |
4 | class SortedSetEncoder(json.JSONEncoder):
5 | """ Dump set as list, from: https://stackoverflow.com/a/8230505/295724 """
6 |
7 | def default(self, obj):
8 | if isinstance(obj, set):
9 | return list(sorted(obj))
10 | return json.JSONEncoder.default(self, obj)
--------------------------------------------------------------------------------
/generate_transcript_data/json_schema_version.py:
--------------------------------------------------------------------------------
1 | # After 0.2.22 we split version into separate code (pip) and data schema versions
2 | # The cdot client will use its own major/minor to determine whether it can read these data files
3 | JSON_SCHEMA_VERSION = "0.2.28"
4 |
--------------------------------------------------------------------------------
/generate_transcript_data/mus_musculus.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | FULL_PATH_TO_SCRIPT="$(realpath "${BASH_SOURCE[-1]}")"
6 | BASE_DIR=$(dirname ${FULL_PATH_TO_SCRIPT})
7 |
8 | # Python scripts will import via generate_transcript_data
9 | export PYTHONPATH=${BASE_DIR}/..
10 |
11 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version)
12 |
13 | # This needs to be passed to called bash scripts, so they are invoked with "." to use these variables
14 | export REFSEQ_DIR=M_musculus
15 | export SPECIES=Mus_musculus
16 | export GENE_INFO_JSON=$(pwd)/${SPECIES}.gene-info-${CDOT_VERSION}.json.gz
17 |
18 | if [[ ! -e ${GENE_INFO_JSON} ]]; then
19 | ${BASE_DIR}/gene_info.sh
20 | fi
21 |
22 | echo "Gene summary variable = ${GENE_INFO_JSON}"
23 |
24 | # RefSeq
25 | mkdir -p refseq
26 | cd refseq
27 |
28 | mkdir -p GRCm38
29 | cd GRCm38
30 | ${BASE_DIR}/Mus_musculus/refseq_transcripts_grcm38.sh
31 | cd ..
32 |
33 | mkdir -p GRCm39
34 | cd GRCm39
35 | ${BASE_DIR}/Mus_musculus/refseq_transcripts_grcm39.sh
36 | cd ..
37 |
--------------------------------------------------------------------------------
/generate_transcript_data/refseq_transcripts_chm13v2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | BASE_DIR=$(dirname ${BASH_SOURCE[0]})
6 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version)
7 | GENOME_BUILD=T2T-CHM13v2.0
8 |
9 | if [[ -z ${GENE_INFO_JSON} ]]; then
10 | echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py"
11 | exit 1
12 | fi
13 |
14 | merge_args=()
15 |
16 | filename=GCF_009914755.1_T2T-CHM13v2.0_genomic.110.gff.gz
17 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz
18 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
19 |
20 | if [[ ! -e ${filename} ]]; then
21 | wget ${url} --output-document=${filename}
22 | fi
23 | if [[ ! -e ${cdot_file} ]]; then
24 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
25 | fi
26 | merge_args+=(${cdot_file})
27 |
28 |
29 | filename=GCF_009914755.1_T2T-CHM13v2.0_genomic.RS_2023_03.gff.gz
30 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2023_03/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz
31 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
32 |
33 | if [[ ! -e ${filename} ]]; then
34 | wget ${url} --output-document=${filename}
35 | fi
36 | if [[ ! -e ${cdot_file} ]]; then
37 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
38 | fi
39 | merge_args+=(${cdot_file})
40 |
41 | filename=GCF_009914755.1_T2T-CHM13v2.0_genomic.RS_2023_10.gff.gz
42 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2023_10/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz
43 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
44 |
45 | if [[ ! -e ${filename} ]]; then
46 | wget ${url} --output-document=${filename}
47 | fi
48 | if [[ ! -e ${cdot_file} ]]; then
49 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
50 | fi
51 | merge_args+=(${cdot_file})
52 |
53 |
54 | filename=GCF_009914755.1_T2T-CHM13v2.0_genomic.RS_2024_08.gff.gz
55 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_009914755.1-RS_2024_08/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz
56 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
57 |
58 | if [[ ! -e ${filename} ]]; then
59 | wget ${url} --output-document=${filename}
60 | fi
61 | if [[ ! -e ${cdot_file} ]]; then
62 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=${GENOME_BUILD} --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
63 | fi
64 | merge_args+=(${cdot_file})
65 |
66 |
67 | merged_file="cdot-${CDOT_VERSION}.refseq.${GENOME_BUILD}.json.gz"
68 | if [[ ! -e ${merged_file} ]]; then
69 | echo "Creating ${merged_file}"
70 | ${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=${GENOME_BUILD} --output "${merged_file}"
71 | fi
--------------------------------------------------------------------------------
/generate_transcript_data/refseq_transcripts_grch37.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | BASE_DIR=$(dirname ${BASH_SOURCE[0]})
6 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version)
7 | UTA_VERSION=20210129
8 |
9 | if [[ -z ${GENE_INFO_JSON} ]]; then
10 | echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py"
11 | exit 1
12 | fi
13 |
14 | if [[ -z ${UTA_TRANSCRIPTS} ]]; then
15 | echo "Not including UTA transcripts. Set environment variable UTA_TRANSCRIPTS=True to do so"
16 | else
17 | echo "Retrieving / storing UTA transcripts"
18 | fi
19 |
20 | # Having troubles with corrupted files downloading via FTP from NCBI via IPv6, http works ok
21 | # NOTE: RefSeq transcripts in GRCh37 before p13 did not have alignment gap information
22 |
23 | merge_args=()
24 |
25 | filename=ref_GRCh37.p5_top_level.gff3.gz
26 | url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/BUILD.37.3/GFF/${filename}
27 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
28 | if [[ ! -e ${filename} ]]; then
29 | wget ${url}
30 | fi
31 | if [[ ! -e ${cdot_file} ]]; then
32 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
33 | fi
34 | merge_args+=(${cdot_file})
35 |
36 |
37 | filename=ref_GRCh37.p9_top_level.gff3.gz
38 | url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.103/GFF/${filename}
39 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
40 | if [[ ! -e ${filename} ]]; then
41 | wget ${url}
42 | fi
43 | if [[ ! -e ${cdot_file} ]]; then
44 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
45 | fi
46 | merge_args+=(${cdot_file})
47 |
48 |
49 | filename=ref_GRCh37.p10_top_level.gff3.gz
50 | url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.104/GFF/${filename}
51 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
52 | if [[ ! -e ${filename} ]]; then
53 | wget ${url}
54 | fi
55 | if [[ ! -e ${cdot_file} ]]; then
56 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
57 | fi
58 | merge_args+=(${cdot_file})
59 |
60 |
61 | if [[ ! -z ${UTA_TRANSCRIPTS} ]]; then
62 | # UTA transcripts have gaps, so they should overwrite the earlier refseq transcripts (without gaps)
63 | # But will be overwritten by newer (post p13) official transcripts
64 | cdot_file="cdot-${CDOT_VERSION}.uta_${UTA_VERSION}.GRCh37.json.gz"
65 | ${BASE_DIR}/uta_transcripts.sh ${UTA_VERSION} GRCh37
66 | merge_args+=(${cdot_file})
67 | fi
68 |
69 | filename=ref_GRCh37.p13_top_level.gff3.gz
70 | url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/${filename}
71 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
72 | if [[ ! -e ${filename} ]]; then
73 | wget ${url}
74 | fi
75 | if [[ ! -e ${cdot_file} ]]; then
76 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
77 | fi
78 | merge_args+=(${cdot_file})
79 |
80 |
81 | # These all have the same name, so rename them based on release ID
82 | for release in 105.20190906 105.20201022 105.20220307; do
83 | filename=GCF_000001405.25_GRCh37.p13_genomic.${release}.gff.gz
84 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/${release}/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz
85 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
86 | if [[ ! -e ${filename} ]]; then
87 | wget ${url} --output-document=${filename}
88 | fi
89 | if [[ ! -e ${cdot_file} ]]; then
90 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh37 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
91 | fi
92 | merge_args+=(${cdot_file})
93 | done
94 |
95 | merged_file="cdot-${CDOT_VERSION}.refseq.grch37.json.gz"
96 | if [[ ! -e ${merged_file} ]]; then
97 | echo "Creating ${merged_file}"
98 | ${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=GRCh37 --output "${merged_file}"
99 | fi
100 |
--------------------------------------------------------------------------------
/generate_transcript_data/refseq_transcripts_grch38.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | BASE_DIR=$(dirname ${BASH_SOURCE[0]})
6 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version)
7 | UTA_VERSION=20210129
8 |
9 | if [[ -z ${GENE_INFO_JSON} ]]; then
10 | echo "You need to set environment variable GENE_INFO_JSON, pointing to the filename produced by cdot_gene_info.py"
11 | exit 1
12 | fi
13 |
14 | # Having troubles with corrupted files downloading via FTP from NCBI via IPv6, http works ok
15 |
16 | if [[ -z ${UTA_TRANSCRIPTS} ]]; then
17 | echo "Not including UTA transcripts. Set environment variable UTA_TRANSCRIPTS=True to do so"
18 | else
19 | echo "Retrieving / storing UTA transcripts"
20 | fi
21 |
22 |
23 | merge_args=()
24 |
25 | if [[ ! -z ${UTA_TRANSCRIPTS} ]]; then
26 | # All GRCh38 transcripts have alignments gaps, so use UTA first (and override with official releases)
27 | uta_cdot_file="cdot-${CDOT_VERSION}.uta_${UTA_VERSION}.GRCh38.json.gz"
28 | ${BASE_DIR}/uta_transcripts.sh ${UTA_VERSION} GRCh38
29 | merge_args+=(${uta_cdot_file})
30 | fi
31 |
32 | if [[ -z ${GRCH38_REFSEQ_HISTORICAL} ]]; then
33 | echo "Not including RefSeq GRCh38 historical transcripts. Set env variable GRCH38_REFSEQ_HISTORICAL=True to do so"
34 | else
35 | echo "Adding RefSeq GRCh38 historical transcripts"
36 | # Historical - these are stored in separate files for annotation/alignments
37 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2023_03/RefSeq_historical_alignments/GCF_000001405.40-RS_2023_03_genomic.gff.gz
38 | annotation_filename=$(basename $url)
39 | if [[ ! -e ${annotation_filename} ]]; then
40 | wget ${url} --output-document=${annotation_filename}
41 | fi
42 |
43 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2023_03/RefSeq_historical_alignments/GCF_000001405.40-RS_2023_03_knownrefseq_alns.gff.gz
44 | alignments_filename=$(basename $url)
45 | if [[ ! -e ${alignments_filename} ]]; then
46 | wget ${url} --output-document=${alignments_filename}
47 | fi
48 |
49 | filename=GCF_000001405.40-RS_2023_03_combined_annotation_alignments.gff.gz
50 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
51 |
52 | if [[ ! -e ${filename} ]]; then
53 | echo "Combining historical annotations and alignments..."
54 | cat ${annotation_filename} ${alignments_filename} > ${filename}
55 | fi
56 | if [[ ! -e ${cdot_file} ]]; then
57 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}" --skip-missing-parents
58 | fi
59 | merge_args+=(${cdot_file})
60 | fi
61 |
62 | filename=ref_GRCh38_top_level.gff3.gz
63 | url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/${filename}
64 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
65 |
66 | if [[ ! -e ${filename} ]]; then
67 | wget ${url}
68 | fi
69 | if [[ ! -e ${cdot_file} ]]; then
70 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
71 | fi
72 | merge_args+=(${cdot_file})
73 |
74 |
75 | filename=ref_GRCh38.p2_top_level.gff3.gz
76 | url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.107/GFF/${filename}
77 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
78 |
79 | if [[ ! -e ${filename} ]]; then
80 | wget ${url}
81 | fi
82 | if [[ ! -e ${cdot_file} ]]; then
83 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
84 | fi
85 | merge_args+=(${cdot_file})
86 |
87 |
88 | filename=ref_GRCh38.p7_top_level.gff3.gz
89 | url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.108/GFF/${filename}
90 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
91 |
92 | if [[ ! -e ${filename} ]]; then
93 | wget ${url}
94 | fi
95 | if [[ ! -e ${cdot_file} ]]; then
96 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
97 | fi
98 | merge_args+=(${cdot_file})
99 |
100 |
101 | filename=ref_GRCh38.p12_top_level.gff3.gz
102 | url=http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.109/GFF/${filename}
103 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
104 |
105 | if [[ ! -e ${filename} ]]; then
106 | wget ${url}
107 | fi
108 | if [[ ! -e ${cdot_file} ]]; then
109 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
110 | fi
111 | merge_args+=(${cdot_file})
112 |
113 |
114 | filename=GCF_000001405.38_GRCh38.p12_genomic.gff.gz
115 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109/GCF_000001405.38_GRCh38.p12/${filename}
116 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
117 |
118 | if [[ ! -e ${filename} ]]; then
119 | wget ${url}
120 | fi
121 | if [[ ! -e ${cdot_file} ]]; then
122 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
123 | fi
124 | merge_args+=(${cdot_file})
125 |
126 |
127 | # 109.20211119 needs latest HTSeq (Feb 2022) or dies with quoting error
128 | for release in 109.20190607 109.20190905 109.20191205 109.20200228 109.20200522 109.20200815 109.20201120 109.20210226 109.20210514 109.20211119; do
129 | # These all have the same name, so rename them based on release ID
130 | filename=GCF_000001405.39_GRCh38.p13_genomic.${release}.gff.gz
131 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/${release}/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz
132 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
133 | if [[ ! -e ${filename} ]]; then
134 | wget ${url} --output-document=${filename}
135 | fi
136 | if [[ ! -e ${cdot_file} ]]; then
137 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
138 | fi
139 | merge_args+=(${cdot_file})
140 | done
141 |
142 | # GRCh38.p14
143 | # These have same filenames (but in diff directory structures) so need to rename them
144 | filename=GCF_000001405.40_GRCh38.p14_genomic.110.gff.gz
145 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz
146 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
147 |
148 | if [[ ! -e ${filename} ]]; then
149 | wget ${url} --output-document=${filename}
150 | fi
151 | if [[ ! -e ${cdot_file} ]]; then
152 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
153 | fi
154 | merge_args+=(${cdot_file})
155 |
156 | ## Dated versions
157 |
158 | filename=GCF_000001405.40_GRCh38.p14_genomic.RS_2023_03.gff.gz
159 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2023_03/GCF_000001405.40_GRCh38.p14_genomic.gff.gz
160 | cdot_file=cdot-${CDOT_VERSION}.ensembl.$(basename $filename .gz).json.gz
161 |
162 | if [[ ! -e ${filename} ]]; then
163 | wget ${url} --output-document=${filename}
164 | fi
165 | if [[ ! -e ${cdot_file} ]]; then
166 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
167 | fi
168 | merge_args+=(${cdot_file})
169 |
170 |
171 | filename=GCF_000001405.40_GRCh38.p14_genomic.RS_2023_10.gff.gz
172 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2023_10/GCF_000001405.40_GRCh38.p14_genomic.gff.gz
173 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
174 |
175 | if [[ ! -e ${filename} ]]; then
176 | wget ${url} --output-document=${filename}
177 | fi
178 | if [[ ! -e ${cdot_file} ]]; then
179 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
180 | fi
181 | merge_args+=(${cdot_file})
182 |
183 |
184 | filename=GCF_000001405.40_GRCh38.p14_genomic.RS_2024_08.gff.gz
185 | url=https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/GCF_000001405.40-RS_2024_08/GCF_000001405.40_GRCh38.p14_genomic.gff.gz
186 | cdot_file=cdot-${CDOT_VERSION}.$(basename $filename .gz).json.gz
187 |
188 | if [[ ! -e ${filename} ]]; then
189 | wget ${url} --output-document=${filename}
190 | fi
191 | if [[ ! -e ${cdot_file} ]]; then
192 | ${BASE_DIR}/cdot_json.py gff3_to_json "${filename}" --url "${url}" --genome-build=GRCh38 --output "${cdot_file}" --gene-info-json="${GENE_INFO_JSON}"
193 | fi
194 | merge_args+=(${cdot_file})
195 |
196 |
197 | merged_file="cdot-${CDOT_VERSION}.refseq.grch38.json.gz"
198 | if [[ ! -e ${merged_file} ]]; then
199 | echo "Creating ${merged_file}"
200 | ${BASE_DIR}/cdot_json.py merge_historical ${merge_args[@]} --genome-build=GRCh38 --output "${merged_file}"
201 | fi
202 |
--------------------------------------------------------------------------------
/generate_transcript_data/requirements.txt:
--------------------------------------------------------------------------------
1 | biopython
2 | bioutils
3 | htseq
4 | ijson
5 | intervaltree
6 | lazy
7 | pyhgvs
8 | requests
--------------------------------------------------------------------------------
/generate_transcript_data/uta_20210129_grch37.sql:
--------------------------------------------------------------------------------
1 | \copy (SELECT transcript.ac, string_agg(distinct transcript.hgnc, ',') as hgnc, 'http://www.ncbi.nlm.nih.gov/refseq/' as origin_url,
2 | string_agg(distinct aln_v.alt_ac::varchar, ',') as contig,
3 | string_agg(distinct aln_v.alt_strand::varchar, ',') as strand,
4 | transcript.cds_start_i,
5 | transcript.cds_end_i,
6 | string_agg(aln_v.alt_start_i::varchar, ',' order by aln_v.alt_exon_id) as exon_starts,
7 | string_agg(aln_v.alt_end_i::varchar, ',' order by aln_v.alt_exon_id) as exon_ends,
8 | string_agg(aln_v.cigar, ',' order by aln_v.alt_exon_id) as cigars,
9 | string_agg(distinct aa.pro_ac, ',' order by aa.pro_ac) as protein
10 | from uta_20210129.transcript transcript
11 | inner join uta_20210129.tx_exon_aln_v aln_v on (transcript.ac = aln_v.tx_ac AND alt_aln_method = 'splign')
12 | left outer join uta_20210129.associated_accessions aa on (transcript.ac = aa.tx_ac)
13 | WHERE aln_v.alt_ac in
14 | ('NC_000001.10', 'NC_000002.11', 'NC_000003.11', 'NC_000004.11', 'NC_000005.9', 'NC_000006.11', 'NC_000007.13', 'NC_000008.10', 'NC_000009.11', 'NC_000010.10', 'NC_000011.9', 'NC_000012.11', 'NC_000013.10', 'NC_000014.8', 'NC_000015.9', 'NC_000016.9', 'NC_000017.10', 'NC_000018.9', 'NC_000019.9', 'NC_000020.10', 'NC_000021.8', 'NC_000022.10', 'NC_000023.10', 'NC_000024.9')
15 | group by transcript.ac) TO 'uta_20210129_grch37.csv' CSV HEADER;
--------------------------------------------------------------------------------
/generate_transcript_data/uta_20210129_grch38.sql:
--------------------------------------------------------------------------------
1 | \copy (SELECT transcript.ac, string_agg(distinct transcript.hgnc, ',') as hgnc, 'http://www.ncbi.nlm.nih.gov/refseq/' as origin_url,
2 | string_agg(distinct aln_v.alt_ac::varchar, ',') as contig,
3 | string_agg(distinct aln_v.alt_strand::varchar, ',') as strand,
4 | transcript.cds_start_i,
5 | transcript.cds_end_i,
6 | string_agg(aln_v.alt_start_i::varchar, ',' order by aln_v.alt_exon_id) as exon_starts,
7 | string_agg(aln_v.alt_end_i::varchar, ',' order by aln_v.alt_exon_id) as exon_ends,
8 | string_agg(aln_v.cigar, ',' order by aln_v.alt_exon_id) as cigars,
9 | string_agg(distinct aa.pro_ac, ',' order by aa.pro_ac) as protein
10 | from uta_20210129.transcript transcript
11 | inner join uta_20210129.tx_exon_aln_v aln_v on (transcript.ac = aln_v.tx_ac AND alt_aln_method = 'splign')
12 | left outer join uta_20210129.associated_accessions aa on (transcript.ac = aa.tx_ac)
13 | WHERE aln_v.alt_ac in
14 | ('NC_000001.11', 'NC_000002.12', 'NC_000003.12', 'NC_000004.12', 'NC_000005.10', 'NC_000006.12', 'NC_000007.14', 'NC_000008.11', 'NC_000009.12', 'NC_000010.11', 'NC_000011.10', 'NC_000012.12', 'NC_000013.11', 'NC_000014.9', 'NC_000015.10', 'NC_000016.10', 'NC_000017.11', 'NC_000018.10', 'NC_000019.10', 'NC_000020.11', 'NC_000021.9', 'NC_000022.11', 'NC_000023.11', 'NC_000024.10') and origin.origin_id not in (10, 11)
15 | group by transcript.ac) TO 'uta_20210129_grch38.csv' CSV HEADER;
--------------------------------------------------------------------------------
/generate_transcript_data/uta_transcripts.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ "$#" -ne 2 ]; then
4 | echo "Usage ${BASH_SOURCE[0]} uta_version genome_build"
5 | exit 1;
6 | fi
7 |
8 | BASE_DIR=$(dirname ${BASH_SOURCE[0]})
9 | UTA_BASE_URL=uta.biocommons.org # uta.invitae.com moved here
10 | CDOT_VERSION=$(${BASE_DIR}/cdot_json.py --version)
11 | UTA_VERSION=${1}
12 | GENOME_BUILD=${2}
13 |
14 | export PGPASSWORD=anonymous
15 |
16 | uta_csv_filename=uta_${UTA_VERSION}_${GENOME_BUILD,,}.csv
17 | if [[ ! -e ${uta_csv_filename} ]]; then
18 | SQL=${BASE_DIR}/uta_${UTA_VERSION}_${GENOME_BUILD,,}.sql # Lowercase filename
19 |
20 | # can't have newlines in \copy command
21 | cat ${SQL} | tr -s '\n' ' ' | psql -h ${UTA_BASE_URL} -U anonymous -d uta
22 | fi
23 |
24 | cdot_file="cdot-${CDOT_VERSION}.uta_${UTA_VERSION}.${GENOME_BUILD}.json.gz"
25 | if [[ ! -e ${cdot_file} ]]; then
26 | POSTGRES_URL=postgresql://${UTA_BASE_URL}/uta_${UTA_VERSION}
27 | ${BASE_DIR}/cdot_json.py uta_to_json "${uta_csv_filename}" --url "${POSTGRES_URL}" --output "${cdot_file}" --genome-build=${GENOME_BUILD}
28 | fi
29 |
--------------------------------------------------------------------------------
/paper/HGVS cleaning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "bb4a4052",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import re\n",
11 | "import string\n",
12 | "from typing import Tuple, List\n",
13 | "import pandas as pd\n",
14 | "\n",
15 | "\n",
16 | "df = pd.read_csv(\"./hgvs_searches.csv\")\n",
17 | "non_resolve_mask = df[\"can_resolve\"] == False\n",
18 | "hgvs_errors_df = df[non_resolve_mask]\n",
19 | "\n",
20 | "hgvs_errors_df = hgvs_errors_df.sort_values(\"hgvs\")\n",
21 | " \n",
22 | "# dropping ALL duplicate values\n",
23 | "hgvs_errors_df.drop_duplicates(subset=\"hgvs\",\n",
24 | " keep=False, inplace=True)\n",
25 | "\n",
26 | "hgvs_errors_df.to_csv(\"hgvs_errors_uniq.csv\")\n",
27 | "\n"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": null,
33 | "id": "a11dfb04",
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "\n",
38 | "hgvs_errors_df"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "id": "8d610f24",
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "\n",
49 | "\n",
50 | "\n",
51 | "pattern_kind_no_colon = re.compile(r\"(c|g|m|n|p)\\.(\\d+)\")\n",
52 | "pattern_kind_no_dot = re.compile(r\":(c|g|m|n|p)(\\d+)\")\n",
53 | "pattern_gene_symbol = re.compile(r\"^[A-Z0-9-]+$|^C[0-9XY]+orf[0-9]+\") # HGNC gene symbol - https://www.biostars.org/p/60118/#65063\n",
54 | "\n",
55 | "\n",
56 | "# Copy/pasted from pyhgvs\n",
57 | "# The RefSeq standard for naming contigs/transcripts/proteins:\n",
58 | "# http://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly # nopep8\n",
59 | "REFSEQ_PREFIXES = [\n",
60 | " ('AC_', 'genomic',\n",
61 | " 'Complete genomic molecule, usually alternate assembly'),\n",
62 | " ('NC_', 'genomic',\n",
63 | " 'Complete genomic molecule, usually reference assembly'),\n",
64 | " ('NG_', 'genomic', 'Incomplete genomic region'),\n",
65 | " ('NT_', 'genomic', 'Contig or scaffold, clone-based or WGS'),\n",
66 | " ('NW_', 'genomic', 'Contig or scaffold, primarily WGS'),\n",
67 | " ('NS_', 'genomic', 'Environmental sequence'),\n",
68 | " ('NZ_', 'genomic', 'Unfinished WGS'),\n",
69 | " ('NM_', 'mRNA', ''),\n",
70 | " ('NR_', 'RNA', ''),\n",
71 | " ('XM_', 'mRNA', 'Predicted model'),\n",
72 | " ('XR_', 'RNA', 'Predicted model'),\n",
73 | " ('AP_', 'Protein', 'Annotated on AC_ alternate assembly'),\n",
74 | " ('NP_', 'Protein', 'Associated with an NM_ or NC_ accession'),\n",
75 | " ('YP_', 'Protein', ''),\n",
76 | " ('XP_', 'Protein', 'Predicted model, associated with an XM_ accession'),\n",
77 | " ('ZP_', 'Protein', 'Predicted model, annotated on NZ_ genomic records'),\n",
78 | "]\n",
79 | "\n",
80 | "\n",
81 | "\n",
82 | "\n",
83 | "def remove_non_printable_characters(hgvs_string):\n",
84 | " return re.sub(f'[^{re.escape(string.printable)}]', '', hgvs_string)\n",
85 | "\n",
86 | "def remove_whitespace(hgvs_string):\n",
87 | " \"\"\" This would be covered in remove_invalid_characters but this gives a nicer message \"\"\"\n",
88 | " return re.sub(\"\\s\", '', hgvs_string)\n",
89 | "\n",
90 | "def remove_invalid_characters(hgvs_string):\n",
91 | " return re.sub(\"[^A-Za-z0-9-_\\(\\)\\>=]\", '', hgvs_string)\n",
92 | "\n",
93 | "\n",
94 | "def clean_kind(hgvs_string):\n",
95 | " # Fix common typos\n",
96 | " \n",
97 | " # c, -> c. \n",
98 | " # ;c -> :c semicolon\n",
99 | " \n",
100 | " \n",
101 | " return hgvs_string\n",
102 | " \n",
103 | "\n",
104 | "def add_unmatched_brackets(hgvs_string):\n",
105 | " return hgvs_string\n",
106 | "\n",
107 | "def add_missing_colon(hgvs_string):\n",
108 | " # GLA c.\n",
109 | " # NM_001205293.2(CACNA1E):c.4165C>T'\n",
110 | " \n",
111 | " return hgvs_string\n",
112 | "\n",
113 | "def remove_duplicates(hgvs_string):\n",
114 | " hgvs_string = re.sub(\"::+\", \":\", hgvs_string)\n",
115 | " hgvs_string = re.sub(\"\\.\\.+\", \".\", hgvs_string)\n",
116 | " return hgvs_string\n",
117 | "\n",
118 | "\n",
119 | "def fix_allele_case(allele_string):\n",
120 | " allele_keyworks = [\n",
121 | " 'del',\n",
122 | " 'delins',\n",
123 | " 'dup',\n",
124 | " 'ins',\n",
125 | " 'inv',\n",
126 | " ]\n",
127 | " for ak in allele_keyworks:\n",
128 | " allele_string = re.sub(ak, ak, allele_string, flags=re.IGNORECASE)\n",
129 | " return allele_string\n",
130 | " \n",
131 | "\n",
132 | "GLOBAL_CLEAN = {\n",
133 | " \"remove_non_printable_characters\": remove_non_printable_characters,\n",
134 | " \"remove_whitespace\": remove_whitespace,\n",
135 | " \"remove_invalid_characters\", remove_invalid_characters,\n",
136 | " \"remove duplicates\": remove_duplicates,\n",
137 | "}\n",
138 | "\n",
139 | "\n",
140 | " # Optional - remove gene symbol - (for clingen and biocommons HGVS) \n",
141 | " # \"remove_gene_symbol\": remove_gene_symbol,\n",
142 | "\n",
143 | "# \"clean_kind\": clean_kind,\n",
144 | "# \"add_unmatched_brackets\": add_unmatched_brackets,\n",
145 | "# \"add_missing_colon\": add_missing_colon,\n",
146 | "\n",
147 | "\n",
148 | "test_hgvs = [\n",
149 | " \"c.4165C>T\", # This should fail as it has no transcript/gene\n",
150 | " \"CACNA1E:c.4165C>T'\", # gene name - it's resolution that is trick here\n",
151 | " \"CACNA1E c.4165C>T'\", # extra space, missing colon \n",
152 | " \"CACNA1Ec.4165C>T'\", # missing colon\n",
153 | " \"NM_001205293.2 :c.4165C>T'\", # whitespace\n",
154 | " \"NM_001205293.2(CACNA1E):c.4165C>T'\", # \n",
155 | " \"NM_001205293.2 :c.4165C>T'\", # whitespace\n",
156 | "]\n",
157 | "\n",
158 | "\n",
159 | "\n",
160 | "def clean_hgvs(original_hgvs_string) -> Tuple[str, List[str]]:\n",
161 | " hgvs_string = original_hgvs_string\n",
162 | " clean_messages = []\n",
163 | "\n",
164 | " for clean_method_desc, clean_hgvs_func in GLOBAL_CLEAN.items():\n",
165 | " cleaned_hgvs_string = clean_hgvs_func(hgvs_string) # hgvs_method)\n",
166 | " if cleaned_hgvs_string != hgvs_string:\n",
167 | " clean_messages.append(clean_method_desc)\n",
168 | " hgvs_string = cleaned_hgvs_string\n",
169 | "\n",
170 | "\n",
171 | " # Now we split it up into reference/kind/allele\n",
172 | " \n",
173 | " \n",
174 | " return hgvs_string, clean_messages\n",
175 | " "
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "id": "d30b8846",
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "# I think we should first break it up into \n",
186 | "# reference / kind / allele\n",
187 | "# \n",
188 | "\n",
189 | "\n",
190 | "original_hgvs_string = \"GLA c.1277_1278delAA\"\n",
191 | "hgvs_string, clean_messages = clean_hgvs(original_hgvs_string)\n",
192 | "print(f\"{original_hgvs_string} -> {hgvs_string} \")\n",
193 | "for msg in clean_messages:\n",
194 | " print(msg)\n",
195 | " \n"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "id": "ba35d85d",
202 | "metadata": {},
203 | "outputs": [],
204 | "source": [
205 | "# This is from VariantGrid code\n",
206 | "\n",
207 | "\n",
208 | " BAD_HGVS = [\n",
209 | " \"NM_000038.6;c.4332A>T\" # Semicolon instead\n",
210 | " \"NM_205768 c.44A>G\", # Missing colon (no version)\n",
211 | " \"NM_005629.3:c1403A>C\", # Missing dot after kind\n",
212 | " \"NM_001101.4 c.95C>G\", # Missing colon\n",
213 | " \"NM_00380.3: c.648_649delGA\", # space after colon\n",
214 | " \"NC_000023.10:g. 31496384G>A\",\n",
215 | " \"NM_004245: :c.337G>T\", # Double colon\n",
216 | " \"NC_000017.10:g.21085664 G>C\", # Space after numbers\n",
217 | " \"NC_000023.10:g. 133547943G>A\", # Space after g.\n",
218 | " # Missing transcript underscore, Missing colon, Missing dot after g\n",
219 | " # Space between position and reference base\n",
220 | " \"NC000002.10g39139341 C>T\",\n",
221 | " # Unbalanced brackets\n",
222 | " \"NM_001754.5):c.557T>A\",\n",
223 | " \"(NM_004991.4:c.2577+4A>T\",\n",
224 | " # Good brackets HGVS (just testing gene symbol)\n",
225 | " \"NM_001754.5(RUNX1):c.1415T>C\",\n",
226 | " \"NM_032638:c.1126_1133DUP\", # Case\n",
227 | " \"NM_001754.5:557T>A\", # Missing \"c.\"\n",
228 | " \"NC_000007.13:117199563G>T\", # Missing \"g.\"\n",
229 | " ]\n",
230 | "\n",
231 | " for bad_hgvs in BAD_HGVS:\n",
232 | " try:\n",
233 | " HGVSName(bad_hgvs)\n",
234 | " self.fail(f\"Expected '{bad_hgvs}' to fail!\")\n",
235 | " except:\n",
236 | " pass # Expected\n",
237 | "\n",
238 | " fixed_hgvs = HGVSMatcher.clean_hgvs(bad_hgvs)[0]\n",
239 | " HGVSName(fixed_hgvs)\n",
240 | "\n",
241 | "\n"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "id": "86c509cb",
248 | "metadata": {},
249 | "outputs": [],
250 | "source": []
251 | }
252 | ],
253 | "metadata": {
254 | "kernelspec": {
255 | "display_name": "Python 3 (ipykernel)",
256 | "language": "python",
257 | "name": "python3"
258 | },
259 | "language_info": {
260 | "codemirror_mode": {
261 | "name": "ipython",
262 | "version": 3
263 | },
264 | "file_extension": ".py",
265 | "mimetype": "text/x-python",
266 | "name": "python",
267 | "nbconvert_exporter": "python",
268 | "pygments_lexer": "ipython3",
269 | "version": "3.10.6"
270 | }
271 | },
272 | "nbformat": 4,
273 | "nbformat_minor": 5
274 | }
275 |
--------------------------------------------------------------------------------
/paper/clean_hgvs_search_csvs.py:
--------------------------------------------------------------------------------
1 | #!/bin/env python3
2 |
3 | import math
4 | import re
5 | import sys
6 | import pandas as pd
7 | from pysam.libcfaidx import FastaFile
8 | import pyhgvs
9 | from cdot.pyhgvs.pyhgvs_transcript import JSONPyHGVSTranscriptFactory
10 |
11 |
12 | def get_combined_df():
13 | servers = ["shariant", "vg-aws", "vg3_upgrade"]
14 |
15 | df_list = []
16 |
17 | for server in servers:
18 | filename = f"{server}_search_hgvs.csv"
19 | server_df = pd.read_csv(filename, names=["date", "details"], skiprows=1)
20 | server_df["server"] = server
21 | df_list.append(server_df)
22 |
23 | return pd.concat(df_list)
24 |
25 |
26 | def add_hgvs_column(df):
27 | pattern_calculated = re.compile(r"'(.+)' calculated")
28 | pattern_type = re.compile(r"'(.+)' = type")
29 | pattern_returned = re.compile(r"'(.+)' returned")
30 |
31 | hgvs_list = []
32 | for details in df["details"].values:
33 | for pattern in [pattern_calculated, pattern_type, pattern_returned]:
34 | if m := pattern.match(details):
35 | hgvs_list.append(m.group(1))
36 | break
37 | else:
38 | print(f"No match for '{details}'")
39 | hgvs_list.append("")
40 |
41 | df["hgvs"] = hgvs_list
42 |
43 |
44 | def can_resolve(genome, factory, hgvs_c):
45 | try:
46 | pyhgvs.parse_hgvs_name(hgvs_c, genome, get_transcript=factory.get_transcript_grch37) # 37
47 | return True
48 | except Exception as e:
49 | print(e)
50 | try:
51 | pyhgvs.parse_hgvs_name(hgvs_c, genome, get_transcript=factory.get_transcript_grch38) # 38
52 | return True
53 | except Exception as e2:
54 | print(e2)
55 | pass
56 | return False
57 |
58 |
59 | def add_hgvs_validation_columns(df):
60 | genome = FastaFile("/data/annotation/fasta/GCF_000001405.25_GRCh37.p13_genomic.fna.gz")
61 | factory = JSONPyHGVSTranscriptFactory(["/home/dlawrence/Downloads/cdot-0.2.12.refseq.grch37_grch38.json.gz",
62 | "/home/dlawrence/Downloads/cdot-0.2.12.ensembl.grch37_grch38.json.gz"])
63 |
64 | valid_hgvs_list = []
65 | can_resolve_list = []
66 | for hgvs_c in df["hgvs"].values:
67 | # print(f"testing... {hgvs_c}")
68 | resolve_ok = False
69 | try:
70 | pyhgvs.HGVSName(hgvs_c)
71 | valid_hgvs = True
72 | resolve_ok = can_resolve(genome, factory, hgvs_c)
73 | except:
74 | valid_hgvs = False
75 |
76 | valid_hgvs_list.append(valid_hgvs)
77 | can_resolve_list.append(resolve_ok)
78 |
79 | df["valid_hgvs"] = valid_hgvs_list
80 | df["can_resolve"] = can_resolve_list
81 |
82 |
83 | def split_df_chunks(data_df,chunk_size):
84 | """ From https://xhinker.medium.com/python-split-a-dataframe-to-a-chunk-list-fe80bf9d63be """
85 | total_length = len(data_df)
86 | total_chunk_num = math.ceil(total_length/chunk_size)
87 | normal_chunk_num = math.floor(total_length/chunk_size)
88 | chunks = []
89 | for i in range(normal_chunk_num):
90 | chunk = data_df[(i*chunk_size):((i+1)*chunk_size)]
91 | chunks.append(chunk)
92 | if total_chunk_num > normal_chunk_num:
93 | chunk = data_df[(normal_chunk_num*chunk_size):total_length]
94 | chunks.append(chunk)
95 | return chunks
96 |
97 |
98 | def main():
99 | if len(sys.argv) == 1:
100 | print("main")
101 | df = get_combined_df()
102 | add_hgvs_column(df)
103 | for i, chunk in enumerate(split_df_chunks(df, 500)):
104 | filename = f"hgvs_search_{i}.csv"
105 | print(f"writing {filename}")
106 | chunk.to_csv(filename)
107 | else:
108 | filename = sys.argv[1]
109 | print(f"Processing {filename}")
110 | df = pd.read_csv(filename)
111 | add_hgvs_validation_columns(df)
112 | df.to_csv(f"validate_{filename}")
113 |
114 |
115 | if __name__ == "__main__":
116 | main()
117 |
--------------------------------------------------------------------------------
/paper/combine_csv.py:
--------------------------------------------------------------------------------
1 | #!/bin/env python3
2 |
3 | columns = ['date', 'details', 'server', 'hgvs', 'valid_hgvs', 'can_resolve']
4 | df_list = []
5 | for filename in glob.glob("validate*.csv"):
6 | df = pd.read_csv(filename)
7 | df = df[columns]
8 | df_list.append(df)
9 | df_combined.sort_values("date").to_csv("hgvs_searches_combined.csv", index=False)
10 |
11 |
12 |
--------------------------------------------------------------------------------
/paper/investigate_fails.py:
--------------------------------------------------------------------------------
1 | #!/bin/env python3
2 |
3 | import math
4 | import re
5 | import sys
6 | import pandas as pd
7 | from pysam.libcfaidx import FastaFile
8 | import pyhgvs
9 | from cdot.pyhgvs.pyhgvs_transcript import JSONPyHGVSTranscriptFactory
10 |
11 |
12 | def main():
13 | if len(sys.argv) != 1:
14 | sys.stderr.write(f"Usage {sys.argv[0]} hgvs_searches_combined.csv\n")
15 | sys.exit(1)
16 |
17 | filename = sys.argv[1]
18 | df = pd.read_csv(filename)
19 |
20 | non_resolve_mask = df["can_resolve"] is False
21 | hgvs_errors_df = df[non_resolve_mask]
22 |
23 | genome = FastaFile("/data/annotation/fasta/GCF_000001405.25_GRCh37.p13_genomic.fna.gz")
24 | factory = JSONPyHGVSTranscriptFactory(["/home/dlawrence/Downloads/cdot-0.2.12.refseq.grch37_grch38.json.gz",
25 | "/home/dlawrence/Downloads/cdot-0.2.12.ensembl.grch37_grch38.json.gz"])
26 |
27 |
28 | if __name__ == "__main__":
29 | main()
30 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools>=42",
4 | "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
7 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = cdot
3 | version = attr: cdot.__version__
4 | author = Dave Lawrence
5 | author_email = davmlaw@gmail.com
6 | description = Transcripts for HGVS libraries
7 | long_description = file: README.md
8 | long_description_content_type = text/markdown
9 | url = https://github.com/SACGF/cdot
10 | project_urls =
11 | Bug Tracker = https://github.com/SACGF/cdot/issues
12 | classifiers =
13 | Programming Language :: Python :: 3
14 | License :: OSI Approved :: MIT License
15 | Operating System :: OS Independent
16 |
17 | [options]
18 | package_dir =
19 | packages = find:
20 | python_requires = >=3.8
21 | install_requires =
22 | requests
23 | intervaltree
24 | more_itertools
25 | bioutils>=0.5.8
26 | lazy
27 |
28 | [options.packages.find]
29 | where =
30 | exclude=
31 | tests
32 | generate_transcript_data
33 |
34 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SACGF/cdot/ddeb78d58731dd4136689360d0fce4a8a91af87d/tests/__init__.py
--------------------------------------------------------------------------------
/tests/benchmark_hgvs.py:
--------------------------------------------------------------------------------
1 | #!/bin/env python3
2 |
3 | """
4 | See instructions at end of file on how to extract test HGVS from clinvar
5 | """
6 | import logging
7 | import time
8 | import pandas as pd
9 | from argparse import ArgumentParser
10 |
11 | import hgvs
12 | import hgvs.dataproviders.uta
13 | from hgvs.assemblymapper import AssemblyMapper
14 | from hgvs.exceptions import HGVSDataNotAvailableError, HGVSInvalidVariantError
15 |
16 | from cdot.hgvs.dataproviders import JSONDataProvider, RESTDataProvider, FastaSeqFetcher
17 | from cdot.hgvs.dataproviders.ensembl_tark_data_provider import EnsemblTarkDataProvider, EnsemblTarkSeqFetcher
18 |
19 |
20 | def handle_args():
21 | parser = ArgumentParser(description='Benchmark cdot')
22 | parser.add_argument("--debug", action="store_true")
23 | parser.add_argument("hgvs_file")
24 | group = parser.add_mutually_exclusive_group()
25 | group.add_argument('--uta', action='store_true')
26 | group.add_argument('--rest', action='store_true')
27 | group.add_argument('--rest-insecure', action='store_true')
28 | group.add_argument('--ensembl-tark', action='store_true')
29 | parser.add_argument('--json', help='JSON file')
30 | parser.add_argument('--fasta', help='Fasta file for local sequences')
31 | args = parser.parse_args()
32 | if not any([args.uta, args.rest, args.rest_insecure, args.ensembl_tark, args.json]):
33 | parser.error("You need to specify at least one of 'uta', 'rest', 'rest-insecure', 'ensembl-tark', 'json'")
34 | return args
35 |
36 |
37 | def main():
38 | args = handle_args()
39 | if args.debug:
40 | logging.basicConfig(level=logging.DEBUG)
41 |
42 | hgvs_g_c_list = []
43 | with open(args.hgvs_file) as f:
44 | for line in f:
45 | hgvs_g_c_list.append(line.split())
46 |
47 | total = len(hgvs_g_c_list)
48 | logging.debug(f"Using {total} test records")
49 |
50 | seqfetcher = None
51 | if args.fasta:
52 | if args.debug:
53 | logging.debug("Using fasta: %s", args.fasta)
54 | seqfetcher = FastaSeqFetcher(args.fasta)
55 |
56 | if args.uta:
57 | hdp = hgvs.dataproviders.uta.connect()
58 | elif args.rest:
59 | hdp = RESTDataProvider(seqfetcher=seqfetcher) # Uses API server at cdot.cc
60 | elif args.rest_insecure:
61 | hdp = RESTDataProvider(secure=False, seqfetcher=seqfetcher)
62 | elif args.json:
63 | hdp = JSONDataProvider([args.json], seqfetcher=seqfetcher)
64 | elif args.ensembl_tark:
65 | # Tark doesn't provide genomes so it needs a genome one...
66 | if args.fasta:
67 | fasta_files = [args.fasta]
68 | else:
69 | fasta_files = None
70 | seqfetcher = EnsemblTarkSeqFetcher(fasta_files=fasta_files)
71 | hdp = EnsemblTarkDataProvider(seqfetcher=seqfetcher)
72 | else:
73 | raise ValueError("Unknown data provider method!")
74 |
75 | if args.debug:
76 | logging.debug("Starting benchmark...")
77 | am = AssemblyMapper(hdp,
78 | assembly_name='GRCh38',
79 | alt_aln_method='splign', replace_reference=True)
80 |
81 | hp = hgvs.parser.Parser()
82 |
83 | run_times = []
84 | correct = 0
85 | incorrect = 0
86 | no_data = 0
87 | errors = 0
88 | total_start = time.time()
89 |
90 | def _show_stats():
91 | df = pd.DataFrame(run_times)
92 | print(df.describe().T)
93 | print(f"Correct: {correct}, incorrect: {incorrect}, no data: {no_data}, errors: {errors}")
94 |
95 | last_notification = time.time()
96 | for hgvs_g, hgvs_c in hgvs_g_c_list:
97 | if args.debug:
98 | logging.debug("c.HGVS: %s", hgvs_c)
99 |
100 | start = time.time()
101 | if start - last_notification > 5:
102 | last_notification = start
103 | _show_stats()
104 | print("-" * 50)
105 |
106 | try:
107 | var_c = hp.parse_hgvs_variant(hgvs_c)
108 | if ":c." in hgvs_c:
109 | converted_hgvs_g = str(am.c_to_g(var_c))
110 | else:
111 | converted_hgvs_g = str(am.n_to_g(var_c))
112 | except HGVSDataNotAvailableError as dne:
113 | logging.warning(dne)
114 | no_data += 1
115 | continue
116 | except HGVSInvalidVariantError as ive:
117 | print(f"{hgvs_c}: {ive}")
118 | incorrect += 1
119 | continue
120 | except Exception as e:
121 | logging.error(e)
122 | errors += 1
123 | continue
124 |
125 | if converted_hgvs_g == hgvs_g:
126 | correct += 1
127 | else:
128 | incorrect += 1
129 | print(f"{hgvs_c}: '{hgvs_g}' != '{converted_hgvs_g}' (actual)")
130 | continue
131 |
132 | # We only keep times for correct data
133 | end = time.time()
134 | time_taken = end - start
135 | run_times.append(time_taken)
136 |
137 | _show_stats()
138 | total_end = time.time()
139 | total_time = total_end - total_start
140 | num_per_second = 1 / total_time * total
141 | print(f"{total} in {total_time} = {num_per_second} per second")
142 |
143 | if __name__ == '__main__':
144 | main()
145 |
146 | """
147 |
148 | How to make RefSeq test files:
149 | --------------------------------
150 |
151 | * Get a subset of rows from ClinVar VCF
152 | * zgrep "^#" clinvar.vcf.gz > header.txt
153 | * zgrep -v "^#" clinvar.vcf.gz | shuf -n 1000 > clinvar_1k_records.vcf
154 | * cat header.txt clinvar_1k_rows.vcf | gzip > clinvar_1k.vcf.gz
155 |
156 | * Annotate the VCF to get MANE transcript (via --pick)
157 |
158 | vep -i clinvar_1k.vcf.gz -o clinvar_1k.vep_annotated.vcf.gz --cache --dir /data/annotation/VEP/vep_cache --fasta /data/annotation/fasta/GCF_000001405.39_GRCh38.p13_genomic.fna.gz --assembly GRCh38 --offline --use_given_ref --vcf --compress_output gzip --force_overwrite --pick --no_escape --hgvs --refseq --buffer_size 1000
159 |
160 | * Extract out the g.HGVS and c.HGVS
161 |
162 | def cyvcf2_header_types(cyvcf2_reader):
163 | header_types = defaultdict(dict)
164 | for h in cyvcf2_reader.header_iter():
165 | info = h.info()
166 | h_id = info.get("ID")
167 | if h_id: # Not much use w/o this
168 | header_types[h.type][h_id] = info
169 | return header_types
170 |
171 |
172 | reader = Reader("./clinvar_1k.vcf.gz")
173 | header_types = cyvcf2_header_types(reader)
174 | description = header_types["INFO"]["CSQ"]["description"]
175 | description = description.replace('"', '') # Strip double quotes
176 |
177 | match = "Format: "
178 | columns_str = description[description.rfind(match) + len(match):]
179 | vep_columns = columns_str.split("|")
180 |
181 | hgvs = []
182 | for v in reader:
183 | csq = v.INFO.get("CSQ")
184 | td = dict(zip(vep_columns, csq.split("|")))
185 | g_hgvs = v.INFO.get("CLNHGVS")
186 | c_hgvs = td.get("HGVSc")
187 | if g_hgvs and c_hgvs:
188 | hgvs.append((g_hgvs, c_hgvs))
189 |
190 |
191 | --------------------------
192 | How to make Ensembl files
193 | --------------------------
194 |
195 | * Import ClinVar subset into VariantGrid
196 | * As admin, on VCF page click "Populate ClinGen Alleles"
197 | * Should have enough with both ClinVar and MANE
198 |
199 | def get_38_ghgvs(cga):
200 | for ga in cga.api_response["genomicAlleles"]:
201 | if ga["referenceGenome"] == 'GRCh38':
202 | for h in ga["hgvs"]:
203 | if h.startswith("NC_"):
204 | return h
205 | return None
206 |
207 | def get_ensembl_mane(cga):
208 | for ta in cga.api_response["transcriptAlleles"]:
209 | if mane := ta.get("MANE"):
210 | if nt := mane.get("nucleotide"):
211 | if e := nt.get("Ensembl"):
212 | if h := e.get("hgvs"):
213 | return h
214 | return None
215 |
216 | g_and_c = []
217 |
218 | clingen_qs = ClinGenAllele.objects.filter(Q(api_response__icontains='ClinVarAlleles') & Q(api_response__icontains='MANE'))
219 | for cga in clingen_qs:
220 | g_hgvs = get_38_ghgvs(cga)
221 | c_hgvs = get_ensembl_mane(cga)
222 | if g_hgvs and c_hgvs:
223 | g_and_c.append((g_hgvs, c_hgvs))
224 |
225 | with open("/tmp/transcripts.txt", "wt") as f:
226 | for x in g_and_c:
227 | f.write("\t".join(x) + "\n")
228 |
229 |
230 | """
231 |
--------------------------------------------------------------------------------
/tests/genome.py:
--------------------------------------------------------------------------------
1 | """
2 | From https://github.com/counsyl/hgvs
3 | """
4 |
5 | from __future__ import absolute_import
6 | from __future__ import unicode_literals
7 |
8 | import itertools
9 | import os
10 |
11 | try:
12 | # Original PyHGVS
13 | from pyhgvs.variants import revcomp
14 | except ImportError:
15 | # SACGF fork of PyHGVS
16 | from pyhgvs.models.variants import revcomp
17 |
18 |
19 | try:
20 | from pyfaidx import Genome as SequenceFileDB
21 | # Allow pyflakes to ignore redefinition in except clause.
22 | SequenceFileDB
23 | except ImportError:
24 | SequenceFileDB = None
25 |
26 |
27 | class MockGenomeError(Exception):
28 | pass
29 |
30 |
31 | class MockSequence(object):
32 | def __init__(self, sequence):
33 | self.sequence = sequence
34 |
35 | def __neg__(self):
36 | """Return reverse complement sequence."""
37 | return MockSequence(revcomp(self.sequence))
38 |
39 | def __str__(self):
40 | return self.sequence
41 |
42 | def __repr__(self):
43 | return 'MockSequence("%s")' % self.sequence
44 |
45 |
46 | class MockChromosome(object):
47 | def __init__(self, name, genome=None):
48 | self.name = name
49 | self.genome = genome
50 |
51 | def __getitem__(self, n):
52 | """Return sequence from region [start, end)
53 |
54 | Coordinates are 0-based, end-exclusive."""
55 | if isinstance(n, slice):
56 | return self.genome.get_seq(self.name, n.start, n.stop)
57 | else:
58 | return self.genome.get_seq(self.name, n, n+1)
59 |
60 | def __repr__(self):
61 | return 'MockChromosome("%s")' % (self.name)
62 |
63 |
64 | class MockGenome(object):
65 | def __init__(self, lookup=None, filename=None, db_filename=None,
66 | default_seq=None):
67 | """
68 | A mock genome object that provides a pygr compatible interface.
69 |
70 | lookup: a list of ((chrom, start, end), seq) values that define
71 | a lookup table for genome sequence requests.
72 | filename: a stream or filename containing a lookup table.
73 | db_filename: a fasta file to use for genome sequence requests. All
74 | requests are recorded and can be writen to a lookup table file
75 | using the `write` method.
76 | default_seq: if given, this base will always be returned if
77 | region is unavailable.
78 | """
79 | self._chroms = {}
80 | self._lookup = lookup if lookup is not None else {}
81 | self._genome = None
82 | self._default_seq = default_seq
83 |
84 | if db_filename:
85 | # Use a real genome database.
86 | if SequenceFileDB is None:
87 | raise ValueError('pygr is not available.')
88 | self._genome = SequenceFileDB(db_filename)
89 | self._source_filename = db_filename
90 | elif filename:
91 | # Read genome sequence from lookup table.
92 | self.read(filename)
93 | self._source_filename = filename
94 |
95 | def __contains__(self, chrom):
96 | """Return True if genome contains chromosome."""
97 | return chrom in (self._genome or self._chroms)
98 |
99 | def __getitem__(self, chrom):
100 | """Return a chromosome by its name."""
101 | if chrom not in self._chroms:
102 | self._chroms[chrom] = MockChromosome(chrom, self)
103 | return self._chroms[chrom]
104 |
105 | def get_seq(self, chrom, start, end):
106 | """Return a sequence by chromosome name and region [start, end).
107 |
108 | Coordinates are 0-based, end-exclusive.
109 | """
110 | if self._genome:
111 | # Get sequence from real genome object and save result.
112 | seq = self._genome[chrom][start:end]
113 | self._lookup[(chrom, start, end)] = str(seq)
114 | return seq
115 | else:
116 | # Use lookup table to fetch genome sequence.
117 | try:
118 | return MockSequence(self._lookup[(chrom, start, end)])
119 | except KeyError:
120 | if self._default_seq:
121 | # Generate default sequence.
122 | return ''.join(itertools.islice(
123 | itertools.cycle(self._default_seq),
124 | None, end - start))
125 | else:
126 | raise MockGenomeError(
127 | 'Sequence not in test data: %s:%d-%d source: %s' %
128 | (chrom, start, end, self._source_filename))
129 |
130 | def read(self, filename):
131 | """Read a sequence lookup table from a file.
132 |
133 | filename: a filename string or file stream.
134 | """
135 | if hasattr(filename, 'read'):
136 | infile = filename
137 | else:
138 | with open(filename) as infile:
139 | return self.read(infile)
140 |
141 | for line in infile:
142 | tokens = line.rstrip().split('\t')
143 | chrom, start, end, seq = tokens
144 | self._lookup[(chrom, int(start), int(end))] = seq
145 | if chrom not in self._lookup:
146 | self._chroms[chrom] = MockChromosome(chrom, self)
147 |
148 | def write(self, filename):
149 | """Write a sequence lookup table to file."""
150 | if hasattr(filename, 'write'):
151 | out = filename
152 | else:
153 | with open(filename, 'w') as out:
154 | return self.write(out)
155 |
156 | for (chrom, start, end), seq in self._lookup.items():
157 | out.write('\t'.join(map(str, [chrom, start, end, seq])) + '\n')
158 |
159 |
160 | class MockGenomeTestFile(MockGenome):
161 | def __init__(self, lookup=None, filename=None, db_filename=None,
162 | default_seq=None, create_data=False):
163 | if not create_data:
164 | db_filename = None
165 | super(MockGenomeTestFile, self).__init__(
166 | lookup=lookup, db_filename=db_filename,
167 | filename=filename,
168 | default_seq=default_seq)
169 |
170 | self._filename = filename
171 | self._create_data = (db_filename is not None)
172 |
173 | if self._create_data and os.path.exists(filename):
174 | # Clear output file when creating data.
175 | os.remove(filename)
176 |
177 | def get_seq(self, chrom, start, end):
178 | seq = super(MockGenomeTestFile, self).get_seq(chrom, start, end)
179 |
180 | # Save each query in append mode.
181 | if self._create_data:
182 | with open(self._filename, 'a') as out:
183 | out.write('\t'.join(map(str, [chrom, start, end, seq])) + '\n')
184 | return seq
185 |
--------------------------------------------------------------------------------
/tests/mock_ensembl_tark.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os.path
3 | import re
4 | from inspect import getsourcefile
5 | from os.path import abspath
6 |
7 | from cdot.hgvs.dataproviders.ensembl_tark_data_provider import EnsemblTarkDataProvider
8 |
9 |
10 | class MockEnsemblTarkDataProvider(EnsemblTarkDataProvider):
11 | def __init__(self, assemblies: list[str] = None, mode=None, cache=None, seqfetcher=None):
12 | super().__init__(assemblies, mode, cache, seqfetcher)
13 | self._this_file_dir = os.path.dirname(abspath(getsourcefile(lambda: 0)))
14 |
15 |
16 | def _get_from_url(self, url):
17 | if not url.startswith(self.base_url):
18 | raise ValueError(f"{url} does not start with {self.base_url}")
19 |
20 | dirname = os.path.dirname(url)
21 | basename = os.path.basename(url)
22 | params = re.sub(r"^\?", "", basename)
23 | path = re.sub(f"^{self.base_url}/", "", dirname)
24 | filename = os.path.join(self._this_file_dir, "test_data", "ensembl_tark", path, f"{params}.json")
25 | if not os.path.exists(filename):
26 | raise FileNotFoundError(f"{filename} not found")
27 |
28 | with open(filename, "r") as f:
29 | return json.load(f)
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/tests/mock_seqfetcher.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from hgvs.exceptions import HGVSDataNotAvailableError
4 |
5 |
6 | class MockSeqFetcher:
7 | def __init__(self, filename):
8 | with open(filename) as f:
9 | self.transcripts = json.load(f)
10 | self.source = f"Mock: Local JSON file: {filename}"
11 |
12 | def fetch_seq(self, ac, start_i=None, end_i=None):
13 | seq = self.transcripts.get(ac)
14 | if seq is None:
15 | raise HGVSDataNotAvailableError()
16 | if start_i is None:
17 | start_i = 0
18 | if end_i is None:
19 | end_i = len(seq)
20 | return seq[start_i:end_i]
21 |
22 |
--------------------------------------------------------------------------------
/tests/test_data/cdot.ensembl.grch38.json:
--------------------------------------------------------------------------------
1 | {
2 | "transcripts": {
3 | "ENST00000617537.5": {
4 | "biotype": [
5 | "mRNA",
6 | "protein_coding"
7 | ],
8 | "gene_name": "AOAH",
9 | "gene_version": "ENSG00000136250.12",
10 | "genome_builds": {
11 | "GRCh38": {
12 | "cds_end": 36724148,
13 | "cds_start": 36513251,
14 | "contig": "NC_000007.14",
15 | "exons": [
16 | [
17 | 36512940,
18 | 36513380,
19 | 20,
20 | 1946,
21 | 2385,
22 | null
23 | ],
24 | [
25 | 36522038,
26 | 36522115,
27 | 19,
28 | 1869,
29 | 1945,
30 | null
31 | ],
32 | [
33 | 36530417,
34 | 36530514,
35 | 18,
36 | 1772,
37 | 1868,
38 | null
39 | ],
40 | [
41 | 36532146,
42 | 36532206,
43 | 17,
44 | 1712,
45 | 1771,
46 | null
47 | ],
48 | [
49 | 36532285,
50 | 36532344,
51 | 16,
52 | 1653,
53 | 1711,
54 | null
55 | ],
56 | [
57 | 36540318,
58 | 36540491,
59 | 15,
60 | 1480,
61 | 1652,
62 | null
63 | ],
64 | [
65 | 36548611,
66 | 36548686,
67 | 14,
68 | 1405,
69 | 1479,
70 | null
71 | ],
72 | [
73 | 36549438,
74 | 36549475,
75 | 13,
76 | 1368,
77 | 1404,
78 | null
79 | ],
80 | [
81 | 36576573,
82 | 36576656,
83 | 12,
84 | 1285,
85 | 1367,
86 | null
87 | ],
88 | [
89 | 36594338,
90 | 36594430,
91 | 11,
92 | 1193,
93 | 1284,
94 | null
95 | ],
96 | [
97 | 36616379,
98 | 36616474,
99 | 10,
100 | 1098,
101 | 1192,
102 | null
103 | ],
104 | [
105 | 36618296,
106 | 36618345,
107 | 9,
108 | 1049,
109 | 1097,
110 | null
111 | ],
112 | [
113 | 36620780,
114 | 36620829,
115 | 8,
116 | 1000,
117 | 1048,
118 | null
119 | ],
120 | [
121 | 36621709,
122 | 36621780,
123 | 7,
124 | 929,
125 | 999,
126 | null
127 | ],
128 | [
129 | 36623189,
130 | 36623250,
131 | 6,
132 | 868,
133 | 928,
134 | null
135 | ],
136 | [
137 | 36632035,
138 | 36632106,
139 | 5,
140 | 797,
141 | 867,
142 | null
143 | ],
144 | [
145 | 36637850,
146 | 36637910,
147 | 4,
148 | 737,
149 | 796,
150 | null
151 | ],
152 | [
153 | 36659165,
154 | 36659265,
155 | 3,
156 | 637,
157 | 736,
158 | null
159 | ],
160 | [
161 | 36673942,
162 | 36674009,
163 | 2,
164 | 570,
165 | 636,
166 | null
167 | ],
168 | [
169 | 36686698,
170 | 36686794,
171 | 1,
172 | 474,
173 | 569,
174 | null
175 | ],
176 | [
177 | 36724021,
178 | 36724494,
179 | 0,
180 | 1,
181 | 473,
182 | null
183 | ]
184 | ],
185 | "strand": "-",
186 | "tag": "CCDS,basic,Ensembl_canonical,GENCODE Primary,MANE_Select",
187 | "url": "ftp://ftp.ensembl.org/pub/release-112/gtf/homo_sapiens/Homo_sapiens.GRCh38.112.gtf.gz"
188 | }
189 | },
190 | "id": "ENST00000617537.5",
191 | "protein": "ENSP00000483783.1",
192 | "start_codon": 346,
193 | "stop_codon": 2074
194 | }
195 | },
196 | "cdot_version": "0.2.26",
197 | "genome_builds": [
198 | "GRCh38"
199 | ]
200 | }
--------------------------------------------------------------------------------
/tests/test_data/cdot.refseq.grch37.json:
--------------------------------------------------------------------------------
1 | {
2 | "transcripts": {
3 | "NM_001637.3": {
4 | "start_codon": 401,
5 | "stop_codon": 2129,
6 | "id": "NM_001637.3",
7 | "gene_version": "313",
8 | "gene_name": "AOAH",
9 | "biotype": [
10 | "protein_coding"
11 | ],
12 | "protein": "NP_001628.1",
13 | "genome_builds": {
14 | "GRCh37": {
15 | "cds_end": 36763753,
16 | "cds_start": 36552857,
17 | "contig": "NC_000007.13",
18 | "exons": [
19 | [
20 | 36552548,
21 | 36552986,
22 | 20,
23 | 2001,
24 | 2440,
25 | "M196 I1 M61 I1 M181"
26 | ],
27 | [
28 | 36561644,
29 | 36561721,
30 | 19,
31 | 1924,
32 | 2000,
33 | null
34 | ],
35 | [
36 | 36570023,
37 | 36570120,
38 | 18,
39 | 1827,
40 | 1923,
41 | null
42 | ],
43 | [
44 | 36571752,
45 | 36571812,
46 | 17,
47 | 1767,
48 | 1826,
49 | null
50 | ],
51 | [
52 | 36571891,
53 | 36571950,
54 | 16,
55 | 1708,
56 | 1766,
57 | null
58 | ],
59 | [
60 | 36579924,
61 | 36580097,
62 | 15,
63 | 1535,
64 | 1707,
65 | null
66 | ],
67 | [
68 | 36588217,
69 | 36588292,
70 | 14,
71 | 1460,
72 | 1534,
73 | null
74 | ],
75 | [
76 | 36589044,
77 | 36589081,
78 | 13,
79 | 1423,
80 | 1459,
81 | null
82 | ],
83 | [
84 | 36616179,
85 | 36616262,
86 | 12,
87 | 1340,
88 | 1422,
89 | null
90 | ],
91 | [
92 | 36633944,
93 | 36634036,
94 | 11,
95 | 1248,
96 | 1339,
97 | null
98 | ],
99 | [
100 | 36655985,
101 | 36656080,
102 | 10,
103 | 1153,
104 | 1247,
105 | null
106 | ],
107 | [
108 | 36657902,
109 | 36657951,
110 | 9,
111 | 1104,
112 | 1152,
113 | null
114 | ],
115 | [
116 | 36660386,
117 | 36660435,
118 | 8,
119 | 1055,
120 | 1103,
121 | null
122 | ],
123 | [
124 | 36661315,
125 | 36661386,
126 | 7,
127 | 984,
128 | 1054,
129 | null
130 | ],
131 | [
132 | 36662795,
133 | 36662856,
134 | 6,
135 | 923,
136 | 983,
137 | null
138 | ],
139 | [
140 | 36671641,
141 | 36671712,
142 | 5,
143 | 852,
144 | 922,
145 | null
146 | ],
147 | [
148 | 36677456,
149 | 36677516,
150 | 4,
151 | 792,
152 | 851,
153 | null
154 | ],
155 | [
156 | 36698770,
157 | 36698870,
158 | 3,
159 | 692,
160 | 791,
161 | null
162 | ],
163 | [
164 | 36713547,
165 | 36713614,
166 | 2,
167 | 625,
168 | 691,
169 | null
170 | ],
171 | [
172 | 36726303,
173 | 36726399,
174 | 1,
175 | 529,
176 | 624,
177 | null
178 | ],
179 | [
180 | 36763626,
181 | 36764154,
182 | 0,
183 | 1,
184 | 528,
185 | null
186 | ]
187 | ],
188 | "start": 36552548,
189 | "stop": 36764154,
190 | "strand": "-",
191 | "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz"
192 | }
193 | }
194 | },
195 | "NR_023343.1": {
196 | "id": "NR_023343.1",
197 | "cdot": "0.2.12",
198 | "hgnc": "34016",
199 | "biotype": [
200 | "non_coding"
201 | ],
202 | "gene_name": "RNU4ATAC",
203 | "genome_builds": {
204 | "GRCh37": {
205 | "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20220307/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz",
206 | "exons": [
207 | [
208 | 122288455,
209 | 122288585,
210 | 0,
211 | 1,
212 | 130,
213 | null
214 | ]
215 | ],
216 | "contig": "NC_000002.11",
217 | "strand": "+"
218 | }
219 | }
220 | }
221 | },
222 | "genes": {
223 | "GATA2": {
224 | "aliases": "DCML, IMD21, MONOMAC, NFE1B",
225 | "biotype": "protein_coding",
226 | "description": "GATA binding protein 2",
227 | "gene_symbol": "GATA2",
228 | "hgnc": "4171",
229 | "map_location": "3q21.3",
230 | "summary": "This gene encodes a member of the GATA family of zinc-finger transcription factors that are named for the consensus nucleotide sequence they bind in the promoter regions of target genes. The encoded protein plays an essential role in regulating transcription of genes involved in the development and proliferation of hematopoietic and endocrine cell lineages. Alternative splicing results in multiple transcript variants.[provided by RefSeq, Mar 2009]",
231 | "url": "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz"
232 | }
233 | },
234 | "cdot_version": "0.2.10",
235 | "genome_builds": [
236 | "GRCh37"
237 | ]
238 | }
--------------------------------------------------------------------------------
/tests/test_data/clinvar_hgvs/clinvar_hgvs_010.tsv:
--------------------------------------------------------------------------------
1 | NC_000002.12:g.189003410G>A NM_000090.4:c.2554-1G>A
2 | NC_000002.12:g.73572910G>A NM_015120.4:c.11036G>A
3 | NC_000003.12:g.36996633_36996634delinsTT NM_001354619.1:c.-593_-592delinsTT
4 | NC_000003.12:g.58149866C>T NM_001164317.2:c.6201C>T
5 | NC_000006.12:g.52082460T>C NM_138694.4:c.213A>G
6 | NC_000007.14:g.98906228C>T NM_001244580.1:c.1088C>T
7 | NC_000010.11:g.120905027T>C NM_018117.12:c.3193+216T>C
8 | NC_000012.12:g.8854218G>A NM_144670.6:c.2681G>A
9 | NC_000016.10:g.81869317C>T NM_002661.5:c.564+19C>T
10 | NC_000017.11:g.58734198A>G NM_058216.3:c.1107A>G
11 |
--------------------------------------------------------------------------------
/tests/test_data/clinvar_hgvs/clinvar_hgvs_050.tsv:
--------------------------------------------------------------------------------
1 | NC_000001.11:g.237617406G>A NM_001035.3:c.5836G>A
2 | NC_000001.11:g.33021456_33021458del NM_001625.4:c.336_338del
3 | NC_000001.11:g.52383874T>C NM_004153.4:c.1819A>G
4 | NC_000001.11:g.52397773C>T NM_004153.4:c.314G>A
5 | NC_000001.11:g.53213501A>C NM_000098.3:c.1883A>C
6 | NC_000001.11:g.94047046C>T NM_000350.3:c.2791G>A
7 | NC_000002.12:g.169275142G>A NM_004525.3:c.1869C>T
8 | NC_000002.12:g.178538451C>G NM_001267550.2:c.99289+89G>C
9 | NC_000002.12:g.178608202G>T NM_001267550.2:c.52681C>A
10 | NC_000002.12:g.178741896G>A NM_001267550.2:c.11337C>T
11 | NC_000002.12:g.214781051A>T NM_000465.4:c.823T>A
12 | NC_000002.12:g.219490515_219490517del NM_005876.5:c.9028_9030del
13 | NC_000002.12:g.46909034C>T NM_001171511.2:c.93-1065G>A
14 | NC_000002.12:g.47806494A>C NM_001281493.1:c.2938A>C
15 | NC_000002.12:g.47813341T>C NM_001190274.2:c.2120A>G
16 | NC_000002.12:g.85888897C>A NM_003896.4:c.9G>T
17 | NC_000003.12:g.123752448G>C NM_001321309.2:c.-155-12447C>G
18 | NC_000005.10:g.10255709T>C NM_012073.5:c.332-246T>C
19 | NC_000005.10:g.112841537T>A NM_000038.6:c.5943T>A
20 | NC_000005.10:g.139026767A>G NM_022464.5:c.645+34T>C
21 | NC_000005.10:g.149027638G>A NM_024577.4:c.2094C>T
22 | NC_000005.10:g.35873480C>T NM_002185.5:c.538C>T
23 | NC_000007.14:g.16089456T>C NM_001101426.4:c.*2239A>G
24 | NC_000007.14:g.97852358C>T NM_001673.5:c.1587G>A
25 | NC_000008.11:g.18062326C>A NM_177924.5:c.601G>T
26 | NC_000008.11:g.89980722C>T NM_002485.5:c.480+12G>A
27 | NC_000009.12:g.37784879T>G NM_016042.4:c.166A>C
28 | NC_000009.12:g.69035905C>T NM_000144.5:c.123C>T
29 | NC_000010.11:g.87925523T>C NM_000314.8:c.175T>C
30 | NC_000011.10:g.112045279del NR_164072.1:n.1167+49del
31 | NC_000011.10:g.118312837C>T NM_000733.4:c.323C>T
32 | NC_000011.10:g.17427125C>T NR_147094.2:n.2212G>A
33 | NC_000011.10:g.64809875del NM_130804.2:c.237del
34 | NC_000012.12:g.120737861C>T NM_000017.4:c.497C>T
35 | NC_000013.11:g.27920121C>T NM_000209.4:c.-18C>T
36 | NC_000014.9:g.30879438C>T NM_004086.3:c.389C>T
37 | NC_000016.10:g.89770196G>A NM_000135.4:c.2286C>T
38 | NC_000017.11:g.43045725C>T NR_027676.2:n.5722G>A
39 | NC_000017.11:g.43094198del NR_027676.2:n.1510del
40 | NC_000017.11:g.59031925C>T NM_015294.6:c.1919G>A
41 | NC_000017.11:g.80058782T>C NM_017950.4:c.1318-76T>C
42 | NC_000018.10:g.58343058C>G NM_001144967.3:c.1530C>G
43 | NC_000018.10:g.70176380_70176383del NM_173630.4:c.1476+295_1476+298del
44 | NC_000019.10:g.12897438G>A NM_000159.4:c.1082+10G>A
45 | NC_000021.9:g.42477365G>A NM_080860.4:c.653C>T
46 | NC_000022.11:g.17110140A>G NM_001289905.1:c.*320A>G
47 | NC_000023.11:g.154767342C>T NM_001363.5:c.600C>T
48 | NC_000023.11:g.19355691T>C NM_000284.4:c.765T>C
49 | NC_000023.11:g.45059450G>A NR_111960.1:n.1555G>A
50 | NC_000023.11:g.74742052G>A NM_001008537.3:c.2505C>T
51 |
--------------------------------------------------------------------------------
/tests/test_data/clinvar_hgvs/clinvar_hgvs_100.tsv:
--------------------------------------------------------------------------------
1 | NC_000001.11:g.12007126G>A NM_001127660.1:c.1946G>A
2 | NC_000001.11:g.21860220G>A NM_005529.7:c.4971C>T
3 | NC_000001.11:g.237643408G>C NM_001035.3:c.7303G>C
4 | NC_000001.11:g.23808165T>C NM_000191.3:c.720A>G
5 | NC_000001.11:g.241517292C>A NM_000143.4:c.157G>T
6 | NC_000001.11:g.94111501G>A NM_000350.3:c.239C>T
7 | NC_000002.12:g.113220093C>T NM_003466.4:c.1275G>A
8 | NC_000002.12:g.144399194T>C NM_014795.4:c.1993A>G
9 | NC_000002.12:g.15286830T>C NM_015909.4:c.5138+243A>G
10 | NC_000002.12:g.178592272G>A NM_001267550.2:c.59632C>T
11 | NC_000002.12:g.178715710C>T NM_001267550.2:c.25704G>A
12 | NC_000002.12:g.214745103C>G NM_000465.4:c.1867G>C
13 | NC_000002.12:g.47803493G>C NM_001281493.1:c.2340G>C
14 | NC_000002.12:g.73432288G>A NM_015120.4:c.1432G>A
15 | NC_000002.12:g.73453022T>C NM_015120.4:c.6498T>C
16 | NC_000002.12:g.85343265G>T NM_017750.4:c.1810C>A
17 | NC_000002.12:g.85888897C>A NM_003896.4:c.9G>T
18 | NC_000003.12:g.15644611A>G NM_001281723.3:c.695A>G
19 | NC_000003.12:g.158691443T>C NR_164500.1:n.2195T>C
20 | NC_000003.12:g.43598588T>A NM_018075.5:c.416A>T
21 | NC_000003.12:g.49099390C>T NM_005051.3:c.1568G>A
22 | NC_000003.12:g.49099430C>T NM_005051.3:c.1528G>A
23 | NC_000003.12:g.52402867A>G NM_004656.4:c.1895T>C
24 | NC_000003.12:g.69118837G>T NM_001304418.3:c.1518C>A
25 | NC_000004.12:g.112646986A>G NM_016648.4:c.552+31A>G
26 | NC_000004.12:g.113355165A>G NM_001148.6:c.6547A>G
27 | NC_000004.12:g.43030609T>A NM_001080476.2:c.*69T>A
28 | NC_000004.12:g.52038248C>A NM_000232.5:c.12G>T
29 | NC_000004.12:g.83273598C>T NM_015697.8:c.590G>A
30 | NC_000004.12:g.88268819C>T NM_152542.5:c.629G>A
31 | NC_000005.10:g.113064054C>G NM_001085377.2:c.2143G>C
32 | NC_000005.10:g.126550263C>A NM_001182.5:c.1348G>T
33 | NC_000005.10:g.179126029C>T NM_014244.5:c.2719G>A
34 | NC_000005.10:g.79051354T>C NM_013391.3:c.678A>G
35 | NC_000005.10:g.83539495C>T NM_004385.5:c.6492C>T
36 | NC_000006.12:g.129280072C>T NM_001079823.2:c.2462C>T
37 | NC_000006.12:g.129353296C>A NM_001079823.2:c.4656C>A
38 | NC_000006.12:g.52079909A>G NM_138694.4:c.381T>C
39 | NC_000006.12:g.52082460T>C NM_138694.4:c.213A>G
40 | NC_000007.14:g.22945636C>G NM_032581.4:c.1519G>C
41 | NC_000007.14:g.5999116G>C NM_000535.7:c.697C>G
42 | NC_000007.14:g.93146872C>A NM_152703.5:c.-779+11G>T
43 | NC_000008.11:g.89980722C>T NM_002485.5:c.480+12G>A
44 | NC_000009.12:g.2717979G>A NM_133497.4:c.240G>A
45 | NC_000009.12:g.34648800G>A NM_001258332.1:c.399G>A
46 | NC_000009.12:g.37745681G>A NM_014907.3:c.3649G>A
47 | NC_000009.12:g.69035905C>T NM_000144.5:c.123C>T
48 | NC_000009.12:g.92045993A>G NM_006415.4:c.1136+6T>C
49 | NC_000010.11:g.110784352G>A NM_001134363.3:c.1349G>A
50 | NC_000010.11:g.110799819A>G NM_001134363.3:c.1701A>G
51 | NC_000010.11:g.111013593T>C NR_136749.1:n.2936T>C
52 | NC_000011.10:g.112045279del NR_164072.1:n.1167+49del
53 | NC_000011.10:g.118503818del NM_001197104.2:c.7926del
54 | NC_000011.10:g.119026031A>G NM_001164279.2:c.701T>C
55 | NC_000011.10:g.1752849G>A NM_001909.5:c.*654C>T
56 | NC_000011.10:g.47342611del NM_000256.3:c.1595del
57 | NC_000011.10:g.5226575del NM_000518.5:c.315+2del
58 | NC_000011.10:g.61445957A>G NM_017841.4:c.387A>G
59 | NC_000013.11:g.32332371_32332377delinsTACTTCAG NM_000059.3:c.893_899delinsTACTTCAG
60 | NC_000013.11:g.32337627T>C NM_000059.3:c.3272T>C
61 | NC_000013.11:g.32339462G>C NM_000059.3:c.5107G>C
62 | NC_000014.9:g.23432514G>C NM_000257.4:c.503-8C>G
63 | NC_000014.9:g.45176022C>G NM_020937.4:c.3268C>G
64 | NC_000014.9:g.67766373A>G NM_015346.4:c.5865T>C
65 | NC_000014.9:g.89980550C>G NM_018319.4:c.802C>G
66 | NC_000014.9:g.92006041T>C NM_004239.4:c.1935A>G
67 | NC_000015.10:g.90754814G>A NM_000057.4:c.963G>A
68 | NC_000015.10:g.92985680G>T NM_001271.4:c.3413+7G>T
69 | NC_000016.10:g.1352026C>T NM_032520.5:c.52+9C>T
70 | NC_000016.10:g.173193A>G NM_000517.6:c.164A>G
71 | NC_000016.10:g.2046301C>G NM_002528.7:c.181G>C
72 | NC_000016.10:g.2048728T>C NM_000548.5:c.113T>C
73 | NC_000016.10:g.2081776T>C NM_000548.5:c.3792T>C
74 | NC_000016.10:g.2109256C>T NM_000296.4:c.5911G>A
75 | NC_000016.10:g.30993204G>T NM_052874.5:c.712C>A
76 | NC_000017.11:g.41769530G>A NM_002230.4:c.356C>T
77 | NC_000017.11:g.43094198del NR_027676.2:n.1510del
78 | NC_000017.11:g.50356597T>C NM_022167.4:c.1569T>C
79 | NC_000017.11:g.59031925C>T NM_015294.6:c.1919G>A
80 | NC_000017.11:g.65557863T>A NM_004655.4:c.758A>T
81 | NC_000017.11:g.65557982_65557984del NM_004655.4:c.639_641del
82 | NC_000018.10:g.46639718G>A NM_144612.6:c.409C>T
83 | NC_000018.10:g.70176380_70176383del NM_173630.4:c.1476+295_1476+298del
84 | NC_000019.10:g.13298566C>T NM_001127222.2:c.3067G>A
85 | NC_000019.10:g.48969229T>A NM_002103.5:c.*59A>T
86 | NC_000019.10:g.51353233dup NM_001014763.1:c.551dup
87 | NC_000019.10:g.53904356T>C NM_002739.5:c.1657-279T>C
88 | NC_000019.10:g.54191771C>T NM_001077446.4:c.294C>T
89 | NC_000019.10:g.54193391G>A NM_001077446.4:c.*29G>A
90 | NC_000020.11:g.10658702_10658703insA NM_000214.3:c.459_460insT
91 | NC_000020.11:g.10673008T>C NM_000214.3:c.82-2A>G
92 | NC_000020.11:g.31831073G>A NM_033118.4:c.1356G>A
93 | NC_000020.11:g.35504820_35504831del NM_007186.6:c.6451_6462del
94 | NC_000020.11:g.63494913G>A NM_001958.5:c.513C>T
95 | NC_000021.9:g.32678630del NM_003895.3:c.1627+15del
96 | NC_000021.9:g.34370822G>T NM_172201.1:c.344G>T
97 | NC_000021.9:g.45468322G>A NM_130445.4:c.187G>A
98 | NC_000022.11:g.17110140A>G NM_001289905.1:c.*320A>G
99 | NC_000023.11:g.22221720T>C NM_000444.6:c.1876T>C
100 | NC_000023.11:g.41343808_41343809dup NR_126093.1:n.1696_1697dup
101 |
--------------------------------------------------------------------------------
/tests/test_data/clinvar_hgvs/clinvar_hgvs_ensembl_100.tsv:
--------------------------------------------------------------------------------
1 | NC_000003.12:g.52999279_53001028del ENST00000394752.8:c.-130-31769_-130-30020del
2 | NC_000004.12:g.121364109_121368709del ENST00000394427.3:c.340+11600_340+16200del
3 | NC_000017.11:g.31163377_31164732del ENST00000358273.9:c.479+1_479+1356del
4 | NC_000019.10:g.12536733_12537889del ENST00000339282.12:c.4-9193_4-8037del
5 | NC_000007.14:g.43891176_43895702del ENST00000453200.6:c.15-7886_15-3360del
6 | NC_000009.12:g.354334_355793del ENST00000432829.7:c.1680-13684_1680-12225del
7 | NC_000011.10:g.88639541_88647384del ENST00000305447.5:c.1147+5785_1147+13628del
8 | NC_000023.11:g.32626321_32627544del ENST00000357033.9:c.1332-13088_1332-11865del
9 | NC_000005.10:g.112840128_112843000del ENST00000257430.9:c.4534_7406del
10 | NC_000005.10:g.140855987_140858640del ENST00000504120.4:c.2394+67303_2394+69956del
11 | NC_000007.14:g.88958724_88966741del ENST00000333190.5:c.108+198640_108+206657del
12 | NC_000017.11:g.17710076_17713445del ENST00000353383.6:c.-148-13952_-148-10583del
13 | NC_000005.10:g.40779130_40785601del ENST00000397128.7:c.128-8015_128-1544del
14 | NC_000005.10:g.90686004_90688051del ENST00000405460.9:c.6490+9_6491-1810del
15 | NC_000001.11:g.3373437_3380436del ENST00000270722.10:c.439-11715_439-4716del
16 | NC_000001.11:g.7621941_7624940del ENST00000303635.12:c.511-18459_511-15460del
17 | NC_000002.12:g.178560066_178564516del ENST00000589042.5:c.81618_86068del
18 | NC_000002.12:g.141316191_141320434del ENST00000389484.8:c.344-65793_344-61550del
19 | NC_000001.11:g.33008231_33013268del ENST00000672715.1:c.636_*4953del
20 | NC_000002.12:g.50041090_50048923del ENST00000401669.7:c.4128+4348_4128+12181del
21 | NC_000002.12:g.60458961_60461239del ENST00000642384.2:c.1675_*1445del
22 | NC_000023.11:g.119927096_119928345del ENST00000371410.5:c.1073+1672_1074-1701del
23 | NC_000023.11:g.154026923_154030670del ENST00000303391.11:c.1158_*3444del
24 | NC_000003.12:g.192587419_192594481del ENST00000445105.7:c.13+132700_13+139762del
25 | NC_000004.12:g.996520_998294del ENST00000514224.2:c.300-4092_300-2318del
26 | NC_000006.12:g.1610446_1613897del ENST00000645831.2:c.1_*1790del
27 | NC_000003.12:g.37938391_37944758del ENST00000273179.10:c.80-8666_80-2299del
28 | NC_000008.11:g.3929021_3931750del ENST00000635120.2:c.818+66153_818+68882del
29 | NC_000010.11:g.26710075_26713224del ENST00000376215.10:c.467+307_467+3456del
30 | NC_000015.10:g.82641775_82645527del ENST00000684509.1:c.-98+1610_-98+5362del
31 | NC_000018.10:g.58268755_58269877del ENST00000400345.8:c.297+16701_297+17823del
32 | NC_000018.10:g.69541905_69550035del ENST00000382713.10:c.67-22582_67-14452del
33 | NC_000001.11:g.245473613_245478917del ENST00000407071.7:c.1166+53868_1166+59172del
34 | NC_000001.11:g.245867700_245869699del ENST00000490107.6:c.814-5812_814-3813del
35 | NC_000002.12:g.17581968_17584091del ENST00000295156.9:c.-5-10102_-5-7979del
36 | NC_000002.12:g.211906513_211910431del ENST00000342788.9:c.421+36999_421+40917del
37 | NC_000007.14:g.108418131_108425883del ENST00000379028.8:c.-331-26289_-331-18537del
38 | NC_000008.11:g.50314668_50318252del ENST00000642720.2:c.-27-79544_-27-75960del
39 | NC_000010.11:g.113086890_113087964del ENST00000355995.9:c.552+46764_552+47838del
40 | NC_000019.10:g.47839728_47843005del ENST00000221996.12:c.661_*3038del
41 | NC_000012.12:g.2136470_2143758del ENST00000399603.6:c.477+16040_477+23328del
42 | NC_000002.12:g.197705407_197707188dup ENST00000282276.8:c.2_*1dup
43 | NC_000012.12:g.70286084_70288034del ENST00000229195.8:c.48+7810_48+9760del
44 | NC_000011.10:g.2699445_2700825del ENST00000155840.12:c.1514+37364_1514+38744del
45 | NC_000012.12:g.99608908_99614995del ENST00000683438.2:c.1272+40072_1272+46159del
46 | NC_000012.12:g.61875937_61878789del ENST00000416284.8:c.-1-11362_-1-8510del
47 | NC_000021.9:g.33355546_33357370del ENST00000270139.8:c.1671_*1821del
48 | NC_000012.12:g.25495743_25501512del ENST00000458174.7:c.*22+2227_*22+7996del
49 | NC_000016.10:g.68767553_68771266del ENST00000261769.10:c.163+29142_164-30404del
50 | NC_000006.12:g.151297100_151300476del ENST00000402676.7:c.163-8647_163-5271del
51 | NC_000010.11:g.51447874_51451689del ENST00000373980.11:c.479-19849_479-16034del
52 | NC_000010.11:g.86194845_86199292del ENST00000327946.12:c.520+7072_520+11519del
53 | NC_000010.11:g.87948970_87950261del ENST00000371953.8:c.493-3148_493-1857del
54 | NC_000013.11:g.32398162_32399672del ENST00000380152.8:c.9649_*902del
55 | NC_000013.11:g.38805150_38809282del ENST00000280481.9:c.6019+20342_6019+24474del
56 | NC_000013.11:g.77000458_77002517del ENST00000377453.9:c.566_*1548del
57 | NC_000009.12:g.36259405_36266486del ENST00000396594.8:c.51+10408_52-10008del
58 | NC_000016.10:g.89919802_89920805del ENST00000555147.2:c.544_*593del
59 | NC_000018.10:g.10913665_10919991del ENST00000674853.1:c.287-8763_287-2437del
60 | NC_000007.14:g.117559509G>T ENST00000003084.11:c.1438G>T
61 | NC_000004.12:g.186083673G>A ENST00000296795.8:c.1987G>A
62 | NC_000017.11:g.32531952_32539872del ENST00000318217.10:c.2865-44956_2865-37036del
63 | NC_000023.11:g.17533722_17538533del ENST00000676302.1:c.566-154020_566-149209del
64 | NC_000021.9:g.33550233_33554247del ENST00000356577.10:c.1002_5016del
65 | NC_000011.10:g.119052306C>G ENST00000617285.5:c.1111G>C
66 | NC_000001.11:g.209786745del ENST00000367021.8:c.*1678del
67 | NC_000001.11:g.220213740G>A ENST00000358951.7:c.304+116C>T
68 | NC_000001.11:g.226875528C>T ENST00000366783.8:c.-43C>T
69 | NC_000002.12:g.113129758T>G ENST00000409930.4:c.205+94T>G
70 | NC_000002.12:g.127058848A>G ENST00000316724.10:c.1002+163T>C
71 | NC_000002.12:g.241190354T>G ENST00000674324.2:c.108+183T>G
72 | NC_000003.12:g.31624774del ENST00000295770.4:c.1728-140del
73 | NC_000003.12:g.38751968T>A ENST00000449082.3:c.1755+251A>T
74 | NC_000003.12:g.53105415_53105419del ENST00000296292.8:c.957+258_957+262del
75 | NC_000003.12:g.160381874dup ENST00000326448.12:c.38-144dup
76 | NC_000005.10:g.78964145A>C ENST00000264914.10:c.690+271T>G
77 | NC_000005.10:g.123386676T>A ENST00000306467.10:c.1431-9A>T
78 | NC_000006.12:g.13306845G>A ENST00000379300.8:c.666-318C>T
79 | NC_000006.12:g.33181375C>G ENST00000341947.7:c.1120-205G>C
80 | NC_000006.12:g.38803495G>A ENST00000327475.11:c.3034+184G>A
81 | NC_000007.14:g.135626418dup ENST00000285968.11:c.4793+57dup
82 | NC_000009.12:g.95508385_95508387dup ENST00000437951.6:c.199-1768_199-1766dup
83 | NC_000009.12:g.104782004T>C ENST00000374736.8:c.*2311A>G
84 | NC_000009.12:g.134823305T>G ENST00000371817.8:c.4645-111T>G
85 | NC_000010.11:g.5001841T>G ENST00000380753.9:c.85-160A>C
86 | NC_000003.12:g.52403787T>G ENST00000460680.6:c.1358A>C
87 | NC_000003.12:g.55470273T>C ENST00000264634.9:c.962A>G
88 | NC_000003.12:g.56593711A>G ENST00000394672.8:c.1289A>G
89 | NC_000003.12:g.66381521C>T ENST00000273261.8:c.2728G>A
90 | NC_000003.12:g.69959257T>C ENST00000352241.9:c.1032-16T>C
91 | NC_000003.12:g.93905835del ENST00000394236.9:c.550del
92 | NC_000011.10:g.68761568_68761570delinsA ENST00000265641.10:c.1993_1995delinsT
93 | NC_000011.10:g.72108616C>T ENST00000541899.3:c.468C>T
94 | NC_000010.11:g.30336592T>C ENST00000263063.9:c.780+211A>G
95 | NC_000010.11:g.87715935A>G ENST00000456849.2:c.865+92A>G
96 | NC_000012.12:g.25205716A>T ENST00000256078.10:c.*4200T>A
97 | NC_000012.12:g.57581372A>C ENST00000455537.7:c.2756-43A>C
98 | NC_000014.9:g.45181610G>A ENST00000267430.10:c.4318-27G>A
99 | NC_000014.9:g.63950166G>A ENST00000555002.6:c.590+160G>A
100 | NC_000014.9:g.67724398T>G ENST00000551171.6:c.69-75T>G
101 |
--------------------------------------------------------------------------------
/tests/test_data/clinvar_hgvs/clinvar_hgvs_ensembl_50.tsv:
--------------------------------------------------------------------------------
1 | NC_000003.12:g.52999279_53001028del ENST00000394752.8:c.-130-31769_-130-30020del
2 | NC_000004.12:g.121364109_121368709del ENST00000394427.3:c.340+11600_340+16200del
3 | NC_000017.11:g.31163377_31164732del ENST00000358273.9:c.479+1_479+1356del
4 | NC_000019.10:g.12536733_12537889del ENST00000339282.12:c.4-9193_4-8037del
5 | NC_000007.14:g.43891176_43895702del ENST00000453200.6:c.15-7886_15-3360del
6 | NC_000009.12:g.354334_355793del ENST00000432829.7:c.1680-13684_1680-12225del
7 | NC_000011.10:g.88639541_88647384del ENST00000305447.5:c.1147+5785_1147+13628del
8 | NC_000023.11:g.32626321_32627544del ENST00000357033.9:c.1332-13088_1332-11865del
9 | NC_000005.10:g.112840128_112843000del ENST00000257430.9:c.4534_7406del
10 | NC_000005.10:g.140855987_140858640del ENST00000504120.4:c.2394+67303_2394+69956del
11 | NC_000007.14:g.88958724_88966741del ENST00000333190.5:c.108+198640_108+206657del
12 | NC_000017.11:g.17710076_17713445del ENST00000353383.6:c.-148-13952_-148-10583del
13 | NC_000005.10:g.40779130_40785601del ENST00000397128.7:c.128-8015_128-1544del
14 | NC_000005.10:g.90686004_90688051del ENST00000405460.9:c.6490+9_6491-1810del
15 | NC_000001.11:g.3373437_3380436del ENST00000270722.10:c.439-11715_439-4716del
16 | NC_000001.11:g.7621941_7624940del ENST00000303635.12:c.511-18459_511-15460del
17 | NC_000002.12:g.178560066_178564516del ENST00000589042.5:c.81618_86068del
18 | NC_000002.12:g.141316191_141320434del ENST00000389484.8:c.344-65793_344-61550del
19 | NC_000001.11:g.33008231_33013268del ENST00000672715.1:c.636_*4953del
20 | NC_000002.12:g.50041090_50048923del ENST00000401669.7:c.4128+4348_4128+12181del
21 | NC_000002.12:g.60458961_60461239del ENST00000642384.2:c.1675_*1445del
22 | NC_000023.11:g.119927096_119928345del ENST00000371410.5:c.1073+1672_1074-1701del
23 | NC_000023.11:g.154026923_154030670del ENST00000303391.11:c.1158_*3444del
24 | NC_000003.12:g.192587419_192594481del ENST00000445105.7:c.13+132700_13+139762del
25 | NC_000004.12:g.996520_998294del ENST00000514224.2:c.300-4092_300-2318del
26 | NC_000006.12:g.1610446_1613897del ENST00000645831.2:c.1_*1790del
27 | NC_000003.12:g.37938391_37944758del ENST00000273179.10:c.80-8666_80-2299del
28 | NC_000008.11:g.3929021_3931750del ENST00000635120.2:c.818+66153_818+68882del
29 | NC_000010.11:g.26710075_26713224del ENST00000376215.10:c.467+307_467+3456del
30 | NC_000015.10:g.82641775_82645527del ENST00000684509.1:c.-98+1610_-98+5362del
31 | NC_000018.10:g.58268755_58269877del ENST00000400345.8:c.297+16701_297+17823del
32 | NC_000018.10:g.69541905_69550035del ENST00000382713.10:c.67-22582_67-14452del
33 | NC_000001.11:g.245473613_245478917del ENST00000407071.7:c.1166+53868_1166+59172del
34 | NC_000001.11:g.245867700_245869699del ENST00000490107.6:c.814-5812_814-3813del
35 | NC_000002.12:g.17581968_17584091del ENST00000295156.9:c.-5-10102_-5-7979del
36 | NC_000002.12:g.211906513_211910431del ENST00000342788.9:c.421+36999_421+40917del
37 | NC_000007.14:g.108418131_108425883del ENST00000379028.8:c.-331-26289_-331-18537del
38 | NC_000008.11:g.50314668_50318252del ENST00000642720.2:c.-27-79544_-27-75960del
39 | NC_000010.11:g.113086890_113087964del ENST00000355995.9:c.552+46764_552+47838del
40 | NC_000019.10:g.47839728_47843005del ENST00000221996.12:c.661_*3038del
41 | NC_000012.12:g.2136470_2143758del ENST00000399603.6:c.477+16040_477+23328del
42 | NC_000002.12:g.197705407_197707188dup ENST00000282276.8:c.2_*1dup
43 | NC_000012.12:g.70286084_70288034del ENST00000229195.8:c.48+7810_48+9760del
44 | NC_000011.10:g.2699445_2700825del ENST00000155840.12:c.1514+37364_1514+38744del
45 | NC_000012.12:g.99608908_99614995del ENST00000683438.2:c.1272+40072_1272+46159del
46 | NC_000012.12:g.61875937_61878789del ENST00000416284.8:c.-1-11362_-1-8510del
47 | NC_000021.9:g.33355546_33357370del ENST00000270139.8:c.1671_*1821del
48 | NC_000012.12:g.25495743_25501512del ENST00000458174.7:c.*22+2227_*22+7996del
49 | NC_000016.10:g.68767553_68771266del ENST00000261769.10:c.163+29142_164-30404del
50 | NC_000006.12:g.151297100_151300476del ENST00000402676.7:c.163-8647_163-5271del
51 |
--------------------------------------------------------------------------------
/tests/test_data/ensembl_tark/transcript/assembly_name=GRCh38&expand=transcript_release_set&loc_end=36530514&loc_region=7&loc_start=36530417&page=6.json:
--------------------------------------------------------------------------------
1 | {
2 | "count": 53,
3 | "next": null,
4 | "previous": "http://tark.ensembl.org/api/transcript/?assembly_name=GRCh38&expand=transcript_release_set&loc_end=36530514&loc_region=7&loc_start=36530417&page=5",
5 | "results": [
6 | {
7 | "stable_id": "XM_011515338",
8 | "stable_id_version": 3,
9 | "assembly": "GRCh38",
10 | "loc_start": 36514653,
11 | "loc_end": 36724494,
12 | "loc_strand": -1,
13 | "loc_region": "7",
14 | "loc_checksum": "3A0C94FD0C19832E867F7DD408729ED9FA0CC577",
15 | "exon_set_checksum": "C755149B187679201C78733FD72856459A1B9984",
16 | "transcript_checksum": "802BD7AF1B237E57A3C4A0C653C3772D2174199E",
17 | "sequence": "9B29344586871C26D51D42E2750778B9F0AED323",
18 | "biotype": "predicted_protein_coding",
19 | "three_prime_utr_start": 36514670,
20 | "three_prime_utr_end": 36514653,
21 | "three_prime_utr_seq": "AAACCACTGTTGAGATGG",
22 | "three_prime_utr_checksum": "361C329D180C3DD63FD9036DF88F00192E50514F",
23 | "five_prime_utr_start": 36724494,
24 | "five_prime_utr_end": 36724149,
25 | "five_prime_utr_seq": "ACTGAGCCAGGGAGCACGGAAGTTGTGCCACTGTGCAACTTGGGTTTTCTTTATCCTGCAGTCTTTACCTCAGCAGAACCGCACACCACAGACTCCCTCCAGCTCTTTGTGTGTGGCTCTCTCAGGGTCCAACAAGAGCAAGCTGTGGGTCTGTGAGTGTTTATGTGTGCTTTTATTCACTTCACACTTATTGAAAAGTGTGTATGTGAGAGGGTGGGGTGTGTGTGTCAAAGAGAGTGAGGAAGAGAAGGAGAGAGAGATCAATTGATTCTGCAGCCTCAGCTCCAGCATCCCTCAGTTGGGAGCTTCCAAAGCCGGGTGATCACTTGGGGTGCATAGCTCGGAG",
26 | "five_prime_utr_checksum": "60878BA89AD674207714116AF20DB12EE2B653C0",
27 | "transcript_release_set": [
28 | {
29 | "assembly": "GRCh38",
30 | "shortname": "110_20220707",
31 | "description": "Refseq Homo sapiens Annotation Release 110.20220707",
32 | "release_date": "2022-07-07",
33 | "source": "RefSeq"
34 | },
35 | {
36 | "assembly": "GRCh38",
37 | "shortname": "GCF_000001405_20230320",
38 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20230320",
39 | "release_date": "2023-03-20",
40 | "source": "RefSeq"
41 | },
42 | {
43 | "assembly": "GRCh38",
44 | "shortname": "GCF_000001405_20231007",
45 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20231007",
46 | "release_date": "2023-10-07",
47 | "source": "RefSeq"
48 | },
49 | {
50 | "assembly": "GRCh38",
51 | "shortname": "GCF_000001405_20240827",
52 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20240827",
53 | "release_date": "2024-08-24",
54 | "source": "RefSeq"
55 | }
56 | ]
57 | },
58 | {
59 | "stable_id": "XM_011515339",
60 | "stable_id_version": 3,
61 | "assembly": "GRCh38",
62 | "loc_start": 36528175,
63 | "loc_end": 36724494,
64 | "loc_strand": -1,
65 | "loc_region": "7",
66 | "loc_checksum": "8DE122E93958674611C3E40852B0EA2C3257B3DD",
67 | "exon_set_checksum": "266DDEFF8992E3D25D19156E1DACE37F5B59C197",
68 | "transcript_checksum": "F3930BD6833EE5CE890A7F6F5A36D74BD2606ED2",
69 | "sequence": "2FEE2ADE3B6333A89337680BC9737760EF94EE4A",
70 | "biotype": "predicted_protein_coding",
71 | "three_prime_utr_start": 36528307,
72 | "three_prime_utr_end": 36528175,
73 | "three_prime_utr_seq": "ACTGAACATTCTACATCAACGTGGGAGAAAGCTCCTAGGCATTTCTCCATGCTTGGCGATTCCCATCAACAGTTCAAGGAACTTCCTGTTTTGTAATCTCCTGCTATTTGTTTAAAATAAATGTGAAGATCTA",
74 | "three_prime_utr_checksum": "32FCDE7C368E86B254F7A1EF1D5837213DA6C6A8",
75 | "five_prime_utr_start": 36724494,
76 | "five_prime_utr_end": 36724149,
77 | "five_prime_utr_seq": "ACTGAGCCAGGGAGCACGGAAGTTGTGCCACTGTGCAACTTGGGTTTTCTTTATCCTGCAGTCTTTACCTCAGCAGAACCGCACACCACAGACTCCCTCCAGCTCTTTGTGTGTGGCTCTCTCAGGGTCCAACAAGAGCAAGCTGTGGGTCTGTGAGTGTTTATGTGTGCTTTTATTCACTTCACACTTATTGAAAAGTGTGTATGTGAGAGGGTGGGGTGTGTGTGTCAAAGAGAGTGAGGAAGAGAAGGAGAGAGAGATCAATTGATTCTGCAGCCTCAGCTCCAGCATCCCTCAGTTGGGAGCTTCCAAAGCCGGGTGATCACTTGGGGTGCATAGCTCGGAG",
78 | "five_prime_utr_checksum": "60878BA89AD674207714116AF20DB12EE2B653C0",
79 | "transcript_release_set": [
80 | {
81 | "assembly": "GRCh38",
82 | "shortname": "110_20220707",
83 | "description": "Refseq Homo sapiens Annotation Release 110.20220707",
84 | "release_date": "2022-07-07",
85 | "source": "RefSeq"
86 | },
87 | {
88 | "assembly": "GRCh38",
89 | "shortname": "GCF_000001405_20230320",
90 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20230320",
91 | "release_date": "2023-03-20",
92 | "source": "RefSeq"
93 | },
94 | {
95 | "assembly": "GRCh38",
96 | "shortname": "GCF_000001405_20231007",
97 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20231007",
98 | "release_date": "2023-10-07",
99 | "source": "RefSeq"
100 | },
101 | {
102 | "assembly": "GRCh38",
103 | "shortname": "GCF_000001405_20240827",
104 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20240827",
105 | "release_date": "2024-08-24",
106 | "source": "RefSeq"
107 | }
108 | ]
109 | },
110 | {
111 | "stable_id": "XM_011515340",
112 | "stable_id_version": 3,
113 | "assembly": "GRCh38",
114 | "loc_start": 36528175,
115 | "loc_end": 36724494,
116 | "loc_strand": -1,
117 | "loc_region": "7",
118 | "loc_checksum": "8DE122E93958674611C3E40852B0EA2C3257B3DD",
119 | "exon_set_checksum": "459BB82C3050AB0D64DC509D36F0AB1DC55E4B79",
120 | "transcript_checksum": "B01C1023FAF579C23978532BDA07596F571E3589",
121 | "sequence": "2529B9E4932091E20EF3C74F7495067255A1CEC6",
122 | "biotype": "predicted_protein_coding",
123 | "three_prime_utr_start": 36529217,
124 | "three_prime_utr_end": 36528175,
125 | "three_prime_utr_seq": "TCACTAGCTCTGTCATTTATTATCTCTGGGACTGCAGGCAATTCTCTGAATATTTTTAAACTCCCTTTCCAGCCCTGGAAATGGGGATACCATATCTTCCTAACAAATCTCTGTGATGATTCTATGAAATAATGTGTCATCCTCCTAAATGTTATGCCTGGTACACAGAAAAGGTACCATTTCCTTACACCCTTGCTACTCAATGTATGCTTTTGGATAAGCAAAATCAACACCACCCAGGAGCTGGTTATAAATGCAGAATTCCAGGCCCTACTCCAGACCTACCTAATCAGAACCTGCATTCAAACACAACCCACAGGTCATCTCTATGCCCATTAAAATTTTGTAAATGGTTTGTGAAGCTGAGATGGGAGGATCGCTCAAGTCCAGGAGGTCAAGACCAGCCTGGGCAACATTGCAAGACCCCATCTCTAAAAACAAAACATCTTTTTTTATTAGCCAGGCATGGTGGTGCATGCTTGTAGTCCCAGCTACTCTGGAGGTTGAGATGGGGGGATCACTTGAGCCTGGGTGGTTGAGGCTGCAGTGAGTTGTGATCACGTCACTCAACTCCAGCCTGGTTGACAGAGAGAGACCTCATCTCTATAAAAATAAAAATAAAGTTTAATAAATGGAGCTCTATAATGCCTCAAGATTAAGAGCTGGGTCAGTCATCAGATTTATAAGTCCTGCTATGTGCCAGGAGCTATGAAGGTGCTGGGAACATAATCGTCAACAAAGCAGAAGAGTCCTTATCTCTGTGGAGGATATAGAGAGAAGAGCTTTATTCTTGTCTGTCCAGAATTCCACTGGTGAACTGATATGCTTGGAAGCAGGGCCTCATCAGCCCTTTCTTGTCCTCACAGCAATAATAAACACAGTGAAAAAGCAAAAAGTTGAATTAAGCTGAACTGAACATTCTACATCAACGTGGGAGAAAGCTCCTAGGCATTTCTCCATGCTTGGCGATTCCCATCAACAGTTCAAGGAACTTCCTGTTTTGTAATCTCCTGCTATTTGTTTAAAATAAATGTGAAGATCTA",
126 | "three_prime_utr_checksum": "48FEEDED5670D4AFBEAC79CFA54306DD9ECDAFE1",
127 | "five_prime_utr_start": 36724494,
128 | "five_prime_utr_end": 36724149,
129 | "five_prime_utr_seq": "ACTGAGCCAGGGAGCACGGAAGTTGTGCCACTGTGCAACTTGGGTTTTCTTTATCCTGCAGTCTTTACCTCAGCAGAACCGCACACCACAGACTCCCTCCAGCTCTTTGTGTGTGGCTCTCTCAGGGTCCAACAAGAGCAAGCTGTGGGTCTGTGAGTGTTTATGTGTGCTTTTATTCACTTCACACTTATTGAAAAGTGTGTATGTGAGAGGGTGGGGTGTGTGTGTCAAAGAGAGTGAGGAAGAGAAGGAGAGAGAGATCAATTGATTCTGCAGCCTCAGCTCCAGCATCCCTCAGTTGGGAGCTTCCAAAGCCGGGTGATCACTTGGGGTGCATAGCTCGGAG",
130 | "five_prime_utr_checksum": "60878BA89AD674207714116AF20DB12EE2B653C0",
131 | "transcript_release_set": [
132 | {
133 | "assembly": "GRCh38",
134 | "shortname": "110_20220707",
135 | "description": "Refseq Homo sapiens Annotation Release 110.20220707",
136 | "release_date": "2022-07-07",
137 | "source": "RefSeq"
138 | },
139 | {
140 | "assembly": "GRCh38",
141 | "shortname": "GCF_000001405_20230320",
142 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20230320",
143 | "release_date": "2023-03-20",
144 | "source": "RefSeq"
145 | },
146 | {
147 | "assembly": "GRCh38",
148 | "shortname": "GCF_000001405_20231007",
149 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20231007",
150 | "release_date": "2023-10-07",
151 | "source": "RefSeq"
152 | },
153 | {
154 | "assembly": "GRCh38",
155 | "shortname": "GCF_000001405_20240827",
156 | "description": "Refseq Homo sapiens Annotation Release GCF_000001405.20240827",
157 | "release_date": "2024-08-24",
158 | "source": "RefSeq"
159 | }
160 | ]
161 | }
162 | ]
163 | }
--------------------------------------------------------------------------------
/tests/test_data/ensembl_test.GRCh38.111.gtf:
--------------------------------------------------------------------------------
1 | 1 ensembl_havana gene 65419 71585 . + . gene_id "ENSG00000186092"; gene_version "7"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
2 | 1 havana transcript 65419 71585 . + . gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select";
3 | 1 havana exon 65419 65433 . + . gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "1"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; exon_id "ENSE00003812156"; exon_version "1"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select";
4 | 1 havana exon 65520 65573 . + . gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; exon_id "ENSE00003813641"; exon_version "1"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select";
5 | 1 havana CDS 65565 65573 . + 0 gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; protein_id "ENSP00000493376"; protein_version "2"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select";
6 | 1 havana start_codon 65565 65567 . + 0 gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select";
7 | 1 havana exon 69037 71585 . + . gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "3"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; exon_id "ENSE00003813949"; exon_version "1"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select";
8 | 1 havana CDS 69037 70005 . + 0 gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "3"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; protein_id "ENSP00000493376"; protein_version "2"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select";
9 | 1 havana stop_codon 70006 70008 . + 0 gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; exon_number "3"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select";
10 | 1 havana five_prime_utr 65419 65433 . + . gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select";
11 | 1 havana five_prime_utr 65520 65564 . + . gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select";
12 | 1 havana three_prime_utr 70009 71585 . + . gene_id "ENSG00000186092"; gene_version "7"; transcript_id "ENST00000641515"; transcript_version "2"; gene_name "OR4F5"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS30547"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select";
13 | MT insdc gene 8295 8364 . + . gene_id "ENSG00000210156"; gene_version "1"; gene_name "MT-TK"; gene_source "insdc"; gene_biotype "Mt_tRNA";
14 | MT insdc transcript 8295 8364 . + . gene_id "ENSG00000210156"; gene_version "1"; transcript_id "ENST00000387421"; transcript_version "1"; gene_name "MT-TK"; gene_source "insdc"; gene_biotype "Mt_tRNA"; transcript_name "MT-TK-201"; transcript_source "insdc"; transcript_biotype "Mt_tRNA"; tag "basic"; tag "Ensembl_canonical"; transcript_support_level "NA";
15 | MT insdc exon 8295 8364 . + . gene_id "ENSG00000210156"; gene_version "1"; transcript_id "ENST00000387421"; transcript_version "1"; exon_number "1"; gene_name "MT-TK"; gene_source "insdc"; gene_biotype "Mt_tRNA"; transcript_name "MT-TK-201"; transcript_source "insdc"; transcript_biotype "Mt_tRNA"; exon_id "ENSE00001544484"; exon_version "1"; tag "basic"; tag "Ensembl_canonical"; transcript_support_level "NA";
16 |
--------------------------------------------------------------------------------
/tests/test_data/grch37.genome:
--------------------------------------------------------------------------------
1 | NC_000007.13 36561661 36561662 C
2 |
--------------------------------------------------------------------------------
/tests/test_data/hg19_chrY_300kb_genes.gtf:
--------------------------------------------------------------------------------
1 | # From iGenomes hg19 UCSC genes.gtf
2 | chrY stdin exon 244668 245252 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "1"; exon_id "NM_013239.1"; gene_name "PPP2R3B";
3 | chrY stdin CDS 245105 245252 . - 1 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "1"; exon_id "NM_013239.1"; gene_name "PPP2R3B";
4 | chrY stdin exon 249339 249445 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "2"; exon_id "NM_013239.2"; gene_name "PPP2R3B";
5 | chrY stdin CDS 249339 249445 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "2"; exon_id "NM_013239.2"; gene_name "PPP2R3B";
6 | chrY stdin exon 249513 249631 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "3"; exon_id "NM_013239.3"; gene_name "PPP2R3B";
7 | chrY stdin CDS 249513 249631 . - 2 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "3"; exon_id "NM_013239.3"; gene_name "PPP2R3B";
8 | chrY stdin exon 251500 251675 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "4"; exon_id "NM_013239.4"; gene_name "PPP2R3B";
9 | chrY stdin CDS 251500 251675 . - 1 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "4"; exon_id "NM_013239.4"; gene_name "PPP2R3B";
10 | chrY stdin exon 252042 252131 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "5"; exon_id "NM_013239.5"; gene_name "PPP2R3B";
11 | chrY stdin CDS 252042 252131 . - 1 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "5"; exon_id "NM_013239.5"; gene_name "PPP2R3B";
12 | chrY stdin exon 252618 252666 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "6"; exon_id "NM_013239.6"; gene_name "PPP2R3B";
13 | chrY stdin CDS 252618 252666 . - 2 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "6"; exon_id "NM_013239.6"; gene_name "PPP2R3B";
14 | chrY stdin exon 256251 256407 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "7"; exon_id "NM_013239.7"; gene_name "PPP2R3B";
15 | chrY stdin CDS 256251 256407 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "7"; exon_id "NM_013239.7"; gene_name "PPP2R3B";
16 | chrY stdin exon 256909 256995 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "8"; exon_id "NM_013239.8"; gene_name "PPP2R3B";
17 | chrY stdin CDS 256909 256995 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "8"; exon_id "NM_013239.8"; gene_name "PPP2R3B";
18 | chrY stdin exon 257436 257510 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "9"; exon_id "NM_013239.9"; gene_name "PPP2R3B";
19 | chrY stdin CDS 257436 257510 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "9"; exon_id "NM_013239.9"; gene_name "PPP2R3B";
20 | chrY stdin exon 257969 258071 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "10"; exon_id "NM_013239.10"; gene_name "PPP2R3B";
21 | chrY stdin CDS 257969 258071 . - 1 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "10"; exon_id "NM_013239.10"; gene_name "PPP2R3B";
22 | chrY stdin exon 258325 258428 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "11"; exon_id "NM_013239.11"; gene_name "PPP2R3B";
23 | chrY stdin CDS 258325 258428 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "11"; exon_id "NM_013239.11"; gene_name "PPP2R3B";
24 | chrY stdin exon 272140 272325 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "12"; exon_id "NM_013239.12"; gene_name "PPP2R3B";
25 | chrY stdin CDS 272140 272325 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "12"; exon_id "NM_013239.12"; gene_name "PPP2R3B";
26 | chrY stdin exon 297103 297690 . - . gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "13"; exon_id "NM_013239.13"; gene_name "PPP2R3B";
27 | chrY stdin CDS 297103 297426 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "13"; exon_id "NM_013239.13"; gene_name "PPP2R3B";
28 | chrY stdin start_codon 297424 297426 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "1"; exon_id "NM_013239.1"; gene_name "PPP2R3B";
29 | chrY stdin stop_codon 245102 245104 . - 0 gene_id "PPP2R3B"; transcript_id "NM_013239"; exon_number "1"; exon_id "NM_013239.1"; gene_name "PPP2R3B";
30 | chrY stdin exon 231385 232054 . + . gene_id "PPP2R3B-AS1"; transcript_id "NR_027231"; exon_number "1"; exon_id "NR_027231.1"; gene_name "PPP2R3B-AS1";
31 | chrY stdin exon 231385 232054 . + . gene_id "PPP2R3B-AS1"; transcript_id "NR_027232"; exon_number "1"; exon_id "NR_027232.1"; gene_name "PPP2R3B-AS1";
32 | chrY stdin exon 148061 148351 . + . gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "1"; exon_id "NM_018390_2.1"; gene_name "PLCXD1";
33 | chrY stdin exon 150834 150981 . + . gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "2"; exon_id "NM_018390_2.2"; gene_name "PLCXD1";
34 | chrY stdin CDS 150855 150981 . + 0 gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "2"; exon_id "NM_018390_2.2"; gene_name "PLCXD1";
35 | chrY stdin exon 155400 155536 . + . gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "3"; exon_id "NM_018390_2.3"; gene_name "PLCXD1";
36 | chrY stdin CDS 155400 155536 . + 2 gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "3"; exon_id "NM_018390_2.3"; gene_name "PLCXD1";
37 | chrY stdin exon 157315 157443 . + . gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "4"; exon_id "NM_018390_2.4"; gene_name "PLCXD1";
38 | chrY stdin CDS 157315 157443 . + 0 gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "4"; exon_id "NM_018390_2.4"; gene_name "PLCXD1";
39 | chrY stdin exon 158166 158321 . + . gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "5"; exon_id "NM_018390_2.5"; gene_name "PLCXD1";
40 | chrY stdin CDS 158166 158321 . + 0 gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "5"; exon_id "NM_018390_2.5"; gene_name "PLCXD1";
41 | chrY stdin exon 159702 159885 . + . gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "6"; exon_id "NM_018390_2.6"; gene_name "PLCXD1";
42 | chrY stdin CDS 159702 159885 . + 0 gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "6"; exon_id "NM_018390_2.6"; gene_name "PLCXD1";
43 | chrY stdin exon 165764 170022 . + . gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "7"; exon_id "NM_018390_2.7"; gene_name "PLCXD1";
44 | chrY stdin CDS 165764 165999 . + 2 gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "7"; exon_id "NM_018390_2.7"; gene_name "PLCXD1";
45 | chrY stdin start_codon 150855 150857 . + 0 gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "1"; exon_id "NM_018390_2.1"; gene_name "PLCXD1";
46 | chrY stdin stop_codon 166000 166002 . + 0 gene_id "PLCXD1"; transcript_id "NM_018390_2"; exon_number "1"; exon_id "NM_018390_2.1"; gene_name "PLCXD1";
47 | chrY stdin exon 142991 143061 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "1"; exon_id "NR_028057_2.1"; gene_name "PLCXD1";
48 | chrY stdin exon 148149 148351 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "2"; exon_id "NR_028057_2.2"; gene_name "PLCXD1";
49 | chrY stdin exon 150834 150981 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "3"; exon_id "NR_028057_2.3"; gene_name "PLCXD1";
50 | chrY stdin exon 155400 155536 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "4"; exon_id "NR_028057_2.4"; gene_name "PLCXD1";
51 | chrY stdin exon 157315 157443 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "5"; exon_id "NR_028057_2.5"; gene_name "PLCXD1";
52 | chrY stdin exon 158166 158321 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "6"; exon_id "NR_028057_2.6"; gene_name "PLCXD1";
53 | chrY stdin exon 159702 159885 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "7"; exon_id "NR_028057_2.7"; gene_name "PLCXD1";
54 | chrY stdin exon 165764 166059 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "8"; exon_id "NR_028057_2.8"; gene_name "PLCXD1";
55 | chrY stdin exon 169260 170022 . + . gene_id "PLCXD1"; transcript_id "NR_028057_2"; exon_number "9"; exon_id "NR_028057_2.9"; gene_name "PLCXD1";
56 |
--------------------------------------------------------------------------------
/tests/test_data/transcript_sequences.json:
--------------------------------------------------------------------------------
1 | {
2 | "ENST00000617537.5": "ACTGAGCCAGGGAGCACGGAAGTTGTGCCACTGTGCAACTTGGGTTTTCTTTATCCTGCAGTCTTTACCTCAGCAGAACCGCACACCACAGACTCCCTCCAGCTCTTTGTGTGTGGCTCTCTCAGGGTCCAACAAGAGCAAGCTGTGGGTCTGTGAGTGTTTATGTGTGCTTTTATTCACTTCACACTTATTGAAAAGTGTGTATGTGAGAGGGTGGGGTGTGTGTGTCAAAGAGAGTGAGGAAGAGAAGGAGAGAGAGATCAATTGATTCTGCAGCCTCAGCTCCAGCATCCCTCAGTTGGGAGCTTCCAAAGCCGGGTGATCACTTGGGGTGCATAGCTCGGAGATGCAGTCCCCCTGGAAAATCCTTACGGTGGCGCCTCTATTCTTGCTCCTGTCTCTTCAGTCCTCGGCCTCTCCAGCCAACGATGACCAGTCCAGGCCCAGCCTCTCGAATGGGCACACCTGTGTAGGGTGTGTGCTGGTGGTGTCTGTAATAGAACAGCTTGCTCAAGTTCACAACTCGACGGTCCAGGCCTCGATGGAGAGACTGTGCAGCTACCTGCCTGAAAAACTGTTCTTGAAAACCACCTGCTATTTAGTCATTGACAAGTTTGGATCAGACATCATAAAACTGCTTAGCGCAGATATGAATGCTGATGTGGTATGTCACACTCTGGAGTTTTGTAAACAGAACACTGGCCAACCATTGTGTCATCTCTACCCTCTTCCCAAGGAGACATGGAAATTTACACTACAGAAGGCAAGACAAATTGTCAAGAAGTCCCCGATTCTGAAATATTCTAGAAGTGGTTCTGACATTTGTTCACTCCCGGTTTTGGCCAAGATCTGCCAGAAAATTAAATTAGCTATGGAACAGTCTGTGCCATTCAAAGATGTGGATTCAGACAAATACAGCGTTTTCCCAACACTGCGGGGCTATCACTGGCGGGGGAGAGACTGTAATGACAGCGACGAGTCAGTGTACCCAGGTAGAAGGCCGAACAACTGGGATGTCCATCAGGATTCAAACTGTAATGGCATTTGGGGTGTCGATCCAAAAGATGGAGTTCCATATGAGAAGAAATTCTGTGAAGGTTCACAGCCCAGGGGAATCATTTTGCTGGGAGACTCAGCTGGGGCTCATTTTCACATCTCTCCTGAATGGATCACAGCGTCGCAGATGTCTTTGAACTCTTTCATCAATCTACCAACAGCCCTTACCAACGAGCTTGACTGGCCCCAACTCTCTGGTGCTACAGGATTTCTGGACTCCACTGTTGGAATTAAAGAAAAATCTATTTACCTTCGCTTATGGAAAAGAAACCACTGTAATCACAGGGACTACCAGAATATTTCAAGAAATGGTGCATCTTCCCGAAACCTGAAGAAATTTATAGAAAGCTTGTCTAGAAACAAGGTGTTGGACTATCCCGCCATCGTTATATATGCCATGATTGGAAATGATGTCTGCAGTGGGAAGAGTGACCCAGTCCCAGCCATGACCACTCCTGAGAAACTCTACTCCAACGTCATGCAGACTCTGAAGCATCTAAATTCCCACCTGCCCAATGGCAGCCATGTTATTTTGTATGGCTTACCAGATGGAACCTTTCTCTGGGATAATTTGCACAACAGATATCATCCTCTCGGCCAGCTAAATAAAGACATGACCTATGCGCAGTTGTACTCCTTCCTGAACTGCCTCCAGGTCAGCCCCTGCCACGGCTGGATGTCTTCCAACAAGACGTTGCGGACTCTCACTTCAGAGAGAGCAGAGCAACTCTCCAACACACTGAAAAAAATTGCAGCCAGTGAGAAATTTACAAACTTCAATCTTTTCTACATGGATTTTGCCTTCCATGAAATCATACAGGAGTGGCAGAAGAGAGGCGGACAGCCCTGGCAGCTCATCGAGCCCGTGGATGGATTCCACCCCAACGAGGTGGCTTTGCTGTTGTTGGCGGATCATTTCTGGAAAAAGGTGCAGCTCCAGTGGCCCCAAATCCTGGGAAAGGAGAATCCGTTCAACCCCCAGATTAAACAGGTGTTTGGAGACCAAGGCGGGCACTGAGCCTCTCAGGAGCATGCACCCCTGGGGAGCACAGGGAGGCAGAGGCTTGGGTAAACTCATTCCACAAACCCTATGGGGGCTGCCACGTCACAGGCCCAAAGGACTCTTCTTCAGCAGCATCTTTGCAAAATGTCTTTCTCTCAATGAAGAGCATATCTGGACGACTGTGCAATGCTGTGTGCTCCCGGGATCAGTAACCCTTCCGCTGTTCCTGAAATAACCTTTCATAAAGTGCTTTGGGTGCCATTCCAAACAAGAGAGTATCTGTGCCCTTTACAGCTAATTGTTCTAAAAGGAGTTTCTAAAAACAC",
3 | "NM_001637.3": "AACAGATCAGTTCCGGCAAGCCTCGAGGCTCACGGGGTTTATGCACACTAACTTCACTGAGCCAGGGAGCACGGAAGTTGTGCCACTGTGCAACTTGGGTTTTCTTTATCCTGCAGTCTTTACCTCAGCAGAACCGCACACCACAGACTCCCTCCAGCTCTTTGTGTGTGGCTCTCTCAGGGTCCAACAAGAGCAAGCTGTGGGTCTGTGAGTGTTTATGTGTGCTTTTATTCACTTCACACTTATTGAAAAGTGTGTATGTGAGAGGGTGGGGTGTGTGTGTCAAAGAGAGTGAGGAAGAGAAGGAGAGAGAGATCAATTGATTCTGCAGCCTCAGCTCCAGCATCCCTCAGTTGGGAGCTTCCAAAGCCGGGTGATCACTTGGGGTGCATAGCTCGGAGATGCAGTCCCCCTGGAAAATCCTTACGGTGGCGCCTCTATTCTTGCTCCTGTCTCTTCAGTCCTCGGCCTCTCCAGCCAACGATGACCAGTCCAGGCCCAGCCTCTCGAATGGGCACACCTGTGTAGGGTGTGTGCTGGTGGTGTCTGTAATAGAACAGCTTGCTCAAGTTCACAACTCGACGGTCCAGGCCTCGATGGAGAGACTGTGCAGCTACCTGCCTGAAAAACTGTTCTTGAAAACCACCTGCTATTTAGTCATTGACAAGTTTGGATCAGACATCATAAAACTGCTTAGCGCAGATATGAATGCTGATGTGGTATGTCACACTCTGGAGTTTTGTAAACAGAACACTGGCCAACCATTGTGTCATCTCTACCCTCTTCCCAAGGAGACATGGAAATTTACACTACAGAAGGCAAGACAAATTGTCAAGAAGTCCCCGATTCTGAAATATTCTAGAAGTGGTTCTGACATTTGTTCACTCCCGGTTTTGGCCAAGATCTGCCAGAAAATTAAATTAGCTATGGAACAGTCTGTGCCATTCAAAGATGTGGATTCAGACAAATACAGCGTTTTCCCAACACTGCGGGGCTATCACTGGCGGGGGAGAGACTGTAATGACAGCGACGAGTCAGTGTACCCAGGTAGAAGGCCGAACAACTGGGATGTCCATCAGGATTCAAACTGTAATGGCATTTGGGGTGTCGATCCAAAAGATGGAGTTCCATATGAGAAGAAATTCTGTGAAGGTTCACAGCCCAGGGGAATCATTTTGCTGGGAGACTCAGCTGGGGCTCATTTTCACATCTCTCCTGAATGGATCACAGCGTCGCAGATGTCTTTGAACTCTTTCATCAATCTACCAACAGCCCTTACCAACGAGCTTGACTGGCCCCAACTCTCTGGTGCTACAGGATTTCTGGACTCCACTGTTGGAATTAAAGAAAAATCTATTTACCTTCGCTTATGGAAAAGAAACCACTGTAATCACAGGGACTACCAGAATATTTCAAGAAATGGTGCATCTTCCCGAAACCTGAAGAAATTTATAGAAAGCTTGTCTAGAAACAAGGTGTTGGACTATCCCGCCATCGTTATATATGCCATGATTGGAAATGATGTCTGCAGTGGGAAGAGTGACCCAGTCCCAGCCATGACCACTCCTGAGAAACTCTACTCCAACGTCATGCAGACTCTGAAGCATCTAAATTCCCACCTGCCCAATGGCAGCCATGTTATTTTGTATGGCTTACCAGATGGAACCTTTCTCTGGGATAATTTGCACAACAGATATCATCCTCTCGGCCAGCTAAATAAAGACATGACCTATGCGCAGTTGTACTCCTTCCTGAACTGCCTCCAGGTCAGCCCCTGCCACGGCTGGATGTCTTCCAACAAGACGTTGCGGACTCTCACTTCAGAGAGAGCAGAGCAACTCTCCAACACACTGAAAAAAATTGCAGCCAGTGAGAAATTTACAAACTTCAATCTTTTCTACATGGATTTTGCCTTCCATGAAATCATACAGGAGTGGCAGAAGAGAGGCGGACAGCCCTGGCAGCTCATCGAGCCCGTGGATGGATTCCACCCCAACGAGGTGGCTTTGCTGTTGTTGGCGGATCATTTCTGGAAAAAGGTGCAGCTCCAGTGGCCCCAAATCCTGGGAAAGGAGAATCCGTTCAACCCCCAGATTAAACAGGTGTTTGGAGACCAAGGCGGGCACTGAGCCTCTCAGGAGCATGCACCCCTGGGGAGCACAGGGAGGCAGAGGCTTGGGTAAACTCATTCCACAAACCCTATGGGGGCTGCCACGTCACAGGCCCAAAGGACTCTTCTTCAGCAGCATCTTTGCAAAATGTCTTTCTCTCAATGAAGAGCATATCTGGACGACTGTGCAATGCTGTGTGCTCCCGGGATCAGTAACCCTTCCGCTGTTCCTGAAATAACCTTTCATAAAGTGCTTTGGGTGCCATTCCAAACAAGAGAGTATCTGTGCCCTTTACAGCTAATTGTTCTAAAAGGAGTTTCTAAAAACAC"
4 | }
--------------------------------------------------------------------------------
/tests/test_gff_parsers.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | from inspect import getsourcefile
4 | import unittest
5 | from generate_transcript_data.gff_parser import GTFParser, GFF3Parser
6 |
7 |
8 | class Test(unittest.TestCase):
9 | this_file_dir = os.path.dirname(os.path.abspath(getsourcefile(lambda: 0)))
10 | test_data_dir = os.path.join(this_file_dir, "test_data")
11 | ENSEMBL_104_GTF_FILENAME = os.path.join(test_data_dir, "ensembl_test.GRCh38.104.gtf")
12 | ENSEMBL_111_GTF_FILENAME = os.path.join(test_data_dir, "ensembl_test.GRCh38.111.gtf")
13 | # Older RefSeq, before Genbank => GenBank changed
14 | REFSEQ_GFF3_FILENAME_2021 = os.path.join(test_data_dir, "refseq_test.GRCh38.p13_genomic.109.20210514.gff")
15 | # Newer RefSeq, before Genbank => GenBank changed
16 | REFSEQ_GFF3_FILENAME_2023 = os.path.join(test_data_dir, "refseq_test.GRCh38.p14_genomic.RS_2023_03.gff")
17 | REFSEQ_GFF3_FILENAME_GRCH37_MT = os.path.join(test_data_dir, "refseq_grch37_mt.gff")
18 | REFSEQ_GFF3_FILENAME_GRCH38_MT = os.path.join(test_data_dir, "refseq_grch38.p14_mt.gff")
19 | UCSC_GTF_FILENAME = os.path.join(test_data_dir, "hg19_chrY_300kb_genes.gtf")
20 | FAKE_URL = "http://fake.url"
21 |
22 | FAKE_MT_TRANSCRIPTS = [
23 | "fake-rna-ATP6", "fake-rna-ATP8", "fake-rna-COX1", "fake-rna-COX2", "fake-rna-COX3", "fake-rna-CYTB",
24 | "fake-rna-ND1", "fake-rna-ND2", "fake-rna-ND3", "fake-rna-ND4", "fake-rna-ND4L", "fake-rna-ND5", "fake-rna-ND6"
25 | ]
26 |
27 | def _test_exon_length(self, transcripts, genome_build, transcript_id, expected_length):
28 | transcript = transcripts[transcript_id]
29 | exons = transcript["genome_builds"][genome_build]["exons"]
30 | length = sum([exon[1] - exon[0] for exon in exons])
31 | self.assertEqual(expected_length, length, "%s exons sum" % transcript_id)
32 |
33 | def test_ucsc_gtf(self):
34 | genome_build = "GRCh37"
35 | parser = GTFParser(self.UCSC_GTF_FILENAME, genome_build, self.FAKE_URL)
36 | _, transcripts = parser.get_genes_and_transcripts()
37 | self._test_exon_length(transcripts, genome_build, "NM_013239", 2426)
38 |
39 | def test_ensembl_gtf(self):
40 | genome_build = "GRCh38"
41 | parser = GTFParser(self.ENSEMBL_104_GTF_FILENAME, genome_build, self.FAKE_URL)
42 | genes, transcripts = parser.get_genes_and_transcripts()
43 | self._test_exon_length(transcripts, genome_build, "ENST00000357654.9", 7088)
44 |
45 | # Ensure that geneID was inserted with a version
46 | expected_gene_version = "ENSG00000012048.23"
47 |
48 | transcript = transcripts["ENST00000357654.9"]
49 | transcript_gene_version = transcript["gene_version"]
50 | self.assertEqual(expected_gene_version, transcript_gene_version, "Transcript gene has version")
51 |
52 | self.assertTrue(expected_gene_version in genes, f"{expected_gene_version=} in genes")
53 |
54 | protein = transcript.get("protein")
55 | self.assertEqual(protein, "ENSP00000350283.3")
56 |
57 | def test_refseq_gff3_2021(self):
58 | genome_build = "GRCh38"
59 | parser = GFF3Parser(self.REFSEQ_GFF3_FILENAME_2021, genome_build, self.FAKE_URL)
60 | _, transcripts = parser.get_genes_and_transcripts()
61 | self._test_exon_length(transcripts, genome_build, "NM_007294.4", 7088)
62 |
63 | transcript = transcripts["NM_015120.4"]
64 | protein = transcript.get("protein")
65 | self.assertEqual(protein, "NP_055935.4")
66 |
67 | def test_refseq_gff3_2023(self):
68 | genome_build = "GRCh38"
69 | parser = GFF3Parser(self.REFSEQ_GFF3_FILENAME_2023, genome_build, self.FAKE_URL)
70 | _, transcripts = parser.get_genes_and_transcripts()
71 | self._test_exon_length(transcripts, genome_build, "NM_007294.4", 7088)
72 |
73 | transcript = transcripts["NM_015120.4"]
74 | protein = transcript.get("protein")
75 | self.assertEqual(protein, "NP_055935.4")
76 |
77 | def test_exons_in_genomic_order(self):
78 | genome_build = "GRCh38"
79 | parser = GTFParser(self.ENSEMBL_104_GTF_FILENAME, genome_build, self.FAKE_URL)
80 | _, transcripts = parser.get_genes_and_transcripts()
81 | transcript = transcripts["ENST00000357654.9"]
82 | exons = transcript["genome_builds"][genome_build]["exons"]
83 | first_exon = exons[0]
84 | last_exon = exons[-1]
85 | self.assertGreater(last_exon[0], first_exon[0])
86 |
87 | parser = GFF3Parser(self.REFSEQ_GFF3_FILENAME_2021, genome_build, self.FAKE_URL)
88 | _, transcripts = parser.get_genes_and_transcripts()
89 | transcript = transcripts["NM_007294.4"]
90 | self.assertEqual(transcript.get("hgnc"), "1100", f"{transcript} has HGNC:1100")
91 | exons = transcript["genome_builds"][genome_build]["exons"]
92 | first_exon = exons[0]
93 | last_exon = exons[-1]
94 | self.assertGreater(last_exon[0], first_exon[0])
95 |
96 | parser = GFF3Parser(self.REFSEQ_GFF3_FILENAME_2023, genome_build, self.FAKE_URL)
97 | _, transcripts = parser.get_genes_and_transcripts()
98 | transcript = transcripts["NM_007294.4"]
99 | self.assertEqual(transcript.get("hgnc"), "1100", f"{transcript} has HGNC:1100")
100 | exons = transcript["genome_builds"][genome_build]["exons"]
101 | first_exon = exons[0]
102 | last_exon = exons[-1]
103 | self.assertGreater(last_exon[0], first_exon[0])
104 |
105 | def test_ensembl_gtf_tags(self):
106 | genome_build = "GRCh38"
107 | parser = GTFParser(self.ENSEMBL_111_GTF_FILENAME, genome_build, self.FAKE_URL)
108 | genes, transcripts = parser.get_genes_and_transcripts()
109 | transcript = transcripts["ENST00000641515.2"]
110 | tag = transcript["genome_builds"][genome_build].get("tag")
111 | self.assertIn("MANE_Select", tag)
112 |
113 | def test_chrom_contig_conversion(self):
114 | genome_build = "GRCh38"
115 | parser = GTFParser(self.ENSEMBL_111_GTF_FILENAME, genome_build, self.FAKE_URL)
116 | _, transcripts = parser.get_genes_and_transcripts()
117 | transcript = transcripts["ENST00000641515.2"]
118 | contig = transcript["genome_builds"][genome_build].get("contig")
119 | self.assertEqual(contig, "NC_000001.11")
120 |
121 | def test_ncrna_gene(self):
122 | """ We were incorrectly missing ncRNA gene info @see https://github.com/SACGF/cdot/issues/72 """
123 | genome_build = "GRCh38"
124 | parser = GTFParser(self.ENSEMBL_111_GTF_FILENAME, genome_build, self.FAKE_URL)
125 | genes, transcripts = parser.get_genes_and_transcripts()
126 | gene = genes["ENSG00000210156"]
127 | gene_symbol = gene["gene_symbol"]
128 | self.assertEqual(gene_symbol, "MT-TK")
129 |
130 | def _test_mito(self, filename, genome_build):
131 | parser = GFF3Parser(filename, genome_build, self.FAKE_URL)
132 | genes, transcripts = parser.get_genes_and_transcripts()
133 |
134 | for transcript_accession in self.FAKE_MT_TRANSCRIPTS:
135 | self.assertIn(transcript_accession, transcripts)
136 |
137 | transcript = transcripts["fake-rna-ATP6"]
138 | exons = transcript["genome_builds"][genome_build]["exons"]
139 | first_exon = exons[0]
140 | self.assertEqual(first_exon[0], 8526)
141 | self.assertEqual(first_exon[1], 9207)
142 |
143 | def test_mito_mrna(self):
144 | """ Need to make fake MT transcripts for RefSeq @see https://github.com/SACGF/cdot/issues/72 """
145 | self._test_mito(self.REFSEQ_GFF3_FILENAME_GRCH38_MT, "GRCh38")
146 |
147 | def test_mito_no_mrna(self):
148 | """ Need to make fake MT transcripts for RefSeq @see https://github.com/SACGF/cdot/issues/72 """
149 | self._test_mito(self.REFSEQ_GFF3_FILENAME_GRCH37_MT, "GRCh37")
150 |
--------------------------------------------------------------------------------
/tests/test_json_data_provider_ensembl.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 | from abc import ABC, abstractmethod
4 | from inspect import getsourcefile
5 | from os.path import abspath
6 |
7 | import hgvs
8 | from hgvs.assemblymapper import AssemblyMapper
9 | from hgvs.dataproviders.seqfetcher import SeqFetcher
10 | from hgvs.exceptions import HGVSDataNotAvailableError
11 |
12 | from cdot.hgvs.dataproviders import ChainedSeqFetcher
13 | from cdot.hgvs.dataproviders.json_data_provider import JSONDataProvider
14 | from tests.mock_seqfetcher import MockSeqFetcher
15 | from tests.mock_ensembl_tark import MockEnsemblTarkDataProvider
16 |
17 |
18 | class AbstractEnsemblTestCase(unittest.TestCase, ABC):
19 | @classmethod
20 | def setUpClass(cls):
21 | """ Subclasses need to override this """
22 | raise unittest.SkipTest
23 |
24 | def test_transcript(self):
25 | am = AssemblyMapper(self.json_data_provider,
26 | assembly_name='GRCh38', alt_aln_method='splign', replace_reference=True)
27 | HGVS_C_TO_G = [
28 | ('ENST00000617537.5:c.1582G>A', 'NC_000007.14:g.36522056C>T'),
29 | ]
30 |
31 | hp = hgvs.parser.Parser()
32 | for hgvs_c, expected_hgvs_g in HGVS_C_TO_G:
33 | var_c = hp.parse_hgvs_variant(hgvs_c)
34 | var_g = am.c_to_g(var_c)
35 | self.assertEqual(str(var_g), expected_hgvs_g)
36 |
37 | def test_get_tx_for_gene(self):
38 | found = False
39 | expected_transcript = "ENST00000617537.5"
40 | for tx_data in self.json_data_provider.get_tx_for_gene("AOAH"):
41 | print(tx_data)
42 | if tx_data["tx_ac"] == expected_transcript:
43 | found = True
44 | self.assertEqual(tx_data["alt_ac"], "NC_000007.14")
45 | continue
46 | self.assertTrue(found)
47 |
48 | def test_get_tx_for_region(self):
49 | found = False
50 | expected_transcript = "ENST00000617537.5"
51 | # Exonic coordinate
52 | for tx_data in self.json_data_provider.get_tx_for_region("NC_000007.14", "splign", 36530416, 36530514):
53 | if tx_data["tx_ac"] == expected_transcript:
54 | found = True
55 | self.assertEqual(tx_data["alt_strand"], -1)
56 | self.assertEqual(tx_data["start_i"], 36512940)
57 | self.assertEqual(tx_data["end_i"], 36724494)
58 | break
59 |
60 | self.assertTrue(found)
61 |
62 | def test_get_pro_ac_for_tx_ac(self):
63 | pro_ac = self.json_data_provider.get_pro_ac_for_tx_ac("ENST00000617537.5")
64 | self.assertEqual(pro_ac, "ENSP00000483783.1")
65 |
66 | def test_get_tx_info(self):
67 | # We only have data for GRCh38 but none for 37
68 |
69 | # Make sure 37 fails
70 | with self.assertRaises(HGVSDataNotAvailableError):
71 | tx_info = self.json_data_provider.get_tx_info("ENST00000617537.5", "NC_000007.13", "splign")
72 |
73 | # Make sure 38 works
74 | tx_info = self.json_data_provider.get_tx_info("ENST00000617537.5", "NC_000007.14", "splign")
75 | print(tx_info)
76 |
77 |
78 | class JsonDataProviderTestCase(AbstractEnsemblTestCase):
79 | @classmethod
80 | def setUpClass(cls):
81 | this_file_dir = os.path.dirname(abspath(getsourcefile(lambda: 0)))
82 | # parent_dir = os.path.dirname(this_file_dir)
83 | test_json_file = os.path.join(this_file_dir, "test_data/cdot.ensembl.grch38.json")
84 | test_transcripts_file = os.path.join(this_file_dir, "test_data/transcript_sequences.json")
85 | mock_seqfetcher = MockSeqFetcher(test_transcripts_file)
86 | seqfetcher = ChainedSeqFetcher(mock_seqfetcher, SeqFetcher())
87 | cls.json_data_provider = JSONDataProvider([test_json_file], seqfetcher=seqfetcher)
88 |
89 |
90 | class EnsemblTarkDataProviderTestCase(AbstractEnsemblTestCase):
91 | @classmethod
92 | def setUpClass(cls):
93 | cls.json_data_provider = MockEnsemblTarkDataProvider()
94 |
95 |
96 |
97 | if __name__ == '__main__':
98 | unittest.main()
99 |
--------------------------------------------------------------------------------
/tests/test_json_data_provider_refseq.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 | from inspect import getsourcefile
4 | from os.path import abspath
5 |
6 | import hgvs
7 | from hgvs.assemblymapper import AssemblyMapper
8 | from hgvs.dataproviders.seqfetcher import SeqFetcher
9 | from hgvs.exceptions import HGVSDataNotAvailableError
10 |
11 | from cdot.hgvs.dataproviders import ChainedSeqFetcher
12 | from cdot.hgvs.dataproviders.json_data_provider import JSONDataProvider
13 | from tests.mock_seqfetcher import MockSeqFetcher
14 |
15 |
16 | class TestJSONDataProvider(unittest.TestCase):
17 | @classmethod
18 | def setUpClass(cls):
19 | this_file_dir = os.path.dirname(abspath(getsourcefile(lambda: 0)))
20 | # parent_dir = os.path.dirname(this_file_dir)
21 | test_json_file = os.path.join(this_file_dir, "test_data/cdot.refseq.grch37.json")
22 | test_transcripts_file = os.path.join(this_file_dir, "test_data/transcript_sequences.json")
23 | mock_seqfetcher = MockSeqFetcher(test_transcripts_file)
24 | seqfetcher = ChainedSeqFetcher(mock_seqfetcher, SeqFetcher())
25 | cls.json_data_provider = JSONDataProvider([test_json_file], seqfetcher=seqfetcher)
26 |
27 | def test_transcript(self):
28 | am = AssemblyMapper(self.json_data_provider,
29 | assembly_name='GRCh37', alt_aln_method='splign', replace_reference=True)
30 | HGVS_C_TO_G = [
31 | ('NM_001637.3:c.1582G>A', 'NC_000007.13:g.36561662C>T'),
32 | ]
33 |
34 | hp = hgvs.parser.Parser()
35 | for hgvs_c, expected_hgvs_g in HGVS_C_TO_G:
36 | var_c = hp.parse_hgvs_variant(hgvs_c)
37 | var_g = am.c_to_g(var_c)
38 | self.assertEqual(str(var_g), expected_hgvs_g)
39 |
40 | def test_get_tx_for_gene(self):
41 | found = False
42 | expected_transcript = "NM_001637.3"
43 | for tx_data in self.json_data_provider.get_tx_for_gene("AOAH"):
44 | print(tx_data)
45 | if tx_data["tx_ac"] == expected_transcript:
46 | found = True
47 | self.assertEqual(tx_data["alt_ac"], "NC_000007.13")
48 | continue
49 | self.assertTrue(found)
50 |
51 | def test_get_tx_for_region(self):
52 | found = False
53 | expected_transcript = "NM_001637.3"
54 | # Exonic coordinate
55 | for tx_data in self.json_data_provider.get_tx_for_region("NC_000007.13", "splign", 36570024, 36570025):
56 | if tx_data["tx_ac"] == expected_transcript:
57 | found = True
58 | self.assertEqual(tx_data["alt_strand"], -1)
59 | self.assertEqual(tx_data["start_i"], 36552548)
60 | self.assertEqual(tx_data["end_i"], 36764154)
61 | continue
62 | self.assertTrue(found)
63 |
64 | def test_get_tx_for_region_intron(self):
65 | """ Test case for https://github.com/SACGF/cdot/issues/38 """
66 | found = False
67 | expected_transcript = "NM_001637.3"
68 | # Coordinate below is intronic
69 | for tx_data in self.json_data_provider.get_tx_for_region("NC_000007.13", "splign", 36743533, 36745648):
70 | if tx_data["tx_ac"] == expected_transcript:
71 | found = True
72 | self.assertEqual(tx_data["alt_strand"], -1)
73 | self.assertEqual(tx_data["start_i"], 36552548)
74 | self.assertEqual(tx_data["end_i"], 36764154)
75 | continue
76 | self.assertTrue(found)
77 |
78 |
79 | def test_get_pro_ac_for_tx_ac(self):
80 | pro_ac = self.json_data_provider.get_pro_ac_for_tx_ac("NM_001637.3")
81 | self.assertEqual(pro_ac, "NP_001628.1")
82 |
83 | def test_get_gene_info(self):
84 | gene_info = self.json_data_provider.get_gene_info("GATA2")
85 | summary = gene_info.pop("summary")
86 | self.assertTrue("zinc-finger transcription factors" in summary)
87 | expected = {
88 | "hgnc": "GATA2",
89 | "maploc": "3q21.3",
90 | "descr": "GATA binding protein 2",
91 | "aliases": "{DCML,IMD21,MONOMAC,NFE1B}",
92 | "added": None,
93 | }
94 | self.assertEqual(gene_info, expected)
95 |
96 | def test_get_tx_info(self):
97 | # We only have data for GRCh37 but none for 38
98 |
99 | # Make sure 37 works
100 | tx_info = self.json_data_provider.get_tx_info("NM_001637.3", "NC_000007.13", "splign")
101 | print(tx_info)
102 |
103 | # Make sure 38 fails
104 | with self.assertRaises(HGVSDataNotAvailableError):
105 | tx_info = self.json_data_provider.get_tx_info("NM_001637.3", "NC_000007.14", "splign")
106 |
107 |
108 | if __name__ == '__main__':
109 | unittest.main()
110 |
--------------------------------------------------------------------------------
/tests/test_pyhgvs.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 | from inspect import getsourcefile
4 | from os.path import abspath
5 |
6 | import pyhgvs
7 |
8 | from cdot.pyhgvs.pyhgvs_transcript import JSONPyHGVSTranscriptFactory, is_sacgf_pyhgvs_fork
9 | from .genome import MockGenomeTestFile
10 |
11 |
12 | class TestPyHGVS(unittest.TestCase):
13 | def test_transcript(self):
14 | this_file_dir = os.path.dirname(abspath(getsourcefile(lambda: 0)))
15 | test_json_file = os.path.join(this_file_dir, "test_data/cdot.refseq.grch37.json")
16 | factory = JSONPyHGVSTranscriptFactory([test_json_file])
17 |
18 | HGVS_C_TO_G = [
19 | ('NM_001637.3:c.1582G>A', 'NC_000007.13:g.36561662C>T'),
20 | ]
21 |
22 | genome = MockGenomeTestFile(
23 | db_filename='grch37.fa',
24 | filename=os.path.join(this_file_dir, 'test_data/grch37.genome'),
25 | create_data=False)
26 |
27 | sacgf_pyhgvs_fork = is_sacgf_pyhgvs_fork()
28 |
29 | def get_transcript(transcript_id):
30 | return factory.get_transcript_grch37(transcript_id, sacgf_pyhgvs_fork=sacgf_pyhgvs_fork)
31 |
32 | for hgvs_c, expected_hgvs_g in HGVS_C_TO_G:
33 | result = pyhgvs.parse_hgvs_name(hgvs_c, genome, get_transcript=get_transcript)
34 | name = pyhgvs.HGVSName(expected_hgvs_g)
35 | expected = (name.chrom, name.start, name.ref_allele, name.alt_allele)
36 | self.assertEqual(result, expected)
37 |
38 | def test_non_coding_transcript(self):
39 | this_file_dir = os.path.dirname(abspath(getsourcefile(lambda: 0)))
40 | test_json_file = os.path.join(this_file_dir, "test_data/cdot.refseq.grch37.json")
41 | factory = JSONPyHGVSTranscriptFactory([test_json_file])
42 |
43 | genome = MockGenomeTestFile(
44 | db_filename='grch37.fa',
45 | filename=os.path.join(this_file_dir, 'test_data/grch37.genome'),
46 | create_data=False)
47 |
48 | transcript_id = "NR_023343.1"
49 | sacgf_pyhgvs_fork = is_sacgf_pyhgvs_fork()
50 | pyhgvs_transcript = factory.get_transcript_grch37(transcript_id, sacgf_pyhgvs_fork=sacgf_pyhgvs_fork)
51 | self.assertFalse(pyhgvs_transcript.is_coding, f"Transcript {transcript_id} is non-coding")
52 |
53 |
54 | if __name__ == '__main__':
55 | unittest.main()
56 |
--------------------------------------------------------------------------------
/tests/test_uta_conversion.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from generate_transcript_data.cdot_json import _cigar_to_gap_and_length
3 |
4 |
5 | class UTAConversionTestCase(unittest.TestCase):
6 | def test_cigar_to_gap_and_length(self):
7 | cigar = '194=1D60=1D184='
8 | expected_gap = 'M194 I1 M60 I1 M184'
9 |
10 | gap, exon_length = _cigar_to_gap_and_length(cigar)
11 | self.assertEqual(gap, expected_gap)
12 |
13 | def test_cigar_full_match(self):
14 | """ Should return None as perfect match """
15 | cigar = '194='
16 | expected_gap = None
17 |
18 | gap, exon_length = _cigar_to_gap_and_length(cigar)
19 | self.assertEqual(gap, expected_gap)
20 |
21 | def test_cigar_merged_matches(self):
22 | cigar = '194=100='
23 | expected_gap = None
24 |
25 | gap, exon_length = _cigar_to_gap_and_length(cigar)
26 | self.assertEqual(gap, expected_gap)
27 |
28 | def test_cigar_mismatch(self):
29 | cigar = '195=1X1D430=' # X will become match and should merge w/first
30 | expected_gap = "M196 I1 M430"
31 |
32 | gap, exon_length = _cigar_to_gap_and_length(cigar)
33 | self.assertEqual(gap, expected_gap)
34 |
35 | def test_cigar_deletion_exon_length(self):
36 | cigar = '100=50I100=' # 100 match, 50 insertion (in ref, del in transcript), 100 match = 200 exon length
37 |
38 | _, exon_length = _cigar_to_gap_and_length(cigar)
39 | self.assertEqual(exon_length, 200)
40 |
41 |
42 | if __name__ == '__main__':
43 | unittest.main()
44 |
--------------------------------------------------------------------------------