├── .gitignore ├── .gitlab-ci.yml ├── Dockerfile ├── LICENSE ├── README.md ├── TODO.md ├── clinvcf.nimble ├── clinvcf.png ├── src ├── clinvcf.nim ├── clinvcfpkg │ ├── gff.nim │ ├── hgnc.nim │ ├── lapper.nim │ └── utils.nim ├── compvcf.nim └── extractClinvarSet.nim └── tests ├── all.nim ├── clinvcf_tests.nim ├── files ├── 109.xml ├── 1166.xml ├── 140866.xml ├── 16895.xml ├── 182965.xml ├── 184976.xml ├── 225499.xml ├── 225974.xml ├── 242771.xml ├── 307134.xml ├── 340430.xml ├── 37785.xml ├── 5333.xml ├── 582.xml ├── 618897_2019-05.xml ├── 7108.xml ├── 9.xml ├── 928.xml ├── 9618.xml ├── ADORA2A.gff ├── BRCA2.gff ├── CFTR.gff ├── FTCD.gff ├── MT.gff ├── TREX1.gff └── hgnc_toy.tsv ├── functional-tests.sh ├── gff_tests.nim ├── hgnc_tests.nim └── nim.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | src/clinvcf 2 | ssshtest 3 | old/ 4 | clinvcf 5 | tests/all 6 | extractClinvarSet 7 | src/clinvcf.dSYM/ 8 | tests/all.dSYM/ 9 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | variables: 2 | IMAGE_NAME: $CI_REGISTRY_IMAGE 3 | 4 | include: 5 | - project: "devops/gitlab-ci-template" 6 | file: "/release.gitlab-ci.yml" 7 | - project: "devops/gitlab-ci-template" 8 | file: "/ssh-key.gitlab-ci.yml" 9 | - project: "devops/gitlab-ci-template" 10 | file: "/docker.gitlab-ci.yml" 11 | 12 | stages: 13 | - test 14 | - release 15 | - artefact_build 16 | - delivery_dev 17 | - delivery_staging 18 | - delivery_prod 19 | 20 | release: 21 | extends: .release 22 | 23 | test: 24 | image: registry.gitlab.seq.one/devops/dockerfiles/nim-builder:1.6.14 25 | stage: test 26 | script: 27 | - apt-get update && apt-get install -y git 28 | - nimble build -y 29 | - nimble test 30 | tags: 31 | - bioinfo 32 | 33 | artefact_build: 34 | image: registry.gitlab.seq.one/devops/dockerfiles/nim-builder:1.6.14 35 | stage: artefact_build 36 | script: 37 | - apt-get update && apt-get install -y git 38 | - nimble install -y 39 | artifacts: 40 | paths: 41 | - clinvcf 42 | tags: 43 | - bioinfo 44 | only: 45 | - tags 46 | 47 | #################### 48 | # DEPLOY 49 | #################### 50 | delivery_dev: 51 | extends: .build and delivery 52 | stage: delivery_dev 53 | environment: 54 | name: "$ENV_TAG" 55 | variables: 56 | ENV_TAG: dev 57 | DOCKER_BUILD_ARGS: --build-arg VERSION=$CI_COMMIT_REF_NAME 58 | --build-arg PRIVATE_CI_ACCESS_TOKEN=$PRIVATE_CI_ACCESS_TOKEN 59 | when: on_success 60 | tags: 61 | - bioinfo 62 | 63 | #################### 64 | # DEPLOY Staging 65 | #################### 66 | delivery_staging: 67 | extends: .tag and delivery 68 | stage: delivery_staging 69 | environment: 70 | name: "$ENV_TAG" 71 | variables: 72 | ENV_TAG: staging 73 | 74 | #################### 75 | # DEPLOY prod 76 | #################### 77 | delivery_prod: 78 | extends: .tag and delivery 79 | stage: delivery_prod 80 | environment: 81 | name: "$ENV_TAG" 82 | variables: 83 | ENV_TAG: prod 84 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:stable-slim as builder 2 | ARG HTSLIB_VERSION=1.10 3 | 4 | RUN apt-get update && apt-get install --no-install-recommends -y \ 5 | libpcre3 libpcre3-dev \ 6 | make \ 7 | wget \ 8 | libbz2-dev \ 9 | bzip2 \ 10 | ca-certificates \ 11 | liblzma-dev \ 12 | zlib1g-dev libcurl4-gnutls-dev gcc 13 | 14 | # Install HTSLIB 15 | RUN cd /usr/bin && \ 16 | wget https://github.com/samtools/htslib/releases/download/$HTSLIB_VERSION/htslib-$HTSLIB_VERSION.tar.bz2 && \ 17 | tar -vxjf htslib-$HTSLIB_VERSION.tar.bz2 && \ 18 | cd htslib-$HTSLIB_VERSION && \ 19 | make 20 | 21 | # FROM debian:stable-slim 22 | # ARG HTSLIB_VERSION=1.10 23 | 24 | # COPY --from=builder /usr/bin/htslib-$HTSLIB_VERSION /usr/bin/htslib-$HTSLIB_VERSION 25 | 26 | ENV LD_LIBRARY_PATH=/usr/bin/htslib-$HTSLIB_VERSION 27 | 28 | 29 | ARG PRIVATE_CI_ACCESS_TOKEN 30 | ARG VERSION 31 | 32 | COPY clinvcf /usr/bin/ 33 | 34 | ENTRYPOINT ["clinvcf"] 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2020 SeqOne 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ClinVCF 2 | 3 | ![ClinVCF-logo](clinvcf.png) 4 | 5 | ClinVCF **generates a VCF file from a ClinVar Full Release** (XML format). It was first developped because we observed missing variants in VCF files provided by NCBI. We later extended its capabilities to provived enhanced Clinvar VCF files by : 6 | 7 | - **Improving Clinvar classification and aggregation method** by [deciphering "conflicting intepretation" records](#clinicalsignificance-correction-module) where almost all submissions goes into the same direction. 8 | - **Implementing a more robust [gene annotation module](#gene-annotation)** based NCBI GFF files. 9 | 10 | ClinVCF is **developped in NimLang, is highly efficient*** (~ 5 minutes to generate the VCF from the XML) and supports GRCh37 and GRCh38 genomes builds. 11 | 12 | **clinVCF** is a part of the [**Genome Alert!** framework](https://github.com/SeqOne/GenomeAlert_app) - [Website https://genomealert.univ-grenoble-alpes.fr/](https://genomealert.univ-grenoble-alpes.fr/). 13 | 14 | ## Table of content 15 | 16 | - [ClinVCF](#clinvcf) 17 | - [Table of content](#table-of-content) 18 | - [Quick start](#quick-start) 19 | - [Usage](#usage) 20 | - [Output format](#output-format) 21 | - [Methodology](#methodology) 22 | - [ClinicalSignificance correction module](#clinicalsignificance-correction-module) 23 | - [Gene annotation](#gene-annotation) 24 | - [How to cite](#how-to-cite) 25 | - [License](#license) 26 | - [Misc](#misc) 27 | 28 | ## Quick start 29 | 30 | You need to have [nimlang installed](https://nim-lang.org/install_unix.html) and [hts-nim](https://github.com/brentp/hts-nim) to compile and install clinVCF. 31 | 32 | A clean install script of nim and hts-nim is proposed by Brent Pedersen [nimlang and hts-nim installed](https://github.com/brentp/hts-nim/blob/master/scripts/install.sh) 33 | 34 | ```bash 35 | # Git clone and install 36 | git clone https://github.com/SeqOne/clinvcf.git && cd clinvcf && nimble install 37 | 38 | # Download (latest) Clinvar XML release 39 | wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz 40 | 41 | # Download GFF for gene annotation (GRCh37 or 38) 42 | wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gff.gz 43 | wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gff.gz 44 | 45 | # Generate clinvar VCF 46 | ## For GRCh37 47 | clinvcf --coding-first --genome GRCh37 ClinVarFullRelease_00-latest.xml.gz | bgzip -c > clinvar_GRCh37.vcf.gz 48 | ## For GRCh38 49 | clinvcf --coding-first --genome GRCh38 ClinVarFullRelease_00-latest.xml.gz | bgzip -c > clinvar_GRCh38.vcf.gz 50 | 51 | ``` 52 | 53 | ## Usage 54 | 55 | ```bash 56 | Usage: clinvcf [options] --genome 57 | 58 | Arguments: 59 | --genome Genome assembly to use 60 | 61 | Options: 62 | --filename-date Use xml filename date instead of inner date which may differ 63 | --hgnc HGNC table used for gene name alias corrections 64 | 65 | Gene annotation: 66 | --gff NCBI GFF to annotate variations with genes 67 | --coding-first Give priority to coding gene in annotation (even if intronic and exonic for another gene) 68 | --gene-padding Padding to annotation upstream/downstream genes (not applied for MT) [default: 5000] 69 | ``` 70 | 71 | ### Output format 72 | 73 | ClinVCF generates a VCF with almost identical format as the original NCBI VCF. 74 | 75 | However, not all VCF fields are currently support by ClinVCF (see table bellow), and 76 | additionnal fields are provided. 77 | 78 | | VCF Info field | Status* | Format | Description | Example | 79 | | -------------- | ------- | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------- | 80 | | **ALLELEID** | Same | *Integer* | the ClinVar Allele ID | `1234` | 81 | | **CLNREVSTAT** | Same | *String* | [ClinVar review status](https://www.ncbi.nlm.nih.gov/clinvar/docs/review_status/) for the Variation ID | `no_assertion_criteria_provided` | 82 | | **CLNSIG** | Same | String | [Clinical significance](https://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/) for this single variant | `Pathogenic/Likely_Pathogenic` | 83 | | **OLD_CLNSIG** | New | String | Orignial Clinical significance if variant reclassified by clinVCF correction module | `Conflicting_interpretations_of_pathogenicity` | 84 | | **CLNRECSTAT** | New | Integer | [3-levels stars confidence](#clinicalsignificance-correction-module) of Variant Alert! automatic reclassfication. | `3` | 85 | | **GENEINFO** | Same | String | Gene(s) for the variant reported as gene symbol:gene id. The gene symbol and id are delimited by a colon (`:`) and each pair is delimited by a vertical bar (`\|`) | `FTCD:10841\|FTCD-AS1:100861507` | 86 | | **MC** | Same | String | comma separated list of molecular consequence in the form of Sequence Ontology `ID\|molecular_consequence` | `SO:0001583\|missense_variant` | 87 | | **RS** | Same | String | dbSNP ID (i.e. rs number) | `80358507` | 88 | 89 | **Status**: *Same* (identical as in original Clinvar VCF), *new* (New field from clinVCF) 90 | 91 | ## Methodology 92 | 93 | ### ClinicalSignificance correction module 94 | 95 | According to the 1.5 * IQR method, we remove outliers submissions and reclassify conflicting status variants according to ClinVar policies. We apply a 3-level star metrics according to our reclassification confidence. 4 or more submission is needed. We only reclassify variants from `conflicting` status to `benign`, `likely benign`, `likely pathogenic` and `pathogenic` status. 96 | 97 | - ⭐ **(1 star)** : default 98 | - ⭐⭐ **(2 stars)** : reclassification remains even if we add a virtual VUS submission 99 | - ⭐⭐⭐ **(3 stars)** : 2 stars requirements and at least 1 pathogenic (or benign) classification 100 | 101 | ### Gene annotation 102 | 103 | 1. **We load all genes from the input GFF** and add them to the index with a padding (5000bp by default and 2bp for MT genes), to annotate upstream / downstream variants. 104 | 2. **For each variant we query the gene index** and retrieve all overlapping genes. 105 | 3. **Overlapped genes are later prioritize** in the `GENEINFO` field with two different procedures (depending of clinVCF parameter) 106 | - If `--coding-first` option is activated : 107 | - We take coding genes over all other genes (except for MT genome) 108 | - If we have an equality we take exonic (+/-20bp padding) over intronic/intergenic candidates 109 | - If none are exonic, we take the gene with closest exon 110 | - If both are exonic, we take the oldest gene ID in NCBI Entrez database 111 | - Default procedure : 112 | - We take coding gene over all other genes (except for MT genome) if the variant is exonic (+/- 20bp) 113 | - If we have an equality we take exonic (+/-20bp padding) over intronic/intergenic candidates 114 | - If none are exonic, we take the gene with closest exon 115 | - If both are exonic, we take the oldest gene ID in NCBI Entrez database 116 | 117 | ## How to cite 118 | 119 | If you use a tool of the Genome Alert! framework, please cite: 120 | > Yauy et al., Genome Alert!: a standardized procedure for genomic variant reinterpretation and automated genotype-phenotype reassessment in clinical routine. medRxiv (2021). [https://doi.org/10.1101/2021.07.13.21260422 121 | ](https://www.medrxiv.org/content/10.1101/2021.07.13.21260422v1) 122 | 123 | ## License 124 | 125 | **clinVCF** is licensed under the Apache License, Version 2.0. See [LICENSE](LICENSE) for the full license text. 126 | 127 | ## Misc 128 | 129 | **clinVCF** is a part of the [**Genome Alert!** framework](https://github.com/SeqOne/GenomeAlert_app), a collaboration of : 130 | 131 | [![SeqOne](img/logo-seqone.png)](https://seq.one/) 132 | 133 | [![Université Grenoble Alpes](img/logo-uga.png)](https://iab.univ-grenoble-alpes.fr/) 134 | 135 | [![CHU de Rouen](img/logo-CHU.png)](https://www.chu-rouen.fr/service/service-de-genetique/) -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | [x] Packaging with Nimble 2 | [x] Sort VCF before 3 | [x] Comp Clinvar VCF and ours 4 | [x] Functional testing 5 | [x] Gitlab CI 6 | [x] Implement new aggregation function 7 | [x] Add RS tag (dbSNP rsid) 8 | [x] Add date header 9 | [x] Handle GenotypeSet (These variants should not be included in the VCF (see case 424779)) 10 | [x] Handle automatic clinsig conversions from NCBI 11 | [x] Multiple submitter is not multiple submissions !!! (see case 307134) 12 | [x] Multiple 3-4 stars subs, take all !!! (see case 7108) 13 | [x] Sort non-ACMG clnsig lexicographically (see case 5333 : drug_response,_risk_factor,_protective => drug_response,_protective,_risk_factor) 14 | [x] Bug "criteria_provided,_single_submitter" that should be "criteria_provided,_conflicting_interpretations" when only one submitter with conflict (see case 1166) 15 | [x] Bug Conflicting only when 1 star or more ! 16 | [x] Memory leak somewhere in xml parsing (see huge memory footprint for extractClinvarSet !!!!) 17 | [x] Output stats of reclassification 18 | [ ] Get rid of q() xml lib and do it directly with xmltree package (see extractClinvarSet code) 19 | [ ] Gene stats module 20 | [ ] Unit testing 21 | [ ] Add gnomad annot (using API calls and cache) 22 | [ ] Add progressiong bar when loading variants 23 | [ ] Create README file 24 | [ ] Add NB_STARS tag 25 | [ ] Add a tag with number of submitters / submissions 26 | [ ] Optimize memory usage (variant infos could be stored in cache files and re-loaded at "print" time !) 27 | 28 | 29 | no assertion criteria provided 30 | likely pathogenic - adrenal pheochromocytoma 31 | Converted during submission to Likely pathogenic. 32 | 33 | 34 | Correct discrepancies : 35 | 36 | CLINSIG ERRORS 37 | 38 | A submission has a non harmonized clinsig values (ex: likely pathogenic - adrenal pheochromocytoma) that 39 | is said to be converted to Likely pathogenic, but information is not really there in XML (or is it ?) 40 | EX: DIFF OF CLNSIG for variant 109 : clinvar_2020-01.vcf = risk_factor <-> clinvar_20191223.vcf.gz = Likely_pathogenic,_risk_factor 41 | EX: DIFF OF CLNSIG for variant 1365 : clinvar_2020-01.vcf = Pathogenic <-> clinvar_20191223.vcf.gz = Pathogenic/Likely_pathogenic 42 | EX: DIFF OF CLNSIG for variant 1762 : clinvar_2020-01.vcf = not_provided <-> clinvar_20191223.vcf.gz = Benign 43 | 44 | https://www.ncbi.nlm.nih.gov/clinvar/variation/1762/ 45 | 46 | It looks like we do not filter-out the 0-star "Pathogenic" submission from this one. 47 | DIFF OF CLNSIG for variant 928 : clinvar_2020-01.vcf = Pathogenic/Likely_pathogenic <-> clinvar_20191223.vcf.gz = Likely_pathogenic 48 | EX: DIFF OF CLNSIG for variant 1274 : clinvar_2020-01.vcf = Pathogenic/Likely_pathogenic <-> clinvar_20191223.vcf.gz = Pathogenic 49 | 50 | REVSTAT ERRORS 51 | 52 | MISSING VARIANTS 53 | -------------------------------------------------------------------------------- /clinvcf.nimble: -------------------------------------------------------------------------------- 1 | # Package 2 | 3 | version = "0.0.1" 4 | author = "Jérôme Audoux, Sacha Beaumeunier" 5 | description = "Generate a clean Clinvar VCF" 6 | license = "SEQONE" 7 | 8 | 9 | # Dependencies 10 | 11 | requires "hts >= 0.2.20 & <= 0.2.23", "q", "docopt"#, "lapper" 12 | requires "https://github.com/GULPF/tiny_sqlite#head" 13 | requires "regex >= 0.13" 14 | srcDir = "src" 15 | installExt = @["nim"] 16 | 17 | bin = @["clinvcf", "extractClinvarSet", "compvcf"] 18 | 19 | skipDirs = @["tests"] 20 | 21 | import ospaths,strutils 22 | 23 | task test, "run the tests": 24 | exec "nim c -d:useSysAssert -d:useGcAssert --lineDir:on --debuginfo --lineDir:on --debuginfo -r --threads:on tests/all" 25 | exec "bash tests/functional-tests.sh" 26 | -------------------------------------------------------------------------------- /clinvcf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SeqOne/clinvcf/59a63ac0bde68d95c93106f55aab6c2c0c0735c6/clinvcf.png -------------------------------------------------------------------------------- /src/clinvcfpkg/gff.nim: -------------------------------------------------------------------------------- 1 | import tables, hts, strutils, re, algorithm 2 | 3 | import ./lapper 4 | import logging 5 | import strformat 6 | 7 | from ./utils import logger 8 | 9 | type 10 | Region* = ref object of RootObj 11 | chrom*: string 12 | start*: int 13 | stop*: int 14 | 15 | GFFGene* = ref object of Region 16 | gene_symbol: string 17 | gene_id: int 18 | biotype: string 19 | exons: seq[Region] 20 | 21 | RequestGene = ref object 22 | gene: GFFGene 23 | query: Region 24 | 25 | proc start*(region : Region): int {.inline.} = return region.start 26 | proc stop*(region : Region): int {.inline.} = return region.stop 27 | proc len*(region : Region): int {.inline.} = return region.stop - region.start + 1 28 | proc `==`*(a, b: Region): bool {.inline.} = return a.chrom == b.chrom and a.start == b.start and a.stop == b.stop 29 | proc `$`*(a: Region): string {.inline.} = return a.chrom & ":" & $a.start & "-" & $a.stop 30 | proc isOverlapping*(a: Region, b: Region): bool {.inline.} = return a.stop >= b.start and a.start <= b.stop 31 | proc merge*(a: var Region, b: Region) {.inline.} = a.start = min(a.start, b.start); a.stop = max(a.stop, b.stop) 32 | 33 | # Coversion table from NCBI chromosomes ID's to usual names 34 | let 35 | # FIXME: Should we remove the version from NC_ ids ? 36 | # ClinVar use MT and not M 37 | ncbi_to_chr = { 38 | "NC_000001.10": "1", 39 | "NC_000002.11": "2", 40 | "NC_000003.11": "3", 41 | "NC_000004.11": "4", 42 | "NC_000005.9": "5", 43 | "NC_000006.11": "6", 44 | "NC_000007.13": "7", 45 | "NC_000008.10": "8", 46 | "NC_000009.11": "9", 47 | "NC_000010.10": "10", 48 | "NC_000011.9": "11", 49 | "NC_000012.11": "12", 50 | "NC_000013.10": "13", 51 | "NC_000014.8": "14", 52 | "NC_000015.9": "15", 53 | "NC_000016.9": "16", 54 | "NC_000017.10": "17", 55 | "NC_000018.9": "18", 56 | "NC_000019.9": "19", 57 | "NC_000020.10": "20", 58 | "NC_000021.8": "21", 59 | "NC_000022.10": "22", 60 | "NC_000023.10": "X", 61 | "NC_000024.9": "Y", 62 | "NC_012920.1": "MT", 63 | "NC_000001.11": "1", 64 | "NC_000002.12": "2", 65 | "NC_000003.12": "3", 66 | "NC_000004.12": "4", 67 | "NC_000005.10": "5", 68 | "NC_000006.12": "6", 69 | "NC_000007.14": "7", 70 | "NC_000008.11": "8", 71 | "NC_000009.12": "9", 72 | "NC_000010.11": "10", 73 | "NC_000011.10": "11", 74 | "NC_000012.12": "12", 75 | "NC_000013.11": "13", 76 | "NC_000014.9": "14", 77 | "NC_000015.10": "15", 78 | "NC_000016.10": "16", 79 | "NC_000017.11": "17", 80 | "NC_000018.10": "18", 81 | "NC_000019.10": "19", 82 | "NC_000020.11": "20", 83 | "NC_000021.9": "21", 84 | "NC_000022.11": "22", 85 | "NC_000023.11": "X", 86 | "NC_000024.10": "Y", 87 | }.toTable 88 | 89 | proc minExonDist*(gene: GFFGene, pos: int, padding : int): int = 90 | var min_dist = -1 91 | for exon in gene.exons: 92 | # Add padding to the exons, in order to consider close intronic regions as "exonic" 93 | # as these are likely to be linked to this gene 94 | let 95 | start = exon.start - padding 96 | stop = exon.stop + padding 97 | if pos >= start and pos <= stop: 98 | min_dist = 0 99 | break 100 | else: 101 | let dist = min(abs(start - pos), abs(pos - stop)) 102 | if min_dist == -1 or dist < min_dist: 103 | min_dist = dist 104 | result = min_dist 105 | 106 | proc minExonDist*(gene: GFFGene, start: int, stop: int, padding : int): int = 107 | result = min(gene.minExonDist(start, padding),gene.minExonDist(stop, padding)) 108 | 109 | proc removeChrPrevix*(chrom: string): string = 110 | if chrom =~ re"""^chr(.*)""": 111 | return matches[0] 112 | else: 113 | return chrom 114 | 115 | proc parseChr*(chrom: string): string {.inline.} = 116 | if ncbi_to_chr.hasKey(chrom): 117 | result = ncbi_to_chr[chrom] 118 | else: 119 | result = removeChrPrevix(chrom) 120 | 121 | proc parseKeyValues*(str: string, global_sep: char, key_value_sep: char): TableRef[string, string] = 122 | let fields = str.split(global_sep) 123 | result = newTable[string, string]() 124 | for f in fields: 125 | let 126 | kv_split = f.split(key_value_sep, 1) 127 | if kv_split.len() == 2: 128 | result[kv_split[0]] = kv_split[1] 129 | else: 130 | stderr.writeLine("[Error] Value fields " & f & " was not a key/value field using separator " & key_value_sep) 131 | 132 | proc loadGenesFromGFF*(gff_file: string, gene_padding : int): (TableRef[string, int], TableRef[string, Lapper[GFFGene]]) = 133 | result = (newTable[string, int](), newTable[string, Lapper[GFFGene]]()) 134 | var 135 | fh: BGZ 136 | genes_chr_table = newTable[string, seq[GFFGene]]() # Temp table to load genes per-chromosomes 137 | genes_name_table = newTable[string, GFFGene]() 138 | 139 | open(fh, gff_file, "r") 140 | for line in fh: 141 | # Skip headers 142 | if line.len() == 0 or line[0] == '#': 143 | continue 144 | var v = line.split('\t', 3) 145 | 146 | # Only use "BestRefSeq" annotations 147 | # This was disable as MT annotations are annotated "RefSeq" and not "BestRefSeq" 148 | # if v[1] != "BestRefSeq": 149 | # continue 150 | 151 | # NC_000001.10 BestRefSeq gene 367659 368597 . + . ID=gene-OR4F29;Dbxref=GeneID:729759,HGNC:HGNC:31275;Name=OR4F29;description=olfactory receptor family 4 subfamily F member 29;gbkey=Gene;gene=OR4F29;gene_biotype=protein_coding;gene_synonym=OR7-21 152 | if v[2] == "gene" or v[2] == "pseudogene": 153 | var 154 | v2 = v[3].split('\t') 155 | chrom = parseChr(v[0]) 156 | start = parseInt(v2[0]) 157 | stop = parseInt(v2[1]) 158 | gff_fields = v2[5].parseKeyValues(';','=') 159 | dbxref_fields = gff_fields["Dbxref"].parseKeyValues(',',':') 160 | gene : GFFGene 161 | 162 | if chrom != "MT": 163 | # Add padding of gene to annotate upstream / downstream genes 164 | gene = GFFGene(chrom: chrom, start: start - gene_padding, stop: stop + gene_padding, exons: @[]) 165 | else: 166 | # For MT, we only do +/-2bp padding 167 | gene = GFFGene(chrom: chrom, start: start - 2, stop: stop + 2, exons: @[]) 168 | 169 | gene.gene_symbol = gff_fields["Name"] 170 | if dbxref_fields.hasKey("GeneID"): 171 | gene.gene_id = parseInt(dbxref_fields["GeneID"]) 172 | 173 | result[0][gene.gene_symbol] = gene.gene_id 174 | 175 | if gff_fields.hasKey("gene_biotype"): 176 | gene.biotype = gff_fields["gene_biotype"] 177 | 178 | if genes_chr_table.hasKey(gene.chrom): 179 | genes_chr_table[gene.chrom].add(gene) 180 | else: 181 | genes_chr_table[gene.chrom] = @[gene] 182 | 183 | genes_name_table[gene.gene_symbol] = gene 184 | 185 | # NC_000001.10 Curated Genomic exon 131068 132927 . + . ID=id-CICP27;Parent=gene-CICP27;Dbxref=GeneID:100420257,HGNC:HGNC:48835;gbkey=exon;gene=CICP27 186 | elif v[2] == "exon": 187 | var 188 | v2 = v[3].split('\t') 189 | gff_fields = v2[5].parseKeyValues(';','=') 190 | dbxref_fields = gff_fields["Dbxref"].parseKeyValues(',',':') 191 | gene_symbol : string 192 | 193 | if gff_fields.hasKey("gene"): 194 | gene_symbol = gff_fields["gene"] 195 | 196 | # This exon belongs to an gene we are annotation, we catch it 197 | if gene_symbol != "" and genes_name_table.hasKey(gene_symbol): 198 | let 199 | exon = Region(chrom: parseChr(v[0]),start: parseInt(v2[0]), stop: parseInt(v2[1])) 200 | 201 | # Only add uniq exons and merge overlapping ones 202 | var i = 0 203 | for e in genes_name_table[gene_symbol].exons.mitems(): 204 | if e == exon: 205 | break 206 | elif e.isOverlapping(exon): 207 | e.merge(exon) 208 | break 209 | inc(i) 210 | 211 | # The exon has not been found / merge, we add it 212 | if i == genes_name_table[gene_symbol].exons.len(): 213 | genes_name_table[gene_symbol].exons.add(exon) 214 | 215 | # Load set of genes (per chromosome) to lapper index 216 | logger.log(lvlInfo, fmt"Create lapper index for file {gff_file}") 217 | for chrom in genes_chr_table.keys(): 218 | result[1][chrom] = lapify(genes_chr_table[chrom]) 219 | 220 | proc cmpGenes*(x, y: RequestGene): int = 221 | ## We select protein coding over non-coding gene (always ?) 222 | let 223 | x_exon_dist = x.gene.minExonDist(x.query.start, x.query.stop, 20) 224 | y_exon_dist = y.gene.minExonDist(y.query.start, y.query.stop, 20) 225 | 226 | # echo "X: " & x.gene.gene_symbol & " DIST: " & $x_exon_dist & " BIOTYPE: " & x.gene.biotype 227 | # echo "Y: " & y.gene.gene_symbol & " DIST: " & $y_exon_dist & " BIOTYPE: " & y.gene.biotype 228 | 229 | # First we give priority to protein_coding genes if variants is at 20bp of an exon boundary or both are intronic 230 | # This does not apply for MT 231 | if x.gene.chrom != "MT" and x.gene.biotype == "protein_coding" and y.gene.biotype != "protein_coding" and (x_exon_dist <= 20 or (x_exon_dist > 0 and y_exon_dist > 0)): 232 | return -1 233 | elif x.gene.chrom != "MT" and x.gene.biotype != "protein_coding" and y.gene.biotype == "protein_coding" and (y_exon_dist <= 20 or (x_exon_dist > 0 and y_exon_dist > 0)): 234 | return 1 235 | else: 236 | # Otherwise we give priority to the genes having the closest exon 237 | if x_exon_dist != -1 and y_exon_dist != -1: 238 | # Both are coding or non of them is, we take the one with the closest exon 239 | let exon_dist_cmp = cmp(x_exon_dist, y_exon_dist) 240 | if exon_dist_cmp != 0: 241 | return exon_dist_cmp 242 | elif x_exon_dist >= 0: 243 | return -1 244 | else: 245 | return 1 246 | 247 | # Finally we chose the oldest gene_id 248 | return cmp(x.gene.gene_id, y.gene.gene_id) 249 | 250 | proc cmpGenesCodingFirst*(x, y: RequestGene): int = 251 | ## We select protein coding over non-coding gene always 252 | 253 | # stderr.writeLine("gene: " & x.gene.gene_symbol & " biotype: " & x.gene.biotype & " dist: " & $x.gene.minExonDist(x.query.start, x.query.stop, 20)) 254 | # stderr.writeLine("gene: " & y.gene.gene_symbol & " biotype: " & y.gene.biotype & " dist: " & $y.gene.minExonDist(x.query.start, x.query.stop, 20)) 255 | 256 | # First we give priority to protein_coding genes if variants is at 20bp of an exon boundary or both are intronic 257 | # This does not apply for MT 258 | if x.gene.chrom != "MT" and x.gene.biotype == "protein_coding" and y.gene.biotype != "protein_coding": 259 | return -1 260 | elif x.gene.chrom != "MT" and x.gene.biotype != "protein_coding" and y.gene.biotype == "protein_coding": 261 | return 1 262 | else: 263 | let 264 | x_exon_dist = x.gene.minExonDist(x.query.start, x.query.stop, 20) 265 | y_exon_dist = y.gene.minExonDist(y.query.start, y.query.stop, 20) 266 | # Otherwise we give priority to the genes having the closest exon 267 | if x_exon_dist != -1 and y_exon_dist != -1: 268 | # Both are coding or non of them is, we take the one with the closest exon 269 | let exon_dist_cmp = cmp(x_exon_dist, y_exon_dist) 270 | if exon_dist_cmp != 0: 271 | return exon_dist_cmp 272 | elif x_exon_dist >= 0: 273 | return -1 274 | else: 275 | return 1 276 | 277 | # Finally we chose the oldest gene_id 278 | return cmp(x.gene.gene_id, y.gene.gene_id) 279 | 280 | proc getInfoString*(genes_index: TableRef[string, Lapper[GFFGene]], chrom: string, start: int, stop: int, coding_priority: bool): string = 281 | if genes_index.hasKey(chrom): 282 | var 283 | res = new_seq[GFFGene]() # Store retrieved genes 284 | found_overlapping_genes = genes_index[chrom].find(start, stop, res) 285 | 286 | # We have no overlapping genes, we try to find the nearest ones (upstream and downstream) 287 | if not found_overlapping_genes: 288 | var 289 | res_nearest_up = new_seq[GFFGene]() 290 | res_nearest_down = new_seq[GFFGene]() 291 | found_nearest_up = genes_index[chrom].find_nearest_upstream(start, res_nearest_up) 292 | found_nearest_down = genes_index[chrom].find_nearest_downstream(stop, res_nearest_down) 293 | dist_nearest_up = -1 294 | dist_nearest_down = -1 295 | 296 | if found_nearest_up: 297 | dist_nearest_up = stop - res_nearest_up[0].stop 298 | if found_nearest_down: 299 | dist_nearest_down = res_nearest_down[0].start - start 300 | 301 | if dist_nearest_up != -1 and dist_nearest_down != -1: 302 | # Select nearest_up genes 303 | if dist_nearest_up < dist_nearest_down: 304 | res = res_nearest_up 305 | elif dist_nearest_down < dist_nearest_up: 306 | res = res_nearest_down 307 | # Merge result 308 | else: 309 | res.add(res_nearest_up) 310 | res.add(res_nearest_down) 311 | elif dist_nearest_up != -1: 312 | res = res_nearest_up 313 | elif dist_nearest_down != -1: 314 | res = res_nearest_down 315 | 316 | if res.len() > 0: 317 | # Create object with gene + query interval for sorting (query is necessary for compGenes) 318 | var sorted_genes: seq[RequestGene] 319 | for g in res: 320 | sorted_genes.add(RequestGene(gene: g, query: Region(chrom: chrom, start: start, stop: stop))) 321 | 322 | # Sort genes 323 | if coding_priority: 324 | sorted_genes.sort(cmpGenesCodingFirst) 325 | else: 326 | sorted_genes.sort(cmpGenes) 327 | 328 | var gene_info: seq[string] 329 | for q in sorted_genes: 330 | gene_info.add(q.gene.gene_symbol & ":" & $q.gene.gene_id) 331 | result = gene_info.join("|") 332 | else: 333 | stderr.writeLine("[Error] Chrom " & chrom & " not found in GFF annotations") 334 | -------------------------------------------------------------------------------- /src/clinvcfpkg/hgnc.nim: -------------------------------------------------------------------------------- 1 | import strformat 2 | import strutils 3 | import tables 4 | import logging 5 | 6 | from ./utils import logger 7 | 8 | type 9 | Entrez* = TableRef[string, int] 10 | Alias* = TableRef[string, string] 11 | SharedAlias* = TableRef[string, int] 12 | 13 | type 14 | HgncIndex* = ref object 15 | entrez*: Entrez 16 | alias*: Alias 17 | sharedAlias*: SharedAlias 18 | 19 | proc newEntrez*(): Entrez = 20 | result = newTable[string, int]() 21 | 22 | proc newAlias*(): Alias = 23 | result = newTable[string, string]() 24 | 25 | proc newSharedAlias*(): SharedAlias = 26 | result = newTable[string, int]() 27 | 28 | proc newHgncIndex*(): HgncIndex = 29 | ## Instantiate a new HgncIndex 30 | ## Each gene that have an Entrez ID is stored in 'entrez' attribute. 31 | ## Each gene alias is stored in 'alias' attribute as key 32 | result = HgncIndex(entrez: newEntrez(), alias: newAlias(), sharedAlias: newSharedAlias()) 33 | 34 | proc initHgncDbfromFile*(file: string): HgncIndex = 35 | ## Create an HgncIndex from HGNC table 36 | result = newHgncIndex() 37 | let f = open(file) 38 | defer: f.close() 39 | var line : string 40 | var isHeader = false 41 | while f.read_line(line): 42 | if line == "Approved symbol\tAlias symbols\tNCBI Gene ID(supplied by NCBI)": 43 | isHeader = true 44 | continue 45 | if not isHeader: 46 | raise newException(IOError, "wrong HGNC table header") 47 | var sl = line.split("\t") 48 | if sl[2] == "": 49 | # next if no entrezID is defined 50 | continue 51 | result.entrez[sl[0]] = parseInt(sl[2]) 52 | # handle alias 53 | if sl[1] != "": 54 | var sAlias = sl[1].split(", ") 55 | for g in sAlias: 56 | if result.alias.hasKey(g): 57 | # store these alias 58 | if not result.sharedAlias.hasKey(g): 59 | result.sharedAlias[g] = 1 60 | else: 61 | inc(result.sharedAlias[g]) 62 | else: 63 | result.alias[g] = sl[0] 64 | # some alias are shared between genes. These alias are ambiguous so we remove them 65 | for g, c in result.sharedAlias: 66 | logger.log(lvlInfo, fmt"[initHgncDbfromFile] remove ambiguous shared alias {g}") 67 | if result.alias.hasKey(g): 68 | result.alias.del(g) 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /src/clinvcfpkg/lapper.nim: -------------------------------------------------------------------------------- 1 | ## FIXME: THIS IS A LOCAL COPY OF LAPPER (https://github.com/brentp/nim-lapper) 2 | ## FIRST: WE USE CLOSED-INTERVALS AND NOT HALF-OPENED AS IN ORIGINAL IMPLEMENTATION 3 | ## IMPLEMENTING TWO NEW PROCEDURE: find_nearest_downstream and find_nearest_upstream 4 | ## THESE MODIFICATION SHOULD BE MERGED IN THE LAPPER REPOSITORY TO AVOID MAINTAINING THIS 5 | ## COPY !!! 6 | ## 7 | ## This module provides a simple data-structure for fast interval searches. It does not use an interval tree, 8 | ## instead, it operates on the assumption that most intervals are of similar length; or, more exactly, that the 9 | ## longest interval in the set is not long compared to the average distance between intervals. On any dataset 10 | ## where that is not the case, this method will not perform well. For cases where this holds true (as it often 11 | ## does with genomic data), we can sort by start and use binary search on the starts, accounting for the length 12 | ## of the longest interval. The advantage of this approach is simplicity of implementation and speed. In realistic 13 | ## tests queries returning the overlapping intervals are 1000 times faster than brute force and queries that merely 14 | ## check for the overlaps are > 5000 times faster. 15 | ## 16 | ## The main methods are `find` and `seek` where the latter uses a cursor and is very fast for cases when the queries 17 | ## are sorted. This is another innovation in this library that allows an addition ~50% speed improvement when 18 | ## consecutive queries are known to be in sort order. 19 | ## 20 | ## For both find and seek, if the given intervals parameter is nil, the function will return a boolean indicating if 21 | ## any intervals in the set overlap the query. This is much faster than modifying the 22 | ## intervals. 23 | ## 24 | ## The example below shows off most of the API of `Lapper`. 25 | ## 26 | ## .. code-block:: nim 27 | ## import lapper 28 | ## type myinterval = ref object 29 | ## start: int 30 | ## stop: int 31 | ## val: int 32 | ## 33 | ## proc start(m: myinterval): int {.inline.} = return m.start 34 | ## proc stop(m: myinterval): int {.inline.} = return m.stop 35 | ## proc `$`(m:myinterval): string = return "(start:$#, stop:$#, val:$#)" % [$m.start, $m.stop, $m.val] 36 | ## 37 | ## create some fake data 38 | ## .. code-block:: nim 39 | ## var ivs = new_seq[myinterval]() 40 | ## for i in countup(0, 100, 10): 41 | ## ivs.add(myinterval(start:i, stop:i + 15, val:0)) 42 | 43 | ## make the Lapper "data-structure" 44 | 45 | ## .. code-block:: nim 46 | ## l = lapify(ivs) 47 | ## empty:seq[myinterval] 48 | 49 | ## .. code-block:: nim 50 | ## l.find(10, 20, empty) 51 | ## notfound = not l.find(200, 300, empty) 52 | ## assert notfound 53 | 54 | ## .. code-block:: nim 55 | ## res = new_seq[myinterval]() 56 | 57 | ## find is the more general case, l.seek gives a speed benefit when consecutive queries are in order. 58 | 59 | ## .. code-block:: nim 60 | ## echo l.find(50, 70, res) 61 | ## echo res 62 | ## # @[(start: 40, stop: 55, val:0), (start: 50, stop: 65, val: 0), (start: 60, stop: 75, val: 0), (start: 70, stop: 85, val: 0)] 63 | ## for r in res: 64 | ## r.val += 1 65 | 66 | ## or we can do a function on each overlapping interval 67 | 68 | ## .. code-block:: nim 69 | ## l.each_seek(50, 60, proc(a:myinterval) = inc(a.val)) 70 | 71 | ## or 72 | 73 | ## .. code-block:: nim 74 | ## l.each_find(50, 60, proc(a:myinterval) = a.val += 10) 75 | 76 | ## .. code-block:: nim 77 | ## discard l.seek(50, 70, res) 78 | ## echo res 79 | ## # @[(start:40, stop:55, val:12), (start:50, stop:65, val:12), (start:60, stop:75, val:1)] 80 | import algorithm 81 | import logging 82 | from ./utils import logger 83 | import strformat 84 | 85 | type 86 | 87 | Interval* = concept i 88 | ## An object/tuple must implement these 2 methods to use this module 89 | start(i) is int 90 | stop(i) is int 91 | 92 | Lapper*[T] = object 93 | ## Lapper enables fast interval searches 94 | intervals: seq[T] 95 | max_len*: int 96 | cursor: int ## `cursor` is used internally by ordered find 97 | 98 | template overlap*[T:Interval](a: T, start:int, stop:int): bool = 99 | ## overlap returns true if half-open intervals overlap 100 | #return a.start < stop and a.stop > start 101 | a.stop >= start and a.start <= stop 102 | 103 | proc iv_cmp[T:Interval](a, b: T): int = 104 | if a.start < b.start: return -1 105 | if b.start < a.start: return 1 106 | return cmp(a.stop, b.stop) 107 | 108 | proc lapify*[T:Interval](ivs:var seq[T]): Lapper[T] = 109 | ## create a new Lapper object; ivs will be sorted. 110 | sort(ivs, iv_cmp) 111 | var l = Lapper[T](max_len: 0, intervals:ivs) 112 | for iv in ivs: 113 | if iv.stop - iv.start > l.max_len: 114 | l.max_len = iv.stop - iv.start 115 | return l 116 | 117 | proc lowerBound[T:Interval](a: var seq[T], start: int): int = 118 | result = a.low 119 | var count = a.high - a.low + 1 120 | var step, pos: int 121 | while count != 0: 122 | step = count div 2 123 | pos = result + step 124 | if a[pos].start < start: 125 | result = pos + 1 126 | count -= step + 1 127 | else: 128 | count = step 129 | 130 | proc len*[T:Interval](L:Lapper[T]): int {.inline.} = 131 | ## len returns the number of intervals in the Lapper 132 | L.intervals.len 133 | 134 | proc find*[T:Interval](L:var Lapper[T], start:int, stop:int, ivs:var seq[T]): bool = 135 | ## fill ivs with all intervals in L that overlap start .. stop. 136 | #if ivs.len != 0: ivs.set_len(0) 137 | shallow(L.intervals) 138 | var off = lowerBound(L.intervals, start - L.max_len) 139 | var n = 0 140 | for i in off..L.intervals.high: 141 | var x = L.intervals[i] 142 | if x.overlap(start, stop): 143 | if n < ivs.len: 144 | ivs[n] = x 145 | else: 146 | ivs.add(x) 147 | n += 1 148 | elif x.start >= stop: break 149 | if ivs.len > n: 150 | ivs.setLen(n) 151 | return len(ivs) > 0 152 | 153 | proc count*[T:Interval](L:var Lapper[T], start:int, stop:int): int = 154 | ## fill ivs with all intervals in L that overlap start .. stop. 155 | shallow(L.intervals) 156 | var off = lowerBound(L.intervals, start - L.max_len) 157 | for i in off..L.intervals.high: 158 | let x = L.intervals[i] 159 | if x.overlap(start, stop): 160 | result.inc 161 | elif x.start >= stop: break 162 | 163 | proc find_nearest_upstream*[T:Interval](L:var Lapper[T], pos:int, ivs:var seq[T]): bool = 164 | ## Find nearest upstream interval (left) 165 | shallow(L.intervals) 166 | var 167 | i = lowerBound(L.intervals, pos) 168 | max_stop = -1 169 | candidates: seq[T] 170 | if len(L.intervals) == i: 171 | let ii:int = i 172 | i = i - 1 173 | logger.log(lvlInfo, fmt"-----> Resetting index [i] value from {ii} to {i} due to an out of bounds index error.") 174 | assert ii - 1 == i 175 | 176 | 177 | # While we have not found an interval or we could find one that will have 178 | # a higher stop position as our current candidate 179 | while i >= 0 and (max_stop == -1 or (max_stop - L.intervals[i].start) < L.max_len): 180 | # We want intervals that are not ovelapping our posion 181 | if L.intervals[i].stop < pos: 182 | if max_stop == -1 or L.intervals[i].stop > max_stop: 183 | max_stop = L.intervals[i].stop 184 | candidates.setLen(0) 185 | candidates.add(L.intervals[i]) 186 | elif L.intervals[i].stop == max_stop: 187 | candidates.add(L.intervals[i]) 188 | dec(i) 189 | 190 | for c in candidates: 191 | ivs.add(c) 192 | 193 | return ivs.len() > 0 194 | 195 | proc find_nearest_downstream*[T:Interval](L:var Lapper[T], pos:int, ivs:var seq[T]): bool = 196 | ## Find nearest upstream interval (left) 197 | shallow(L.intervals) 198 | var 199 | i = lowerBound(L.intervals, pos) 200 | min_start = -1 201 | 202 | # While we have not found an interval or we could find one that will have 203 | # a higher stop position as our current candidate 204 | while i <= L.intervals.high and (min_start == -1 or L.intervals[i].start == min_start): 205 | # We want intervals that are not ovelapping our posion 206 | # Note: Stop is half-open (not included) 207 | if L.intervals[i].start > pos: 208 | min_start = L.intervals[i].start 209 | ivs.add(L.intervals[i]) 210 | inc(i) 211 | 212 | return ivs.len() > 0 213 | 214 | proc each_find*[T:Interval](L:var Lapper[T], start:int, stop:int, fn: proc (v:T)) = 215 | ## call fn(x) for each interval x in L that overlaps start..stop 216 | var off = lowerBound(L.intervals, start - L.max_len) 217 | for i in off..L.intervals.high: 218 | var x = L.intervals[i] 219 | if x.overlap(start, stop): 220 | fn(x) 221 | elif x.start >= stop: break 222 | 223 | proc seek*[T:Interval](L:var Lapper[T], start:int, stop:int, ivs:var seq[T]): bool = 224 | ## fill ivs with all intervals in L that overlap start .. stop inclusive. 225 | ## this method will work when queries to this lapper are in sorted (start) order 226 | ## it uses a linear search from the last query instead of a binary search. 227 | ## if ivs is nil, then this will just return true if it finds an interval and false otherwise 228 | if ivs.len != 0: ivs.set_len(0) 229 | if L.cursor == 0 or L.intervals[L.cursor].start > start: 230 | L.cursor = lowerBound(L.intervals, start - L.max_len) 231 | while (L.cursor + 1) < L.intervals.high and L.intervals[L.cursor + 1].start < (start - L.max_len): 232 | L.cursor += 1 233 | for i in L.cursor..L.intervals.high: 234 | var x = L.intervals[i] 235 | if x.overlap(start, stop): 236 | ivs.add(x) 237 | elif x.start >= stop: break 238 | return ivs.len != 0 239 | 240 | proc each_seek*[T:Interval](L:var Lapper[T], start:int, stop:int, fn:proc (v:T)) {.inline.} = 241 | ## call fn(x) for each interval x in L that overlaps start..stop 242 | ## this assumes that subsequent calls to this function will be in sorted order 243 | if L.cursor == 0 or L.cursor >= L.intervals.high or L.intervals[L.cursor].start > start: 244 | L.cursor = lowerBound(L.intervals, start - L.max_len) 245 | while (L.cursor + 1) < L.intervals.high and L.intervals[L.cursor + 1].start < (start - L.max_len): 246 | L.cursor += 1 247 | for i in L.cursor..L.intervals.high: 248 | var x = L.intervals[i] 249 | if x.start >= stop: break 250 | elif x.stop > start: 251 | fn(x) 252 | 253 | iterator items*[T:Interval](L: Lapper[T]): T = 254 | for i in L.intervals: yield i 255 | 256 | when isMainModule: 257 | 258 | import random 259 | import times 260 | import strutils 261 | 262 | proc randomi(imin:int, imax:int): int = 263 | return imin + rand(imax - imin) 264 | 265 | proc brute_force(ivs: seq[Interval], start:int, stop:int, res: var seq[Interval]) = 266 | if res.len != 0: res.set_len(0) 267 | for i in ivs: 268 | if i.overlap(start, stop): res.add(i) 269 | 270 | # example implementation 271 | type myinterval = tuple[start:int, stop:int, val:int] 272 | proc start(m: myinterval): int {.inline.} = return m.start 273 | proc stop(m: myinterval): int {.inline.} = return m.stop 274 | 275 | proc make_random(n:int, range_max:int, size_min:int, size_max:int): seq[myinterval] = 276 | result = new_seq[myinterval](n) 277 | for i in 0.. 0: 42 | stderr.writeLine("[Log] Found " & $nb_reclassif & " reclassified variants in " & filename) 43 | 44 | 45 | proc main*(argv: seq[string]) = 46 | 47 | # TODO: Create a usage and expose api_keys as options 48 | let doc = format(""" 49 | Usage: compVCF <1.vcf> <2.vcf> 50 | 51 | """) 52 | 53 | let 54 | args = docopt(doc) 55 | vcf1 = $args["<1.vcf>"] 56 | vcf2 = $args["<2.vcf>"] 57 | 58 | stderr.writeLine("[Log] Loading variants from " & vcf1) 59 | var variants1 = vcf1.loadVariantsFromVCF() 60 | stderr.writeLine("[Log] " & $variants1.len() & " variant loaded") 61 | stderr.writeLine("[Log] Loading variants from " & vcf2) 62 | var variants2 = vcf2.loadVariantsFromVCF() 63 | stderr.writeLine("[Log] " & $variants2.len() & " variant loaded") 64 | 65 | var 66 | nb_wrong_clinsig = 0 67 | nb_wrong_revstat = 0 68 | nb_missing_variant_v1 = 0 69 | nb_missing_variant_v2 = 0 70 | 71 | for vid, v1 in variants1: 72 | if variants2.hasKey(vid): 73 | if variants2[vid].clinsig == "reclassified" or v1.clinsig == "reclassified": 74 | continue 75 | if variants2[vid].clinsig != v1.clinsig: 76 | inc(nb_wrong_clinsig) 77 | echo "DIFF OF CLNSIG for variant " & $vid & " : " & vcf1 & " = " & v1.clinsig & " <-> " & vcf2 & " = " & variants2[vid].clinsig 78 | if variants2[vid].revstat != v1.revstat: 79 | inc(nb_wrong_revstat) 80 | echo "DIFF OF REVSTAT for variant " & $vid & " : " & vcf1 & " = " & v1.revstat & " <-> " & vcf2 & " = " & variants2[vid].revstat 81 | else: 82 | echo "MISSING variant " & $vid & " in " & vcf2 83 | inc(nb_missing_variant_v2) 84 | 85 | for vid, v2 in variants2: 86 | if not variants1.hasKey(vid): 87 | inc(nb_missing_variant_v1) 88 | echo "MISSING variant " & $vid & " in " & vcf1 89 | 90 | stderr.writeLine("[Stats] NB_WRONG_CLINSIG " & $nb_wrong_clinsig) 91 | stderr.writeLine("[Stats] NB_WRONG_REVSTAT " & $nb_wrong_revstat) 92 | stderr.writeLine("[Stats] NB_MISSING_VARIANT_VCF1 " & $nb_missing_variant_v1) 93 | stderr.writeLine("[Stats] NB_MISSING_VARIANT_VCF2 " & $nb_missing_variant_v2) 94 | 95 | when isMainModule: 96 | main(commandLineParams()) -------------------------------------------------------------------------------- /src/extractClinvarSet.nim: -------------------------------------------------------------------------------- 1 | import httpclient, json, tables 2 | import os, times 3 | import xmltree # Parse XML 4 | import htmlparser 5 | import docopt # Formating the command-line 6 | import strutils # Split string 7 | from streams import newStringStream 8 | import hts 9 | 10 | iterator nextClinvarSet*(file: var BGZ): string = 11 | var chunk: string 12 | for line in file: 13 | if line == "": 14 | yield chunk 15 | chunk = "" 16 | else: 17 | chunk.add(line & "\n") 18 | yield chunk 19 | 20 | proc formatVCFString*(vcf_string: string): string = 21 | result = vcf_string.replace(' ', '_') 22 | 23 | proc findNodes(n: XmlNode, tag: string): seq[XmlNode] = 24 | for xref_node in n: 25 | if xref_node.kind == xnElement: 26 | if xref_node.tag == tag: 27 | result.add(xref_node) 28 | 29 | proc main*(argv: seq[string]) = 30 | 31 | # TODO: Create a usage and expose api_keys as options 32 | let doc = format(""" 33 | Usage: extractClinvarSet 34 | 35 | """) 36 | 37 | let 38 | args = docopt(doc) 39 | searched_id = $args[""] 40 | clinvar_xml_file = $args[""] 41 | #variation_allele_file = $args[""] 42 | #allele_variant_table = loadAlleleVariantTable(variation_allele_file) 43 | 44 | # TODO: Print VCF headers 45 | stderr.writeLine("[Log] Parsing variants from " & clinvar_xml_file) 46 | 47 | var 48 | file : BGZ 49 | parsed_variants = initTable[string, int]() 50 | i = 0 51 | 52 | file.open(clinvar_xml_file, "r") 53 | 54 | for clinvarset_string in file.nextClinvarSet(): 55 | if clinvarset_string != "" and clinvarset_string.startsWith("= min_val and v <= max_val: 49 | filtered_d.add(v) 50 | check filtered_d.len() == 6 51 | 52 | 53 | test "test pathology string format": 54 | check formatPathoString(" Factor X Deficiency ") == "Factor_X_Deficiency" 55 | check formatPathoString("Factor (X) Deficiency, pathology") == "Factor_X_Deficiency_pathology" 56 | check formatPathoString("Factor, (X) Deficiency, pathology,|cancer/") == "Factor_X_Deficiency_pathology|cancer" 57 | check formatPathoString("Factor, X, Deficiency ,pathology/| cancer") == "Factor_X_Deficiency_pathology|cancer" 58 | check formatPathoString(" , Factor X,,Deficiency/pathology| , ,cancer/") == "Factor_X_Deficiency_pathology|cancer" 59 | check formatPathoString("CLNDISEASE= , Factor X,,Deficiency/pathology| , ,cancer/") == "CLNDISEASE=Factor_X_Deficiency_pathology|cancer" 60 | 61 | 62 | test "test clinical pathology parsing": 63 | check parseClinicalPathologies("DISEASE", @["coagulation_x_deficiency", "factor_x_deficiency"]) == "CLNDISEASE=coagulation_x_deficiency|factor_x_deficiency" 64 | -------------------------------------------------------------------------------- /tests/files/109.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | current 6 | NM_017849.3(TMEM127):c.245-1G>T AND Pheochromocytoma, susceptibility to 7 | 8 | 9 | current 10 | 11 | no assertion criteria provided 12 | risk factor 13 | 14 | 15 | 16 | 17 | germline 18 | human 19 | not provided 20 | 21 | 22 | literature only 23 | 24 | 25 | In 2 members of a family with pheochromocytoma (171300), Qin et al. (2010) identified a heterozygous germline G-to-T transversion in intron 2 of the TMEM127 gene, resulting in a frameshift and premature termination. Analysis of tumor tissue showed loss of heterozygosity at the TMEM127 locus, consistent with a 2-hit model of tumor suppressor inactivation. Age at onset was 54 and 66 years, respectively, and both patients had bilateral tumors. TMEM127 expression was decreased, consistent with a loss of function. 26 | 27 | 20154675 28 | 29 | 30 | 31 | 32 | 33 | 34 | NM_017849.3(TMEM127):c.245-1G>T 35 | 36 | 37 | IVS2-1G>T(p.F83fs) 38 | 39 | 40 | LRG_528t1:c.245-1G>T 41 | 42 | 43 | NM_001193304.3:c.245-1G>T 44 | 45 | 46 | NM_017849.3:c.245-1G>T 47 | 48 | 49 | LRG_528:g.16016G>T 50 | 51 | 52 | NG_027695.1:g.16016G>T 53 | 54 | 55 | NC_000002.12:g.96254998C>A 56 | 57 | 58 | NC_000002.11:g.96920736C>A 59 | 60 | 61 | splice acceptor variant 62 | 63 | 64 | 65 | 66 | splice acceptor variant 67 | 68 | 69 | 70 | 71 | IVS2AS, G-T, -1 72 | 73 | 2q11.2 74 | 75 | 76 | 77 | 78 | transmembrane protein 127 79 | 80 | 81 | TMEM127 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 20154675 91 | 92 | 93 | 94 | 95 | 96 | NM_017849.3(TMEM127):c.245-1G>T 97 | 98 | 99 | 100 | 101 | 102 | 103 | Pheochromocytoma, susceptibility to 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | current 113 | 114 | no assertion criteria provided 115 | risk factor 116 | 117 | 118 | 119 | 120 | 121 | germline 122 | human 123 | not provided 124 | 125 | 126 | literature only 127 | 128 | 129 | In 2 members of a family with pheochromocytoma (171300), Qin et al. (2010) identified a heterozygous germline G-to-T transversion in intron 2 of the TMEM127 gene, resulting in a frameshift and premature termination. Analysis of tumor tissue showed loss of heterozygosity at the TMEM127 locus, consistent with a 2-hit model of tumor suppressor inactivation. Age at onset was 54 and 66 years, respectively, and both patients had bilateral tumors. TMEM127 expression was decreased, consistent with a loss of function. 130 | 131 | 20154675 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | TMEM127, IVS2AS, G-T, -1 140 | 141 | 142 | IVS2AS, G-T, -1 143 | 144 | 145 | 146 | TMEM127 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | PHEOCHROMOCYTOMA, SUSCEPTIBILITY TO 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | current 164 | NM_017849.3(TMEM127):c.245-1G>T AND Pheochromocytoma 165 | 166 | 167 | current 168 | 169 | no assertion criteria provided 170 | Likely pathogenic 171 | 172 | 173 | 174 | 175 | germline 176 | human 177 | not provided 178 | 1 179 | 180 | 181 | literature only 182 | 183 | 184 | not provided 185 | 186 | 187 | 188 | 189 | 190 | NM_017849.3(TMEM127):c.245-1G>T 191 | 192 | 193 | IVS2-1G>T(p.F83fs) 194 | 195 | 196 | LRG_528t1:c.245-1G>T 197 | 198 | 199 | NM_001193304.3:c.245-1G>T 200 | 201 | 202 | NM_017849.3:c.245-1G>T 203 | 204 | 205 | LRG_528:g.16016G>T 206 | 207 | 208 | NG_027695.1:g.16016G>T 209 | 210 | 211 | NC_000002.12:g.96254998C>A 212 | 213 | 214 | NC_000002.11:g.96920736C>A 215 | 216 | 217 | splice acceptor variant 218 | 219 | 220 | 221 | 222 | splice acceptor variant 223 | 224 | 225 | 226 | 227 | IVS2AS, G-T, -1 228 | 229 | 2q11.2 230 | 231 | 232 | 233 | 234 | transmembrane protein 127 235 | 236 | 237 | TMEM127 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 20154675 247 | 248 | 249 | 250 | 251 | 252 | NM_017849.3(TMEM127):c.245-1G>T 253 | 254 | 255 | 256 | 257 | 258 | 259 | Pheochromocytoma 260 | 261 | 262 | 263 | 264 | 265 | Pheochromocytoma, somatic 266 | 267 | 268 | MAX-Related Hereditary Paraganglioma-Pheochromocytoma Syndrome 269 | 270 | 271 | PHEOCHROMOCYTOMA, SUSCEPTIBILITY TO 272 | 273 | 274 | 275 | Chromaffin tumors 276 | 277 | 278 | 279 | Hereditary paraganglioma-pheochromocytoma (PGL/PCC) syndromes are characterized by paragangliomas (tumors that arise from neuroendocrine tissues distributed along the paravertebral axis from the base of the skull to the pelvis) and pheochromocytomas (paragangliomas that are confined to the adrenal medulla). Sympathetic paragangliomas cause catecholamine excess; parasympathetic paragangliomas are most often nonsecretory. Extra-adrenal parasympathetic paragangliomas are located predominantly in the skull base and neck (referred to as head and neck PGL [HNPGL]) and sometimes in the upper mediastinum; approximately 95% of such tumors are nonsecretory. In contrast, sympathetic extra-adrenal paragangliomas are generally confined to the lower mediastinum, abdomen, and pelvis, and are typically secretory. Pheochromocytomas, which arise from the adrenal medulla, typically lead to catecholamine excess. Symptoms of PGL/PCC result from either mass effects or catecholamine hypersecretion (e.g., sustained or paroxysmal elevations in blood pressure, headache, episodic profuse sweating, forceful palpitations, pallor, and apprehension or anxiety). The risk for developing metastatic disease is greater for extra-adrenal sympathetic paragangliomas than for pheochromocytomas. 280 | 281 | 282 | 283 | Neoplasm 284 | 285 | 286 | Hereditary cancer syndrome 287 | 288 | 289 | 20301715 290 | NBK1548 291 | 292 | 293 | 24893135 294 | 295 | 296 | 24493721 297 | 298 | 299 | 3419007 300 | 301 | 302 | 24319509 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | current 315 | 316 | no assertion criteria provided 317 | likely pathogenic - adrenal pheochromocytoma 318 | Converted during submission to Likely pathogenic. 319 | 320 | 321 | 322 | 323 | germline 324 | human 325 | not provided 326 | 1 327 | 328 | 329 | not provided 330 | 331 | 332 | not provided 333 | 334 | 335 | 336 | 337 | 338 | IVS2-1G>T (p.F83fs) 339 | 340 | 341 | NM_017849.3:c.245-1G>T 342 | 343 | 344 | 20154675 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | Pheochromocytoma 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | -------------------------------------------------------------------------------- /tests/files/16895.xml: -------------------------------------------------------------------------------- 1 | 2 | current 3 | NM_000106.5(CYP2D6):c.[886C>T;457G>C] AND Debrisoquine, ultrarapid metabolism of 4 | 5 | 6 | current 7 | 8 | no assertion criteria provided 9 | drug response 10 | 11 | 12 | 13 | 14 | germline 15 | human 16 | not provided 17 | 18 | 19 | literature only 20 | 21 | 22 | This allelic variant is also known as CYP2D6*2 or CYP2D6L. 23 | 24 | 25 | In a family in which 2 sibs and their father had MRs of less that 0.02 (ultrarapid phenotype, see 608902), Johansson et al. (1993) found 12 extra copies of the CYP2D6 gene inherited in an autosomal dominant pattern; in a second family in which 2 sibs had MRs of less than 0.1, the authors found 2 extra copies of the CYP2D6 gene. All affected individuals had a variant CYP2D6 gene, termed CYP2D6L, which contained 2 amino acid substitutions: a 2938C-T transition in exon 6, resulting in an arg296-to-cys (R296C), and a 4268G-to-C transversion in exon 9, resulting in a resulting in a ser486-to-thr (S486T) substitution. The MR of individuals with 1 copy of the CYP2D6L gene did not differ from those with the wildtype gene, but there was a correlation between decreased MR and increased copies of the CYP2D6L gene. 26 | 27 | 7903454 28 | 29 | 30 | 31 | Panserat et al. (1994) identified the R296C and S486T changes as 2 major CYP2D6 allozymes in extensive metabolizers (wildtype). Residue 296 falls within a presumed substrate recognition site, and residue 486 lies in the vicinity of the heme binding site. 32 | 33 | 7927337 34 | 35 | 36 | 37 | 38 | 39 | 40 | NM_000106.6(CYP2D6):c.886C>T (p.Arg296Cys) 41 | 42 | 43 | LRG_303t1:c.886C>T 44 | 45 | 46 | NM_001025161.3:c.733C>T 47 | 48 | 49 | NM_000106.6:c.886C>T 50 | 51 | 52 | LRG_303:g.7870C>T 53 | 54 | 55 | NG_008376.3:g.7051C>T 56 | 57 | 58 | NG_008376.4:g.7870C>T 59 | 60 | 61 | NC_000022.11:g.42127941G>A 62 | 63 | 64 | NC_000022.10:g.42523943= 65 | 66 | 67 | NM_000106.5:c.886C>T 68 | 69 | 70 | NG_008376.2:g.7941C>T 71 | 72 | 73 | LRG_303p1:p.Arg296Cys 74 | 75 | 76 | P10635:p.Arg296Cys 77 | 78 | 79 | NP_001020332.2:p.Arg245Cys 80 | 81 | 82 | NP_000097.3:p.Arg296Cys 83 | 84 | 85 | NP_000097.3:p.Arg296Cys 86 | 87 | 88 | NP_000097.3:p.Arg296Cys 89 | 90 | 91 | missense variant 92 | 93 | 94 | 95 | 96 | missense variant 97 | 98 | 99 | 100 | 101 | R245C 102 | 103 | 104 | R296C 105 | 106 | 107 | ARG296CYS 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 22q13.2 118 | 119 | 120 | 121 | 122 | cytochrome P450 family 2 subfamily D member 6 123 | 124 | 125 | CYP2D6 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | NBK100662 140 | 141 | 142 | NBK367795 143 | 144 | 145 | NBK425795 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | NM_000106.6(CYP2D6):c.1457G>C (p.Ser486Thr) 154 | 155 | 156 | LRG_303t1:c.1457G>C 157 | 158 | 159 | NM_001025161.3:c.1304G>C 160 | 161 | 162 | NM_000106.6:c.1457G>C 163 | 164 | 165 | LRG_303:g.9200G>C 166 | 167 | 168 | NG_008376.3:g.8381G>C 169 | 170 | 171 | NG_008376.4:g.9200G>C 172 | 173 | 174 | NC_000022.11:g.42126611C>G 175 | 176 | 177 | NC_000022.10:g.42522613= 178 | 179 | 180 | NM_000106.5:c.1457G>C 181 | 182 | 183 | NG_008376.2:g.9271G>C 184 | 185 | 186 | LRG_303p1:p.Ser486Thr 187 | 188 | 189 | P10635:p.Ser486Thr 190 | 191 | 192 | NP_001020332.2:p.Ser435Thr 193 | 194 | 195 | NP_000097.3:p.Ser486Thr 196 | 197 | 198 | NP_000097.3:p.Ser486Thr 199 | 200 | 201 | NP_000097.3:p.Ser486Thr 202 | 203 | 204 | NM_000106.5:exon 8 205 | 206 | 207 | missense variant 208 | 209 | 210 | 211 | 212 | missense variant 213 | 214 | 215 | 216 | 217 | S435T 218 | 219 | 220 | S486T 221 | 222 | 223 | SER486THR 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 22q13.2 234 | 235 | 236 | 237 | 238 | cytochrome P450 family 2 subfamily D member 6 239 | 240 | 241 | CYP2D6 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | NM_000106.5(CYP2D6):c.[886C>T;457G>C] 260 | 261 | 262 | CYP2D6, ARG296CYS AND SER486THR 263 | 264 | 265 | 266 | CYP2D6*2 267 | 268 | 269 | NG_008376.2:g.[7941C>T;9271G>C] 270 | 271 | 272 | NM_000106.5:c.[886C>T;457G>C] 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | Debrisoquine, ultrarapid metabolism of 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | current 290 | 291 | no assertion criteria provided 292 | drug response 293 | 294 | 295 | 296 | 297 | 298 | germline 299 | human 300 | not provided 301 | 302 | 303 | literature only 304 | 305 | 306 | This allelic variant is also known as CYP2D6*2 or CYP2D6L. 307 | 308 | 309 | In a family in which 2 sibs and their father had MRs of less that 0.02 (ultrarapid phenotype, see 608902), Johansson et al. (1993) found 12 extra copies of the CYP2D6 gene inherited in an autosomal dominant pattern; in a second family in which 2 sibs had MRs of less than 0.1, the authors found 2 extra copies of the CYP2D6 gene. All affected individuals had a variant CYP2D6 gene, termed CYP2D6L, which contained 2 amino acid substitutions: a 2938C-T transition in exon 6, resulting in an arg296-to-cys (R296C), and a 4268G-to-C transversion in exon 9, resulting in a resulting in a ser486-to-thr (S486T) substitution. The MR of individuals with 1 copy of the CYP2D6L gene did not differ from those with the wildtype gene, but there was a correlation between decreased MR and increased copies of the CYP2D6L gene. 310 | 311 | 7903454 312 | 313 | 314 | 315 | 316 | Panserat et al. (1994) identified the R296C and S486T changes as 2 major CYP2D6 allozymes in extensive metabolizers (wildtype). Residue 296 falls within a presumed substrate recognition site, and residue 486 lies in the vicinity of the heme binding site. 317 | 318 | 7927337 319 | 320 | 321 | 322 | 323 | 324 | 325 | CYP2D6, ARG296CYS AND SER486THR 326 | 327 | 328 | ARG296CYS AND SER486THR 329 | 330 | 331 | 332 | CYP2D6 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | DEBRISOQUINE, ULTRARAPID METABOLISM OF 342 | 343 | 344 | 345 | 346 | 347 | 348 | -------------------------------------------------------------------------------- /tests/files/225974.xml: -------------------------------------------------------------------------------- 1 | 2 | current 3 | NM_000675.6(ADORA2A):c.-275+1797C>T AND caffeine response - Toxicity/ADR 4 | 5 | 6 | current 7 | 8 | reviewed by expert panel 9 | drug response 10 | 11 | 12 | 13 | 14 | germline 15 | human 16 | yes 17 | 18 | 19 | curation 20 | 21 | 22 | not provided 23 | 24 | 25 | 26 | 27 | 28 | NM_000675.6(ADORA2A):c.-275+1797C>T 29 | 30 | 31 | NM_001278500.1:c.-274-3588C>T 32 | 33 | 34 | NM_000675.6:c.-275+1797C>T 35 | 36 | 37 | NM_001278499.2:c.-275+1817C>T 38 | 39 | 40 | NG_052804.1:g.10947C>T 41 | 42 | 43 | NC_000022.11:g.24429543C>T 44 | 45 | 46 | NC_000022.10:g.24825511C>T 47 | 48 | 49 | NR_028484.3:n.2494G>A 50 | 51 | 52 | intron variant 53 | 54 | 55 | 56 | 57 | intron variant 58 | 59 | 60 | 61 | 62 | intron variant 63 | 64 | 65 | 66 | 67 | non-coding transcript variant 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 22q11.23 78 | 79 | 80 | 81 | 82 | ADORA2A antisense RNA 1 83 | 84 | 85 | ADORA2A-AS1 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | SPECC1L-ADORA2A readthrough (NMD candidate) 94 | 95 | 96 | SPECC1L-ADORA2A 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | adenosine A2a receptor 105 | 106 | 107 | ADORA2A 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | NM_000675.6(ADORA2A):c.-275+1797C>T 121 | 122 | 123 | 124 | 125 | 126 | caffeine response - Toxicity/ADR 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | current 136 | 137 | reviewed by expert panel 138 | drug response 139 | Drug-variant association: Toxicity/ADR 140 | 141 | 18305461 142 | 143 | 144 | 20520601 145 | 146 | PharmGKB Level of Evidence 2B: Annotation for a variant-drug combination with moderate evidence of an association. The association must be replicated but there may be some studies that do not show statistical significance, and/or the effect size may be small. 147 | 148 | 149 | 150 | 151 | Pharmacogenomics knowledge for personalized medicine 152 | 153 | 22992668 154 | 155 | 156 | 157 | 158 | germline 159 | human 160 | yes 161 | 162 | 163 | curation 164 | 165 | 166 | not provided 167 | 168 | 169 | 170 | 171 | 172 | NC_000022.10:g.24825511C>T 173 | 174 | 175 | 176 | ADORA2A 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | caffeine response - Toxicity/ADR 186 | 187 | 188 | 189 | 190 | https://www.pharmgkb.org/clinicalAnnotation/981201549 191 | 192 | 193 | 194 | 195 | -------------------------------------------------------------------------------- /tests/files/242771.xml: -------------------------------------------------------------------------------- 1 | 2 | current 3 | NM_000106.6(CYP2D6):c.886C>T (p.Arg296Cys) AND not specified 4 | 5 | 6 | current 7 | 8 | criteria provided, single submitter 9 | Benign 10 | 11 | 12 | 13 | 14 | germline 15 | human 16 | yes 17 | 18 | 19 | clinical testing 20 | 21 | 22 | not provided 23 | 24 | 25 | 26 | 27 | 28 | NM_000106.6(CYP2D6):c.886C>T (p.Arg296Cys) 29 | 30 | 31 | LRG_303t1:c.886C>T 32 | 33 | 34 | NM_001025161.3:c.733C>T 35 | 36 | 37 | NM_000106.6:c.886C>T 38 | 39 | 40 | LRG_303:g.7870C>T 41 | 42 | 43 | NG_008376.3:g.7051C>T 44 | 45 | 46 | NG_008376.4:g.7870C>T 47 | 48 | 49 | NC_000022.11:g.42127941G>A 50 | 51 | 52 | NC_000022.10:g.42523943= 53 | 54 | 55 | NM_000106.5:c.886C>T 56 | 57 | 58 | NG_008376.2:g.7941C>T 59 | 60 | 61 | LRG_303p1:p.Arg296Cys 62 | 63 | 64 | P10635:p.Arg296Cys 65 | 66 | 67 | NP_001020332.2:p.Arg245Cys 68 | 69 | 70 | NP_000097.3:p.Arg296Cys 71 | 72 | 73 | NP_000097.3:p.Arg296Cys 74 | 75 | 76 | NP_000097.3:p.Arg296Cys 77 | 78 | 79 | missense variant 80 | 81 | 82 | 83 | 84 | missense variant 85 | 86 | 87 | 88 | 89 | R245C 90 | 91 | 92 | R296C 93 | 94 | 95 | ARG296CYS 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 22q13.2 106 | 107 | 108 | 109 | 110 | cytochrome P450 family 2 subfamily D member 6 111 | 112 | 113 | CYP2D6 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | NBK100662 128 | 129 | 130 | NBK367795 131 | 132 | 133 | NBK425795 134 | 135 | 136 | 137 | 138 | 139 | 140 | NM_000106.6(CYP2D6):c.886C>T (p.Arg296Cys) 141 | 142 | 143 | 144 | 145 | 146 | 147 | not specified 148 | 149 | 150 | AllHighlyPenetrant 151 | 152 | 153 | The term 'not specified' was created for use in ClinVar so that submitters can convey the concept that a variant is benign, likely benign, or of uncertain significance for an unspecified set of disorders. This usage was introduced in 2014 to replace AllHighlyPenetrant. 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | current 163 | 164 | criteria provided, single submitter 165 | Benign 166 | This variant is considered likely benign or benign based on one or more of the following criteria: it is a conservative change, it occurs at a poorly conserved position in the protein, it is predicted to be benign by multiple in silico algorithms, and/or has population frequency not consistent with disease. 167 | 168 | 169 | 170 | 171 | GeneDX Variant Classification (06012015) 172 | 173 | https://submit.ncbi.nlm.nih.gov/ft/byid/7oynscmk/mdi-5616_26957_genedx_interprules_final_061215.pdf 174 | 175 | 176 | 177 | 178 | germline 179 | human 180 | yes 181 | 182 | 183 | clinical testing 184 | 185 | 186 | not provided 187 | 188 | 189 | 190 | 191 | 192 | NM_000106.5:c.886C>T 193 | 194 | 195 | 196 | 197 | CYP2D6 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | not specified 206 | 207 | 208 | 209 | 210 | 211 | 212 | -------------------------------------------------------------------------------- /tests/files/340430.xml: -------------------------------------------------------------------------------- 1 | 2 | current 3 | NM_206965.2(FTCD):c.636+9G>A AND GLUTAMATE FORMIMINOTRANSFERASE DEFICIENCY 4 | 5 | 6 | current 7 | 8 | criteria provided, single submitter 9 | Uncertain significance 10 | 11 | 12 | 13 | 14 | germline 15 | human 16 | unknown 17 | 18 | 19 | clinical testing 20 | 21 | 22 | not provided 23 | 24 | 25 | 26 | 27 | 28 | NM_206965.2(FTCD):c.636+9G>A 29 | 30 | 31 | NM_001350598.1:c.15C>T 32 | 33 | 34 | NM_001320412.2:c.636+9G>A 35 | 36 | 37 | NM_006657.3:c.636+9G>A 38 | 39 | 40 | NM_206965.2:c.636+9G>A 41 | 42 | 43 | NG_016191.1:g.9019G>A 44 | 45 | 46 | NC_000021.9:g.46151549C>T 47 | 48 | 49 | NC_000021.8:g.47571463C>T 50 | 51 | 52 | NM_006657.2:c.636+9G>A 53 | 54 | 55 | NP_001337527.1:p.Asn5= 56 | 57 | 58 | intron variant 59 | 60 | 61 | 62 | 63 | intron variant 64 | 65 | 66 | 67 | 68 | intron variant 69 | 70 | 71 | 72 | 73 | synonymous variant 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 21q22.3 84 | 85 | 86 | 87 | 88 | FTCD antisense RNA 1 89 | 90 | 91 | FTCD-AS1 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | formimidoyltransferase cyclodeaminase 101 | 102 | 103 | FTCD 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | NM_206965.2(FTCD):c.636+9G>A 117 | 118 | 119 | 120 | 121 | 122 | 123 | GLUTAMATE FORMIMINOTRANSFERASE DEFICIENCY 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | Glutamate formiminotransferase deficiency is an autosomal recessive disorder and the second most common inborn error of folate metabolism. Features of a severe phenotype include elevated levels of formiminoglutamate (FIGLU) in the urine in response to histidine administration, megaloblastic anemia, and mental retardation. Features of a mild phenotype include high urinary excretion of FIGLU in the absence of histidine administration, mild developmental delay, and no hematologic abnormalities (summary by Hilton et al., 2003). 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | current 146 | 147 | criteria provided, single submitter 148 | Uncertain significance 149 | 150 | 151 | 152 | 153 | ICSL Variant Classification 20161018 154 | 155 | https://submit.ncbi.nlm.nih.gov/ft/byid/4jQgNGYk/ICSL_Variant_Classification_20161018.pdf 156 | ICSL_Variant_Classification_20161018.pdf 157 | 158 | 159 | 160 | 161 | germline 162 | human 163 | unknown 164 | 165 | 166 | clinical testing 167 | 168 | 169 | not provided 170 | 171 | 172 | 173 | 174 | 175 | NM_006657.2:c.636+9G>A 176 | 177 | 178 | 179 | FTCD 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | Formiminotransferase Deficiency 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | current 196 | NM_206965.2(FTCD):c.636+9G>A AND not provided 197 | 198 | 199 | current 200 | 201 | criteria provided, single submitter 202 | Likely benign 203 | 204 | 205 | 206 | 207 | germline 208 | human 209 | unknown 210 | 211 | 212 | clinical testing 213 | 214 | 215 | not provided 216 | 217 | 218 | 219 | 220 | 221 | NM_206965.2(FTCD):c.636+9G>A 222 | 223 | 224 | NM_001350598.1:c.15C>T 225 | 226 | 227 | NM_001320412.2:c.636+9G>A 228 | 229 | 230 | NM_006657.3:c.636+9G>A 231 | 232 | 233 | NM_206965.2:c.636+9G>A 234 | 235 | 236 | NG_016191.1:g.9019G>A 237 | 238 | 239 | NC_000021.9:g.46151549C>T 240 | 241 | 242 | NC_000021.8:g.47571463C>T 243 | 244 | 245 | NM_006657.2:c.636+9G>A 246 | 247 | 248 | NP_001337527.1:p.Asn5= 249 | 250 | 251 | intron variant 252 | 253 | 254 | 255 | 256 | intron variant 257 | 258 | 259 | 260 | 261 | intron variant 262 | 263 | 264 | 265 | 266 | synonymous variant 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 21q22.3 277 | 278 | 279 | 280 | 281 | FTCD antisense RNA 1 282 | 283 | 284 | FTCD-AS1 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | formimidoyltransferase cyclodeaminase 294 | 295 | 296 | FTCD 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | NM_206965.2(FTCD):c.636+9G>A 310 | 311 | 312 | 313 | 314 | 315 | 316 | not provided 317 | 318 | 319 | 320 | The term 'not provided' is registered in MedGen to support identification of submissions to ClinVar for which no condition was named when assessing the variant. 'not provided' differs from 'not specified', which is used when a variant is asserted to be benign, likely benign, or of uncertain significance for conditions that have not been specified. 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | current 330 | 331 | criteria provided, single submitter 332 | Likely benign 333 | 334 | 335 | 336 | 337 | Nykamp K et al. (Genet Med 2017) 338 | 339 | 28492532 340 | 341 | 342 | 343 | 344 | germline 345 | human 346 | unknown 347 | 348 | 349 | clinical testing 350 | 351 | 352 | not provided 353 | 354 | 355 | 356 | 357 | 358 | NM_006657.2:c.636+9G>A 359 | 360 | 361 | 362 | FTCD 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | not provided 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | -------------------------------------------------------------------------------- /tests/files/618897_2019-05.xml: -------------------------------------------------------------------------------- 1 | 2 | current 3 | NC_000007.14:g.117559618_117559619delGA AND Cystic fibrosis 4 | 5 | 6 | current 7 | 8 | criteria provided, single submitter 9 | Likely pathogenic 10 | 11 | 12 | 13 | 14 | unknown 15 | human 16 | yes 17 | 18 | 19 | clinical testing 20 | 21 | 22 | not provided 23 | 24 | 25 | 26 | 27 | 28 | NC_000007.14:g.117559618_117559619delGA 29 | 30 | 31 | NC_000007.14:g.117559618_117559619delGA 32 | 33 | 34 | NC_000007.13:g.117199671_117199672delAG 35 | 36 | 7q31.2 37 | 38 | 39 | 40 | 41 | CFTR antisense RNA 1 42 | 43 | 44 | CFTR-AS1 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | cystic fibrosis transmembrane conductance regulator 53 | 54 | 55 | CFTR 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | NC_000007.14:g.117559618_117559619delGA 66 | 67 | 68 | 69 | 70 | 71 | Cystic fibrosis 72 | 73 | 74 | 75 | 76 | 77 | 78 | CF 79 | 80 | 81 | 82 | 83 | Cystic fibrosis (CF) is a multisystem disease affecting epithelia of the respiratory tract, exocrine pancreas, intestine, hepatobiliary system, and exocrine sweat glands. Morbidities include progressive obstructive lung disease with bronchiectasis, frequent hospitalizations for pulmonary disease, pancreatic insufficiency and malnutrition, recurrent sinusitis and bronchitis, and male infertility. Pulmonary disease is the major cause of morbidity and mortality in CF. Meconium ileus occurs at birth in 15%-20% of newborns with CF. More than 95% of males with CF are infertile. Congenital absence of the vas deferens (CAVD) is generally identified during evaluation of infertility or as an incidental finding at the time of a surgical procedure. Hypoplasia or aplasia of the vas deferens and seminal vesicles may occur either bilaterally or unilaterally. Testicular development and function and spermatogenesis are usually normal. 84 | 85 | 86 | 87 | loss of function 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | More than 1,000 CFTR variants have been reported. Most common pathogenic variant is p.Phe508del. 142 | 143 | 144 | 11280952 145 | 146 | 147 | 3110945 148 | 149 | 150 | 20301428 151 | NBK1250 152 | 153 | 154 | 19888064 155 | 156 | 157 | 15789152 158 | 159 | 160 | 3110977 161 | 162 | 163 | 20675678 164 | 165 | 166 | 20605539 167 | 168 | 169 | 19914445 170 | 171 | 172 | 19914443 173 | 174 | 175 | 21938795 176 | 177 | 178 | https://dailymed.nlm.nih.gov/dailymed/lookup.cfm?setid=0ab0c9f8-3eee-4e0f-9f3f-c1e16aaffe25 179 | DailyMed Drug Label, KALYDECO, 2012 180 | 181 | 182 | 12394352 183 | 184 | 185 | 22475884 186 | 187 | 188 | 3148255 189 | 190 | 191 | 21422883 192 | 193 | 194 | 4026598 195 | 196 | 197 | 24014130 198 | 199 | 200 | 25431289 201 | 202 | 203 | 25981758 204 | 205 | 206 | 17761616 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | current 218 | 219 | criteria provided, single submitter 220 | Likely pathogenic 221 | 222 | 223 | 224 | Mendelics Assertion Criteria 2017 225 | 226 | https://submit.ncbi.nlm.nih.gov/ft/byid/chhjzatu/mendelics_assertion_criteria_2017.pdf 227 | 228 | 229 | 230 | 231 | unknown 232 | human 233 | yes 234 | 235 | 236 | clinical testing 237 | 238 | 239 | not provided 240 | 241 | 242 | 243 | 244 | 245 | NC_000007.13:g.117199671_117199672delAG 246 | 247 | 248 | 249 | CFTR 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | -------------------------------------------------------------------------------- /tests/files/MT.gff: -------------------------------------------------------------------------------- 1 | ##sequence-region NC_012920.1 1 16569 2 | NC_012920.1 RefSeq region 1 16569 . + . ID=NC_012920.1:1..16569;Dbxref=taxon:9606;Is_circular=true;Name=MT;country=United Kingdom: Great Britain;gbkey=Src;genome=mitochondrion;isolation-source=caucasian;mol_type=genomic DNA;note=this is the rCRS;tissue-type=placenta 3 | NC_012920.1 RefSeq gene 577 647 . + . ID=gene-TRNF;Dbxref=GeneID:4558,HGNC:HGNC:7481,MIM:590070;Name=TRNF;gbkey=Gene;gene=TRNF;gene_biotype=tRNA 4 | NC_012920.1 RefSeq tRNA 577 647 . + . ID=rna-TRNF;Parent=gene-TRNF;Dbxref=GeneID:4558,HGNC:HGNC:7481,MIM:590070;Note=NAR: 1455;anticodon=(pos:611..613);codons=1;gbkey=tRNA;gene=TRNF;product=tRNA-Phe 5 | NC_012920.1 RefSeq exon 577 647 . + . ID=exon-TRNF-1;Parent=rna-TRNF;Dbxref=GeneID:4558,HGNC:HGNC:7481,MIM:590070;Note=NAR: 1455;anticodon=(pos:611..613);codons=1;gbkey=tRNA;gene=TRNF;product=tRNA-Phe 6 | NC_012920.1 RefSeq gene 648 1601 . + . ID=gene-RNR1;Dbxref=GeneID:4549,HGNC:HGNC:7470,MIM:561000;Name=RNR1;gbkey=Gene;gene=RNR1;gene_biotype=rRNA;gene_synonym=MTRNR1 7 | NC_012920.1 RefSeq rRNA 648 1601 . + . ID=rna-RNR1;Parent=gene-RNR1;Dbxref=GeneID:4549,HGNC:HGNC:7470,MIM:561000;Note=12S rRNA%3B 12S ribosomal RNA;gbkey=rRNA;gene=RNR1;product=s-rRNA 8 | NC_012920.1 RefSeq exon 648 1601 . + . ID=exon-RNR1-1;Parent=rna-RNR1;Dbxref=GeneID:4549,HGNC:HGNC:7470,MIM:561000;Note=12S rRNA%3B 12S ribosomal RNA;gbkey=rRNA;gene=RNR1;product=s-rRNA 9 | NC_012920.1 RefSeq gene 1602 1670 . + . ID=gene-TRNV;Dbxref=GeneID:4577,HGNC:HGNC:7500,MIM:590105;Name=TRNV;gbkey=Gene;gene=TRNV;gene_biotype=tRNA;gene_synonym=MTTV 10 | NC_012920.1 RefSeq tRNA 1602 1670 . + . ID=rna-TRNV;Parent=gene-TRNV;Dbxref=GeneID:4577,HGNC:HGNC:7500,MIM:590105;Note=NAR: 2053;anticodon=(pos:1633..1635);codons=50;gbkey=tRNA;gene=TRNV;product=tRNA-Val 11 | NC_012920.1 RefSeq exon 1602 1670 . + . ID=exon-TRNV-1;Parent=rna-TRNV;Dbxref=GeneID:4577,HGNC:HGNC:7500,MIM:590105;Note=NAR: 2053;anticodon=(pos:1633..1635);codons=50;gbkey=tRNA;gene=TRNV;product=tRNA-Val 12 | NC_012920.1 RefSeq gene 1671 3229 . + . ID=gene-RNR2;Dbxref=GeneID:4550,HGNC:HGNC:7471,MIM:561010;Name=RNR2;gbkey=Gene;gene=RNR2;gene_biotype=rRNA;gene_synonym=MTRNR2 13 | NC_012920.1 RefSeq rRNA 1671 3229 . + . ID=rna-RNR2;Parent=gene-RNR2;Dbxref=GeneID:4550,HGNC:HGNC:7471,MIM:561010;Note=16S ribosomal RNA%3B 16S rRNA;gbkey=rRNA;gene=RNR2;product=l-rRNA 14 | NC_012920.1 RefSeq exon 1671 3229 . + . ID=exon-RNR2-1;Parent=rna-RNR2;Dbxref=GeneID:4550,HGNC:HGNC:7471,MIM:561010;Note=16S ribosomal RNA%3B 16S rRNA;gbkey=rRNA;gene=RNR2;product=l-rRNA 15 | NC_012920.1 RefSeq sequence_feature 3107 3107 . + . ID=id-NC_012920.1:3107..3107;Note=preserves historical genome annotation numbering;gbkey=misc_feature 16 | NC_012920.1 RefSeq gene 3230 3304 . + . ID=gene-TRNL1;Dbxref=GeneID:4567,HGNC:HGNC:7490,MIM:590050;Name=TRNL1;gbkey=Gene;gene=TRNL1;gene_biotype=tRNA;gene_synonym=MTTL1 17 | NC_012920.1 RefSeq tRNA 3230 3304 . + . ID=rna-TRNL1;Parent=gene-TRNL1;Dbxref=GeneID:4567,HGNC:HGNC:7490,MIM:590050;Note=NAR: 1054;anticodon=(pos:3265..3267);codons=2%2C3;gbkey=tRNA;gene=TRNL1;product=tRNA-Leu 18 | NC_012920.1 RefSeq exon 3230 3304 . + . ID=exon-TRNL1-1;Parent=rna-TRNL1;Dbxref=GeneID:4567,HGNC:HGNC:7490,MIM:590050;Note=NAR: 1054;anticodon=(pos:3265..3267);codons=2%2C3;gbkey=tRNA;gene=TRNL1;product=tRNA-Leu 19 | NC_012920.1 RefSeq gene 3307 4262 . + . ID=gene-ND1;Dbxref=GeneID:4535,HGNC:HGNC:7455,MIM:516000;Name=ND1;gbkey=Gene;gene=ND1;gene_biotype=protein_coding;gene_synonym=MTND1 20 | NC_012920.1 RefSeq CDS 3307 4262 . + 0 ID=cds-YP_003024026.1;Parent=gene-ND1;Dbxref=Genbank:YP_003024026.1,GeneID:4535,HGNC:HGNC:7455,MIM:516000;Name=YP_003024026.1;Note=TAA stop codon is completed by the addition of 3' A residues to the mRNA;gbkey=CDS;gene=ND1;product=NADH dehydrogenase subunit 1;protein_id=YP_003024026.1;transl_except=(pos:4261..4262%2Caa:TERM);transl_table=2 21 | NC_012920.1 RefSeq gene 4263 4331 . + . ID=gene-TRNI;Dbxref=GeneID:4565,HGNC:HGNC:7488,MIM:590045;Name=TRNI;gbkey=Gene;gene=TRNI;gene_biotype=tRNA;gene_synonym=MTTI 22 | NC_012920.1 RefSeq tRNA 4263 4331 . + . ID=rna-TRNI;Parent=gene-TRNI;Dbxref=GeneID:4565,HGNC:HGNC:7488,MIM:590045;Note=NAR: 0997;anticodon=(pos:4292..4294);codons=33;gbkey=tRNA;gene=TRNI;product=tRNA-Ile 23 | NC_012920.1 RefSeq exon 4263 4331 . + . ID=exon-TRNI-1;Parent=rna-TRNI;Dbxref=GeneID:4565,HGNC:HGNC:7488,MIM:590045;Note=NAR: 0997;anticodon=(pos:4292..4294);codons=33;gbkey=tRNA;gene=TRNI;product=tRNA-Ile 24 | NC_012920.1 RefSeq gene 4329 4400 . - . ID=gene-TRNQ;Dbxref=GeneID:4572,HGNC:HGNC:7495,MIM:590030;Name=TRNQ;gbkey=Gene;gene=TRNQ;gene_biotype=tRNA;gene_synonym=MTTQ 25 | NC_012920.1 RefSeq tRNA 4329 4400 . - . ID=rna-TRNQ;Parent=gene-TRNQ;Dbxref=GeneID:4572,HGNC:HGNC:7495,MIM:590030;Note=NAR: 0597;anticodon=(pos:complement(4365..4367));codons=26;gbkey=tRNA;gene=TRNQ;product=tRNA-Gln 26 | NC_012920.1 RefSeq exon 4329 4400 . - . ID=exon-TRNQ-1;Parent=rna-TRNQ;Dbxref=GeneID:4572,HGNC:HGNC:7495,MIM:590030;Note=NAR: 0597;anticodon=(pos:complement(4365..4367));codons=26;gbkey=tRNA;gene=TRNQ;product=tRNA-Gln 27 | NC_012920.1 RefSeq gene 4402 4469 . + . ID=gene-TRNM;Dbxref=GeneID:4569,HGNC:HGNC:7492,MIM:590065;Name=TRNM;gbkey=Gene;gene=TRNM;gene_biotype=tRNA;gene_synonym=MTTM 28 | NC_012920.1 RefSeq tRNA 4402 4469 . + . ID=rna-TRNM;Parent=gene-TRNM;Dbxref=GeneID:4569,HGNC:HGNC:7492,MIM:590065;Note=NAR: 1297;anticodon=(pos:4432..4434);codons=35;gbkey=tRNA;gene=TRNM;product=tRNA-Met 29 | NC_012920.1 RefSeq exon 4402 4469 . + . ID=exon-TRNM-1;Parent=rna-TRNM;Dbxref=GeneID:4569,HGNC:HGNC:7492,MIM:590065;Note=NAR: 1297;anticodon=(pos:4432..4434);codons=35;gbkey=tRNA;gene=TRNM;product=tRNA-Met 30 | NC_012920.1 RefSeq gene 4470 5511 . + . ID=gene-ND2;Dbxref=GeneID:4536,HGNC:HGNC:7456,MIM:516001;Name=ND2;gbkey=Gene;gene=ND2;gene_biotype=protein_coding;gene_synonym=MTND2 31 | NC_012920.1 RefSeq CDS 4470 5511 . + 0 ID=cds-YP_003024027.1;Parent=gene-ND2;Dbxref=Genbank:YP_003024027.1,GeneID:4536,HGNC:HGNC:7456,MIM:516001;Name=YP_003024027.1;Note=TAA stop codon is completed by the addition of 3' A residues to the mRNA;gbkey=CDS;gene=ND2;product=NADH dehydrogenase subunit 2;protein_id=YP_003024027.1;transl_except=(pos:5511..5511%2Caa:TERM);transl_table=2 32 | NC_012920.1 RefSeq gene 5512 5579 . + . ID=gene-TRNW;Dbxref=GeneID:4578,HGNC:HGNC:7501,MIM:590095;Name=TRNW;gbkey=Gene;gene=TRNW;gene_biotype=tRNA;gene_synonym=MTTW 33 | NC_012920.1 RefSeq tRNA 5512 5579 . + . ID=rna-TRNW;Parent=gene-TRNW;Dbxref=GeneID:4578,HGNC:HGNC:7501,MIM:590095;Note=NAR: 1897;anticodon=(pos:5544..5546);codons=14;gbkey=tRNA;gene=TRNW;product=tRNA-Trp 34 | NC_012920.1 RefSeq exon 5512 5579 . + . ID=exon-TRNW-1;Parent=rna-TRNW;Dbxref=GeneID:4578,HGNC:HGNC:7501,MIM:590095;Note=NAR: 1897;anticodon=(pos:5544..5546);codons=14;gbkey=tRNA;gene=TRNW;product=tRNA-Trp 35 | NC_012920.1 RefSeq gene 5587 5655 . - . ID=gene-TRNA;Dbxref=GeneID:4553,HGNC:HGNC:7475,MIM:590000;Name=TRNA;gbkey=Gene;gene=TRNA;gene_biotype=tRNA;gene_synonym=MTTA 36 | NC_012920.1 RefSeq tRNA 5587 5655 . - . ID=rna-TRNA;Parent=gene-TRNA;Dbxref=GeneID:4553,HGNC:HGNC:7475,MIM:590000;Note=NAR: 0097;anticodon=(pos:complement(5623..5625));codons=54;gbkey=tRNA;gene=TRNA;product=tRNA-Ala 37 | NC_012920.1 RefSeq exon 5587 5655 . - . ID=exon-TRNA-1;Parent=rna-TRNA;Dbxref=GeneID:4553,HGNC:HGNC:7475,MIM:590000;Note=NAR: 0097;anticodon=(pos:complement(5623..5625));codons=54;gbkey=tRNA;gene=TRNA;product=tRNA-Ala 38 | NC_012920.1 RefSeq gene 5657 5729 . - . ID=gene-TRNN;Dbxref=GeneID:4570,HGNC:HGNC:7493,MIM:590010;Name=TRNN;gbkey=Gene;gene=TRNN;gene_biotype=tRNA;gene_synonym=MTTN 39 | NC_012920.1 RefSeq tRNA 5657 5729 . - . ID=rna-TRNN;Parent=gene-TRNN;Dbxref=GeneID:4570,HGNC:HGNC:7493,MIM:590010;Note=NAR: 0297;anticodon=(pos:complement(5694..5696));codons=41;gbkey=tRNA;gene=TRNN;product=tRNA-Asn 40 | NC_012920.1 RefSeq exon 5657 5729 . - . ID=exon-TRNN-1;Parent=rna-TRNN;Dbxref=GeneID:4570,HGNC:HGNC:7493,MIM:590010;Note=NAR: 0297;anticodon=(pos:complement(5694..5696));codons=41;gbkey=tRNA;gene=TRNN;product=tRNA-Asn 41 | NC_012920.1 RefSeq gene 5761 5826 . - . ID=gene-TRNC;Dbxref=GeneID:4511,HGNC:HGNC:7477,MIM:590020;Name=TRNC;gbkey=Gene;gene=TRNC;gene_biotype=tRNA;gene_synonym=MTTC 42 | NC_012920.1 RefSeq tRNA 5761 5826 . - . ID=rna-TRNC;Parent=gene-TRNC;Dbxref=GeneID:4511,HGNC:HGNC:7477,MIM:590020;Note=NAR: 0497;anticodon=(pos:complement(5796..5798));codons=13;gbkey=tRNA;gene=TRNC;product=tRNA-Cys 43 | NC_012920.1 RefSeq exon 5761 5826 . - . ID=exon-TRNC-1;Parent=rna-TRNC;Dbxref=GeneID:4511,HGNC:HGNC:7477,MIM:590020;Note=NAR: 0497;anticodon=(pos:complement(5796..5798));codons=13;gbkey=tRNA;gene=TRNC;product=tRNA-Cys 44 | NC_012920.1 RefSeq gene 5826 5891 . - . ID=gene-TRNY;Dbxref=GeneID:4579,HGNC:HGNC:7502,MIM:590100;Name=TRNY;gbkey=Gene;gene=TRNY;gene_biotype=tRNA;gene_synonym=MTTY 45 | NC_012920.1 RefSeq tRNA 5826 5891 . - . ID=rna-TRNY;Parent=gene-TRNY;Dbxref=GeneID:4579,HGNC:HGNC:7502,MIM:590100;Note=NAR: 1997;anticodon=(pos:complement(5860..5862));codons=9;gbkey=tRNA;gene=TRNY;product=tRNA-Tyr 46 | NC_012920.1 RefSeq exon 5826 5891 . - . ID=exon-TRNY-1;Parent=rna-TRNY;Dbxref=GeneID:4579,HGNC:HGNC:7502,MIM:590100;Note=NAR: 1997;anticodon=(pos:complement(5860..5862));codons=9;gbkey=tRNA;gene=TRNY;product=tRNA-Tyr 47 | NC_012920.1 RefSeq gene 5904 7445 . + . ID=gene-COX1;Dbxref=GeneID:4512,HGNC:HGNC:7419,MIM:516030;Name=COX1;gbkey=Gene;gene=COX1;gene_biotype=protein_coding;gene_synonym=COI,MTCO1 48 | NC_012920.1 RefSeq CDS 5904 7445 . + 0 ID=cds-YP_003024028.1;Parent=gene-COX1;Dbxref=Genbank:YP_003024028.1,GeneID:4512,HGNC:HGNC:7419,MIM:516030;Name=YP_003024028.1;Note=cytochrome c oxidase I;gbkey=CDS;gene=COX1;product=cytochrome c oxidase subunit I;protein_id=YP_003024028.1;transl_table=2 49 | NC_012920.1 RefSeq gene 7446 7514 . - . ID=gene-TRNS1;Dbxref=GeneID:4574,HGNC:HGNC:7497,MIM:590080;Name=TRNS1;gbkey=Gene;gene=TRNS1;gene_biotype=tRNA;gene_synonym=MTTS1 50 | NC_012920.1 RefSeq tRNA 7446 7514 . - . ID=rna-TRNS1;Parent=gene-TRNS1;Dbxref=GeneID:4574,HGNC:HGNC:7497,MIM:590080;Note=NAR: 1697;anticodon=(pos:complement(7482..7484));codons=4%2C5%2C6%2C7;gbkey=tRNA;gene=TRNS1;product=tRNA-Ser 51 | NC_012920.1 RefSeq exon 7446 7514 . - . ID=exon-TRNS1-1;Parent=rna-TRNS1;Dbxref=GeneID:4574,HGNC:HGNC:7497,MIM:590080;Note=NAR: 1697;anticodon=(pos:complement(7482..7484));codons=4%2C5%2C6%2C7;gbkey=tRNA;gene=TRNS1;product=tRNA-Ser 52 | NC_012920.1 RefSeq gene 7518 7585 . + . ID=gene-TRND;Dbxref=GeneID:4555,HGNC:HGNC:7478,MIM:590015;Name=TRND;gbkey=Gene;gene=TRND;gene_biotype=tRNA;gene_synonym=MTTD 53 | NC_012920.1 RefSeq tRNA 7518 7585 . + . ID=rna-TRND;Parent=gene-TRND;Dbxref=GeneID:4555,HGNC:HGNC:7478,MIM:590015;Note=NAR: 0397;anticodon=(pos:7548..7550);codons=57;gbkey=tRNA;gene=TRND;product=tRNA-Asp 54 | NC_012920.1 RefSeq exon 7518 7585 . + . ID=exon-TRND-1;Parent=rna-TRND;Dbxref=GeneID:4555,HGNC:HGNC:7478,MIM:590015;Note=NAR: 0397;anticodon=(pos:7548..7550);codons=57;gbkey=tRNA;gene=TRND;product=tRNA-Asp 55 | NC_012920.1 RefSeq gene 7586 8269 . + . ID=gene-COX2;Dbxref=GeneID:4513,HGNC:HGNC:7421,MIM:516040;Name=COX2;gbkey=Gene;gene=COX2;gene_biotype=protein_coding;gene_synonym=COII,MTCO2 56 | NC_012920.1 RefSeq CDS 7586 8269 . + 0 ID=cds-YP_003024029.1;Parent=gene-COX2;Dbxref=Genbank:YP_003024029.1,GeneID:4513,HGNC:HGNC:7421,MIM:516040;Name=YP_003024029.1;Note=cytochrome c oxidase II;gbkey=CDS;gene=COX2;product=cytochrome c oxidase subunit II;protein_id=YP_003024029.1;transl_table=2 57 | NC_012920.1 RefSeq gene 8295 8364 . + . ID=gene-TRNK;Dbxref=GeneID:4566,HGNC:HGNC:7489,MIM:590060;Name=TRNK;gbkey=Gene;gene=TRNK;gene_biotype=tRNA;gene_synonym=MTTK 58 | NC_012920.1 RefSeq tRNA 8295 8364 . + . ID=rna-TRNK;Parent=gene-TRNK;Dbxref=GeneID:4566,HGNC:HGNC:7489,MIM:590060;Note=NAR: 1197;anticodon=(pos:8323..8325);codons=42;gbkey=tRNA;gene=TRNK;product=tRNA-Lys 59 | NC_012920.1 RefSeq exon 8295 8364 . + . ID=exon-TRNK-1;Parent=rna-TRNK;Dbxref=GeneID:4566,HGNC:HGNC:7489,MIM:590060;Note=NAR: 1197;anticodon=(pos:8323..8325);codons=42;gbkey=tRNA;gene=TRNK;product=tRNA-Lys 60 | NC_012920.1 RefSeq gene 8366 8572 . + . ID=gene-ATP8;Dbxref=GeneID:4509,HGNC:HGNC:7415,MIM:516070;Name=ATP8;gbkey=Gene;gene=ATP8;gene_biotype=protein_coding;gene_synonym=ATPase8,MTATP8 61 | NC_012920.1 RefSeq CDS 8366 8572 . + 0 ID=cds-YP_003024030.1;Parent=gene-ATP8;Dbxref=Genbank:YP_003024030.1,GeneID:4509,HGNC:HGNC:7415,MIM:516070;Name=YP_003024030.1;Note=ATP synthase 8%3B ATPase subunit 8;gbkey=CDS;gene=ATP8;product=ATP synthase F0 subunit 8;protein_id=YP_003024030.1;transl_table=2 62 | NC_012920.1 RefSeq gene 8527 9207 . + . ID=gene-ATP6;Dbxref=GeneID:4508,HGNC:HGNC:7414,MIM:516060;Name=ATP6;gbkey=Gene;gene=ATP6;gene_biotype=protein_coding;gene_synonym=ATPase6,MTATP6 63 | NC_012920.1 RefSeq CDS 8527 9207 . + 0 ID=cds-YP_003024031.1;Parent=gene-ATP6;Dbxref=Genbank:YP_003024031.1,GeneID:4508,HGNC:HGNC:7414,MIM:516060;Name=YP_003024031.1;Note=ATP synthase 6%3B ATPase subunit 6;gbkey=CDS;gene=ATP6;product=ATP synthase F0 subunit 6;protein_id=YP_003024031.1;transl_table=2 64 | NC_012920.1 RefSeq gene 9207 9990 . + . ID=gene-COX3;Dbxref=GeneID:4514,HGNC:HGNC:7422,MIM:516050;Name=COX3;gbkey=Gene;gene=COX3;gene_biotype=protein_coding;gene_synonym=COIII,MTCO3 65 | NC_012920.1 RefSeq CDS 9207 9990 . + 0 ID=cds-YP_003024032.1;Parent=gene-COX3;Dbxref=Genbank:YP_003024032.1,GeneID:4514,HGNC:HGNC:7422,MIM:516050;Name=YP_003024032.1;Note=TAA stop codon is completed by the addition of 3' A residues to the mRNA;gbkey=CDS;gene=COX3;product=cytochrome c oxidase subunit III;protein_id=YP_003024032.1;transl_except=(pos:9990..9990%2Caa:TERM);transl_table=2 66 | NC_012920.1 RefSeq gene 9991 10058 . + . ID=gene-TRNG;Dbxref=GeneID:4563,HGNC:HGNC:7486,MIM:590035;Name=TRNG;gbkey=Gene;gene=TRNG;gene_biotype=tRNA;gene_synonym=MTTG 67 | NC_012920.1 RefSeq tRNA 9991 10058 . + . ID=rna-TRNG;Parent=gene-TRNG;Dbxref=GeneID:4563,HGNC:HGNC:7486,MIM:590035;Note=NAR: 0797;anticodon=(pos:10021..10023);codons=62;gbkey=tRNA;gene=TRNG;product=tRNA-Gly 68 | NC_012920.1 RefSeq exon 9991 10058 . + . ID=exon-TRNG-1;Parent=rna-TRNG;Dbxref=GeneID:4563,HGNC:HGNC:7486,MIM:590035;Note=NAR: 0797;anticodon=(pos:10021..10023);codons=62;gbkey=tRNA;gene=TRNG;product=tRNA-Gly 69 | NC_012920.1 RefSeq gene 10059 10404 . + . ID=gene-ND3;Dbxref=GeneID:4537,HGNC:HGNC:7458,MIM:516002;Name=ND3;gbkey=Gene;gene=ND3;gene_biotype=protein_coding;gene_synonym=MTND3 70 | NC_012920.1 RefSeq CDS 10059 10404 . + 0 ID=cds-YP_003024033.1;Parent=gene-ND3;Dbxref=Genbank:YP_003024033.1,GeneID:4537,HGNC:HGNC:7458,MIM:516002;Name=YP_003024033.1;Note=TAA stop codon is completed by the addition of 3' A residues to the mRNA;gbkey=CDS;gene=ND3;product=NADH dehydrogenase subunit 3;protein_id=YP_003024033.1;transl_except=(pos:10404..10404%2Caa:TERM);transl_table=2 71 | NC_012920.1 RefSeq gene 10405 10469 . + . ID=gene-TRNR;Dbxref=GeneID:4573,HGNC:HGNC:7496,MIM:590005;Name=TRNR;gbkey=Gene;gene=TRNR;gene_biotype=tRNA;gene_synonym=MTTR 72 | NC_012920.1 RefSeq tRNA 10405 10469 . + . ID=rna-TRNR;Parent=gene-TRNR;Dbxref=GeneID:4573,HGNC:HGNC:7496,MIM:590005;Note=NAR: 0197;anticodon=(pos:10435..10437);codons=30;gbkey=tRNA;gene=TRNR;product=tRNA-Arg 73 | NC_012920.1 RefSeq exon 10405 10469 . + . ID=exon-TRNR-1;Parent=rna-TRNR;Dbxref=GeneID:4573,HGNC:HGNC:7496,MIM:590005;Note=NAR: 0197;anticodon=(pos:10435..10437);codons=30;gbkey=tRNA;gene=TRNR;product=tRNA-Arg 74 | NC_012920.1 RefSeq gene 10470 10766 . + . ID=gene-ND4L;Dbxref=GeneID:4539,HGNC:HGNC:7460,MIM:516004;Name=ND4L;gbkey=Gene;gene=ND4L;gene_biotype=protein_coding;gene_synonym=MTND4L 75 | NC_012920.1 RefSeq CDS 10470 10766 . + 0 ID=cds-YP_003024034.1;Parent=gene-ND4L;Dbxref=Genbank:YP_003024034.1,GeneID:4539,HGNC:HGNC:7460,MIM:516004;Name=YP_003024034.1;gbkey=CDS;gene=ND4L;product=NADH dehydrogenase subunit 4L;protein_id=YP_003024034.1;transl_table=2 76 | NC_012920.1 RefSeq gene 10760 12137 . + . ID=gene-ND4;Dbxref=GeneID:4538,HGNC:HGNC:7459,MIM:516003;Name=ND4;gbkey=Gene;gene=ND4;gene_biotype=protein_coding;gene_synonym=MTND4 77 | NC_012920.1 RefSeq CDS 10760 12137 . + 0 ID=cds-YP_003024035.1;Parent=gene-ND4;Dbxref=Genbank:YP_003024035.1,GeneID:4538,HGNC:HGNC:7459,MIM:516003;Name=YP_003024035.1;Note=TAA stop codon is completed by the addition of 3' A residues to the mRNA;gbkey=CDS;gene=ND4;product=NADH dehydrogenase subunit 4;protein_id=YP_003024035.1;transl_except=(pos:12137..12137%2Caa:TERM);transl_table=2 78 | NC_012920.1 RefSeq gene 12138 12206 . + . ID=gene-TRNH;Dbxref=GeneID:4564,HGNC:HGNC:7487,MIM:590040;Name=TRNH;gbkey=Gene;gene=TRNH;gene_biotype=tRNA;gene_synonym=MTTH 79 | NC_012920.1 RefSeq tRNA 12138 12206 . + . ID=rna-TRNH;Parent=gene-TRNH;Dbxref=GeneID:4564,HGNC:HGNC:7487,MIM:590040;Note=NAR: 0897;anticodon=(pos:12168..12170);codons=25;gbkey=tRNA;gene=TRNH;product=tRNA-His 80 | NC_012920.1 RefSeq exon 12138 12206 . + . ID=exon-TRNH-1;Parent=rna-TRNH;Dbxref=GeneID:4564,HGNC:HGNC:7487,MIM:590040;Note=NAR: 0897;anticodon=(pos:12168..12170);codons=25;gbkey=tRNA;gene=TRNH;product=tRNA-His 81 | NC_012920.1 RefSeq gene 12207 12265 . + . ID=gene-TRNS2;Dbxref=GeneID:4575,HGNC:HGNC:7498,MIM:590085;Name=TRNS2;gbkey=Gene;gene=TRNS2;gene_biotype=tRNA;gene_synonym=MTTS2 82 | NC_012920.1 RefSeq tRNA 12207 12265 . + . ID=rna-TRNS2;Parent=gene-TRNS2;Dbxref=GeneID:4575,HGNC:HGNC:7498,MIM:590085;Note=NAR: 1656;anticodon=(pos:12226..12228);codons=44%2C45;gbkey=tRNA;gene=TRNS2;product=tRNA-Ser 83 | NC_012920.1 RefSeq exon 12207 12265 . + . ID=exon-TRNS2-1;Parent=rna-TRNS2;Dbxref=GeneID:4575,HGNC:HGNC:7498,MIM:590085;Note=NAR: 1656;anticodon=(pos:12226..12228);codons=44%2C45;gbkey=tRNA;gene=TRNS2;product=tRNA-Ser 84 | NC_012920.1 RefSeq gene 12266 12336 . + . ID=gene-TRNL2;Dbxref=GeneID:4568,HGNC:HGNC:7491,MIM:590055;Name=TRNL2;gbkey=Gene;gene=TRNL2;gene_biotype=tRNA;gene_synonym=MTTL2 85 | NC_012920.1 RefSeq tRNA 12266 12336 . + . ID=rna-TRNL2;Parent=gene-TRNL2;Dbxref=GeneID:4568,HGNC:HGNC:7491,MIM:590055;Note=NAR: 1097;anticodon=(pos:12298..12300);codons=16%2C17%2C18%2C19;gbkey=tRNA;gene=TRNL2;product=tRNA-Leu 86 | NC_012920.1 RefSeq exon 12266 12336 . + . ID=exon-TRNL2-1;Parent=rna-TRNL2;Dbxref=GeneID:4568,HGNC:HGNC:7491,MIM:590055;Note=NAR: 1097;anticodon=(pos:12298..12300);codons=16%2C17%2C18%2C19;gbkey=tRNA;gene=TRNL2;product=tRNA-Leu 87 | NC_012920.1 RefSeq gene 12337 14148 . + . ID=gene-ND5;Dbxref=GeneID:4540,HGNC:HGNC:7461,MIM:516005;Name=ND5;gbkey=Gene;gene=ND5;gene_biotype=protein_coding;gene_synonym=MTND5 88 | NC_012920.1 RefSeq CDS 12337 14148 . + 0 ID=cds-YP_003024036.1;Parent=gene-ND5;Dbxref=Genbank:YP_003024036.1,GeneID:4540,HGNC:HGNC:7461,MIM:516005;Name=YP_003024036.1;gbkey=CDS;gene=ND5;product=NADH dehydrogenase subunit 5;protein_id=YP_003024036.1;transl_table=2 89 | NC_012920.1 RefSeq gene 14149 14673 . - . ID=gene-ND6;Dbxref=GeneID:4541,HGNC:HGNC:7462,MIM:516006;Name=ND6;gbkey=Gene;gene=ND6;gene_biotype=protein_coding;gene_synonym=MTND6 90 | NC_012920.1 RefSeq CDS 14149 14673 . - 0 ID=cds-YP_003024037.1;Parent=gene-ND6;Dbxref=Genbank:YP_003024037.1,GeneID:4541,HGNC:HGNC:7462,MIM:516006;Name=YP_003024037.1;gbkey=CDS;gene=ND6;product=NADH dehydrogenase subunit 6;protein_id=YP_003024037.1;transl_table=2 91 | NC_012920.1 RefSeq gene 14674 14742 . - . ID=gene-TRNE;Dbxref=GeneID:4556,HGNC:HGNC:7479,MIM:590025;Name=TRNE;gbkey=Gene;gene=TRNE;gene_biotype=tRNA;gene_synonym=MTTE 92 | NC_012920.1 RefSeq tRNA 14674 14742 . - . ID=rna-TRNE;Parent=gene-TRNE;Dbxref=GeneID:4556,HGNC:HGNC:7479,MIM:590025;Note=NAR: 0697;anticodon=(pos:complement(14710..14712));codons=58;gbkey=tRNA;gene=TRNE;product=tRNA-Glu 93 | NC_012920.1 RefSeq exon 14674 14742 . - . ID=exon-TRNE-1;Parent=rna-TRNE;Dbxref=GeneID:4556,HGNC:HGNC:7479,MIM:590025;Note=NAR: 0697;anticodon=(pos:complement(14710..14712));codons=58;gbkey=tRNA;gene=TRNE;product=tRNA-Glu 94 | NC_012920.1 RefSeq gene 14747 15887 . + . ID=gene-CYTB;Dbxref=GeneID:4519,HGNC:HGNC:7427,MIM:516020;Name=CYTB;gbkey=Gene;gene=CYTB;gene_biotype=protein_coding;gene_synonym=MTCYB 95 | NC_012920.1 RefSeq CDS 14747 15887 . + 0 ID=cds-YP_003024038.1;Parent=gene-CYTB;Dbxref=Genbank:YP_003024038.1,GeneID:4519,HGNC:HGNC:7427,MIM:516020;Name=YP_003024038.1;Note=TAA stop codon is completed by the addition of 3' A residues to the mRNA;gbkey=CDS;gene=CYTB;product=cytochrome b;protein_id=YP_003024038.1;transl_except=(pos:15887..15887%2Caa:TERM);transl_table=2 96 | NC_012920.1 RefSeq gene 15888 15953 . + . ID=gene-TRNT;Dbxref=GeneID:4576,HGNC:HGNC:7499,MIM:590090;Name=TRNT;gbkey=Gene;gene=TRNT;gene_biotype=tRNA;gene_synonym=MTTT 97 | NC_012920.1 RefSeq tRNA 15888 15953 . + . ID=rna-TRNT;Parent=gene-TRNT;Dbxref=GeneID:4576,HGNC:HGNC:7499,MIM:590090;Note=NAR: 1797;anticodon=(pos:15919..15921);codons=38;gbkey=tRNA;gene=TRNT;product=tRNA-Thr 98 | NC_012920.1 RefSeq exon 15888 15953 . + . ID=exon-TRNT-1;Parent=rna-TRNT;Dbxref=GeneID:4576,HGNC:HGNC:7499,MIM:590090;Note=NAR: 1797;anticodon=(pos:15919..15921);codons=38;gbkey=tRNA;gene=TRNT;product=tRNA-Thr 99 | NC_012920.1 RefSeq gene 15956 16023 . - . ID=gene-TRNP;Dbxref=GeneID:4571,HGNC:HGNC:7494,MIM:590075;Name=TRNP;gbkey=Gene;gene=TRNP;gene_biotype=tRNA;gene_synonym=MTTP 100 | NC_012920.1 RefSeq tRNA 15956 16023 . - . ID=rna-TRNP;Parent=gene-TRNP;Dbxref=GeneID:4571,HGNC:HGNC:7494,MIM:590075;Note=NAR: 1597;anticodon=(pos:complement(15990..15992));codons=22;gbkey=tRNA;gene=TRNP;product=tRNA-Pro 101 | NC_012920.1 RefSeq exon 15956 16023 . - . ID=exon-TRNP-1;Parent=rna-TRNP;Dbxref=GeneID:4571,HGNC:HGNC:7494,MIM:590075;Note=NAR: 1597;anticodon=(pos:complement(15990..15992));codons=22;gbkey=tRNA;gene=TRNP;product=tRNA-Pro 102 | NC_012920.1 RefSeq D_loop 16024 17145 . - . ID=id-NC_012920.1:1..16569;gbkey=D-loop 103 | -------------------------------------------------------------------------------- /tests/files/TREX1.gff: -------------------------------------------------------------------------------- 1 | NC_000003.11 BestRefSeq gene 48488183 48509044 . + . ID=gene-ATRIP-TREX1;Dbxref=GeneID:111822955;Name=ATRIP-TREX1;description=ATRIP-TREX1 readthrough;gbkey=Gene;gene=ATRIP-TREX1;gene_biotype=lncRNA 2 | NC_000003.11 BestRefSeq lnc_RNA 48488183 48509044 . + . ID=rna-NR_153405.1;Parent=gene-ATRIP-TREX1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Name=NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1 3 | NC_000003.11 BestRefSeq exon 48488183 48488496 . + . ID=exon-NR_153405.1-1;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1 4 | NC_000003.11 BestRefSeq exon 48491443 48491576 . + . ID=exon-NR_153405.1-2;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1 5 | NC_000003.11 BestRefSeq exon 48493135 48493305 . + . ID=exon-NR_153405.1-3;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1 6 | NC_000003.11 BestRefSeq exon 48495700 48495818 . + . ID=exon-NR_153405.1-4;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1 7 | NC_000003.11 BestRefSeq exon 48498659 48498816 . + . ID=exon-NR_153405.1-5;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1 8 | NC_000003.11 BestRefSeq exon 48500758 48500853 . + . ID=exon-NR_153405.1-6;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1 9 | NC_000003.11 BestRefSeq exon 48501186 48501315 . + . ID=exon-NR_153405.1-7;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1 10 | NC_000003.11 BestRefSeq exon 48501509 48502198 . + . ID=exon-NR_153405.1-8;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1 11 | NC_000003.11 BestRefSeq exon 48502760 48502844 . + . ID=exon-NR_153405.1-9;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1 12 | NC_000003.11 BestRefSeq exon 48505144 48505280 . + . ID=exon-NR_153405.1-10;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1 13 | NC_000003.11 BestRefSeq exon 48505440 48505531 . + . ID=exon-NR_153405.1-11;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1 14 | NC_000003.11 BestRefSeq exon 48505981 48506061 . + . ID=exon-NR_153405.1-12;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1 15 | NC_000003.11 BestRefSeq exon 48506230 48506482 . + . ID=exon-NR_153405.1-13;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1 16 | NC_000003.11 BestRefSeq exon 48506886 48507708 . + . ID=exon-NR_153405.1-14;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1 17 | NC_000003.11 BestRefSeq exon 48508029 48509044 . + . ID=exon-NR_153405.1-15;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1 18 | NC_000003.11 BestRefSeq gene 48507229 48509044 . + . ID=gene-TREX1;Dbxref=GeneID:11277,HGNC:HGNC:12269,MIM:606609;Name=TREX1;description=three prime repair exonuclease 1;gbkey=Gene;gene=TREX1;gene_biotype=protein_coding;gene_synonym=AGS1,CRV,DRN3,HERNS 19 | NC_000003.11 BestRefSeq mRNA 48507229 48509044 . + . ID=rna-NM_007248.5;Parent=gene-TREX1;Dbxref=GeneID:11277,Genbank:NM_007248.5,HGNC:HGNC:12269,MIM:606609;Name=NM_007248.5;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=TREX1;inference=similar to RNA sequence%2C mRNA (same species):RefSeq:NM_007248.5;product=three prime repair exonuclease 1%2C transcript variant 5;transcript_id=NM_007248.5 20 | NC_000003.11 BestRefSeq exon 48507229 48507568 . + . ID=exon-NM_007248.5-1;Parent=rna-NM_007248.5;Dbxref=GeneID:11277,Genbank:NM_007248.5,HGNC:HGNC:12269,MIM:606609;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=TREX1;inference=similar to RNA sequence%2C mRNA (same species):RefSeq:NM_007248.5;product=three prime repair exonuclease 1%2C transcript variant 5;transcript_id=NM_007248.5 21 | NC_000003.11 BestRefSeq exon 48508067 48509044 . + . ID=exon-NM_007248.5-2;Parent=rna-NM_007248.5;Dbxref=GeneID:11277,Genbank:NM_007248.5,HGNC:HGNC:12269,MIM:606609;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=TREX1;inference=similar to RNA sequence%2C mRNA (same species):RefSeq:NM_007248.5;product=three prime repair exonuclease 1%2C transcript variant 5;transcript_id=NM_007248.5 22 | NC_000003.11 BestRefSeq CDS 48508085 48508999 . + 0 ID=cds-NP_009179.2;Parent=rna-NM_007248.5;Dbxref=CCDS:CCDS59451.1,GeneID:11277,Genbank:NP_009179.2,HGNC:HGNC:12269,MIM:606609;Name=NP_009179.2;Note=isoform c is encoded by transcript variant 5;gbkey=CDS;gene=TREX1;product=three-prime repair exonuclease 1 isoform c;protein_id=NP_009179.2 23 | NC_000003.11 BestRefSeq mRNA 48507629 48509044 . + . ID=rna-NM_033629.6;Parent=gene-TREX1;Dbxref=GeneID:11277,Genbank:NM_033629.6,HGNC:HGNC:12269,MIM:606609;Name=NM_033629.6;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=TREX1;inference=similar to RNA sequence%2C mRNA (same species):RefSeq:NM_033629.6;product=three prime repair exonuclease 1%2C transcript variant 4;tag=MANE Select;transcript_id=NM_033629.6 24 | NC_000003.11 BestRefSeq exon 48507629 48507708 . + . ID=exon-NM_033629.6-1;Parent=rna-NM_033629.6;Dbxref=GeneID:11277,Genbank:NM_033629.6,HGNC:HGNC:12269,MIM:606609;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=TREX1;inference=similar to RNA sequence%2C mRNA (same species):RefSeq:NM_033629.6;product=three prime repair exonuclease 1%2C transcript variant 4;tag=MANE Select;transcript_id=NM_033629.6 25 | NC_000003.11 BestRefSeq exon 48508029 48509044 . + . ID=exon-NM_033629.6-2;Parent=rna-NM_033629.6;Dbxref=GeneID:11277,Genbank:NM_033629.6,HGNC:HGNC:12269,MIM:606609;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=TREX1;inference=similar to RNA sequence%2C mRNA (same species):RefSeq:NM_033629.6;product=three prime repair exonuclease 1%2C transcript variant 4;tag=MANE Select;transcript_id=NM_033629.6 26 | NC_000003.11 BestRefSeq CDS 48508055 48508999 . + 0 ID=cds-NP_338599.1;Parent=rna-NM_033629.6;Dbxref=CCDS:CCDS2769.1,GeneID:11277,Genbank:NP_338599.1,HGNC:HGNC:12269,MIM:606609;Name=NP_338599.1;Note=isoform b is encoded by transcript variant 4;gbkey=CDS;gene=TREX1;product=three-prime repair exonuclease 1 isoform b;protein_id=NP_338599.1;tag=MANE Select 27 | NC_000023.10 BestRefSeq gene 122734420 122866902 . - . ID=gene-THOC2;Dbxref=GeneID:57187,HGNC:HGNC:19073,MIM:300395;Name=THOC2;description=THO complex 2;gbkey=Gene;gene=THOC2;gene_biotype=protein_coding;gene_synonym=CXorf3,dJ506G2.1,hTREX120,MRX12,MRX35,THO2 28 | -------------------------------------------------------------------------------- /tests/files/hgnc_toy.tsv: -------------------------------------------------------------------------------- 1 | Approved symbol Alias symbols NCBI Gene ID(supplied by NCBI) 2 | BRCA2 FAD, FAD1, BRCC2, XRCC11 675 3 | TMEM127 FLJ20507, FLJ22257 55654 4 | TRPV4 OTRPC4, TRP12, VROAC, VRL-2, VR-OAC, CMT2C 59341 5 | GLB1 EBP 2720 6 | GLB1L2 89944 7 | GLB1L3 FLJ90231 112937 8 | GLB1L MGC10771 79411 9 | PLGLB1 PRP-B 5343 10 | SH3GLB1 CGI-61, KIAA0491, Bif-1, PPP1R70 51100 11 | AIF1 IRT-1, AIF-1, Em:AF129756.17, IBA1 199 12 | ARHGEF12 KIAA0382, LARG 23365 13 | C1QTNF12 MGC105127, CTRP12, ADIPOLIN 388581 14 | CD300A Irp60, CMRF35H, CMRF-35-H9, IRC1, IRC2, IGSF12 11314 15 | CYP4F12 66002 16 | DCAF12 DKFZP434O125, MGC1058, CT102, TCC52 25853 17 | DCAF12L1 KIAA1892L 139170 18 | DCAF12L2 340578 19 | F12 2161 20 | FGF12-AS1 100873986 21 | FGF12-AS2 100873987 22 | FGF12-AS3 100873988 23 | FGF12B 24 | FGF12 FHF1 2257 25 | KIF12 113220 26 | KLF12 AP-2rep, HSPC122, AP2REP 11278 27 | LINC00588 DKFZP434F122 26138 28 | MAPK10 JNK3, p493F12, p54bSAPK 5602 29 | MKRN3-AS1 FNZ127, NCRNA00009, ZNF127-AS 10108 30 | NBPF12 COAS1 149013 31 | OR1F12 hs6M1-35P, OR1F12Q 442179 32 | OR4F12 33 | PHF12 PF1, KIAA1523 57649 34 | PRAMEF12 OTTHUMG00000001927 390999 35 | PSLNR LA16c-83F12.6 106146148 36 | RET PTC, CDHF12, RET51, CDHR16 5979 37 | RFPL3 RNF120 10738 38 | RNF121 FLJ11099 55298 39 | RNF122 FLJ12526 79845 40 | RNF123 FLJ12565, KPC1 63891 41 | RNF125 FLJ20456 54941 42 | RNF126 FLJ20552 55658 43 | RNF126P1 376412 44 | RNF128 FLJ23516, GRAIL 79589 45 | SPINT1-AS1 RP11-532F12.5 102724362 46 | SRSF12 SRrp35, SFRS19 135295 47 | TAF12 TAFII20 6883 48 | TCF12 HEB, HTF4, HsT17266, bHLHb20, p64 6938 49 | TNFRSF12A FN14, TweakR, CD266 51330 50 | TNFRSF12L DR3L 51 | TNFSF12-TNFSF13 TWE-PRIL 407977 52 | TNFSF12 TWEAK, DR3LG, APO3L 8742 53 | TRAF3IP1 MIP-T3, DKFZP434F124, MIPT3, IFT54, FAP116 26146 54 | UQCRHP1 Em:AF129756.18 100130756 55 | ZBTB20 ODA-8S, DKFZp566F123, DPZF 26137 56 | ZNF101 HZF12, DKFZp570I0164 94039 57 | ZNF120 58 | ZNF121 ZHC32, ZNF20 7675 59 | ZNF122 60 | ZNF123P HZF-1 100188891 61 | ZNF124 HZF16, HZF-16 7678 62 | ZNF125 HZF3, HZF-3 63 | ZNF126 HZF2, HZF-2 64 | ZNF128 65 | ZNF129 66 | ZNF12 KOX3, GIOT-3 7559 67 | ZNF92 HPF12, TF12 168374 68 | SLC34A1 NAPI-3, NPTIIa, SLC11 6569 69 | CFTR-AS1 BGas 111082987 70 | CFTR MRP7, ABC35, TNR-CFTR, dJ760C5.1, CFTR/MRP 1080 71 | CFTRP1 dJ760C5.1 140871 72 | CFTRP2 107080633 73 | CFTRP3 106481718 74 | TAS2R16 T2R16 50833 75 | APAH1 76 | CPA6 CPAH 57094 77 | PAH PH 5053 78 | PHYHIP KIAA0273, PAHX-AP 9796 79 | PHYH PAHX, RD, PHYH1 5264 80 | PTLAH FPAH 81 | SLC22A6 ROAT1, PAHT, OAT1 9356 82 | ADHFE1 FLJ32430 137872 83 | HAMP LEAP-1, HEPC, HFE2B, LEAP1 57817 84 | HFE-AS1 85 | HFE HLA-H, HFE1 3077 86 | HJV JH, HFE2A, RGMC, hemojuvelin, haemojuvelin 148738 87 | SLC40A1 MTP1, IREG1, FPN1, HFE4 30061 88 | TFR2 HFE3, TFRC2 7036 89 | TREX1 DRN3 11277 90 | CFTR-AS1 BGas 111082987 91 | CFTR MRP7, ABC35, TNR-CFTR, dJ760C5.1, CFTR/MRP 1080 92 | CFTRP1 dJ760C5.1 140871 93 | CFTRP2 107080633 94 | CFTRP3 106481718 95 | ADORA2A-AS1 FLJ34651 646023 96 | ADORA2A RDC8 135 97 | SPECC1L-ADORA2A 101730217 98 | FTCD 10841 99 | FTCD-AS1 100861507 100 | FTCDNL1 FONG 348751 101 | CYP2D6 CPD6, P450-DB1, CYP2D, P450C2D 1565 102 | MUTYH MYH 4595 103 | MSH6 2956 104 | ABHD15-AS1 linc-TP53I13, lnc-TP53I13 104355133 105 | EI24 PIG8, TP53I8, EPG4 9538 106 | ENC1 PIG10, ENC-1, TP53I10, KLHL37 8507 107 | GAMT PIG2, TP53I2 2593 108 | LGALS7 GAL7, PIG1, TP53I1, LGALS7A 3963 109 | LITAF PIG7, SIMPLE, FLJ38636, TP53I7 9516 110 | MAD1L1 HsMAD1, TXBP181, MAD1, PIG9, TP53I9 8379 111 | PRODH HSPOX2, PRODH1, PIG6, PRODH2, TP53I6 5625 112 | PTGES MGST-IV, PIG12, MGST1-L1, TP53I12 9536 113 | SAA1 PIG4, TP53I4 6288 114 | TOPORS TP53BPL, LUN 10210 115 | TP53AIP1 p53AIP1 63970 116 | TP53BP1 53BP1, p202, TDRD30 7158 117 | TP53BP2P1 94299 118 | TP53BP2 PPP1R13A, ASPP2, 53BP2 7159 119 | TP53COR1 linc-p21, lincRNA-p21, Trp53cor1 102800311 120 | TP53CP p53CP 121 | TP53I11 PIG11 9537 122 | TP53I13 DSCP1 90313 123 | TP53I3 PIG3 9540 124 | TP53INP1 DKFZp434M1317, FLJ22139, P53DINP1, SIP, TP53INP1A, TP53INP1B, Teap 94241 125 | TP53INP2 FLJ21759, FLJ23500, DKFZp434B2411, DKFZp434O0827, dJ1181N3.1, PINH, DOR 58476 126 | TP53L NBP 127 | TP53 p53, LFS1 7157 128 | TP53RK dJ101A2.2, prpk, Nori-2p, BUD32, TPRKB 112858 129 | TP53TG1 H_RG012D21.9, LINC00096 11257 130 | TP53TG3B 729355 131 | TP53TG3C 653550 132 | TP53TG3D 729264 133 | TP53TG3E 102724101 134 | TP53TG3F 102724127 135 | TP53TG3GP 106660619 136 | TP53TG3HP 100130700 137 | TP53TG3 P53TG3, TP53TG3A 24150 138 | TP53TG5 CLG01, dJ453C12.5 27296 139 | TMEM127 FLJ20507, FLJ22257 55654 140 | -------------------------------------------------------------------------------- /tests/functional-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | test -e ssshtest || wget -q https://raw.githubusercontent.com/ryanlayer/ssshtest/master/ssshtest 3 | 4 | . ssshtest 5 | 6 | nim c -d:debug -d:useSysAssert -d:useGcAssert --lineDir:on --debuginfo --boundChecks:on -x:on src/clinvcf 7 | grch37_version="--genome GRCh37" 8 | exe=./src/clinvcf 9 | 10 | run simple_parsing $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/BRCA2.gff $grch37_version tests/files/37785.xml 11 | assert_exit_code 0 12 | assert_in_stdout "##fileDate=2019-12-31" 13 | assert_in_stdout "13 32893387 37785 T A" 14 | assert_in_stdout "CLNSIG=Conflicting_interpretations_of_pathogenicity" 15 | assert_in_stdout "ALLELEID=46341" 16 | assert_in_stdout "GENEINFO=BRCA2:675" 17 | assert_in_stdout "CLNREVSTAT=criteria_provided,_conflicting_interpretations" 18 | assert_in_stdout "MC=SO:0001583|missense_variant" 19 | assert_in_stdout "RS=80358507" 20 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 21 | assert_in_stdout "VARIANTLENGTH=1" 22 | 23 | # Check integration of NCBI clinsig conversion 24 | run ncbi_clnsig_conversion $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/109.xml 25 | assert_exit_code 0 26 | assert_in_stdout "CLNSIG=Likely_pathogenic,_risk_factor" 27 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 28 | assert_in_stdout "VARIANTLENGTH=1" 29 | 30 | # Multiple submission from same submitter 31 | run mutli_subs_from_same_submitter $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/307134.xml 32 | assert_exit_code 0 33 | assert_in_stdout "CLNREVSTAT=criteria_provided,_single_submitter" 34 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 35 | assert_in_stdout "VARIANTLENGTH=1" 36 | 37 | # Multiple submission from same submitter 38 | run skip_het_compound $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/928.xml 39 | assert_exit_code 0 40 | assert_in_stdout "CLNSIG=Likely_pathogenic" 41 | 42 | # Conflicting variants should always has a ReviewStatus conflicting 43 | # Even if all submission are from the same submitter 44 | run same_submitter_conflict $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/1166.xml 45 | assert_exit_code 0 46 | assert_in_stdout "CLNREVSTAT=criteria_provided,_conflicting_interpretations" 47 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 48 | assert_in_stdout "VARIANTLENGTH=1" 49 | 50 | # Multiple 3-4 stars subs, take them all !!! (see case 7108) 51 | run run_multiple_3_4_star_subs $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/7108.xml 52 | assert_exit_code 0 53 | assert_in_stdout "CLNSIG=Pathogenic,_drug_response" 54 | assert_in_stdout "CLNREVSTAT=reviewed_by_expert_panel" 55 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 56 | assert_in_stdout "VARIANTLENGTH=1" 57 | 58 | # Sort non-ACMG clnsig lexicographically 59 | run sort_non_acmg_cnlsig_tags $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/5333.xml 60 | assert_exit_code 0 61 | assert_in_stdout "CLNSIG=Affects,_risk_factor" 62 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 63 | assert_in_stdout "VARIANTLENGTH=1" 64 | 65 | # Sort non-ACMG clnsig lexicographically 66 | run expert_panel $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/582.xml 67 | assert_exit_code 0 68 | assert_in_stdout "CLNSIG=Pathogenic;" 69 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 70 | assert_in_stdout "VARIANTLENGTH=1" 71 | 72 | # Correction of conflicting interpretation 73 | run conflict_deciphering $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/9.xml 74 | assert_exit_code 0 75 | assert_in_stdout "CLNSIG=Pathogenic" 76 | assert_in_stdout "OLD_CLNSIG=Conflicting_interpretations_of_pathogenicity" 77 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 78 | assert_in_stdout "VARIANTLENGTH=1" 79 | 80 | # Handle multiple gene and select the prefered one from HGVS 81 | run multi_gene_selection $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/TREX1.gff $grch37_version tests/files/225499.xml 82 | assert_exit_code 0 83 | assert_in_stdout "GENEINFO=TREX1:11277" 84 | assert_in_stdout "VARIANTTYPE=Duplication" 85 | assert_in_stdout "VARIANTLENGTH=1" 86 | 87 | # Handle mutliple gene and select the prefered based on submissions (HGVS has no gene) 88 | run multi_gene_selection $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/CFTR.gff $grch37_version tests/files/618897_2019-05.xml 89 | assert_exit_code 0 90 | assert_in_stdout "GENEINFO=CFTR:1080" 91 | assert_in_stdout "VARIANTTYPE=Deletion" 92 | assert_in_stdout "VARIANTLENGTH=1" 93 | 94 | # Mitochondrial annotations 95 | run mito_anno $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/MT.gff $grch37_version tests/files/9618.xml 96 | assert_exit_code 0 97 | assert_in_stdout "GENEINFO=TRNE:4556" 98 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 99 | assert_in_stdout "VARIANTLENGTH=1" 100 | 101 | # Coding first option for gene anno to force using protein coding annotation 102 | run coding_first_control $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/ADORA2A.gff $grch37_version tests/files/225974.xml 103 | assert_exit_code 0 104 | assert_in_stdout "GENEINFO=ADORA2A:135" 105 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 106 | assert_in_stdout "VARIANTLENGTH=1" 107 | run coding_first_option $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/ADORA2A.gff --coding-first $grch37_version tests/files/225974.xml 108 | assert_exit_code 0 109 | assert_in_stdout "GENEINFO=ADORA2A:135" 110 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 111 | assert_in_stdout "VARIANTLENGTH=1" 112 | # Consider close exonic regions (20bp padding) as exonic in gene priorization module 113 | # In this case variant is in FTCD-AS1 (protein-coding) exon but 9bp away from FTCD. 114 | # We discriminate these two gene using the gene_id of FTCD that is smaller that FTCD-AS1 115 | run close_exonic_region $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/FTCD.gff $grch37_version tests/files/340430.xml 116 | assert_exit_code 0 117 | assert_in_stdout "GENEINFO=FTCD:10841" 118 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 119 | assert_in_stdout "VARIANTLENGTH=1" 120 | 121 | # For antivariant (same as the reference) 122 | # We use the "." for the alternate allele representation 123 | run antivariant $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/242771.xml 124 | assert_exit_code 0 125 | assert_in_stdout "22 42523943 242771 A ." 126 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 127 | assert_in_stdout "VARIANTLENGTH=1" 128 | 129 | # Haplotypes should not be exported in the VCF 130 | run haplotype $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/16895.xml 131 | assert_equal "$(grep -v '^#' $STDOUT_FILE)" "" 132 | 133 | # 3-stars reclassification system 134 | run three_star_reclassification $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/184976.xml 135 | assert_exit_code 0 136 | assert_in_stdout "CLNSIG=Pathogenic/Likely_pathogenic" 137 | assert_in_stdout "CLNRECSTAT=3" 138 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 139 | assert_in_stdout "VARIANTLENGTH=1" 140 | 141 | run two_star_reclassification $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/140866.xml 142 | assert_exit_code 0 143 | assert_in_stdout "CLNSIG=Likely_pathogenic" 144 | assert_in_stdout "CLNRECSTAT=2" 145 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 146 | assert_in_stdout "VARIANTLENGTH=1" 147 | 148 | run one_star_reclassification $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/182965.xml 149 | assert_exit_code 0 150 | assert_in_stdout "CLNRECSTAT=1" 151 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 152 | assert_in_stdout "VARIANTLENGTH=1" 153 | 154 | # Pathology parsing 155 | run pathology_field_parsing $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/109.xml 156 | assert_exit_code 0 157 | assert_in_stdout "CLNDISEASE=pheochromocytoma_susceptibility_to|pheochromocytoma" 158 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant" 159 | assert_in_stdout "VARIANTLENGTH=1" -------------------------------------------------------------------------------- /tests/gff_tests.nim: -------------------------------------------------------------------------------- 1 | import unittest 2 | import clinvcfpkg/gff 3 | 4 | suite "test GFF functions": 5 | 6 | test "test interval": 7 | var 8 | a = Region(chrom: "A", start: 1, stop: 8) 9 | b = Region(chrom: "A", start: 5, stop: 12) 10 | c = Region(chrom: "A", start: 1, stop: 4) 11 | d = Region(chrom: "A", start: 1, stop: 4) 12 | 13 | check a.isOverlapping(b) == true 14 | check a.isOverlapping(c) == true 15 | check b.isOverlapping(c) == false 16 | 17 | check (d == c) == true 18 | a.merge(b) 19 | check a.start == 1 20 | check a.stop == 12 21 | check b.start == 5 22 | check b.stop == 12 23 | -------------------------------------------------------------------------------- /tests/hgnc_tests.nim: -------------------------------------------------------------------------------- 1 | import unittest 2 | import tables 3 | import clinvcfpkg/hgnc 4 | 5 | suite "test HGNC functions": 6 | 7 | test "test initHgncDbfromFile": 8 | var f = "tests/files/hgnc_toy.tsv" 9 | var hgncIndex = initHgncDbfromFile(f) 10 | # assert alias give the same entrez ID 11 | check hgncIndex.alias["LFS1"] == "TP53" 12 | -------------------------------------------------------------------------------- /tests/nim.cfg: -------------------------------------------------------------------------------- 1 | path = "../src" --------------------------------------------------------------------------------