├── .gitignore
├── .gitlab-ci.yml
├── Dockerfile
├── LICENSE
├── README.md
├── TODO.md
├── clinvcf.nimble
├── clinvcf.png
├── src
    ├── clinvcf.nim
    ├── clinvcfpkg
    │   ├── gff.nim
    │   ├── hgnc.nim
    │   ├── lapper.nim
    │   └── utils.nim
    ├── compvcf.nim
    └── extractClinvarSet.nim
└── tests
    ├── all.nim
    ├── clinvcf_tests.nim
    ├── files
        ├── 109.xml
        ├── 1166.xml
        ├── 140866.xml
        ├── 16895.xml
        ├── 182965.xml
        ├── 184976.xml
        ├── 225499.xml
        ├── 225974.xml
        ├── 242771.xml
        ├── 307134.xml
        ├── 340430.xml
        ├── 37785.xml
        ├── 5333.xml
        ├── 582.xml
        ├── 618897_2019-05.xml
        ├── 7108.xml
        ├── 9.xml
        ├── 928.xml
        ├── 9618.xml
        ├── ADORA2A.gff
        ├── BRCA2.gff
        ├── CFTR.gff
        ├── FTCD.gff
        ├── MT.gff
        ├── TREX1.gff
        └── hgnc_toy.tsv
    ├── functional-tests.sh
    ├── gff_tests.nim
    ├── hgnc_tests.nim
    └── nim.cfg


/.gitignore:
--------------------------------------------------------------------------------
1 | src/clinvcf
2 | ssshtest
3 | old/
4 | clinvcf
5 | tests/all
6 | extractClinvarSet
7 | src/clinvcf.dSYM/
8 | tests/all.dSYM/
9 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | variables:
 2 |   IMAGE_NAME: $CI_REGISTRY_IMAGE
 3 | 
 4 | include:
 5 |   - project: "devops/gitlab-ci-template"
 6 |     file: "/release.gitlab-ci.yml"
 7 |   - project: "devops/gitlab-ci-template"
 8 |     file: "/ssh-key.gitlab-ci.yml"
 9 |   - project: "devops/gitlab-ci-template"
10 |     file: "/docker.gitlab-ci.yml"
11 | 
12 | stages:
13 |   - test
14 |   - release
15 |   - artefact_build
16 |   - delivery_dev
17 |   - delivery_staging
18 |   - delivery_prod
19 | 
20 | release:
21 |   extends: .release
22 | 
23 | test:
24 |   image: registry.gitlab.seq.one/devops/dockerfiles/nim-builder:1.6.14
25 |   stage: test
26 |   script:
27 |     - apt-get update && apt-get install -y git
28 |     - nimble build -y
29 |     - nimble test
30 |   tags:
31 |     - bioinfo
32 | 
33 | artefact_build:
34 |   image: registry.gitlab.seq.one/devops/dockerfiles/nim-builder:1.6.14
35 |   stage: artefact_build
36 |   script:
37 |     - apt-get update && apt-get install -y git
38 |     - nimble install -y
39 |   artifacts:
40 |     paths:
41 |     - clinvcf
42 |   tags:
43 |     - bioinfo
44 |   only:
45 |     - tags
46 | 
47 | ####################
48 | # DEPLOY
49 | ####################
50 | delivery_dev:
51 |   extends: .build and delivery
52 |   stage: delivery_dev
53 |   environment:
54 |     name: "$ENV_TAG"
55 |   variables:
56 |     ENV_TAG: dev
57 |     DOCKER_BUILD_ARGS: --build-arg VERSION=$CI_COMMIT_REF_NAME
58 |       --build-arg PRIVATE_CI_ACCESS_TOKEN=$PRIVATE_CI_ACCESS_TOKEN
59 |   when: on_success
60 |   tags:
61 |     - bioinfo
62 | 
63 | ####################
64 | # DEPLOY Staging
65 | ####################
66 | delivery_staging:
67 |   extends: .tag and delivery
68 |   stage: delivery_staging
69 |   environment:
70 |     name: "$ENV_TAG"
71 |   variables:
72 |     ENV_TAG: staging
73 | 
74 | ####################
75 | # DEPLOY prod
76 | ####################
77 | delivery_prod:
78 |   extends: .tag and delivery
79 |   stage: delivery_prod
80 |   environment:
81 |     name: "$ENV_TAG"
82 |   variables:
83 |     ENV_TAG: prod
84 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:stable-slim as builder
 2 | ARG HTSLIB_VERSION=1.10
 3 | 
 4 | RUN apt-get update && apt-get install --no-install-recommends -y \
 5 |     libpcre3 libpcre3-dev \
 6 |     make \
 7 |     wget \
 8 |     libbz2-dev \
 9 |     bzip2 \
10 |     ca-certificates \
11 |     liblzma-dev \
12 |     zlib1g-dev  libcurl4-gnutls-dev gcc
13 | 
14 | # Install HTSLIB
15 | RUN cd /usr/bin && \
16 |     wget https://github.com/samtools/htslib/releases/download/$HTSLIB_VERSION/htslib-$HTSLIB_VERSION.tar.bz2 && \
17 |     tar -vxjf htslib-$HTSLIB_VERSION.tar.bz2 && \
18 |     cd htslib-$HTSLIB_VERSION && \
19 |     make
20 | 
21 | # FROM debian:stable-slim
22 | # ARG HTSLIB_VERSION=1.10
23 | 
24 | # COPY --from=builder /usr/bin/htslib-$HTSLIB_VERSION /usr/bin/htslib-$HTSLIB_VERSION
25 | 
26 | ENV LD_LIBRARY_PATH=/usr/bin/htslib-$HTSLIB_VERSION
27 | 
28 | 
29 | ARG PRIVATE_CI_ACCESS_TOKEN
30 | ARG VERSION
31 | 
32 | COPY clinvcf /usr/bin/
33 | 
34 | ENTRYPOINT ["clinvcf"]
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2020 SeqOne
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ClinVCF
  2 | 
  3 | ![ClinVCF-logo](clinvcf.png)
  4 | 
  5 | ClinVCF **generates a VCF file from a ClinVar Full Release** (XML format). It was first developped because we observed missing variants in VCF files provided by NCBI. We later extended its capabilities to provived enhanced Clinvar VCF files by :
  6 | 
  7 | - **Improving Clinvar classification and aggregation method** by [deciphering "conflicting intepretation" records](#clinicalsignificance-correction-module) where almost all submissions goes into the same direction.
  8 | - **Implementing a more robust [gene annotation module](#gene-annotation)** based NCBI GFF files.
  9 | 
 10 | ClinVCF is **developped in NimLang, is highly efficient*** (~ 5 minutes to generate the VCF from the XML) and supports GRCh37 and GRCh38 genomes builds.
 11 | 
 12 | **clinVCF** is a part of the [**Genome Alert!** framework](https://github.com/SeqOne/GenomeAlert_app) - [Website https://genomealert.univ-grenoble-alpes.fr/](https://genomealert.univ-grenoble-alpes.fr/).
 13 | 
 14 | ## Table of content
 15 | 
 16 | - [ClinVCF](#clinvcf)
 17 |   - [Table of content](#table-of-content)
 18 |   - [Quick start](#quick-start)
 19 |   - [Usage](#usage)
 20 |     - [Output format](#output-format)
 21 |   - [Methodology](#methodology)
 22 |     - [ClinicalSignificance correction module](#clinicalsignificance-correction-module)
 23 |     - [Gene annotation](#gene-annotation)
 24 |   - [How to cite](#how-to-cite)
 25 |   - [License](#license)
 26 |   - [Misc](#misc)
 27 | 
 28 | ## Quick start
 29 | 
 30 | You need to have [nimlang installed](https://nim-lang.org/install_unix.html) and [hts-nim](https://github.com/brentp/hts-nim) to compile and install clinVCF.
 31 | 
 32 | A clean install script of nim and hts-nim is proposed by Brent Pedersen [nimlang and hts-nim installed](https://github.com/brentp/hts-nim/blob/master/scripts/install.sh)
 33 | 
 34 | ```bash
 35 | # Git clone and install
 36 | git clone https://github.com/SeqOne/clinvcf.git && cd clinvcf && nimble install
 37 | 
 38 | # Download (latest) Clinvar XML release
 39 | wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz
 40 | 
 41 | # Download GFF for gene annotation (GRCh37 or 38)
 42 | wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh37_latest/refseq_identifiers/GRCh37_latest_genomic.gff.gz
 43 | wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gff.gz
 44 | 
 45 | # Generate clinvar VCF
 46 | ## For GRCh37
 47 | clinvcf --coding-first --genome GRCh37 ClinVarFullRelease_00-latest.xml.gz | bgzip -c > clinvar_GRCh37.vcf.gz
 48 | ## For GRCh38
 49 | clinvcf --coding-first  --genome GRCh38 ClinVarFullRelease_00-latest.xml.gz | bgzip -c > clinvar_GRCh38.vcf.gz
 50 | 
 51 | ```
 52 | 
 53 | ## Usage
 54 | 
 55 | ```bash
 56 | Usage: clinvcf [options] --genome <version> <clinvar.xml.gz>
 57 | 
 58 | Arguments:
 59 |   --genome <version>              Genome assembly to use
 60 |   
 61 | Options:
 62 |   --filename-date                 Use xml filename date instead of inner date which may differ
 63 |   --hgnc <table>                  HGNC table used for gene name alias corrections
 64 | 
 65 | Gene annotation:
 66 |   --gff <file>                    NCBI GFF to annotate variations with genes
 67 |   --coding-first                  Give priority to coding gene in annotation (even if intronic and exonic for another gene)
 68 |   --gene-padding <int>            Padding to annotation upstream/downstream genes (not applied for MT) [default: 5000]
 69 | ```
 70 | 
 71 | ### Output format
 72 | 
 73 | ClinVCF generates a VCF with almost identical format as the original NCBI VCF.
 74 | 
 75 | However, not all VCF fields are currently support by ClinVCF (see table bellow), and
 76 | additionnal fields are provided.
 77 | 
 78 | | VCF Info field | Status* | Format    | Description                                                                                                                                                        | Example                                        |
 79 | | -------------- | ------- | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------- |
 80 | | **ALLELEID**   | Same    | *Integer* | the ClinVar Allele ID                                                                                                                                              | `1234`                                         |
 81 | | **CLNREVSTAT** | Same    | *String*  | [ClinVar review status](https://www.ncbi.nlm.nih.gov/clinvar/docs/review_status/) for the Variation ID                                                             | `no_assertion_criteria_provided`               |
 82 | | **CLNSIG**     | Same    | String    | [Clinical significance](https://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/) for this single variant                                                                | `Pathogenic/Likely_Pathogenic`                 |
 83 | | **OLD_CLNSIG** | New     | String    | Orignial Clinical significance if variant reclassified by clinVCF correction module                                                                                | `Conflicting_interpretations_of_pathogenicity` |
 84 | | **CLNRECSTAT** | New     | Integer   | [3-levels stars confidence](#clinicalsignificance-correction-module) of Variant Alert! automatic reclassfication.                                                  | `3`                                            |
 85 | | **GENEINFO**   | Same    | String    | Gene(s) for the variant reported as gene symbol:gene id. The gene symbol and id are delimited by a colon (`:`) and each pair is delimited by a vertical bar (`\|`) | `FTCD:10841\|FTCD-AS1:100861507`               |
 86 | | **MC**         | Same    | String    | comma separated list of molecular consequence in the form of Sequence Ontology `ID\|molecular_consequence`                                                         | `SO:0001583\|missense_variant`                 |
 87 | | **RS**         | Same    | String    | dbSNP ID (i.e. rs number)                                                                                                                                          | `80358507`                                     |
 88 | 
 89 | **Status**: *Same* (identical as in original Clinvar VCF), *new* (New field from clinVCF)
 90 | 
 91 | ## Methodology
 92 | 
 93 | ### ClinicalSignificance correction module
 94 | 
 95 | According to the 1.5 * IQR method, we remove outliers submissions and reclassify conflicting status variants according to ClinVar policies. We apply a 3-level star metrics according to our reclassification confidence. 4 or more submission is needed. We only reclassify variants from `conflicting` status to `benign`, `likely benign`, `likely pathogenic` and `pathogenic` status. 
 96 | 
 97 | - ⭐ **(1 star)** : default
 98 | - ⭐⭐ **(2 stars)** : reclassification remains even if we add a virtual VUS submission
 99 | - ⭐⭐⭐ **(3 stars)** : 2 stars requirements and at least 1 pathogenic (or benign) classification
100 | 
101 | ### Gene annotation
102 | 
103 | 1. **We load all genes from the input GFF** and add them to the index with a padding (5000bp by default and 2bp for MT genes), to annotate upstream / downstream variants.
104 | 2. **For each variant we query the gene index** and retrieve all overlapping genes.
105 | 3. **Overlapped genes are later prioritize** in the `GENEINFO` field with two different procedures (depending of clinVCF parameter)
106 |      - If `--coding-first` option is activated :
107 |        - We take coding genes over all other genes (except for MT genome)
108 |        - If we have an equality we take exonic (+/-20bp padding) over intronic/intergenic candidates
109 |        - If none are exonic, we take the gene with closest exon
110 |        - If both are exonic, we take the oldest gene ID in NCBI Entrez database
111 |      - Default procedure :
112 |        - We take coding gene over all other genes (except for MT genome) if the variant is exonic (+/- 20bp)
113 |        - If we have an equality we take exonic (+/-20bp padding) over intronic/intergenic candidates
114 |        - If none are exonic, we take the gene with closest exon
115 |        - If both are exonic, we take the oldest gene ID in NCBI Entrez database
116 | 
117 | ## How to cite
118 | 
119 | If you use a tool of the Genome Alert! framework, please cite:
120 | > Yauy et al., Genome Alert!: a standardized procedure for genomic variant reinterpretation and automated genotype-phenotype reassessment in clinical routine. medRxiv (2021). [https://doi.org/10.1101/2021.07.13.21260422
121 | ](https://www.medrxiv.org/content/10.1101/2021.07.13.21260422v1)
122 | 
123 | ## License
124 | 
125 | **clinVCF** is licensed under the Apache License, Version 2.0. See [LICENSE](LICENSE) for the full license text.
126 | 
127 | ## Misc
128 | 
129 | **clinVCF** is a part of the [**Genome Alert!** framework](https://github.com/SeqOne/GenomeAlert_app), a collaboration of :
130 | 
131 | [![SeqOne](img/logo-seqone.png)](https://seq.one/)
132 | 
133 | [![Université Grenoble Alpes](img/logo-uga.png)](https://iab.univ-grenoble-alpes.fr/)
134 | 
135 | [![CHU de Rouen](img/logo-CHU.png)](https://www.chu-rouen.fr/service/service-de-genetique/)


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | [x] Packaging with Nimble
 2 | [x] Sort VCF before
 3 | [x] Comp Clinvar VCF and ours
 4 | [x] Functional testing
 5 | [x] Gitlab CI
 6 | [x] Implement new aggregation function
 7 | [x] Add RS tag (dbSNP rsid)
 8 | [x] Add date header
 9 | [x] Handle GenotypeSet (These variants should not be included in the VCF (see case 424779))
10 | [x] Handle automatic clinsig conversions from NCBI
11 | [x] Multiple submitter is not multiple submissions !!! (see case 307134)
12 | [x] Multiple 3-4 stars subs, take all !!! (see case 7108)
13 | [x] Sort non-ACMG clnsig lexicographically (see case 5333 : drug_response,_risk_factor,_protective =>  drug_response,_protective,_risk_factor) 
14 | [x] Bug "criteria_provided,_single_submitter" that should be "criteria_provided,_conflicting_interpretations" when only one submitter with conflict (see case 1166)
15 | [x] Bug Conflicting only when 1 star or more !
16 | [x] Memory leak somewhere in xml parsing (see huge memory footprint for extractClinvarSet !!!!)
17 | [x] Output stats of reclassification
18 | [ ] Get rid of q() xml lib and do it directly with xmltree package (see extractClinvarSet code)
19 | [ ] Gene stats module
20 | [ ] Unit testing
21 | [ ] Add gnomad annot (using API calls and cache)
22 | [ ] Add progressiong bar when loading variants
23 | [ ] Create README file
24 | [ ] Add NB_STARS tag 
25 | [ ] Add a tag with number of submitters / submissions
26 | [ ] Optimize memory usage (variant infos could be stored in cache files and re-loaded at "print" time !)
27 | 
28 | <ClinicalSignificance>
29 |       <ReviewStatus>no assertion criteria provided</ReviewStatus>
30 |       <Description>likely pathogenic - adrenal pheochromocytoma</Description>
31 |       <Comment Type="ConvertedByNCBI">Converted during submission to Likely pathogenic.</Comment>
32 |     </ClinicalSignificance>
33 | 
34 | Correct discrepancies :
35 | 
36 | CLINSIG ERRORS 
37 | 
38 | A submission has a non harmonized clinsig values (ex: likely pathogenic - adrenal pheochromocytoma) that
39 | is said to be converted to Likely pathogenic, but information is not really there in XML (or is it ?)
40 | EX: DIFF OF CLNSIG for variant 109 : clinvar_2020-01.vcf = risk_factor <-> clinvar_20191223.vcf.gz = Likely_pathogenic,_risk_factor
41 | EX: DIFF OF CLNSIG for variant 1365 : clinvar_2020-01.vcf = Pathogenic <-> clinvar_20191223.vcf.gz = Pathogenic/Likely_pathogenic
42 | EX: DIFF OF CLNSIG for variant 1762 : clinvar_2020-01.vcf = not_provided <-> clinvar_20191223.vcf.gz = Benign
43 | 
44 | https://www.ncbi.nlm.nih.gov/clinvar/variation/1762/
45 | 
46 | It looks like we do not filter-out the 0-star "Pathogenic" submission from this one.
47 | DIFF OF CLNSIG for variant 928 : clinvar_2020-01.vcf = Pathogenic/Likely_pathogenic <-> clinvar_20191223.vcf.gz = Likely_pathogenic
48 | EX: DIFF OF CLNSIG for variant 1274 : clinvar_2020-01.vcf = Pathogenic/Likely_pathogenic <-> clinvar_20191223.vcf.gz = Pathogenic
49 | 
50 | REVSTAT ERRORS
51 | 
52 | MISSING VARIANTS
53 | 


--------------------------------------------------------------------------------
/clinvcf.nimble:
--------------------------------------------------------------------------------
 1 | # Package
 2 | 
 3 | version       = "0.0.1"
 4 | author        = "Jérôme Audoux, Sacha Beaumeunier"
 5 | description   = "Generate a clean Clinvar VCF"
 6 | license       = "SEQONE"
 7 | 
 8 | 
 9 | # Dependencies
10 | 
11 | requires "hts >= 0.2.20 & <= 0.2.23", "q", "docopt"#, "lapper"
12 | requires "https://github.com/GULPF/tiny_sqlite#head"
13 | requires "regex >= 0.13"
14 | srcDir = "src"
15 | installExt = @["nim"]
16 | 
17 | bin = @["clinvcf", "extractClinvarSet", "compvcf"]
18 | 
19 | skipDirs = @["tests"]
20 | 
21 | import ospaths,strutils
22 | 
23 | task test, "run the tests":
24 |   exec "nim c  -d:useSysAssert -d:useGcAssert --lineDir:on --debuginfo --lineDir:on --debuginfo -r --threads:on tests/all"
25 |   exec "bash tests/functional-tests.sh"
26 | 


--------------------------------------------------------------------------------
/clinvcf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeqOne/clinvcf/59a63ac0bde68d95c93106f55aab6c2c0c0735c6/clinvcf.png


--------------------------------------------------------------------------------
/src/clinvcfpkg/gff.nim:
--------------------------------------------------------------------------------
  1 | import tables, hts, strutils, re, algorithm
  2 | 
  3 | import ./lapper
  4 | import logging
  5 | import strformat
  6 | 
  7 | from ./utils import logger
  8 | 
  9 | type
 10 |   Region* = ref object of RootObj
 11 |     chrom*: string
 12 |     start*: int
 13 |     stop*: int
 14 | 
 15 |   GFFGene* = ref object of Region
 16 |     gene_symbol: string
 17 |     gene_id: int
 18 |     biotype: string
 19 |     exons: seq[Region]
 20 | 
 21 |   RequestGene = ref object
 22 |     gene: GFFGene
 23 |     query: Region
 24 | 
 25 | proc start*(region : Region): int {.inline.} = return region.start
 26 | proc stop*(region : Region): int {.inline.} = return region.stop
 27 | proc len*(region : Region): int {.inline.} = return region.stop - region.start + 1
 28 | proc `==`*(a, b: Region): bool {.inline.} = return a.chrom == b.chrom and a.start == b.start and a.stop == b.stop
 29 | proc `$`*(a: Region): string {.inline.} = return a.chrom & ":" & $a.start & "-" & $a.stop
 30 | proc isOverlapping*(a: Region, b: Region): bool {.inline.} = return a.stop >= b.start and a.start <= b.stop
 31 | proc merge*(a: var Region, b: Region) {.inline.} = a.start = min(a.start, b.start); a.stop = max(a.stop, b.stop)
 32 | 
 33 | # Coversion table from NCBI chromosomes ID's to usual names
 34 | let
 35 |   # FIXME: Should we remove the version from NC_ ids ?
 36 |   # ClinVar use MT and not M
 37 |   ncbi_to_chr = {
 38 |     "NC_000001.10": "1",
 39 |     "NC_000002.11": "2",
 40 |     "NC_000003.11": "3",
 41 |     "NC_000004.11": "4",
 42 |     "NC_000005.9": "5",
 43 |     "NC_000006.11": "6",
 44 |     "NC_000007.13": "7",
 45 |     "NC_000008.10": "8",
 46 |     "NC_000009.11": "9",
 47 |     "NC_000010.10": "10",
 48 |     "NC_000011.9": "11",
 49 |     "NC_000012.11": "12",
 50 |     "NC_000013.10": "13",
 51 |     "NC_000014.8": "14",
 52 |     "NC_000015.9": "15",
 53 |     "NC_000016.9": "16",
 54 |     "NC_000017.10": "17",
 55 |     "NC_000018.9": "18",
 56 |     "NC_000019.9": "19",
 57 |     "NC_000020.10": "20",
 58 |     "NC_000021.8": "21",
 59 |     "NC_000022.10": "22",
 60 |     "NC_000023.10": "X",
 61 |     "NC_000024.9": "Y",
 62 |     "NC_012920.1": "MT",
 63 |     "NC_000001.11": "1",
 64 |     "NC_000002.12": "2",
 65 |     "NC_000003.12": "3",
 66 |     "NC_000004.12": "4",
 67 |     "NC_000005.10": "5",
 68 |     "NC_000006.12": "6",
 69 |     "NC_000007.14": "7",
 70 |     "NC_000008.11": "8",
 71 |     "NC_000009.12": "9",
 72 |     "NC_000010.11": "10",
 73 |     "NC_000011.10": "11",
 74 |     "NC_000012.12": "12",
 75 |     "NC_000013.11": "13",
 76 |     "NC_000014.9": "14",
 77 |     "NC_000015.10": "15",
 78 |     "NC_000016.10": "16",
 79 |     "NC_000017.11": "17",
 80 |     "NC_000018.10": "18",
 81 |     "NC_000019.10": "19",
 82 |     "NC_000020.11": "20",
 83 |     "NC_000021.9": "21",
 84 |     "NC_000022.11": "22",
 85 |     "NC_000023.11": "X",
 86 |     "NC_000024.10": "Y",
 87 |     }.toTable
 88 | 
 89 | proc minExonDist*(gene: GFFGene, pos: int, padding : int): int =
 90 |   var min_dist = -1
 91 |   for exon in gene.exons:
 92 |     # Add padding to the exons, in order to consider close intronic regions as "exonic"
 93 |     # as these are likely to be linked to this gene
 94 |     let
 95 |       start = exon.start - padding
 96 |       stop = exon.stop + padding
 97 |     if pos >= start and pos <= stop:
 98 |       min_dist = 0
 99 |       break
100 |     else:
101 |       let dist = min(abs(start - pos), abs(pos - stop))
102 |       if min_dist == -1 or dist < min_dist:
103 |         min_dist = dist
104 |   result = min_dist
105 | 
106 | proc minExonDist*(gene: GFFGene, start: int, stop: int, padding : int): int =
107 |   result = min(gene.minExonDist(start, padding),gene.minExonDist(stop, padding))
108 | 
109 | proc removeChrPrevix*(chrom: string): string =
110 |   if chrom =~ re"""^chr(.*)""":
111 |     return matches[0]
112 |   else:
113 |     return chrom
114 | 
115 | proc parseChr*(chrom: string): string {.inline.} =
116 |   if ncbi_to_chr.hasKey(chrom):
117 |     result = ncbi_to_chr[chrom]
118 |   else:
119 |     result = removeChrPrevix(chrom)
120 | 
121 | proc parseKeyValues*(str: string, global_sep: char, key_value_sep: char): TableRef[string, string] =
122 |   let fields = str.split(global_sep)
123 |   result = newTable[string, string]()
124 |   for f in fields:
125 |     let
126 |       kv_split = f.split(key_value_sep, 1)
127 |     if kv_split.len() == 2:
128 |       result[kv_split[0]] = kv_split[1]
129 |     else:
130 |       stderr.writeLine("[Error] Value fields " & f & " was not a key/value field using separator " & key_value_sep)
131 | 
132 | proc loadGenesFromGFF*(gff_file: string, gene_padding : int): (TableRef[string, int], TableRef[string, Lapper[GFFGene]]) =
133 |   result = (newTable[string, int](), newTable[string, Lapper[GFFGene]]())
134 |   var
135 |     fh: BGZ
136 |     genes_chr_table = newTable[string, seq[GFFGene]]() # Temp table to load genes per-chromosomes
137 |     genes_name_table = newTable[string, GFFGene]()
138 | 
139 |   open(fh, gff_file, "r")
140 |   for line in fh:
141 |     # Skip headers
142 |     if line.len() == 0 or line[0] == '#':
143 |       continue
144 |     var v = line.split('\t', 3)
145 | 
146 |     # Only use "BestRefSeq" annotations
147 |     # This was disable as MT annotations are annotated "RefSeq" and not "BestRefSeq"
148 |     # if v[1] != "BestRefSeq":
149 |     #   continue
150 | 
151 |     # NC_000001.10    BestRefSeq      gene    367659  368597  .       +       .       ID=gene-OR4F29;Dbxref=GeneID:729759,HGNC:HGNC:31275;Name=OR4F29;description=olfactory receptor family 4 subfamily F member 29;gbkey=Gene;gene=OR4F29;gene_biotype=protein_coding;gene_synonym=OR7-21
152 |     if v[2] == "gene" or v[2] == "pseudogene":
153 |       var
154 |         v2 = v[3].split('\t')
155 |         chrom = parseChr(v[0])
156 |         start = parseInt(v2[0])
157 |         stop = parseInt(v2[1])
158 |         gff_fields = v2[5].parseKeyValues(';','=')
159 |         dbxref_fields = gff_fields["Dbxref"].parseKeyValues(',',':')
160 |         gene : GFFGene
161 | 
162 |       if chrom != "MT":
163 |         # Add padding of gene to annotate upstream / downstream genes
164 |         gene = GFFGene(chrom: chrom, start: start - gene_padding, stop: stop + gene_padding, exons: @[])
165 |       else:
166 |         # For MT, we only do +/-2bp padding
167 |         gene = GFFGene(chrom: chrom, start: start - 2, stop: stop + 2, exons: @[])
168 | 
169 |       gene.gene_symbol = gff_fields["Name"]
170 |       if dbxref_fields.hasKey("GeneID"):
171 |         gene.gene_id = parseInt(dbxref_fields["GeneID"])
172 | 
173 |       result[0][gene.gene_symbol] = gene.gene_id
174 | 
175 |       if gff_fields.hasKey("gene_biotype"):
176 |         gene.biotype = gff_fields["gene_biotype"]
177 | 
178 |       if genes_chr_table.hasKey(gene.chrom):
179 |         genes_chr_table[gene.chrom].add(gene)
180 |       else:
181 |         genes_chr_table[gene.chrom] = @[gene]
182 | 
183 |       genes_name_table[gene.gene_symbol] = gene
184 | 
185 |     # NC_000001.10    Curated Genomic exon    131068  132927  .       +       .       ID=id-CICP27;Parent=gene-CICP27;Dbxref=GeneID:100420257,HGNC:HGNC:48835;gbkey=exon;gene=CICP27
186 |     elif v[2] == "exon":
187 |       var
188 |         v2 = v[3].split('\t')
189 |         gff_fields = v2[5].parseKeyValues(';','=')
190 |         dbxref_fields = gff_fields["Dbxref"].parseKeyValues(',',':')
191 |         gene_symbol : string
192 | 
193 |       if gff_fields.hasKey("gene"):
194 |         gene_symbol = gff_fields["gene"]
195 | 
196 |       # This exon belongs to an gene we are annotation, we catch it
197 |       if gene_symbol != "" and genes_name_table.hasKey(gene_symbol):
198 |         let
199 |           exon = Region(chrom: parseChr(v[0]),start: parseInt(v2[0]), stop: parseInt(v2[1]))
200 | 
201 |         # Only add uniq exons and merge overlapping ones
202 |         var i = 0
203 |         for e in genes_name_table[gene_symbol].exons.mitems():
204 |           if e == exon:
205 |             break
206 |           elif e.isOverlapping(exon):
207 |             e.merge(exon)
208 |             break
209 |           inc(i)
210 | 
211 |         # The exon has not been found / merge, we add it
212 |         if i == genes_name_table[gene_symbol].exons.len():
213 |           genes_name_table[gene_symbol].exons.add(exon)
214 | 
215 |   # Load set of genes (per chromosome) to lapper index
216 |   logger.log(lvlInfo, fmt"Create lapper index for file {gff_file}")
217 |   for chrom in genes_chr_table.keys():
218 |     result[1][chrom] = lapify(genes_chr_table[chrom])
219 | 
220 | proc cmpGenes*(x, y: RequestGene): int =
221 |   ## We select protein coding over non-coding gene (always ?)
222 |   let
223 |     x_exon_dist = x.gene.minExonDist(x.query.start, x.query.stop, 20)
224 |     y_exon_dist = y.gene.minExonDist(y.query.start, y.query.stop, 20)
225 | 
226 |   # echo "X: " & x.gene.gene_symbol & " DIST: " & $x_exon_dist & " BIOTYPE: " & x.gene.biotype
227 |   # echo "Y: " & y.gene.gene_symbol & " DIST: " & $y_exon_dist & " BIOTYPE: " & y.gene.biotype
228 | 
229 |   # First we give priority to protein_coding genes if variants is at 20bp of an exon boundary or both are intronic
230 |   # This does not apply for MT
231 |   if x.gene.chrom != "MT" and x.gene.biotype == "protein_coding" and y.gene.biotype != "protein_coding" and (x_exon_dist <= 20 or (x_exon_dist > 0 and y_exon_dist > 0)):
232 |     return -1
233 |   elif x.gene.chrom != "MT" and x.gene.biotype != "protein_coding" and y.gene.biotype == "protein_coding" and (y_exon_dist <= 20 or (x_exon_dist > 0 and y_exon_dist > 0)):
234 |     return 1
235 |   else:
236 |     # Otherwise we give priority to the genes having the closest exon
237 |     if x_exon_dist != -1 and y_exon_dist != -1:
238 |       # Both are coding or non of them is, we take the one with the closest exon
239 |       let exon_dist_cmp = cmp(x_exon_dist, y_exon_dist)
240 |       if exon_dist_cmp != 0:
241 |         return exon_dist_cmp
242 |     elif x_exon_dist >= 0:
243 |       return -1
244 |     else:
245 |       return 1
246 | 
247 |   # Finally we chose the oldest gene_id
248 |   return cmp(x.gene.gene_id, y.gene.gene_id)
249 | 
250 | proc cmpGenesCodingFirst*(x, y: RequestGene): int =
251 |   ## We select protein coding over non-coding gene always
252 | 
253 |   # stderr.writeLine("gene: " & x.gene.gene_symbol & " biotype: " & x.gene.biotype & " dist: " & $x.gene.minExonDist(x.query.start, x.query.stop, 20))
254 |   # stderr.writeLine("gene: " & y.gene.gene_symbol & " biotype: " & y.gene.biotype & " dist: " & $y.gene.minExonDist(x.query.start, x.query.stop, 20))
255 | 
256 |   # First we give priority to protein_coding genes if variants is at 20bp of an exon boundary or both are intronic
257 |   # This does not apply for MT
258 |   if x.gene.chrom != "MT" and x.gene.biotype == "protein_coding" and y.gene.biotype != "protein_coding":
259 |     return -1
260 |   elif x.gene.chrom != "MT" and x.gene.biotype != "protein_coding" and y.gene.biotype == "protein_coding":
261 |     return 1
262 |   else:
263 |     let
264 |       x_exon_dist = x.gene.minExonDist(x.query.start, x.query.stop, 20)
265 |       y_exon_dist = y.gene.minExonDist(y.query.start, y.query.stop, 20)
266 |     # Otherwise we give priority to the genes having the closest exon
267 |     if x_exon_dist != -1 and y_exon_dist != -1:
268 |       # Both are coding or non of them is, we take the one with the closest exon
269 |       let exon_dist_cmp = cmp(x_exon_dist, y_exon_dist)
270 |       if exon_dist_cmp != 0:
271 |         return exon_dist_cmp
272 |     elif x_exon_dist >= 0:
273 |       return -1
274 |     else:
275 |       return 1
276 | 
277 |   # Finally we chose the oldest gene_id
278 |   return cmp(x.gene.gene_id, y.gene.gene_id)
279 | 
280 | proc getInfoString*(genes_index: TableRef[string, Lapper[GFFGene]], chrom: string, start: int, stop: int, coding_priority: bool): string =
281 |   if genes_index.hasKey(chrom):
282 |     var
283 |       res = new_seq[GFFGene]() # Store retrieved genes
284 |       found_overlapping_genes = genes_index[chrom].find(start, stop, res)
285 | 
286 |     # We have no overlapping genes, we try to find the nearest ones (upstream and downstream)
287 |     if not found_overlapping_genes:
288 |       var
289 |         res_nearest_up = new_seq[GFFGene]()
290 |         res_nearest_down = new_seq[GFFGene]()
291 |         found_nearest_up = genes_index[chrom].find_nearest_upstream(start, res_nearest_up)
292 |         found_nearest_down = genes_index[chrom].find_nearest_downstream(stop, res_nearest_down)
293 |         dist_nearest_up = -1
294 |         dist_nearest_down = -1
295 | 
296 |       if found_nearest_up:
297 |         dist_nearest_up = stop - res_nearest_up[0].stop
298 |       if found_nearest_down:
299 |         dist_nearest_down = res_nearest_down[0].start - start
300 | 
301 |       if dist_nearest_up != -1 and dist_nearest_down != -1:
302 |         # Select nearest_up genes
303 |         if dist_nearest_up < dist_nearest_down:
304 |           res = res_nearest_up
305 |         elif dist_nearest_down < dist_nearest_up:
306 |           res = res_nearest_down
307 |         # Merge result
308 |         else:
309 |           res.add(res_nearest_up)
310 |           res.add(res_nearest_down)
311 |       elif dist_nearest_up != -1:
312 |         res = res_nearest_up
313 |       elif dist_nearest_down != -1:
314 |         res = res_nearest_down
315 | 
316 |     if res.len() > 0:
317 |       # Create object with gene + query interval for sorting (query is necessary for compGenes)
318 |       var sorted_genes: seq[RequestGene]
319 |       for g in res:
320 |         sorted_genes.add(RequestGene(gene: g, query: Region(chrom: chrom, start: start, stop: stop)))
321 | 
322 |       # Sort genes
323 |       if coding_priority:
324 |         sorted_genes.sort(cmpGenesCodingFirst)
325 |       else:
326 |         sorted_genes.sort(cmpGenes)
327 | 
328 |       var gene_info: seq[string]
329 |       for q in sorted_genes:
330 |         gene_info.add(q.gene.gene_symbol & ":" & $q.gene.gene_id)
331 |       result = gene_info.join("|")
332 |   else:
333 |     stderr.writeLine("[Error] Chrom " & chrom & " not found in GFF annotations")
334 | 


--------------------------------------------------------------------------------
/src/clinvcfpkg/hgnc.nim:
--------------------------------------------------------------------------------
 1 | import strformat
 2 | import strutils
 3 | import tables
 4 | import logging
 5 | 
 6 | from ./utils import logger
 7 | 
 8 | type 
 9 |   Entrez* = TableRef[string, int]
10 |   Alias* = TableRef[string, string]
11 |   SharedAlias* = TableRef[string, int]
12 | 
13 | type
14 |   HgncIndex* = ref object
15 |     entrez*: Entrez
16 |     alias*: Alias
17 |     sharedAlias*: SharedAlias
18 | 
19 | proc newEntrez*(): Entrez =
20 |   result = newTable[string, int]()
21 | 
22 | proc newAlias*(): Alias =
23 |   result = newTable[string, string]()
24 | 
25 | proc newSharedAlias*(): SharedAlias =
26 |   result =  newTable[string, int]()
27 | 
28 | proc newHgncIndex*(): HgncIndex =
29 |   ## Instantiate a new HgncIndex
30 |   ## Each gene that have an Entrez ID is stored in 'entrez' attribute.
31 |   ## Each gene alias is stored in 'alias' attribute as key
32 |   result = HgncIndex(entrez: newEntrez(), alias: newAlias(), sharedAlias: newSharedAlias())
33 | 
34 | proc initHgncDbfromFile*(file: string): HgncIndex =
35 |   ## Create an HgncIndex from HGNC table
36 |   result = newHgncIndex()
37 |   let f = open(file)
38 |   defer: f.close()
39 |   var line : string
40 |   var isHeader = false
41 |   while f.read_line(line):
42 |     if line == "Approved symbol\tAlias symbols\tNCBI Gene ID(supplied by NCBI)":
43 |       isHeader = true
44 |       continue
45 |     if not isHeader:
46 |       raise newException(IOError, "wrong HGNC table header")
47 |     var sl = line.split("\t")
48 |     if sl[2] == "":
49 |       # next if no entrezID is defined
50 |       continue
51 |     result.entrez[sl[0]] = parseInt(sl[2])
52 |     # handle alias
53 |     if sl[1] != "":
54 |       var sAlias = sl[1].split(", ")
55 |       for g in sAlias:
56 |         if result.alias.hasKey(g):
57 |           # store these alias
58 |           if not result.sharedAlias.hasKey(g):
59 |             result.sharedAlias[g] = 1
60 |           else:
61 |             inc(result.sharedAlias[g])
62 |         else:
63 |           result.alias[g] = sl[0]
64 |   # some alias are shared between genes. These alias are ambiguous so we remove them
65 |   for g, c in result.sharedAlias:
66 |     logger.log(lvlInfo, fmt"[initHgncDbfromFile] remove ambiguous shared alias {g}")
67 |     if result.alias.hasKey(g):
68 |       result.alias.del(g)
69 | 
70 |             
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/src/clinvcfpkg/lapper.nim:
--------------------------------------------------------------------------------
  1 | ## FIXME: THIS IS A LOCAL COPY OF LAPPER (https://github.com/brentp/nim-lapper)
  2 | ## FIRST: WE USE CLOSED-INTERVALS AND NOT HALF-OPENED AS IN ORIGINAL IMPLEMENTATION
  3 | ## IMPLEMENTING TWO NEW PROCEDURE: find_nearest_downstream and find_nearest_upstream
  4 | ## THESE MODIFICATION SHOULD BE MERGED IN THE LAPPER REPOSITORY TO AVOID MAINTAINING THIS
  5 | ## COPY !!!
  6 | ##
  7 | ## This module provides a simple data-structure for fast interval searches. It does not use an interval tree,
  8 | ## instead, it operates on the assumption that most intervals are of similar length; or, more exactly, that the
  9 | ## longest interval in the set is not long compared to the average distance between intervals. On any dataset
 10 | ## where that is not the case, this method will not perform well. For cases where this holds true (as it often
 11 | ## does with genomic data), we can sort by start and use binary search on the starts, accounting for the length
 12 | ## of the longest interval. The advantage of this approach is simplicity of implementation and speed. In realistic
 13 | ## tests queries returning the overlapping intervals are 1000 times faster than brute force and queries that merely
 14 | ## check for the overlaps are > 5000 times faster.
 15 | ##
 16 | ## The main methods are `find` and `seek` where the latter uses a cursor and is very fast for cases when the queries
 17 | ## are sorted. This is another innovation in this library that allows an addition ~50% speed improvement when
 18 | ## consecutive queries are known to be in sort order.
 19 | ##
 20 | ## For both find and seek, if the given intervals parameter is nil, the function will return a boolean indicating if
 21 | ## any intervals in the set overlap the query. This is much faster than modifying the
 22 | ## intervals.
 23 | ##
 24 | ## The example below shows off most of the API of `Lapper`.
 25 | ##
 26 | ## .. code-block:: nim
 27 | ##   import lapper
 28 | ##   type myinterval = ref object
 29 | ##      start: int
 30 | ##      stop: int
 31 | ##      val: int
 32 | ##
 33 | ##    proc start(m: myinterval): int {.inline.} = return m.start
 34 | ##    proc stop(m: myinterval): int {.inline.} = return m.stop
 35 | ##    proc `$`(m:myinterval): string = return "(start:$#, stop:$#, val:$#)" % [$m.start, $m.stop, $m.val]
 36 | ##
 37 | ##  create some fake data
 38 | ## .. code-block:: nim
 39 | ##  var ivs = new_seq[myinterval]()
 40 | ##  for i in countup(0, 100, 10):
 41 | ##    ivs.add(myinterval(start:i, stop:i + 15, val:0))
 42 | 
 43 | ##  make the Lapper "data-structure"
 44 | 
 45 | ## .. code-block:: nim
 46 | ##  l = lapify(ivs)
 47 | ##  empty:seq[myinterval]
 48 | 
 49 | ## .. code-block:: nim
 50 | ##  l.find(10, 20, empty)
 51 | ##  notfound = not l.find(200, 300, empty)
 52 | ##  assert notfound
 53 | 
 54 | ## .. code-block:: nim
 55 | ##  res = new_seq[myinterval]()
 56 | 
 57 | ##  find is the more general case, l.seek gives a speed benefit when consecutive queries are in order.
 58 | 
 59 | ## .. code-block:: nim
 60 | ##  echo l.find(50, 70, res)
 61 | ##  echo res
 62 | ##  # @[(start: 40, stop: 55, val:0), (start: 50, stop: 65, val: 0), (start: 60, stop: 75, val: 0), (start: 70, stop: 85, val: 0)]
 63 | ##  for r in res:
 64 | ##     r.val += 1
 65 | 
 66 | ## or we can do a function on each overlapping interval
 67 | 
 68 | ## .. code-block:: nim
 69 | ##   l.each_seek(50, 60, proc(a:myinterval) = inc(a.val))
 70 | 
 71 | ## or
 72 | 
 73 | ## .. code-block:: nim
 74 | ##   l.each_find(50, 60, proc(a:myinterval) = a.val += 10)
 75 | 
 76 | ## .. code-block:: nim
 77 | ##   discard l.seek(50, 70, res)
 78 | ##   echo res
 79 | ##   # @[(start:40, stop:55, val:12), (start:50, stop:65, val:12), (start:60, stop:75, val:1)]
 80 | import algorithm
 81 | import logging
 82 | from ./utils import logger
 83 | import strformat
 84 | 
 85 | type
 86 | 
 87 |   Interval* = concept i
 88 |     ## An object/tuple must implement these 2 methods to use this module
 89 |     start(i) is int
 90 |     stop(i) is int
 91 | 
 92 |   Lapper*[T] = object
 93 |     ## Lapper enables fast interval searches
 94 |     intervals: seq[T]
 95 |     max_len*: int
 96 |     cursor: int ## `cursor` is used internally by ordered find
 97 | 
 98 | template overlap*[T:Interval](a: T, start:int, stop:int): bool =
 99 |   ## overlap returns true if half-open intervals overlap
100 |   #return a.start < stop and a.stop > start
101 |   a.stop >= start and a.start <= stop
102 | 
103 | proc iv_cmp[T:Interval](a, b: T): int =
104 |     if a.start < b.start: return -1
105 |     if b.start < a.start: return 1
106 |     return cmp(a.stop, b.stop)
107 | 
108 | proc lapify*[T:Interval](ivs:var seq[T]): Lapper[T] =
109 |   ## create a new Lapper object; ivs will be sorted.
110 |   sort(ivs, iv_cmp)
111 |   var l = Lapper[T](max_len: 0, intervals:ivs)
112 |   for iv in ivs:
113 |     if iv.stop - iv.start > l.max_len:
114 |       l.max_len = iv.stop - iv.start
115 |   return l
116 | 
117 | proc lowerBound[T:Interval](a: var seq[T], start: int): int =
118 |   result = a.low
119 |   var count = a.high - a.low + 1
120 |   var step, pos: int
121 |   while count != 0:
122 |     step = count div 2
123 |     pos = result + step
124 |     if a[pos].start < start:
125 |       result = pos + 1
126 |       count -= step + 1
127 |     else:
128 |       count = step
129 | 
130 | proc len*[T:Interval](L:Lapper[T]): int {.inline.} =
131 |   ## len returns the number of intervals in the Lapper
132 |   L.intervals.len
133 | 
134 | proc find*[T:Interval](L:var Lapper[T], start:int, stop:int, ivs:var seq[T]): bool =
135 |   ## fill ivs with all intervals in L that overlap start .. stop.
136 |   #if ivs.len != 0: ivs.set_len(0)
137 |   shallow(L.intervals)
138 |   var off = lowerBound(L.intervals, start - L.max_len)
139 |   var n = 0
140 |   for i in off..L.intervals.high:
141 |     var x = L.intervals[i]
142 |     if x.overlap(start, stop):
143 |       if n < ivs.len:
144 |         ivs[n] = x
145 |       else:
146 |         ivs.add(x)
147 |       n += 1
148 |     elif x.start >= stop: break
149 |   if ivs.len > n:
150 |     ivs.setLen(n)
151 |   return len(ivs) > 0
152 | 
153 | proc count*[T:Interval](L:var Lapper[T], start:int, stop:int): int =
154 |   ## fill ivs with all intervals in L that overlap start .. stop.
155 |   shallow(L.intervals)
156 |   var off = lowerBound(L.intervals, start - L.max_len)
157 |   for i in off..L.intervals.high:
158 |     let x = L.intervals[i]
159 |     if x.overlap(start, stop):
160 |       result.inc
161 |     elif x.start >= stop: break
162 | 
163 | proc find_nearest_upstream*[T:Interval](L:var Lapper[T], pos:int, ivs:var seq[T]): bool =
164 |   ## Find nearest upstream interval (left)
165 |   shallow(L.intervals)
166 |   var
167 |     i = lowerBound(L.intervals, pos)
168 |     max_stop = -1
169 |     candidates: seq[T]
170 |   if len(L.intervals) == i:
171 |     let ii:int = i
172 |     i = i - 1
173 |     logger.log(lvlInfo, fmt"-----> Resetting index [i] value from {ii} to {i} due to an out of bounds index error.")
174 |     assert ii - 1 == i
175 | 
176 | 
177 |   # While we have not found an interval or we could find one that will have
178 |   # a higher stop position as our current candidate
179 |   while i >= 0 and (max_stop == -1 or (max_stop - L.intervals[i].start) < L.max_len):
180 |     # We want intervals that are not ovelapping our posion
181 |     if L.intervals[i].stop < pos:
182 |       if max_stop == -1 or L.intervals[i].stop > max_stop:
183 |         max_stop = L.intervals[i].stop
184 |         candidates.setLen(0)
185 |         candidates.add(L.intervals[i])
186 |       elif  L.intervals[i].stop == max_stop:
187 |         candidates.add(L.intervals[i])
188 |     dec(i)
189 | 
190 |   for c in candidates:
191 |     ivs.add(c)
192 | 
193 |   return ivs.len() > 0
194 | 
195 | proc find_nearest_downstream*[T:Interval](L:var Lapper[T], pos:int, ivs:var seq[T]): bool =
196 |   ## Find nearest upstream interval (left)
197 |   shallow(L.intervals)
198 |   var
199 |     i = lowerBound(L.intervals, pos)
200 |     min_start = -1
201 | 
202 |   # While we have not found an interval or we could find one that will have
203 |   # a higher stop position as our current candidate
204 |   while i <= L.intervals.high and (min_start == -1 or L.intervals[i].start == min_start):
205 |     # We want intervals that are not ovelapping our posion
206 |     # Note: Stop is half-open (not included)
207 |     if L.intervals[i].start > pos:
208 |       min_start =  L.intervals[i].start
209 |       ivs.add(L.intervals[i])
210 |     inc(i)
211 | 
212 |   return ivs.len() > 0
213 | 
214 | proc each_find*[T:Interval](L:var Lapper[T], start:int, stop:int, fn: proc (v:T)) =
215 |   ## call fn(x) for each interval x in L that overlaps start..stop
216 |   var off = lowerBound(L.intervals, start - L.max_len)
217 |   for i in off..L.intervals.high:
218 |     var x = L.intervals[i]
219 |     if x.overlap(start, stop):
220 |       fn(x)
221 |     elif x.start >= stop: break
222 | 
223 | proc seek*[T:Interval](L:var Lapper[T], start:int, stop:int, ivs:var seq[T]): bool =
224 |   ## fill ivs with all intervals in L that overlap start .. stop inclusive.
225 |   ## this method will work when queries to this lapper are in sorted (start) order
226 |   ## it uses a linear search from the last query instead of a binary search.
227 |   ## if ivs is nil, then this will just return true if it finds an interval and false otherwise
228 |   if ivs.len != 0: ivs.set_len(0)
229 |   if L.cursor == 0 or L.intervals[L.cursor].start > start:
230 |     L.cursor = lowerBound(L.intervals, start - L.max_len)
231 |   while (L.cursor + 1) < L.intervals.high and L.intervals[L.cursor + 1].start < (start - L.max_len):
232 |     L.cursor += 1
233 |   for i in L.cursor..L.intervals.high:
234 |     var x = L.intervals[i]
235 |     if x.overlap(start, stop):
236 |       ivs.add(x)
237 |     elif x.start >= stop: break
238 |   return ivs.len != 0
239 | 
240 | proc each_seek*[T:Interval](L:var Lapper[T], start:int, stop:int, fn:proc (v:T)) {.inline.} =
241 |   ## call fn(x) for each interval x in L that overlaps start..stop
242 |   ## this assumes that subsequent calls to this function will be in sorted order
243 |   if L.cursor == 0 or L.cursor >= L.intervals.high or L.intervals[L.cursor].start > start:
244 |     L.cursor = lowerBound(L.intervals, start - L.max_len)
245 |   while (L.cursor + 1) < L.intervals.high and L.intervals[L.cursor + 1].start < (start - L.max_len):
246 |     L.cursor += 1
247 |   for i in L.cursor..L.intervals.high:
248 |     var x = L.intervals[i]
249 |     if x.start >= stop: break
250 |     elif x.stop > start:
251 |       fn(x)
252 | 
253 | iterator items*[T:Interval](L: Lapper[T]): T =
254 |   for i in L.intervals: yield i
255 | 
256 | when isMainModule:
257 | 
258 |   import random
259 |   import times
260 |   import strutils
261 | 
262 |   proc randomi(imin:int, imax:int): int =
263 |       return imin + rand(imax - imin)
264 | 
265 |   proc brute_force(ivs: seq[Interval], start:int, stop:int, res: var seq[Interval]) =
266 |     if res.len != 0: res.set_len(0)
267 |     for i in ivs:
268 |       if i.overlap(start, stop): res.add(i)
269 | 
270 |   # example implementation
271 |   type myinterval = tuple[start:int, stop:int, val:int]
272 |   proc start(m: myinterval): int {.inline.} = return m.start
273 |   proc stop(m: myinterval): int {.inline.} = return m.stop
274 | 
275 |   proc make_random(n:int, range_max:int, size_min:int, size_max:int): seq[myinterval] =
276 |     result = new_seq[myinterval](n)
277 |     for i in 0..<n:
278 |       var s = randomi(0, range_max)
279 |       var e = s + randomi(size_min, size_max)
280 |       var m:myinterval = (s, e, 0)
281 |       result[i] = m
282 | 
283 |   var
284 |     N = 100000
285 |     ntimes = 40
286 |     brute_step = 10
287 | 
288 |   var intervals = make_random(N, 50000000, 500, 20000)
289 |   echo "running tests and comparisons on $# random intervals" % [$N]
290 |   var icopy = intervals
291 | 
292 |   var t = cpuTime()
293 |   var res = new_seq[myinterval]()
294 | 
295 |   for i in countup(0, intervals.len - brute_step, brute_step):
296 |     var iv = intervals[i]
297 |     brute_force(intervals, iv.start, iv.stop, res)
298 | 
299 |   var brute_time = cpuTime() - t
300 |   echo "time for brute force search on 1/$#th of the data:" % [$brute_step], brute_time
301 | 
302 |   t = cpuTime()
303 | 
304 |   var lap = lapify(intervals)
305 |   echo "time to create Lapper:", cpuTime() - t
306 | 
307 |   t = cpuTime()
308 |   for k in 0..<ntimes:
309 |     for iv in icopy:
310 |       discard lap.find(iv.start, iv.stop, res)
311 |       if len(res) == 0:
312 |         echo "0 bad!!!"
313 |   var lap_time = cpuTime() - t
314 |   echo "time to do $# searches ($# reps) in Lapper:" % [$(N * ntimes), $ntimes], lap_time, " speedup:", (brute_time * float64(brute_step)) / (lap_time / float64(ntimes))
315 | 
316 |   t = cpuTime()
317 |   for k in 0..<ntimes:
318 |     for iv in intervals:
319 |       discard lap.seek(iv.start, iv.stop, res)
320 |       if len(res) == 0:
321 |         echo "1 bad!!!"
322 |   lap_time = cpuTime() - t
323 |   echo "time to do $# seek-searches ($# reps) in Lapper:" % [$(N * ntimes), $ntimes], lap_time, " speedup:", (brute_time * float64(brute_step)) / (lap_time / float64(ntimes))
324 | 
325 |   var iempty: seq[myinterval]
326 |   t = cpuTime()
327 |   for k in 0..<ntimes:
328 |     for iv in icopy:
329 |       if 0 == lap.count(iv.start, iv.stop):
330 |         echo "2 bad!!!"
331 |   lap_time = cpuTime() - t
332 |   echo "time to do $# presence tests ($# reps) in Lapper:" % [$(N * ntimes), $ntimes], lap_time, " speedup:", (brute_time * float64(brute_step)) / (lap_time / float64(ntimes))
333 | 
334 |   t = cpuTime()
335 |   for k in 0..<ntimes:
336 |     for iv in intervals:
337 |       if not lap.seek(iv.start, iv.stop, iempty):
338 |         echo "3 bad!!!"
339 |   lap_time = cpuTime() - t
340 |   echo "time to do $# seek-presence tests ($# reps) in Lapper:" % [$(N * ntimes), $ntimes], lap_time, " speedup:", (brute_time * float64(brute_step)) / (lap_time / float64(ntimes))
341 | 
342 |   t = cpuTime()
343 |   for k in 0..<ntimes:
344 |     for iv in intervals:
345 |       var n = 0
346 |       lap.each_seek(iv.start, iv.stop, (proc(f:myinterval) = (if iv.start == f.start: n.inc)))
347 |       if n == 0:
348 |         echo "4 bad!!!"
349 |   lap_time = cpuTime() - t
350 |   echo "time to do $# each-seek-presence tests ($# reps) in Lapper:" % [$(N * ntimes), $ntimes], lap_time, " speedup:", (brute_time * float64(brute_step)) / (lap_time / float64(ntimes))
351 | 
352 | 
353 |   var brute_res = new_seq[myinterval]()
354 |   var error = 0
355 | 
356 |   t = cpuTime()
357 |   var res2 = new_seq[myinterval](10)
358 |   var res3 = new_seq_of_cap[myinterval](10)
359 |   var res4 = new_seq_of_cap[myinterval](10)
360 |   proc do_each_find(m:myinterval) = res3.add(m)
361 |   proc do_each_seek(m:myinterval) = res4.add(m)
362 |   icopy.sort(iv_cmp)
363 | 
364 |   for iv in icopy:
365 |     brute_force(icopy, iv.start, iv.stop, brute_res)
366 |     discard lap.find(iv.start, iv.stop, res)
367 |     discard lap.seek(iv.start, iv.stop, res2)
368 | 
369 |     res3.set_len(0)
370 |     lap.each_find(iv.start, iv.stop, do_each_find)
371 | 
372 |     res4.set_len(0)
373 |     lap.each_seek(iv.start, iv.stop, do_each_seek)
374 | 
375 |     if not lap.seek(iv.start, iv.stop, iempty):
376 |       echo "4 bad!! should have found it"
377 |     sort(brute_res, iv_cmp)
378 |     sort(res, iv_cmp)
379 |     sort(res2, iv_cmp)
380 |     sort(res3, iv_cmp)
381 |     sort(res4, iv_cmp)
382 | 
383 |     for i, b in brute_res:
384 |         if b.start != res[i].start or b.start != res2[i].start or b.start != res3[i].start or b.start != res4[i].start:
385 |           echo "5 bad!!! ", len(res), " ", len(res2)
386 |           error = 1
387 |         if b.stop != res[i].stop or b.stop != res2[i].stop or res3[i].stop != b.stop or res4[i].stop != b.stop:
388 |           echo "6 bad!!! ", len(res), " ", len(res2)
389 |           error = 1
390 |   echo "time to check each result:", cpuTime() - t
391 |   quit(error)
392 | 


--------------------------------------------------------------------------------
/src/clinvcfpkg/utils.nim:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | var logger* = newConsoleLogger(fmtStr="[$datetime] - $appname - $levelname : ", useStderr=true)
4 | 


--------------------------------------------------------------------------------
/src/compvcf.nim:
--------------------------------------------------------------------------------
 1 | import httpclient, json, tables, os, times
 2 | import docopt
 3 | import strutils # Split string
 4 | import hts
 5 | 
 6 | type
 7 | 
 8 |   ClinVariant* = ref object
 9 |     variant_id: int
10 |     clinsig: string
11 |     revstat: string
12 | 
13 | proc loadVariantsFromVCF(filename: string): TableRef[int, ClinVariant] =
14 |   var 
15 |     file : BGZ
16 |     nb_reclassif = 0
17 |   
18 |   result = newTable[int, ClinVariant]()
19 |   file.open(filename, "r")
20 |   for line in file:
21 |     if line.len() == 0 or line[0] == '#':
22 |       continue
23 |     else:
24 |       let 
25 |         v = line.split('\t')
26 |         f = v[7].split(';')
27 |       var 
28 |         h = initTable[string, string]()
29 |       for field in f:
30 |         let a = field.split('=')
31 |         h[a[0]] = a[1]
32 |       # We ignore re-classified variants because they have a different clinsig
33 |       if h.hasKey("OLD_CLNSIG"):
34 |         inc(nb_reclassif)
35 |         var variant = ClinVariant(variant_id: v[2].parseInt(), clinsig: "reclassified", revstat: "reclassified")
36 |         result[variant.variant_id] = variant
37 |         #continue
38 |       elif h.hasKey("CLNSIG") and h.hasKey("CLNREVSTAT"):
39 |         var variant = ClinVariant(variant_id: v[2].parseInt(), clinsig: h["CLNSIG"], revstat: h["CLNREVSTAT"])
40 |         result[variant.variant_id] = variant
41 |   if nb_reclassif > 0:
42 |     stderr.writeLine("[Log] Found " & $nb_reclassif & " reclassified variants in " & filename)
43 | 
44 | 
45 | proc main*(argv: seq[string]) =
46 | 
47 |   # TODO: Create a usage and expose api_keys as options
48 |   let doc = format("""
49 | Usage: compVCF <1.vcf> <2.vcf>
50 | 
51 |   """)
52 | 
53 |   let 
54 |     args = docopt(doc)
55 |     vcf1 = $args["<1.vcf>"]
56 |     vcf2 = $args["<2.vcf>"]
57 |   
58 |   stderr.writeLine("[Log] Loading variants from " & vcf1)
59 |   var variants1 = vcf1.loadVariantsFromVCF()
60 |   stderr.writeLine("[Log] " & $variants1.len() & " variant loaded")
61 |   stderr.writeLine("[Log] Loading variants from " & vcf2)
62 |   var variants2 = vcf2.loadVariantsFromVCF()
63 |   stderr.writeLine("[Log] " & $variants2.len() & " variant loaded")
64 | 
65 |   var
66 |     nb_wrong_clinsig = 0
67 |     nb_wrong_revstat = 0
68 |     nb_missing_variant_v1 = 0
69 |     nb_missing_variant_v2 = 0
70 | 
71 |   for vid, v1 in variants1:
72 |     if variants2.hasKey(vid):
73 |       if variants2[vid].clinsig == "reclassified" or v1.clinsig == "reclassified":
74 |         continue
75 |       if variants2[vid].clinsig != v1.clinsig:
76 |         inc(nb_wrong_clinsig)
77 |         echo "DIFF OF CLNSIG for variant " & $vid & " : " & vcf1 & " = " & v1.clinsig & " <-> " & vcf2 & " = " & variants2[vid].clinsig
78 |       if variants2[vid].revstat != v1.revstat:
79 |         inc(nb_wrong_revstat)
80 |         echo "DIFF OF REVSTAT for variant " & $vid & " : " & vcf1 & " = " & v1.revstat & " <-> " & vcf2 & " = " & variants2[vid].revstat
81 |     else: 
82 |       echo "MISSING variant " & $vid & " in " & vcf2
83 |       inc(nb_missing_variant_v2)
84 |   
85 |   for vid, v2 in variants2:
86 |     if not variants1.hasKey(vid):
87 |       inc(nb_missing_variant_v1)
88 |       echo "MISSING variant " & $vid & " in " & vcf1
89 |   
90 |   stderr.writeLine("[Stats] NB_WRONG_CLINSIG " & $nb_wrong_clinsig)
91 |   stderr.writeLine("[Stats] NB_WRONG_REVSTAT " & $nb_wrong_revstat)
92 |   stderr.writeLine("[Stats] NB_MISSING_VARIANT_VCF1 " & $nb_missing_variant_v1)
93 |   stderr.writeLine("[Stats] NB_MISSING_VARIANT_VCF2 " & $nb_missing_variant_v2)
94 | 
95 | when isMainModule:
96 |   main(commandLineParams())


--------------------------------------------------------------------------------
/src/extractClinvarSet.nim:
--------------------------------------------------------------------------------
 1 | import httpclient, json, tables
 2 | import os, times
 3 | import xmltree # Parse XML
 4 | import htmlparser
 5 | import docopt # Formating the command-line
 6 | import strutils # Split string
 7 | from streams import newStringStream
 8 | import hts
 9 | 
10 | iterator nextClinvarSet*(file: var BGZ): string =
11 |   var chunk: string
12 |   for line in file:
13 |     if line == "":
14 |       yield chunk
15 |       chunk = ""
16 |     else:
17 |       chunk.add(line & "\n")
18 |   yield chunk
19 | 
20 | proc formatVCFString*(vcf_string: string): string =
21 |   result = vcf_string.replace(' ', '_')
22 | 
23 | proc findNodes(n: XmlNode, tag: string): seq[XmlNode] =
24 |   for xref_node in n:
25 |     if xref_node.kind == xnElement:
26 |       if xref_node.tag == tag:
27 |         result.add(xref_node)
28 | 
29 | proc main*(argv: seq[string]) =
30 | 
31 |   # TODO: Create a usage and expose api_keys as options
32 |   let doc = format("""
33 | Usage: extractClinvarSet <clinvar.xml.gz> <variant_id>
34 | 
35 |   """)
36 | 
37 |   let
38 |     args = docopt(doc)
39 |     searched_id = $args["<variant_id>"]
40 |     clinvar_xml_file = $args["<clinvar.xml.gz>"]
41 |     #variation_allele_file = $args["<variation_allele.txt.gz>"]
42 |     #allele_variant_table = loadAlleleVariantTable(variation_allele_file)
43 | 
44 |   # TODO: Print VCF headers
45 |   stderr.writeLine("[Log] Parsing variants from " & clinvar_xml_file)
46 | 
47 |   var
48 |     file : BGZ
49 |     parsed_variants = initTable[string, int]()
50 |     i = 0
51 | 
52 |   file.open(clinvar_xml_file, "r")
53 | 
54 |   for clinvarset_string in file.nextClinvarSet():
55 |     if clinvarset_string != "" and clinvarset_string.startsWith("<ClinVarSet"):
56 |       let
57 |         root = parseHtml(newStringStream(clinvarset_string))
58 |       for clinvarset_node in root.findNodes("clinvarset"):
59 |         for reference_clinvar_assertion_nodes in clinvarset_node.findNodes("referenceclinvarassertion"):
60 |           for measureset_node in reference_clinvar_assertion_nodes.findNodes("measureset"):
61 |             let variant_id = measureset_node.attr("ID")
62 |             if variant_id == searched_id:
63 |               echo clinvarset_string
64 | 
65 | when isMainModule:
66 |   main(commandLineParams())
67 | 


--------------------------------------------------------------------------------
/tests/all.nim:
--------------------------------------------------------------------------------
1 | import ./clinvcf_tests.nim
2 | import ./gff_tests.nim
3 | import ./hgnc_tests.nim


--------------------------------------------------------------------------------
/tests/clinvcf_tests.nim:
--------------------------------------------------------------------------------
 1 | import unittest, tables, hts, strutils
 2 | import clinvcf
 3 | 
 4 | suite "test utils functions":
 5 | 
 6 |   test "test nbStars":
 7 |     check rsNoAssertion.nbStars() == 0
 8 |     check rsNoAssertionCriteria.nbStars() == 0
 9 |     check rsNoAssertionVariant.nbStars() == 0
10 |     check rsSingleSubmitter.nbStars() == 1
11 |     check rsConflicting.nbStars() == 1
12 |     check rsMutlipleSubmitterNoConflict.nbStars() == 2
13 |     check rsExpertPanel.nbStars() == 3
14 |     check rsPracticeGuideline.nbStars() == 4
15 | 
16 |   test "test clinsig conversion":
17 |     check parseEnum[ClinSig]("Benign") == csBenign
18 |     check parseEnum[ClinSig]("Benign/Likely benign") == csBenignLikelyBenign
19 |     check parseEnum[ClinSig]("Likely benign") == csLikelyBenign
20 |     check parseEnum[ClinSig]("Uncertain significance") == csUncertainSignificance
21 |     check parseEnum[ClinSig]("Likely pathogenic") == csLikelyPathogenic
22 |     check parseEnum[ClinSig]("Pathogenic") == csPathogenic
23 |     check parseEnum[ClinSig]("Pathogenic/Likely pathogenic") == csPathogenicLikelyPathogenic
24 |     check parseEnum[ClinSig]("drug response") == csDrugResponse
25 |     check parseEnum[ClinSig]("association") == csAssociation
26 |     check parseEnum[ClinSig]("risk factor") == csRiskFactor
27 |     check parseEnum[ClinSig]("protective") == csProtective
28 |     check parseEnum[ClinSig]("Affects") == csAffects
29 |     check parseEnum[ClinSig]("conflicting data from submitters") == csConflictingDataFromSubmitters
30 |     check parseEnum[ClinSig]("other") == csOther
31 |     check parseEnum[ClinSig]("not provided") == csUnknown
32 | 
33 |   test "test parseNCBIConversionComment":
34 |     check parseNCBIConversionComment("Converted during submission to Likely pathogenic.") == csLikelyPathogenic
35 |     check parseNCBIConversionComment("Converted during submission to Benign.") == csBenign
36 | 
37 | 
38 |   test "test IQRoutlierBounds":
39 |     let
40 |       d = @[3.0,3.0,4.0,4.0,4.0,4.0]
41 |       (min_val, max_val) = d.IQRoutlierBounds()
42 | 
43 |     check quantile(d, 0.25) == 3.25
44 |     check quantile(d, 0.75) == 4
45 | 
46 |     var filtered_d : seq[float32]
47 |     for v in d:
48 |       if v >= min_val and v <= max_val:
49 |         filtered_d.add(v)
50 |     check filtered_d.len() == 6
51 | 
52 | 
53 |   test "test pathology string format":
54 |     check formatPathoString(" Factor X Deficiency ") == "Factor_X_Deficiency"
55 |     check formatPathoString("Factor (X) Deficiency, pathology") == "Factor_X_Deficiency_pathology"
56 |     check formatPathoString("Factor, (X) Deficiency, pathology,|cancer/") == "Factor_X_Deficiency_pathology|cancer"
57 |     check formatPathoString("Factor, X, Deficiency      ,pathology/| cancer") == "Factor_X_Deficiency_pathology|cancer"
58 |     check formatPathoString("  , Factor X,,Deficiency/pathology| , ,cancer/") == "Factor_X_Deficiency_pathology|cancer"
59 |     check formatPathoString("CLNDISEASE=  , Factor X,,Deficiency/pathology| , ,cancer/") == "CLNDISEASE=Factor_X_Deficiency_pathology|cancer"
60 | 
61 | 
62 |   test "test clinical pathology parsing":
63 |     check parseClinicalPathologies("DISEASE", @["coagulation_x_deficiency", "factor_x_deficiency"]) == "CLNDISEASE=coagulation_x_deficiency|factor_x_deficiency"
64 | 


--------------------------------------------------------------------------------
/tests/files/109.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
  2 | <ReleaseSet Dated="2019-12-31" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Type="full" xsi:noNamespaceSchemaLocation="http://ftp.ncbi.nlm.nih.gov/pub/clinvar/xsd_public/clinvar_public_1.59.xsd">
  3 | 
  4 | <ClinVarSet ID="48728680">
  5 |   <RecordStatus>current</RecordStatus>
  6 |   <Title>NM_017849.3(TMEM127):c.245-1G&gt;T AND Pheochromocytoma, susceptibility to</Title>
  7 |   <ReferenceClinVarAssertion DateCreated="2012-08-13" DateLastUpdated="2019-12-17" ID="57632">
  8 |     <ClinVarAccession Acc="RCV000000129" Version="3" Type="RCV" DateUpdated="2019-12-17"/>
  9 |     <RecordStatus>current</RecordStatus>
 10 |     <ClinicalSignificance DateLastEvaluated="2010-03-01">
 11 |       <ReviewStatus>no assertion criteria provided</ReviewStatus>
 12 |       <Description>risk factor</Description>
 13 |     </ClinicalSignificance>
 14 |     <Assertion Type="variation to disease"/>
 15 |     <ObservedIn>
 16 |       <Sample>
 17 |         <Origin>germline</Origin>
 18 |         <Species TaxonomyId="9606">human</Species>
 19 |         <AffectedStatus>not provided</AffectedStatus>
 20 |       </Sample>
 21 |       <Method>
 22 |         <MethodType>literature only</MethodType>
 23 |       </Method>
 24 |       <ObservedData ID="49701161">
 25 |         <Attribute Type="Description">In 2 members of a family with pheochromocytoma (171300), Qin et al. (2010) identified a heterozygous germline G-to-T transversion in intron 2 of the TMEM127 gene, resulting in a frameshift and premature termination. Analysis of tumor tissue showed loss of heterozygosity at the TMEM127 locus, consistent with a 2-hit model of tumor suppressor inactivation. Age at onset was 54 and 66 years, respectively, and both patients had bilateral tumors. TMEM127 expression was decreased, consistent with a loss of function.</Attribute>
 26 |         <Citation Type="general">
 27 |           <ID Source="PubMed">20154675</ID>
 28 |         </Citation>
 29 |       </ObservedData>
 30 |     </ObservedIn>
 31 |     <MeasureSet Type="Variant" ID="109" Acc="VCV000000109" Version="1">
 32 |       <Measure Type="single nucleotide variant" ID="15148">
 33 |         <Name>
 34 |           <ElementValue Type="Preferred">NM_017849.3(TMEM127):c.245-1G&gt;T</ElementValue>
 35 |         </Name>
 36 |         <Name>
 37 |           <ElementValue Type="Alternate">IVS2-1G&gt;T(p.F83fs)</ElementValue>
 38 |         </Name>
 39 |         <AttributeSet>
 40 |           <Attribute Accession="LRG_528t1" Change="c.245-1G&gt;T" Type="HGVS, coding, LRG">LRG_528t1:c.245-1G&gt;T</Attribute>
 41 |         </AttributeSet>
 42 |         <AttributeSet>
 43 |           <Attribute Accession="NM_001193304" Version="3" Change="c.245-1G&gt;T" Type="HGVS, coding, RefSeq">NM_001193304.3:c.245-1G&gt;T</Attribute>
 44 |         </AttributeSet>
 45 |         <AttributeSet>
 46 |           <Attribute Accession="NM_017849" Version="3" Change="c.245-1G&gt;T" Type="HGVS, coding, RefSeq">NM_017849.3:c.245-1G&gt;T</Attribute>
 47 |         </AttributeSet>
 48 |         <AttributeSet>
 49 |           <Attribute Accession="LRG_528" Change="g.16016G&gt;T" Type="HGVS, genomic, LRG">LRG_528:g.16016G&gt;T</Attribute>
 50 |         </AttributeSet>
 51 |         <AttributeSet>
 52 |           <Attribute Accession="NG_027695" Version="1" Change="g.16016G&gt;T" Type="HGVS, genomic, RefSeqGene">NG_027695.1:g.16016G&gt;T</Attribute>
 53 |         </AttributeSet>
 54 |         <AttributeSet>
 55 |           <Attribute Accession="NC_000002" Version="12" Change="g.96254998C&gt;A" Type="HGVS, genomic, top level" integerValue="38">NC_000002.12:g.96254998C&gt;A</Attribute>
 56 |         </AttributeSet>
 57 |         <AttributeSet>
 58 |           <Attribute Accession="NC_000002" Version="11" Change="g.96920736C&gt;A" Type="HGVS, genomic, top level, previous" integerValue="37">NC_000002.11:g.96920736C&gt;A</Attribute>
 59 |         </AttributeSet>
 60 |         <AttributeSet>
 61 |           <Attribute Type="MolecularConsequence">splice acceptor variant</Attribute>
 62 |           <XRef ID="SO:0001574" DB="Sequence Ontology"/>
 63 |           <XRef ID="NM_001193304.3:c.245-1G&gt;T" DB="RefSeq"/>
 64 |         </AttributeSet>
 65 |         <AttributeSet>
 66 |           <Attribute Type="MolecularConsequence">splice acceptor variant</Attribute>
 67 |           <XRef ID="SO:0001574" DB="Sequence Ontology"/>
 68 |           <XRef ID="NM_017849.3:c.245-1G&gt;T" DB="RefSeq"/>
 69 |         </AttributeSet>
 70 |         <AttributeSet>
 71 |           <Attribute Type="nucleotide change">IVS2AS, G-T, -1</Attribute>
 72 |         </AttributeSet>
 73 |         <CytogeneticLocation>2q11.2</CytogeneticLocation>
 74 |         <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="2" Accession="NC_000002.12" start="96254998" stop="96254998" display_start="96254998" display_stop="96254998" variantLength="1" positionVCF="96254998" referenceAlleleVCF="C" alternateAlleleVCF="A"/>
 75 |         <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="2" Accession="NC_000002.11" start="96920736" stop="96920736" display_start="96920736" display_stop="96920736" variantLength="1" positionVCF="96920736" referenceAlleleVCF="C" alternateAlleleVCF="A"/>
 76 |         <MeasureRelationship Type="within single gene">
 77 |           <Name>
 78 |             <ElementValue Type="Preferred">transmembrane protein 127</ElementValue>
 79 |           </Name>
 80 |           <Symbol>
 81 |             <ElementValue Type="Preferred">TMEM127</ElementValue>
 82 |           </Symbol>
 83 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="2" Accession="NC_000002.12" start="96248514" stop="96265997" display_start="96248514" display_stop="96265997" Strand="-"/>
 84 |           <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="2" Accession="NC_000002.11" start="96915945" stop="96931750" display_start="96915945" display_stop="96931750" variantLength="15806" Strand="-"/>
 85 |           <XRef ID="55654" DB="Gene"/>
 86 |           <XRef Type="MIM" ID="613403" DB="OMIM"/>
 87 |           <XRef ID="HGNC:26038" DB="HGNC"/>
 88 |         </MeasureRelationship>
 89 |         <Citation Type="general">
 90 |           <ID Source="PubMed">20154675</ID>
 91 |         </Citation>
 92 |         <XRef Type="Allelic variant" ID="613403.0003" DB="OMIM"/>
 93 |         <XRef Type="rs" ID="121908821" DB="dbSNP"/>
 94 |       </Measure>
 95 |       <Name>
 96 |         <ElementValue Type="Preferred">NM_017849.3(TMEM127):c.245-1G&gt;T</ElementValue>
 97 |       </Name>
 98 |       <XRef ID="CA113858" DB="ClinGen"/>
 99 |     </MeasureSet>
100 |     <TraitSet Type="Disease" ID="49">
101 |       <Trait ID="9611" Type="Disease">
102 |         <Name>
103 |           <ElementValue Type="Preferred">Pheochromocytoma, susceptibility to</ElementValue>
104 |         </Name>
105 |         <XRef ID="C3149711" DB="MedGen"/>
106 |       </Trait>
107 |     </TraitSet>
108 |   </ReferenceClinVarAssertion>
109 |   <ClinVarAssertion ID="20272">
110 |     <ClinVarSubmissionID localKey="613403.0003_PHEOCHROMOCYTOMA, SUSCEPTIBILITY TO" submitter="OMIM" submitterDate="2013-09-06" title="TMEM127, IVS2AS, G-T, -1 _PHEOCHROMOCYTOMA, SUSCEPTIBILITY TO"/>
111 |     <ClinVarAccession Acc="SCV000020272" Version="2" Type="SCV" OrgID="3" OrganizationCategory="resource" OrgType="primary" DateUpdated="2019-03-31"/>
112 |     <RecordStatus>current</RecordStatus>
113 |     <ClinicalSignificance DateLastEvaluated="2010-03-01">
114 |       <ReviewStatus>no assertion criteria provided</ReviewStatus>
115 |       <Description>risk factor</Description>
116 |     </ClinicalSignificance>
117 |     <Assertion Type="variation to disease"/>
118 |     <ExternalID DB="OMIM" ID="613403.0003" Type="Allelic variant"/>
119 |     <ObservedIn>
120 |       <Sample>
121 |         <Origin>germline</Origin>
122 |         <Species>human</Species>
123 |         <AffectedStatus>not provided</AffectedStatus>
124 |       </Sample>
125 |       <Method>
126 |         <MethodType>literature only</MethodType>
127 |       </Method>
128 |       <ObservedData>
129 |         <Attribute Type="Description">In 2 members of a family with pheochromocytoma (171300), Qin et al. (2010) identified a heterozygous germline G-to-T transversion in intron 2 of the TMEM127 gene, resulting in a frameshift and premature termination. Analysis of tumor tissue showed loss of heterozygosity at the TMEM127 locus, consistent with a 2-hit model of tumor suppressor inactivation. Age at onset was 54 and 66 years, respectively, and both patients had bilateral tumors. TMEM127 expression was decreased, consistent with a loss of function.</Attribute>
130 |         <Citation>
131 |           <ID Source="PubMed">20154675</ID>
132 |         </Citation>
133 |         <XRef DB="OMIM" ID="171300" Type="MIM"/>
134 |       </ObservedData>
135 |     </ObservedIn>
136 |     <MeasureSet Type="Variant">
137 |       <Measure Type="Variation">
138 |         <Name>
139 |           <ElementValue Type="Preferred">TMEM127, IVS2AS, G-T, -1</ElementValue>
140 |         </Name>
141 |         <AttributeSet>
142 |           <Attribute Type="NonHGVS">IVS2AS, G-T, -1</Attribute>
143 |         </AttributeSet>
144 |         <MeasureRelationship Type="variant in gene">
145 |           <Symbol>
146 |             <ElementValue Type="Preferred">TMEM127</ElementValue>
147 |           </Symbol>
148 |         </MeasureRelationship>
149 |         <XRef DB="OMIM" ID="613403.0003" Type="Allelic variant"/>
150 |       </Measure>
151 |     </MeasureSet>
152 |     <TraitSet Type="Disease">
153 |       <Trait Type="Disease">
154 |         <Name>
155 |           <ElementValue Type="Preferred">PHEOCHROMOCYTOMA, SUSCEPTIBILITY TO</ElementValue>
156 |         </Name>
157 |       </Trait>
158 |     </TraitSet>
159 |   </ClinVarAssertion>
160 | </ClinVarSet>
161 | 
162 | <ClinVarSet ID="48739896">
163 |   <RecordStatus>current</RecordStatus>
164 |   <Title>NM_017849.3(TMEM127):c.245-1G&gt;T AND Pheochromocytoma</Title>
165 |   <ReferenceClinVarAssertion DateCreated="2014-04-23" DateLastUpdated="2019-12-17" ID="269101">
166 |     <ClinVarAccession Acc="RCV000114824" Version="1" Type="RCV" DateUpdated="2019-12-17"/>
167 |     <RecordStatus>current</RecordStatus>
168 |     <ClinicalSignificance>
169 |       <ReviewStatus>no assertion criteria provided</ReviewStatus>
170 |       <Description>Likely pathogenic</Description>
171 |     </ClinicalSignificance>
172 |     <Assertion Type="variation to disease"/>
173 |     <ObservedIn>
174 |       <Sample>
175 |         <Origin>germline</Origin>
176 |         <Species TaxonomyId="9606">human</Species>
177 |         <AffectedStatus>not provided</AffectedStatus>
178 |         <NumberTested>1</NumberTested>
179 |       </Sample>
180 |       <Method>
181 |         <MethodType>literature only</MethodType>
182 |       </Method>
183 |       <ObservedData ID="49712714">
184 |         <Attribute Type="Description">not provided</Attribute>
185 |       </ObservedData>
186 |     </ObservedIn>
187 |     <MeasureSet Type="Variant" ID="109" Acc="VCV000000109" Version="1">
188 |       <Measure Type="single nucleotide variant" ID="15148">
189 |         <Name>
190 |           <ElementValue Type="Preferred">NM_017849.3(TMEM127):c.245-1G&gt;T</ElementValue>
191 |         </Name>
192 |         <Name>
193 |           <ElementValue Type="Alternate">IVS2-1G&gt;T(p.F83fs)</ElementValue>
194 |         </Name>
195 |         <AttributeSet>
196 |           <Attribute Accession="LRG_528t1" Change="c.245-1G&gt;T" Type="HGVS, coding, LRG">LRG_528t1:c.245-1G&gt;T</Attribute>
197 |         </AttributeSet>
198 |         <AttributeSet>
199 |           <Attribute Accession="NM_001193304" Version="3" Change="c.245-1G&gt;T" Type="HGVS, coding, RefSeq">NM_001193304.3:c.245-1G&gt;T</Attribute>
200 |         </AttributeSet>
201 |         <AttributeSet>
202 |           <Attribute Accession="NM_017849" Version="3" Change="c.245-1G&gt;T" Type="HGVS, coding, RefSeq">NM_017849.3:c.245-1G&gt;T</Attribute>
203 |         </AttributeSet>
204 |         <AttributeSet>
205 |           <Attribute Accession="LRG_528" Change="g.16016G&gt;T" Type="HGVS, genomic, LRG">LRG_528:g.16016G&gt;T</Attribute>
206 |         </AttributeSet>
207 |         <AttributeSet>
208 |           <Attribute Accession="NG_027695" Version="1" Change="g.16016G&gt;T" Type="HGVS, genomic, RefSeqGene">NG_027695.1:g.16016G&gt;T</Attribute>
209 |         </AttributeSet>
210 |         <AttributeSet>
211 |           <Attribute Accession="NC_000002" Version="12" Change="g.96254998C&gt;A" Type="HGVS, genomic, top level" integerValue="38">NC_000002.12:g.96254998C&gt;A</Attribute>
212 |         </AttributeSet>
213 |         <AttributeSet>
214 |           <Attribute Accession="NC_000002" Version="11" Change="g.96920736C&gt;A" Type="HGVS, genomic, top level, previous" integerValue="37">NC_000002.11:g.96920736C&gt;A</Attribute>
215 |         </AttributeSet>
216 |         <AttributeSet>
217 |           <Attribute Type="MolecularConsequence">splice acceptor variant</Attribute>
218 |           <XRef ID="SO:0001574" DB="Sequence Ontology"/>
219 |           <XRef ID="NM_001193304.3:c.245-1G&gt;T" DB="RefSeq"/>
220 |         </AttributeSet>
221 |         <AttributeSet>
222 |           <Attribute Type="MolecularConsequence">splice acceptor variant</Attribute>
223 |           <XRef ID="SO:0001574" DB="Sequence Ontology"/>
224 |           <XRef ID="NM_017849.3:c.245-1G&gt;T" DB="RefSeq"/>
225 |         </AttributeSet>
226 |         <AttributeSet>
227 |           <Attribute Type="nucleotide change">IVS2AS, G-T, -1</Attribute>
228 |         </AttributeSet>
229 |         <CytogeneticLocation>2q11.2</CytogeneticLocation>
230 |         <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="2" Accession="NC_000002.12" start="96254998" stop="96254998" display_start="96254998" display_stop="96254998" variantLength="1" positionVCF="96254998" referenceAlleleVCF="C" alternateAlleleVCF="A"/>
231 |         <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="2" Accession="NC_000002.11" start="96920736" stop="96920736" display_start="96920736" display_stop="96920736" variantLength="1" positionVCF="96920736" referenceAlleleVCF="C" alternateAlleleVCF="A"/>
232 |         <MeasureRelationship Type="within single gene">
233 |           <Name>
234 |             <ElementValue Type="Preferred">transmembrane protein 127</ElementValue>
235 |           </Name>
236 |           <Symbol>
237 |             <ElementValue Type="Preferred">TMEM127</ElementValue>
238 |           </Symbol>
239 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="2" Accession="NC_000002.12" start="96248514" stop="96265997" display_start="96248514" display_stop="96265997" Strand="-"/>
240 |           <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="2" Accession="NC_000002.11" start="96915945" stop="96931750" display_start="96915945" display_stop="96931750" variantLength="15806" Strand="-"/>
241 |           <XRef ID="55654" DB="Gene"/>
242 |           <XRef Type="MIM" ID="613403" DB="OMIM"/>
243 |           <XRef ID="HGNC:26038" DB="HGNC"/>
244 |         </MeasureRelationship>
245 |         <Citation Type="general">
246 |           <ID Source="PubMed">20154675</ID>
247 |         </Citation>
248 |         <XRef Type="Allelic variant" ID="613403.0003" DB="OMIM"/>
249 |         <XRef Type="rs" ID="121908821" DB="dbSNP"/>
250 |       </Measure>
251 |       <Name>
252 |         <ElementValue Type="Preferred">NM_017849.3(TMEM127):c.245-1G&gt;T</ElementValue>
253 |       </Name>
254 |       <XRef ID="CA113858" DB="ClinGen"/>
255 |     </MeasureSet>
256 |     <TraitSet Type="Disease" ID="572">
257 |       <Trait ID="3796" Type="Disease">
258 |         <Name>
259 |           <ElementValue Type="Preferred">Pheochromocytoma</ElementValue>
260 |           <XRef ID="Pheochromocytoma/5718" DB="Genetic Alliance"/>
261 |           <XRef ID="HP:0002666" DB="Human Phenotype Ontology"/>
262 |           <XRef ID="7385" DB="Office of Rare Diseases"/>
263 |         </Name>
264 |         <Name>
265 |           <ElementValue Type="Alternate">Pheochromocytoma, somatic</ElementValue>
266 |         </Name>
267 |         <Name>
268 |           <ElementValue Type="Alternate">MAX-Related Hereditary Paraganglioma-Pheochromocytoma Syndrome</ElementValue>
269 |         </Name>
270 |         <Name>
271 |           <ElementValue Type="Alternate">PHEOCHROMOCYTOMA, SUSCEPTIBILITY TO</ElementValue>
272 |           <XRef Type="MIM" ID="171300" DB="OMIM"/>
273 |         </Name>
274 |         <Name>
275 |           <ElementValue Type="Alternate">Chromaffin tumors</ElementValue>
276 |           <XRef ID="HP:0002666" DB="Human Phenotype Ontology"/>
277 |         </Name>
278 |         <AttributeSet>
279 |           <Attribute Type="public definition">Hereditary paraganglioma-pheochromocytoma (PGL/PCC) syndromes are characterized by paragangliomas (tumors that arise from neuroendocrine tissues distributed along the paravertebral axis from the base of the skull to the pelvis) and pheochromocytomas (paragangliomas that are confined to the adrenal medulla). Sympathetic paragangliomas cause catecholamine excess; parasympathetic paragangliomas are most often nonsecretory. Extra-adrenal parasympathetic paragangliomas are located predominantly in the skull base and neck (referred to as head and neck PGL [HNPGL]) and sometimes in the upper mediastinum; approximately 95% of such tumors are nonsecretory. In contrast, sympathetic extra-adrenal paragangliomas are generally confined to the lower mediastinum, abdomen, and pelvis, and are typically secretory. Pheochromocytomas, which arise from the adrenal medulla, typically lead to catecholamine excess. Symptoms of PGL/PCC result from either mass effects or catecholamine hypersecretion (e.g., sustained or paroxysmal elevations in blood pressure, headache, episodic profuse sweating, forceful palpitations, pallor, and apprehension or anxiety). The risk for developing metastatic disease is greater for extra-adrenal sympathetic paragangliomas than for pheochromocytomas.</Attribute>
280 |           <XRef ID="NBK1548" DB="GeneReviews"/>
281 |         </AttributeSet>
282 |         <AttributeSet>
283 |           <Attribute Type="keyword">Neoplasm</Attribute>
284 |         </AttributeSet>
285 |         <AttributeSet>
286 |           <Attribute Type="keyword">Hereditary cancer syndrome</Attribute>
287 |         </AttributeSet>
288 |         <Citation Type="review" Abbrev="GeneReviews">
289 |           <ID Source="PubMed">20301715</ID>
290 |           <ID Source="BookShelf">NBK1548</ID>
291 |         </Citation>
292 |         <Citation Type="practice guideline" Abbrev="ES, 2014">
293 |           <ID Source="PubMed">24893135</ID>
294 |         </Citation>
295 |         <Citation Type="practice guideline" Abbrev="ASCO, 2014">
296 |           <ID Source="PubMed">24493721</ID>
297 |         </Citation>
298 |         <Citation Type="practice guideline" Abbrev="NANETS, 2010">
299 |           <ID Source="pmc">3419007</ID>
300 |         </Citation>
301 |         <Citation Type="practice guideline" Abbrev="KCRNC, 2013">
302 |           <ID Source="PubMed">24319509</ID>
303 |         </Citation>
304 |         <XRef ID="C0031511" DB="MedGen"/>
305 |         <XRef ID="29072" DB="Orphanet"/>
306 |         <XRef Type="MIM" ID="171300" DB="OMIM"/>
307 |         <XRef Type="primary" ID="HP:0002666" DB="Human Phenotype Ontology"/>
308 |       </Trait>
309 |     </TraitSet>
310 |   </ReferenceClinVarAssertion>
311 |   <ClinVarAssertion ID="268918" SubmissionName="ute_01">
312 |     <ClinVarSubmissionID localKey="NM_017849.3:c.245-1G&gt;T|FCC-IOV" submitter="Familial Cancer Clinic,Veneto Institute of Oncology" submitterDate="2010-12-21"/>
313 |     <ClinVarAccession Acc="SCV000148719" Version="1" Type="SCV" OrgID="500224" OrganizationCategory="laboratory" OrgType="primary" DateUpdated="2019-03-31"/>
314 |     <RecordStatus>current</RecordStatus>
315 |     <ClinicalSignificance>
316 |       <ReviewStatus>no assertion criteria provided</ReviewStatus>
317 |       <Description>likely pathogenic - adrenal pheochromocytoma</Description>
318 |       <Comment Type="ConvertedByNCBI">Converted during submission to Likely pathogenic.</Comment>
319 |     </ClinicalSignificance>
320 |     <Assertion Type="variation to disease"/>
321 |     <ObservedIn>
322 |       <Sample>
323 |         <Origin>germline</Origin>
324 |         <Species>human</Species>
325 |         <AffectedStatus>not provided</AffectedStatus>
326 |         <NumberTested>1</NumberTested>
327 |       </Sample>
328 |       <Method>
329 |         <MethodType>not provided</MethodType>
330 |       </Method>
331 |       <ObservedData>
332 |         <Attribute Type="Description">not provided</Attribute>
333 |       </ObservedData>
334 |     </ObservedIn>
335 |     <MeasureSet Type="Variant">
336 |       <Measure Type="Variation">
337 |         <Name>
338 |           <ElementValue Type="Alternate">IVS2-1G&gt;T (p.F83fs)</ElementValue>
339 |         </Name>
340 |         <AttributeSet>
341 |           <Attribute Type="HGVS">NM_017849.3:c.245-1G&gt;T</Attribute>
342 |         </AttributeSet>
343 |         <Citation Type="general">
344 |           <ID Source="PubMed">20154675</ID>
345 |         </Citation>
346 |         <XRef DB="OMIM" ID="613403.0003" Type="Allelic variant"/>
347 |       </Measure>
348 |     </MeasureSet>
349 |     <TraitSet Type="Disease">
350 |       <Trait Type="Disease">
351 |         <Name>
352 |           <ElementValue Type="Preferred">Pheochromocytoma</ElementValue>
353 |         </Name>
354 |         <XRef DB="OMIM" ID="171300" Type="MIM"/>
355 |       </Trait>
356 |     </TraitSet>
357 |   </ClinVarAssertion>
358 | </ClinVarSet>
359 | 
360 | 
361 | 


--------------------------------------------------------------------------------
/tests/files/16895.xml:
--------------------------------------------------------------------------------
  1 | <ClinVarSet ID="47079688">
  2 |   <RecordStatus>current</RecordStatus>
  3 |   <Title>NM_000106.5(CYP2D6):c.[886C&gt;T;457G&gt;C] AND Debrisoquine, ultrarapid metabolism of</Title>
  4 |   <ReferenceClinVarAssertion DateCreated="2012-08-13" DateLastUpdated="2019-11-02" ID="75894">
  5 |     <ClinVarAccession Acc="RCV000018391" Version="27" Type="RCV" DateUpdated="2019-11-02"/>
  6 |     <RecordStatus>current</RecordStatus>
  7 |     <ClinicalSignificance DateLastEvaluated="2015-05-18">
  8 |       <ReviewStatus>no assertion criteria provided</ReviewStatus>
  9 |       <Description>drug response</Description>
 10 |     </ClinicalSignificance>
 11 |     <Assertion Type="variation to disease"/>
 12 |     <ObservedIn>
 13 |       <Sample>
 14 |         <Origin>germline</Origin>
 15 |         <Species TaxonomyId="9606">human</Species>
 16 |         <AffectedStatus>not provided</AffectedStatus>
 17 |       </Sample>
 18 |       <Method>
 19 |         <MethodType>literature only</MethodType>
 20 |       </Method>
 21 |       <ObservedData ID="47916362">
 22 |         <Attribute Type="Description">This allelic variant is also known as CYP2D6*2 or CYP2D6L.</Attribute>
 23 |       </ObservedData>
 24 |       <ObservedData ID="47916362">
 25 |         <Attribute Type="Description">In a family in which 2 sibs and their father had MRs of less that 0.02 (ultrarapid phenotype, see 608902), Johansson et al. (1993) found 12 extra copies of the CYP2D6 gene inherited in an autosomal dominant pattern; in a second family in which 2 sibs had MRs of less than 0.1, the authors found 2 extra copies of the CYP2D6 gene. All affected individuals had a variant CYP2D6 gene, termed CYP2D6L, which contained 2 amino acid substitutions: a 2938C-T transition in exon 6, resulting in an arg296-to-cys (R296C), and a 4268G-to-C transversion in exon 9, resulting in a resulting in a ser486-to-thr (S486T) substitution. The MR of individuals with 1 copy of the CYP2D6L gene did not differ from those with the wildtype gene, but there was a correlation between decreased MR and increased copies of the CYP2D6L gene.</Attribute>
 26 |         <Citation Type="general">
 27 |           <ID Source="PubMed">7903454</ID>
 28 |         </Citation>
 29 |       </ObservedData>
 30 |       <ObservedData ID="47916362">
 31 |         <Attribute Type="Description">Panserat et al. (1994) identified the R296C and S486T changes as 2 major CYP2D6 allozymes in extensive metabolizers (wildtype). Residue 296 falls within a presumed substrate recognition site, and residue 486 lies in the vicinity of the heme binding site.</Attribute>
 32 |         <Citation Type="general">
 33 |           <ID Source="PubMed">7927337</ID>
 34 |         </Citation>
 35 |       </ObservedData>
 36 |     </ObservedIn>
 37 |     <MeasureSet Type="Haplotype" ID="16895" Acc="VCV000016895" Version="1">
 38 |       <Measure Type="single nucleotide variant" ID="31934">
 39 |         <Name>
 40 |           <ElementValue Type="Preferred">NM_000106.6(CYP2D6):c.886C&gt;T (p.Arg296Cys)</ElementValue>
 41 |         </Name>
 42 |         <AttributeSet>
 43 |           <Attribute Accession="LRG_303t1" Change="c.886C&gt;T" Type="HGVS, coding, LRG">LRG_303t1:c.886C&gt;T</Attribute>
 44 |         </AttributeSet>
 45 |         <AttributeSet>
 46 |           <Attribute Accession="NM_001025161" Version="3" Change="c.733C&gt;T" Type="HGVS, coding, RefSeq">NM_001025161.3:c.733C&gt;T</Attribute>
 47 |         </AttributeSet>
 48 |         <AttributeSet>
 49 |           <Attribute Accession="NM_000106" Version="6" Change="c.886C&gt;T" Type="HGVS, coding, RefSeq">NM_000106.6:c.886C&gt;T</Attribute>
 50 |         </AttributeSet>
 51 |         <AttributeSet>
 52 |           <Attribute Accession="LRG_303" Change="g.7870C&gt;T" Type="HGVS, genomic, LRG">LRG_303:g.7870C&gt;T</Attribute>
 53 |         </AttributeSet>
 54 |         <AttributeSet>
 55 |           <Attribute Accession="NG_008376" Version="3" Change="g.7051C&gt;T" Type="HGVS, genomic, RefSeqGene">NG_008376.3:g.7051C&gt;T</Attribute>
 56 |         </AttributeSet>
 57 |         <AttributeSet>
 58 |           <Attribute Accession="NG_008376" Version="4" Change="g.7870C&gt;T" Type="HGVS, genomic, RefSeqGene">NG_008376.4:g.7870C&gt;T</Attribute>
 59 |         </AttributeSet>
 60 |         <AttributeSet>
 61 |           <Attribute Accession="NC_000022" Version="11" Change="g.42127941G&gt;A" Type="HGVS, genomic, top level" integerValue="38">NC_000022.11:g.42127941G&gt;A</Attribute>
 62 |         </AttributeSet>
 63 |         <AttributeSet>
 64 |           <Attribute Accession="NC_000022" Version="10" Change="g.42523943=" Type="HGVS, genomic, top level, previous" integerValue="37">NC_000022.10:g.42523943=</Attribute>
 65 |         </AttributeSet>
 66 |         <AttributeSet>
 67 |           <Attribute Accession="NM_000106" Version="5" Change="c.886C&gt;T" Type="HGVS, previous">NM_000106.5:c.886C&gt;T</Attribute>
 68 |         </AttributeSet>
 69 |         <AttributeSet>
 70 |           <Attribute Accession="NG_008376" Version="2" Change="g.7941C&gt;T" Type="HGVS, previous">NG_008376.2:g.7941C&gt;T</Attribute>
 71 |         </AttributeSet>
 72 |         <AttributeSet>
 73 |           <Attribute Accession="LRG_303p1" Change="p.Arg296Cys" Type="HGVS, protein">LRG_303p1:p.Arg296Cys</Attribute>
 74 |         </AttributeSet>
 75 |         <AttributeSet>
 76 |           <Attribute Accession="P10635" Change="p.Arg296Cys" Type="HGVS, protein">P10635:p.Arg296Cys</Attribute>
 77 |         </AttributeSet>
 78 |         <AttributeSet>
 79 |           <Attribute Accession="NP_001020332" Version="2" Change="p.Arg245Cys" Type="HGVS, protein, RefSeq">NP_001020332.2:p.Arg245Cys</Attribute>
 80 |         </AttributeSet>
 81 |         <AttributeSet>
 82 |           <Attribute Accession="NP_000097" Version="3" Change="p.Arg296Cys" Type="HGVS, protein, RefSeq">NP_000097.3:p.Arg296Cys</Attribute>
 83 |         </AttributeSet>
 84 |         <AttributeSet>
 85 |           <Attribute Accession="NP_000097" Version="3" Change="p.Arg296Cys" Type="HGVS, protein, RefSeq">NP_000097.3:p.Arg296Cys</Attribute>
 86 |         </AttributeSet>
 87 |         <AttributeSet>
 88 |           <Attribute Accession="NP_000097" Version="3" Change="p.Arg296Cys" Type="HGVS, protein, RefSeq">NP_000097.3:p.Arg296Cys</Attribute>
 89 |         </AttributeSet>
 90 |         <AttributeSet>
 91 |           <Attribute Type="MolecularConsequence">missense variant</Attribute>
 92 |           <XRef ID="SO:0001583" DB="Sequence Ontology"/>
 93 |           <XRef ID="NM_000106.6:c.886C&gt;T" DB="RefSeq"/>
 94 |         </AttributeSet>
 95 |         <AttributeSet>
 96 |           <Attribute Type="MolecularConsequence">missense variant</Attribute>
 97 |           <XRef ID="SO:0001583" DB="Sequence Ontology"/>
 98 |           <XRef ID="NM_001025161.3:c.733C&gt;T" DB="RefSeq"/>
 99 |         </AttributeSet>
100 |         <AttributeSet>
101 |           <Attribute Type="ProteinChange1LetterCode">R245C</Attribute>
102 |         </AttributeSet>
103 |         <AttributeSet>
104 |           <Attribute Type="ProteinChange1LetterCode">R296C</Attribute>
105 |         </AttributeSet>
106 |         <AttributeSet>
107 |           <Attribute Type="ProteinChange3LetterCode">ARG296CYS</Attribute>
108 |         </AttributeSet>
109 |         <AlleleFrequencyList>
110 |           <AlleleFrequency Value="0.40045" Source="NHLBI Exome Sequencing Project (ESP) Exome Variant Server"/>
111 |           <AlleleFrequency Value="0.35923" Source="1000 Genomes Project"/>
112 |           <AlleleFrequency Value="0.34334" Source="Exome Aggregation Consortium (ExAC)"/>
113 |           <AlleleFrequency Value="0.38555" Source="The Genome Aggregation Database (gnomAD)"/>
114 |           <AlleleFrequency Value="0.37647" Source="Trans-Omics for Precision Medicine (TOPMed)"/>
115 |         </AlleleFrequencyList>
116 |         <GlobalMinorAlleleFrequency Value="0.35923" Source="1000 Genomes Project" MinorAllele="A"/>
117 |         <CytogeneticLocation>22q13.2</CytogeneticLocation>
118 |         <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NC_000022.11" start="42127941" stop="42127941" display_start="42127941" display_stop="42127941" variantLength="1" positionVCF="42127941" referenceAlleleVCF="G" alternateAlleleVCF="A"/>
119 |         <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="22" Accession="NC_000022.10" start="42523943" stop="42523943" display_start="42523943" display_stop="42523943" variantLength="1" positionVCF="42523943" referenceAlleleVCF="A" alternateAlleleVCF="A"/>
120 |         <MeasureRelationship Type="within single gene">
121 |           <Name>
122 |             <ElementValue Type="Preferred">cytochrome P450 family 2 subfamily D member 6</ElementValue>
123 |           </Name>
124 |           <Symbol>
125 |             <ElementValue Type="Preferred">CYP2D6</ElementValue>
126 |           </Symbol>
127 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NC_000022.11" start="42126499" stop="42130810" display_start="42126499" display_stop="42130810" Strand="-"/>
128 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NT_187682.1" start="48840" stop="53151" display_start="48840" display_stop="53151" Strand="-"/>
129 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NW_004504305.1" start="48826" stop="53137" display_start="48826" display_stop="53137" Strand="-"/>
130 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NW_009646208.1" start="12065" stop="16376" display_start="12065" display_stop="16376" Strand="-"/>
131 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NW_014040931.1" start="20088" stop="24399" display_start="20088" display_stop="24399" Strand="-"/>
132 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NW_015148968.1" start="4240" stop="8551" display_start="4240" display_stop="8551" Strand="-"/>
133 |           <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="22" Accession="NC_000022.10" start="42522500" stop="42526882" display_start="42522500" display_stop="42526882" variantLength="4383" Strand="-"/>
134 |           <XRef ID="1565" DB="Gene"/>
135 |           <XRef Type="MIM" ID="124030" DB="OMIM"/>
136 |           <XRef ID="HGNC:2625" DB="HGNC"/>
137 |         </MeasureRelationship>
138 |         <Citation Type="review" Abbrev="Medical Genetics Summaries">
139 |           <ID Source="BookShelf">NBK100662</ID>
140 |         </Citation>
141 |         <Citation Type="review" Abbrev="Medical Genetics Summaries">
142 |           <ID Source="BookShelf">NBK367795</ID>
143 |         </Citation>
144 |         <Citation Type="review" Abbrev="Medical Genetics Summaries">
145 |           <ID Source="BookShelf">NBK425795</ID>
146 |         </Citation>
147 |         <XRef ID="P10635#VAR_008340" DB="UniProtKB"/>
148 |         <XRef Type="Allelic variant" ID="124030.0007" DB="OMIM"/>
149 |         <XRef Type="rs" ID="16947" DB="dbSNP"/>
150 |       </Measure>
151 |       <Measure Type="single nucleotide variant" ID="38485">
152 |         <Name>
153 |           <ElementValue Type="Preferred">NM_000106.6(CYP2D6):c.1457G&gt;C (p.Ser486Thr)</ElementValue>
154 |         </Name>
155 |         <AttributeSet>
156 |           <Attribute Accession="LRG_303t1" Change="c.1457G&gt;C" Type="HGVS, coding, LRG">LRG_303t1:c.1457G&gt;C</Attribute>
157 |         </AttributeSet>
158 |         <AttributeSet>
159 |           <Attribute Accession="NM_001025161" Version="3" Change="c.1304G&gt;C" Type="HGVS, coding, RefSeq">NM_001025161.3:c.1304G&gt;C</Attribute>
160 |         </AttributeSet>
161 |         <AttributeSet>
162 |           <Attribute Accession="NM_000106" Version="6" Change="c.1457G&gt;C" Type="HGVS, coding, RefSeq">NM_000106.6:c.1457G&gt;C</Attribute>
163 |         </AttributeSet>
164 |         <AttributeSet>
165 |           <Attribute Accession="LRG_303" Change="g.9200G&gt;C" Type="HGVS, genomic, LRG">LRG_303:g.9200G&gt;C</Attribute>
166 |         </AttributeSet>
167 |         <AttributeSet>
168 |           <Attribute Accession="NG_008376" Version="3" Change="g.8381G&gt;C" Type="HGVS, genomic, RefSeqGene">NG_008376.3:g.8381G&gt;C</Attribute>
169 |         </AttributeSet>
170 |         <AttributeSet>
171 |           <Attribute Accession="NG_008376" Version="4" Change="g.9200G&gt;C" Type="HGVS, genomic, RefSeqGene">NG_008376.4:g.9200G&gt;C</Attribute>
172 |         </AttributeSet>
173 |         <AttributeSet>
174 |           <Attribute Accession="NC_000022" Version="11" Change="g.42126611C&gt;G" Type="HGVS, genomic, top level" integerValue="38">NC_000022.11:g.42126611C&gt;G</Attribute>
175 |         </AttributeSet>
176 |         <AttributeSet>
177 |           <Attribute Accession="NC_000022" Version="10" Change="g.42522613=" Type="HGVS, genomic, top level, previous" integerValue="37">NC_000022.10:g.42522613=</Attribute>
178 |         </AttributeSet>
179 |         <AttributeSet>
180 |           <Attribute Accession="NM_000106" Version="5" Change="c.1457G&gt;C" Type="HGVS, previous">NM_000106.5:c.1457G&gt;C</Attribute>
181 |         </AttributeSet>
182 |         <AttributeSet>
183 |           <Attribute Accession="NG_008376" Version="2" Change="g.9271G&gt;C" Type="HGVS, previous">NG_008376.2:g.9271G&gt;C</Attribute>
184 |         </AttributeSet>
185 |         <AttributeSet>
186 |           <Attribute Accession="LRG_303p1" Change="p.Ser486Thr" Type="HGVS, protein">LRG_303p1:p.Ser486Thr</Attribute>
187 |         </AttributeSet>
188 |         <AttributeSet>
189 |           <Attribute Accession="P10635" Change="p.Ser486Thr" Type="HGVS, protein">P10635:p.Ser486Thr</Attribute>
190 |         </AttributeSet>
191 |         <AttributeSet>
192 |           <Attribute Accession="NP_001020332" Version="2" Change="p.Ser435Thr" Type="HGVS, protein, RefSeq">NP_001020332.2:p.Ser435Thr</Attribute>
193 |         </AttributeSet>
194 |         <AttributeSet>
195 |           <Attribute Accession="NP_000097" Version="3" Change="p.Ser486Thr" Type="HGVS, protein, RefSeq">NP_000097.3:p.Ser486Thr</Attribute>
196 |         </AttributeSet>
197 |         <AttributeSet>
198 |           <Attribute Accession="NP_000097" Version="3" Change="p.Ser486Thr" Type="HGVS, protein, RefSeq">NP_000097.3:p.Ser486Thr</Attribute>
199 |         </AttributeSet>
200 |         <AttributeSet>
201 |           <Attribute Accession="NP_000097" Version="3" Change="p.Ser486Thr" Type="HGVS, protein, RefSeq">NP_000097.3:p.Ser486Thr</Attribute>
202 |         </AttributeSet>
203 |         <AttributeSet>
204 |           <Attribute Type="Location">NM_000106.5:exon 8</Attribute>
205 |         </AttributeSet>
206 |         <AttributeSet>
207 |           <Attribute Type="MolecularConsequence">missense variant</Attribute>
208 |           <XRef ID="SO:0001583" DB="Sequence Ontology"/>
209 |           <XRef ID="NM_000106.6:c.1457G&gt;C" DB="RefSeq"/>
210 |         </AttributeSet>
211 |         <AttributeSet>
212 |           <Attribute Type="MolecularConsequence">missense variant</Attribute>
213 |           <XRef ID="SO:0001583" DB="Sequence Ontology"/>
214 |           <XRef ID="NM_001025161.3:c.1304G&gt;C" DB="RefSeq"/>
215 |         </AttributeSet>
216 |         <AttributeSet>
217 |           <Attribute Type="ProteinChange1LetterCode">S435T</Attribute>
218 |         </AttributeSet>
219 |         <AttributeSet>
220 |           <Attribute Type="ProteinChange1LetterCode">S486T</Attribute>
221 |         </AttributeSet>
222 |         <AttributeSet>
223 |           <Attribute Type="ProteinChange3LetterCode">SER486THR</Attribute>
224 |         </AttributeSet>
225 |         <AlleleFrequencyList>
226 |           <AlleleFrequency Value="0.59168" Source="NHLBI Exome Sequencing Project (ESP) Exome Variant Server"/>
227 |           <AlleleFrequency Value="0.59884" Source="1000 Genomes Project"/>
228 |           <AlleleFrequency Value="0.54444" Source="Exome Aggregation Consortium (ExAC)"/>
229 |           <AlleleFrequency Value="0.58040" Source="The Genome Aggregation Database (gnomAD)"/>
230 |           <AlleleFrequency Value="0.54901" Source="The Genome Aggregation Database (gnomAD), exomes"/>
231 |         </AlleleFrequencyList>
232 |         <GlobalMinorAlleleFrequency Value="0.40116" Source="1000 Genomes Project" MinorAllele="C"/>
233 |         <CytogeneticLocation>22q13.2</CytogeneticLocation>
234 |         <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NC_000022.11" start="42126611" stop="42126611" display_start="42126611" display_stop="42126611" variantLength="1" positionVCF="42126611" referenceAlleleVCF="C" alternateAlleleVCF="G"/>
235 |         <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="22" Accession="NC_000022.10" start="42522613" stop="42522613" display_start="42522613" display_stop="42522613" variantLength="1" positionVCF="42522613" referenceAlleleVCF="G" alternateAlleleVCF="G"/>
236 |         <MeasureRelationship Type="within single gene">
237 |           <Name>
238 |             <ElementValue Type="Preferred">cytochrome P450 family 2 subfamily D member 6</ElementValue>
239 |           </Name>
240 |           <Symbol>
241 |             <ElementValue Type="Preferred">CYP2D6</ElementValue>
242 |           </Symbol>
243 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NC_000022.11" start="42126499" stop="42130810" display_start="42126499" display_stop="42130810" Strand="-"/>
244 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NT_187682.1" start="48840" stop="53151" display_start="48840" display_stop="53151" Strand="-"/>
245 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NW_004504305.1" start="48826" stop="53137" display_start="48826" display_stop="53137" Strand="-"/>
246 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NW_009646208.1" start="12065" stop="16376" display_start="12065" display_stop="16376" Strand="-"/>
247 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NW_014040931.1" start="20088" stop="24399" display_start="20088" display_stop="24399" Strand="-"/>
248 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NW_015148968.1" start="4240" stop="8551" display_start="4240" display_stop="8551" Strand="-"/>
249 |           <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="22" Accession="NC_000022.10" start="42522500" stop="42526882" display_start="42522500" display_stop="42526882" variantLength="4383" Strand="-"/>
250 |           <XRef ID="1565" DB="Gene"/>
251 |           <XRef Type="MIM" ID="124030" DB="OMIM"/>
252 |           <XRef ID="HGNC:2625" DB="HGNC"/>
253 |         </MeasureRelationship>
254 |         <XRef ID="P10635#VAR_008341" DB="UniProtKB"/>
255 |         <XRef Type="Allelic variant" ID="124030.0007" DB="OMIM"/>
256 |         <XRef Type="rs" ID="1135840" DB="dbSNP"/>
257 |       </Measure>
258 |       <Name>
259 |         <ElementValue Type="Preferred">NM_000106.5(CYP2D6):c.[886C&gt;T;457G&gt;C]</ElementValue>
260 |       </Name>
261 |       <Name>
262 |         <ElementValue Type="Alternate">CYP2D6, ARG296CYS AND SER486THR</ElementValue>
263 |         <XRef Type="Allelic variant" ID="124030.0007" DB="OMIM"/>
264 |       </Name>
265 |       <Symbol>
266 |         <ElementValue Type="Preferred">CYP2D6*2</ElementValue>
267 |       </Symbol>
268 |       <AttributeSet>
269 |         <Attribute Type="HGVS, genomic, RefSeqGene" Change="g.[7941C&gt;T;9271G&gt;C]">NG_008376.2:g.[7941C&gt;T;9271G&gt;C]</Attribute>
270 |       </AttributeSet>
271 |       <AttributeSet>
272 |         <Attribute Type="HGVS, coding, RefSeq" Change="c.[886C&gt;T;457G&gt;C]">NM_000106.5:c.[886C&gt;T;457G&gt;C]</Attribute>
273 |       </AttributeSet>
274 |       <XRef ID="CA039192" DB="ClinGen"/>
275 |       <XRef Type="Allelic variant" ID="124030.0007" DB="OMIM"/>
276 |     </MeasureSet>
277 |     <TraitSet Type="Disease" ID="5542">
278 |       <Trait ID="9095" Type="DrugResponse">
279 |         <Name>
280 |           <ElementValue Type="Preferred">Debrisoquine, ultrarapid metabolism of</ElementValue>
281 |         </Name>
282 |         <XRef ID="C1837157" DB="MedGen"/>
283 |       </Trait>
284 |     </TraitSet>
285 |   </ReferenceClinVarAssertion>
286 |   <ClinVarAssertion ID="38673">
287 |     <ClinVarSubmissionID localKey="124030.0007_DEBRISOQUINE, ULTRARAPID METABOLISM OF" submitter="OMIM" submitterDate="2012-06-07" title="CYP2D6, ARG296CYS AND SER486THR_DEBRISOQUINE, ULTRARAPID METABOLISM OF"/>
288 |     <ClinVarAccession Acc="SCV000038673" Version="1" Type="SCV" OrgID="3" OrganizationCategory="resource" OrgType="primary" DateUpdated="2019-08-03"/>
289 |     <RecordStatus>current</RecordStatus>
290 |     <ClinicalSignificance DateLastEvaluated="2015-05-18">
291 |       <ReviewStatus>no assertion criteria provided</ReviewStatus>
292 |       <Description>drug response</Description>
293 |     </ClinicalSignificance>
294 |     <Assertion Type="variation to disease"/>
295 |     <ExternalID DB="OMIM" ID="124030.0007" Type="Allelic variant"/>
296 |     <ObservedIn>
297 |       <Sample>
298 |         <Origin>germline</Origin>
299 |         <Species>human</Species>
300 |         <AffectedStatus>not provided</AffectedStatus>
301 |       </Sample>
302 |       <Method>
303 |         <MethodType>literature only</MethodType>
304 |       </Method>
305 |       <ObservedData>
306 |         <Attribute Type="Description">This allelic variant is also known as CYP2D6*2 or CYP2D6L.</Attribute>
307 |       </ObservedData>
308 |       <ObservedData>
309 |         <Attribute Type="Description">In a family in which 2 sibs and their father had MRs of less that 0.02 (ultrarapid phenotype, see 608902), Johansson et al. (1993) found 12 extra copies of the CYP2D6 gene inherited in an autosomal dominant pattern; in a second family in which 2 sibs had MRs of less than 0.1, the authors found 2 extra copies of the CYP2D6 gene. All affected individuals had a variant CYP2D6 gene, termed CYP2D6L, which contained 2 amino acid substitutions: a 2938C-T transition in exon 6, resulting in an arg296-to-cys (R296C), and a 4268G-to-C transversion in exon 9, resulting in a resulting in a ser486-to-thr (S486T) substitution. The MR of individuals with 1 copy of the CYP2D6L gene did not differ from those with the wildtype gene, but there was a correlation between decreased MR and increased copies of the CYP2D6L gene.</Attribute>
310 |         <Citation>
311 |           <ID Source="PubMed">7903454</ID>
312 |         </Citation>
313 |         <XRef DB="OMIM" ID="608902" Type="MIM"/>
314 |       </ObservedData>
315 |       <ObservedData>
316 |         <Attribute Type="Description">Panserat et al. (1994) identified the R296C and S486T changes as 2 major CYP2D6 allozymes in extensive metabolizers (wildtype). Residue 296 falls within a presumed substrate recognition site, and residue 486 lies in the vicinity of the heme binding site.</Attribute>
317 |         <Citation>
318 |           <ID Source="PubMed">7927337</ID>
319 |         </Citation>
320 |       </ObservedData>
321 |     </ObservedIn>
322 |     <MeasureSet Type="Variant">
323 |       <Measure Type="Variation">
324 |         <Name>
325 |           <ElementValue Type="Preferred">CYP2D6, ARG296CYS AND SER486THR</ElementValue>
326 |         </Name>
327 |         <AttributeSet>
328 |           <Attribute Type="NonHGVS">ARG296CYS AND SER486THR</Attribute>
329 |         </AttributeSet>
330 |         <MeasureRelationship Type="variant in gene">
331 |           <Symbol>
332 |             <ElementValue Type="Preferred">CYP2D6</ElementValue>
333 |           </Symbol>
334 |         </MeasureRelationship>
335 |         <XRef DB="OMIM" ID="124030.0007" Type="Allelic variant"/>
336 |       </Measure>
337 |     </MeasureSet>
338 |     <TraitSet Type="Disease">
339 |       <Trait Type="Disease">
340 |         <Name>
341 |           <ElementValue Type="Preferred">DEBRISOQUINE, ULTRARAPID METABOLISM OF</ElementValue>
342 |         </Name>
343 |       </Trait>
344 |     </TraitSet>
345 |   </ClinVarAssertion>
346 | </ClinVarSet>
347 | 
348 | 


--------------------------------------------------------------------------------
/tests/files/225974.xml:
--------------------------------------------------------------------------------
  1 | <ClinVarSet ID="47660574">
  2 |   <RecordStatus>current</RecordStatus>
  3 |   <Title>NM_000675.6(ADORA2A):c.-275+1797C&gt;T AND caffeine response - Toxicity/ADR</Title>
  4 |   <ReferenceClinVarAssertion DateCreated="2018-07-07" DateLastUpdated="2019-11-02" ID="1527200">
  5 |     <ClinVarAccession Acc="RCV000660765" Version="1" Type="RCV" DateUpdated="2019-11-02"/>
  6 |     <RecordStatus>current</RecordStatus>
  7 |     <ClinicalSignificance DateLastEvaluated="2017-12-20">
  8 |       <ReviewStatus>reviewed by expert panel</ReviewStatus>
  9 |       <Description>drug response</Description>
 10 |     </ClinicalSignificance>
 11 |     <Assertion Type="variation to disease"/>
 12 |     <ObservedIn>
 13 |       <Sample>
 14 |         <Origin>germline</Origin>
 15 |         <Species TaxonomyId="9606">human</Species>
 16 |         <AffectedStatus>yes</AffectedStatus>
 17 |       </Sample>
 18 |       <Method>
 19 |         <MethodType>curation</MethodType>
 20 |       </Method>
 21 |       <ObservedData ID="48568031">
 22 |         <Attribute Type="Description">not provided</Attribute>
 23 |       </ObservedData>
 24 |     </ObservedIn>
 25 |     <MeasureSet Type="Variant" ID="225974" Acc="VCV000225974" Version="1">
 26 |       <Measure Type="single nucleotide variant" ID="227811">
 27 |         <Name>
 28 |           <ElementValue Type="Preferred">NM_000675.6(ADORA2A):c.-275+1797C&gt;T</ElementValue>
 29 |         </Name>
 30 |         <AttributeSet>
 31 |           <Attribute Accession="NM_001278500" Version="1" Change="c.-274-3588C&gt;T" Type="HGVS, coding, RefSeq">NM_001278500.1:c.-274-3588C&gt;T</Attribute>
 32 |         </AttributeSet>
 33 |         <AttributeSet>
 34 |           <Attribute Accession="NM_000675" Version="6" Change="c.-275+1797C&gt;T" Type="HGVS, coding, RefSeq">NM_000675.6:c.-275+1797C&gt;T</Attribute>
 35 |         </AttributeSet>
 36 |         <AttributeSet>
 37 |           <Attribute Accession="NM_001278499" Version="2" Change="c.-275+1817C&gt;T" Type="HGVS, coding, RefSeq">NM_001278499.2:c.-275+1817C&gt;T</Attribute>
 38 |         </AttributeSet>
 39 |         <AttributeSet>
 40 |           <Attribute Accession="NG_052804" Version="1" Change="g.10947C&gt;T" Type="HGVS, genomic, RefSeqGene">NG_052804.1:g.10947C&gt;T</Attribute>
 41 |         </AttributeSet>
 42 |         <AttributeSet>
 43 |           <Attribute Accession="NC_000022" Version="11" Change="g.24429543C&gt;T" Type="HGVS, genomic, top level" integerValue="38">NC_000022.11:g.24429543C&gt;T</Attribute>
 44 |         </AttributeSet>
 45 |         <AttributeSet>
 46 |           <Attribute Accession="NC_000022" Version="10" Change="g.24825511C&gt;T" Type="HGVS, genomic, top level, previous" integerValue="37">NC_000022.10:g.24825511C&gt;T</Attribute>
 47 |         </AttributeSet>
 48 |         <AttributeSet>
 49 |           <Attribute Accession="NR_028484" Version="3" Change="n.2494G&gt;A" Type="HGVS, non-coding">NR_028484.3:n.2494G&gt;A</Attribute>
 50 |         </AttributeSet>
 51 |         <AttributeSet>
 52 |           <Attribute Type="MolecularConsequence">intron variant</Attribute>
 53 |           <XRef ID="SO:0001627" DB="Sequence Ontology"/>
 54 |           <XRef ID="NM_000675.6:c.-275+1797C&gt;T" DB="RefSeq"/>
 55 |         </AttributeSet>
 56 |         <AttributeSet>
 57 |           <Attribute Type="MolecularConsequence">intron variant</Attribute>
 58 |           <XRef ID="SO:0001627" DB="Sequence Ontology"/>
 59 |           <XRef ID="NM_001278499.2:c.-275+1817C&gt;T" DB="RefSeq"/>
 60 |         </AttributeSet>
 61 |         <AttributeSet>
 62 |           <Attribute Type="MolecularConsequence">intron variant</Attribute>
 63 |           <XRef ID="SO:0001627" DB="Sequence Ontology"/>
 64 |           <XRef ID="NM_001278500.1:c.-274-3588C&gt;T" DB="RefSeq"/>
 65 |         </AttributeSet>
 66 |         <AttributeSet>
 67 |           <Attribute Type="MolecularConsequence">non-coding transcript variant</Attribute>
 68 |           <XRef ID="SO:0001619" DB="Sequence Ontology"/>
 69 |           <XRef ID="NR_028484.3:n.2494G&gt;A" DB="RefSeq"/>
 70 |         </AttributeSet>
 71 |         <AlleleFrequencyList>
 72 |           <AlleleFrequency Value="0.40056" Source="1000 Genomes Project"/>
 73 |           <AlleleFrequency Value="0.48943" Source="The Genome Aggregation Database (gnomAD)"/>
 74 |           <AlleleFrequency Value="0.47828" Source="Trans-Omics for Precision Medicine (TOPMed)"/>
 75 |         </AlleleFrequencyList>
 76 |         <GlobalMinorAlleleFrequency Value="0.40056" Source="1000 Genomes Project" MinorAllele="T"/>
 77 |         <CytogeneticLocation>22q11.23</CytogeneticLocation>
 78 |         <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NC_000022.11" start="24429543" stop="24429543" display_start="24429543" display_stop="24429543" variantLength="1" positionVCF="24429543" referenceAlleleVCF="C" alternateAlleleVCF="T"/>
 79 |         <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="22" Accession="NC_000022.10" start="24825511" stop="24825511" display_start="24825511" display_stop="24825511" variantLength="1" positionVCF="24825511" referenceAlleleVCF="C" alternateAlleleVCF="T"/>
 80 |         <MeasureRelationship Type="within multiple genes by overlap">
 81 |           <Name>
 82 |             <ElementValue Type="Preferred">ADORA2A antisense RNA 1</ElementValue>
 83 |           </Name>
 84 |           <Symbol>
 85 |             <ElementValue Type="Preferred">ADORA2A-AS1</ElementValue>
 86 |           </Symbol>
 87 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NC_000022.11" start="24429206" stop="24495074" display_start="24429206" display_stop="24495074" Strand="-"/>
 88 |           <XRef ID="646023" DB="Gene"/>
 89 |           <XRef ID="HGNC:37122" DB="HGNC"/>
 90 |         </MeasureRelationship>
 91 |         <MeasureRelationship Type="within multiple genes by overlap">
 92 |           <Name>
 93 |             <ElementValue Type="Preferred">SPECC1L-ADORA2A readthrough (NMD candidate)</ElementValue>
 94 |           </Name>
 95 |           <Symbol>
 96 |             <ElementValue Type="Preferred">SPECC1L-ADORA2A</ElementValue>
 97 |           </Symbol>
 98 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NC_000022.11" start="24270817" stop="24442360" display_start="24270817" display_stop="24442360" Strand="+"/>
 99 |           <XRef ID="101730217" DB="Gene"/>
100 |           <XRef ID="HGNC:49185" DB="HGNC"/>
101 |         </MeasureRelationship>
102 |         <MeasureRelationship Type="within multiple genes by overlap">
103 |           <Name>
104 |             <ElementValue Type="Preferred">adenosine A2a receptor</ElementValue>
105 |           </Name>
106 |           <Symbol>
107 |             <ElementValue Type="Preferred">ADORA2A</ElementValue>
108 |           </Symbol>
109 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NC_000022.11" start="24423597" stop="24442360" display_start="24423597" display_stop="24442360" Strand="+"/>
110 |           <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="22" Accession="NC_000022.10" start="24823529" stop="24838324" display_start="24823529" display_stop="24838324" variantLength="14796" Strand="+"/>
111 |           <XRef ID="135" DB="Gene"/>
112 |           <XRef Type="MIM" ID="102776" DB="OMIM"/>
113 |           <XRef ID="HGNC:263" DB="HGNC"/>
114 |         </MeasureRelationship>
115 |         <XRef ID="981201549PA448710" DB="PharmGKB"/>
116 |         <XRef ID="981201549" DB="PharmGKB Clinical Annotation"/>
117 |         <XRef Type="rs" ID="2298383" DB="dbSNP"/>
118 |       </Measure>
119 |       <Name>
120 |         <ElementValue Type="Preferred">NM_000675.6(ADORA2A):c.-275+1797C&gt;T</ElementValue>
121 |       </Name>
122 |     </MeasureSet>
123 |     <TraitSet Type="DrugResponse" ID="26888">
124 |       <Trait ID="35407" Type="DrugResponse">
125 |         <Name>
126 |           <ElementValue Type="Preferred">caffeine response - Toxicity/ADR</ElementValue>
127 |         </Name>
128 |         <XRef ID="CN236491" DB="MedGen"/>
129 |       </Trait>
130 |     </TraitSet>
131 |   </ReferenceClinVarAssertion>
132 |   <ClinVarAssertion ID="1525587" SubmissionName="PharmGKB-ClinVar 2018-05">
133 |     <ClinVarSubmissionID localKey="981201549PA448710|caffeine response - Toxicity/ADR" submittedAssembly="GRCh38" submitter="PharmGKB" submitterDate="2018-06-18"/>
134 |     <ClinVarAccession Acc="SCV000783004" Version="1" Type="SCV" OrgID="500295" OrganizationCategory="resource" OrgType="primary" DateUpdated="2019-03-31"/>
135 |     <RecordStatus>current</RecordStatus>
136 |     <ClinicalSignificance DateLastEvaluated="2017-12-20">
137 |       <ReviewStatus>reviewed by expert panel</ReviewStatus>
138 |       <Description>drug response</Description>
139 |       <ExplanationOfInterpretation>Drug-variant association: Toxicity/ADR</ExplanationOfInterpretation>
140 |       <Citation>
141 |         <ID Source="PubMed">18305461</ID>
142 |       </Citation>
143 |       <Citation>
144 |         <ID Source="PubMed">20520601</ID>
145 |       </Citation>
146 |       <Comment Type="public">PharmGKB Level of Evidence 2B: Annotation for a variant-drug combination with moderate evidence of an association. The association must be replicated but there may be some studies that do not show statistical significance, and/or the effect size may be small.</Comment>
147 |     </ClinicalSignificance>
148 |     <Assertion Type="variation to disease"/>
149 |     <ExternalID DB="Pharmacogenomics Knowledge Base" ID="981201549PA448710"/>
150 |     <AttributeSet>
151 |       <Attribute Type="AssertionMethod">Pharmacogenomics knowledge for personalized medicine</Attribute>
152 |       <Citation>
153 |         <ID Source="PubMed">22992668</ID>
154 |       </Citation>
155 |     </AttributeSet>
156 |     <ObservedIn>
157 |       <Sample>
158 |         <Origin>germline</Origin>
159 |         <Species TaxonomyId="9606">human</Species>
160 |         <AffectedStatus>yes</AffectedStatus>
161 |       </Sample>
162 |       <Method>
163 |         <MethodType>curation</MethodType>
164 |       </Method>
165 |       <ObservedData>
166 |         <Attribute Type="Description">not provided</Attribute>
167 |       </ObservedData>
168 |     </ObservedIn>
169 |     <MeasureSet Type="Variant">
170 |       <Measure Type="Variation">
171 |         <AttributeSet>
172 |           <Attribute Type="HGVS">NC_000022.10:g.24825511C&gt;T</Attribute>
173 |         </AttributeSet>
174 |         <MeasureRelationship Type="variant in gene">
175 |           <Symbol>
176 |             <ElementValue Type="Preferred">ADORA2A</ElementValue>
177 |           </Symbol>
178 |         </MeasureRelationship>
179 |         <XRef DB="dbSNP" ID="2298383" Type="rsNumber"/>
180 |       </Measure>
181 |     </MeasureSet>
182 |     <TraitSet Type="DrugResponse">
183 |       <Trait Type="DrugResponse">
184 |         <Name>
185 |           <ElementValue Type="Preferred">caffeine response - Toxicity/ADR</ElementValue>
186 |         </Name>
187 |       </Trait>
188 |     </TraitSet>
189 |     <Citation>
190 |       <URL>https://www.pharmgkb.org/clinicalAnnotation/981201549</URL>
191 |     </Citation>
192 |   </ClinVarAssertion>
193 | </ClinVarSet>
194 | 
195 | 


--------------------------------------------------------------------------------
/tests/files/242771.xml:
--------------------------------------------------------------------------------
  1 | <ClinVarSet ID="47617087">
  2 |   <RecordStatus>current</RecordStatus>
  3 |   <Title>NM_000106.6(CYP2D6):c.886C&gt;T (p.Arg296Cys) AND not specified</Title>
  4 |   <ReferenceClinVarAssertion DateCreated="2018-04-09" DateLastUpdated="2019-11-02" ID="1433287">
  5 |     <ClinVarAccession Acc="RCV000616933" Version="1" Type="RCV" DateUpdated="2019-11-02"/>
  6 |     <RecordStatus>current</RecordStatus>
  7 |     <ClinicalSignificance DateLastEvaluated="2018-02-27">
  8 |       <ReviewStatus>criteria provided, single submitter</ReviewStatus>
  9 |       <Description>Benign</Description>
 10 |     </ClinicalSignificance>
 11 |     <Assertion Type="variation to disease"/>
 12 |     <ObservedIn>
 13 |       <Sample>
 14 |         <Origin>germline</Origin>
 15 |         <Species TaxonomyId="9606">human</Species>
 16 |         <AffectedStatus>yes</AffectedStatus>
 17 |       </Sample>
 18 |       <Method>
 19 |         <MethodType>clinical testing</MethodType>
 20 |       </Method>
 21 |       <ObservedData ID="48509535">
 22 |         <Attribute Type="Description">not provided</Attribute>
 23 |       </ObservedData>
 24 |     </ObservedIn>
 25 |     <MeasureSet Type="Variant" ID="242771" Acc="VCV000242771" Version="1">
 26 |       <Measure Type="single nucleotide variant" ID="31934">
 27 |         <Name>
 28 |           <ElementValue Type="Preferred">NM_000106.6(CYP2D6):c.886C&gt;T (p.Arg296Cys)</ElementValue>
 29 |         </Name>
 30 |         <AttributeSet>
 31 |           <Attribute Accession="LRG_303t1" Change="c.886C&gt;T" Type="HGVS, coding, LRG">LRG_303t1:c.886C&gt;T</Attribute>
 32 |         </AttributeSet>
 33 |         <AttributeSet>
 34 |           <Attribute Accession="NM_001025161" Version="3" Change="c.733C&gt;T" Type="HGVS, coding, RefSeq">NM_001025161.3:c.733C&gt;T</Attribute>
 35 |         </AttributeSet>
 36 |         <AttributeSet>
 37 |           <Attribute Accession="NM_000106" Version="6" Change="c.886C&gt;T" Type="HGVS, coding, RefSeq">NM_000106.6:c.886C&gt;T</Attribute>
 38 |         </AttributeSet>
 39 |         <AttributeSet>
 40 |           <Attribute Accession="LRG_303" Change="g.7870C&gt;T" Type="HGVS, genomic, LRG">LRG_303:g.7870C&gt;T</Attribute>
 41 |         </AttributeSet>
 42 |         <AttributeSet>
 43 |           <Attribute Accession="NG_008376" Version="3" Change="g.7051C&gt;T" Type="HGVS, genomic, RefSeqGene">NG_008376.3:g.7051C&gt;T</Attribute>
 44 |         </AttributeSet>
 45 |         <AttributeSet>
 46 |           <Attribute Accession="NG_008376" Version="4" Change="g.7870C&gt;T" Type="HGVS, genomic, RefSeqGene">NG_008376.4:g.7870C&gt;T</Attribute>
 47 |         </AttributeSet>
 48 |         <AttributeSet>
 49 |           <Attribute Accession="NC_000022" Version="11" Change="g.42127941G&gt;A" Type="HGVS, genomic, top level" integerValue="38">NC_000022.11:g.42127941G&gt;A</Attribute>
 50 |         </AttributeSet>
 51 |         <AttributeSet>
 52 |           <Attribute Accession="NC_000022" Version="10" Change="g.42523943=" Type="HGVS, genomic, top level, previous" integerValue="37">NC_000022.10:g.42523943=</Attribute>
 53 |         </AttributeSet>
 54 |         <AttributeSet>
 55 |           <Attribute Accession="NM_000106" Version="5" Change="c.886C&gt;T" Type="HGVS, previous">NM_000106.5:c.886C&gt;T</Attribute>
 56 |         </AttributeSet>
 57 |         <AttributeSet>
 58 |           <Attribute Accession="NG_008376" Version="2" Change="g.7941C&gt;T" Type="HGVS, previous">NG_008376.2:g.7941C&gt;T</Attribute>
 59 |         </AttributeSet>
 60 |         <AttributeSet>
 61 |           <Attribute Accession="LRG_303p1" Change="p.Arg296Cys" Type="HGVS, protein">LRG_303p1:p.Arg296Cys</Attribute>
 62 |         </AttributeSet>
 63 |         <AttributeSet>
 64 |           <Attribute Accession="P10635" Change="p.Arg296Cys" Type="HGVS, protein">P10635:p.Arg296Cys</Attribute>
 65 |         </AttributeSet>
 66 |         <AttributeSet>
 67 |           <Attribute Accession="NP_001020332" Version="2" Change="p.Arg245Cys" Type="HGVS, protein, RefSeq">NP_001020332.2:p.Arg245Cys</Attribute>
 68 |         </AttributeSet>
 69 |         <AttributeSet>
 70 |           <Attribute Accession="NP_000097" Version="3" Change="p.Arg296Cys" Type="HGVS, protein, RefSeq">NP_000097.3:p.Arg296Cys</Attribute>
 71 |         </AttributeSet>
 72 |         <AttributeSet>
 73 |           <Attribute Accession="NP_000097" Version="3" Change="p.Arg296Cys" Type="HGVS, protein, RefSeq">NP_000097.3:p.Arg296Cys</Attribute>
 74 |         </AttributeSet>
 75 |         <AttributeSet>
 76 |           <Attribute Accession="NP_000097" Version="3" Change="p.Arg296Cys" Type="HGVS, protein, RefSeq">NP_000097.3:p.Arg296Cys</Attribute>
 77 |         </AttributeSet>
 78 |         <AttributeSet>
 79 |           <Attribute Type="MolecularConsequence">missense variant</Attribute>
 80 |           <XRef ID="SO:0001583" DB="Sequence Ontology"/>
 81 |           <XRef ID="NM_000106.6:c.886C&gt;T" DB="RefSeq"/>
 82 |         </AttributeSet>
 83 |         <AttributeSet>
 84 |           <Attribute Type="MolecularConsequence">missense variant</Attribute>
 85 |           <XRef ID="SO:0001583" DB="Sequence Ontology"/>
 86 |           <XRef ID="NM_001025161.3:c.733C&gt;T" DB="RefSeq"/>
 87 |         </AttributeSet>
 88 |         <AttributeSet>
 89 |           <Attribute Type="ProteinChange1LetterCode">R245C</Attribute>
 90 |         </AttributeSet>
 91 |         <AttributeSet>
 92 |           <Attribute Type="ProteinChange1LetterCode">R296C</Attribute>
 93 |         </AttributeSet>
 94 |         <AttributeSet>
 95 |           <Attribute Type="ProteinChange3LetterCode">ARG296CYS</Attribute>
 96 |         </AttributeSet>
 97 |         <AlleleFrequencyList>
 98 |           <AlleleFrequency Value="0.40045" Source="NHLBI Exome Sequencing Project (ESP) Exome Variant Server"/>
 99 |           <AlleleFrequency Value="0.35923" Source="1000 Genomes Project"/>
100 |           <AlleleFrequency Value="0.34334" Source="Exome Aggregation Consortium (ExAC)"/>
101 |           <AlleleFrequency Value="0.38555" Source="The Genome Aggregation Database (gnomAD)"/>
102 |           <AlleleFrequency Value="0.37647" Source="Trans-Omics for Precision Medicine (TOPMed)"/>
103 |         </AlleleFrequencyList>
104 |         <GlobalMinorAlleleFrequency Value="0.35923" Source="1000 Genomes Project" MinorAllele="A"/>
105 |         <CytogeneticLocation>22q13.2</CytogeneticLocation>
106 |         <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NC_000022.11" start="42127941" stop="42127941" display_start="42127941" display_stop="42127941" variantLength="1" positionVCF="42127941" referenceAlleleVCF="G" alternateAlleleVCF="A"/>
107 |         <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="22" Accession="NC_000022.10" start="42523943" stop="42523943" display_start="42523943" display_stop="42523943" variantLength="1" positionVCF="42523943" referenceAlleleVCF="A" alternateAlleleVCF="A"/>
108 |         <MeasureRelationship Type="within single gene">
109 |           <Name>
110 |             <ElementValue Type="Preferred">cytochrome P450 family 2 subfamily D member 6</ElementValue>
111 |           </Name>
112 |           <Symbol>
113 |             <ElementValue Type="Preferred">CYP2D6</ElementValue>
114 |           </Symbol>
115 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NC_000022.11" start="42126499" stop="42130810" display_start="42126499" display_stop="42130810" Strand="-"/>
116 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NT_187682.1" start="48840" stop="53151" display_start="48840" display_stop="53151" Strand="-"/>
117 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NW_004504305.1" start="48826" stop="53137" display_start="48826" display_stop="53137" Strand="-"/>
118 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NW_009646208.1" start="12065" stop="16376" display_start="12065" display_stop="16376" Strand="-"/>
119 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NW_014040931.1" start="20088" stop="24399" display_start="20088" display_stop="24399" Strand="-"/>
120 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="22" Accession="NW_015148968.1" start="4240" stop="8551" display_start="4240" display_stop="8551" Strand="-"/>
121 |           <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="22" Accession="NC_000022.10" start="42522500" stop="42526882" display_start="42522500" display_stop="42526882" variantLength="4383" Strand="-"/>
122 |           <XRef ID="1565" DB="Gene"/>
123 |           <XRef Type="MIM" ID="124030" DB="OMIM"/>
124 |           <XRef ID="HGNC:2625" DB="HGNC"/>
125 |         </MeasureRelationship>
126 |         <Citation Type="review" Abbrev="Medical Genetics Summaries">
127 |           <ID Source="BookShelf">NBK100662</ID>
128 |         </Citation>
129 |         <Citation Type="review" Abbrev="Medical Genetics Summaries">
130 |           <ID Source="BookShelf">NBK367795</ID>
131 |         </Citation>
132 |         <Citation Type="review" Abbrev="Medical Genetics Summaries">
133 |           <ID Source="BookShelf">NBK425795</ID>
134 |         </Citation>
135 |         <XRef ID="P10635#VAR_008340" DB="UniProtKB"/>
136 |         <XRef Type="Allelic variant" ID="124030.0007" DB="OMIM"/>
137 |         <XRef Type="rs" ID="16947" DB="dbSNP"/>
138 |       </Measure>
139 |       <Name>
140 |         <ElementValue Type="Preferred">NM_000106.6(CYP2D6):c.886C&gt;T (p.Arg296Cys)</ElementValue>
141 |       </Name>
142 |       <XRef ID="CA039022" DB="ClinGen"/>
143 |     </MeasureSet>
144 |     <TraitSet Type="Disease" ID="9590">
145 |       <Trait ID="16789" Type="Disease">
146 |         <Name>
147 |           <ElementValue Type="Preferred">not specified</ElementValue>
148 |         </Name>
149 |         <Name>
150 |           <ElementValue Type="Alternate">AllHighlyPenetrant</ElementValue>
151 |         </Name>
152 |         <AttributeSet>
153 |           <Attribute Type="public definition">The term 'not specified' was created for use in ClinVar so that submitters can convey the concept that a variant is benign, likely benign, or of uncertain significance for an unspecified set of disorders.  This usage was introduced in 2014 to replace AllHighlyPenetrant.</Attribute>
154 |         </AttributeSet>
155 |         <XRef ID="CN169374" DB="MedGen"/>
156 |       </Trait>
157 |     </TraitSet>
158 |   </ReferenceClinVarAssertion>
159 |   <ClinVarAssertion ID="1405438" SubmissionName="SUB3839901">
160 |     <ClinVarSubmissionID localKey="GDX:1826531|Not Provided" submittedAssembly="GRCh37" submitter="GeneDx" submitterDate="2018-03-26"/>
161 |     <ClinVarAccession Acc="SCV000724283" Version="1" Type="SCV" OrgID="26957" OrganizationCategory="laboratory" OrgType="primary" DateUpdated="2019-08-03"/>
162 |     <RecordStatus>current</RecordStatus>
163 |     <ClinicalSignificance DateLastEvaluated="2018-02-27">
164 |       <ReviewStatus>criteria provided, single submitter</ReviewStatus>
165 |       <Description>Benign</Description>
166 |       <Comment>This variant is considered likely benign or benign based on one or more of the following criteria: it is a conservative change, it occurs at a poorly conserved position in the protein, it is predicted to be benign by multiple in silico algorithms, and/or has population frequency not consistent with disease.</Comment>
167 |     </ClinicalSignificance>
168 |     <Assertion Type="variation to disease"/>
169 |     <ExternalID DB="GeneDx" ID="GDX:1826531"/>
170 |     <AttributeSet>
171 |       <Attribute Type="AssertionMethod">GeneDX Variant Classification (06012015)</Attribute>
172 |       <Citation>
173 |         <URL>https://submit.ncbi.nlm.nih.gov/ft/byid/7oynscmk/mdi-5616_26957_genedx_interprules_final_061215.pdf</URL>
174 |       </Citation>
175 |     </AttributeSet>
176 |     <ObservedIn>
177 |       <Sample>
178 |         <Origin>germline</Origin>
179 |         <Species TaxonomyId="9606">human</Species>
180 |         <AffectedStatus>yes</AffectedStatus>
181 |       </Sample>
182 |       <Method>
183 |         <MethodType>clinical testing</MethodType>
184 |       </Method>
185 |       <ObservedData>
186 |         <Attribute Type="Description">not provided</Attribute>
187 |       </ObservedData>
188 |     </ObservedIn>
189 |     <MeasureSet Type="Variant">
190 |       <Measure Type="Variation">
191 |         <AttributeSet>
192 |           <Attribute Type="HGVS">NM_000106.5:c.886C&gt;T</Attribute>
193 |         </AttributeSet>
194 |         <SequenceLocation Assembly="GRCh37" Chr="22" alternateAllele="A" referenceAllele="A" start="42523943" stop="42523943" variantLength="1"/>
195 |         <MeasureRelationship Type="variant in gene">
196 |           <Symbol>
197 |             <ElementValue Type="Preferred">CYP2D6</ElementValue>
198 |           </Symbol>
199 |         </MeasureRelationship>
200 |       </Measure>
201 |     </MeasureSet>
202 |     <TraitSet Type="Disease">
203 |       <Trait Type="Disease">
204 |         <Name>
205 |           <ElementValue Type="Preferred">not specified</ElementValue>
206 |         </Name>
207 |       </Trait>
208 |     </TraitSet>
209 |   </ClinVarAssertion>
210 | </ClinVarSet>
211 | 
212 | 


--------------------------------------------------------------------------------
/tests/files/340430.xml:
--------------------------------------------------------------------------------
  1 | <ClinVarSet ID="48788085">
  2 |   <RecordStatus>current</RecordStatus>
  3 |   <Title>NM_206965.2(FTCD):c.636+9G&gt;A AND GLUTAMATE FORMIMINOTRANSFERASE DEFICIENCY</Title>
  4 |   <ReferenceClinVarAssertion DateCreated="2016-12-05" DateLastUpdated="2019-12-17" ID="872805">
  5 |     <ClinVarAccession Acc="RCV000334054" Version="1" Type="RCV" DateUpdated="2019-12-17"/>
  6 |     <RecordStatus>current</RecordStatus>
  7 |     <ClinicalSignificance DateLastEvaluated="2016-06-14">
  8 |       <ReviewStatus>criteria provided, single submitter</ReviewStatus>
  9 |       <Description>Uncertain significance</Description>
 10 |     </ClinicalSignificance>
 11 |     <Assertion Type="variation to disease"/>
 12 |     <ObservedIn>
 13 |       <Sample>
 14 |         <Origin>germline</Origin>
 15 |         <Species TaxonomyId="9606">human</Species>
 16 |         <AffectedStatus>unknown</AffectedStatus>
 17 |       </Sample>
 18 |       <Method>
 19 |         <MethodType>clinical testing</MethodType>
 20 |       </Method>
 21 |       <ObservedData ID="49416197">
 22 |         <Attribute Type="Description">not provided</Attribute>
 23 |       </ObservedData>
 24 |     </ObservedIn>
 25 |     <MeasureSet Type="Variant" ID="340430" Acc="VCV000340430" Version="2">
 26 |       <Measure Type="single nucleotide variant" ID="351952">
 27 |         <Name>
 28 |           <ElementValue Type="Preferred">NM_206965.2(FTCD):c.636+9G&gt;A</ElementValue>
 29 |         </Name>
 30 |         <AttributeSet>
 31 |           <Attribute Accession="NM_001350598" Version="1" Change="c.15C&gt;T" Type="HGVS, coding, RefSeq">NM_001350598.1:c.15C&gt;T</Attribute>
 32 |         </AttributeSet>
 33 |         <AttributeSet>
 34 |           <Attribute Accession="NM_001320412" Version="2" Change="c.636+9G&gt;A" Type="HGVS, coding, RefSeq">NM_001320412.2:c.636+9G&gt;A</Attribute>
 35 |         </AttributeSet>
 36 |         <AttributeSet>
 37 |           <Attribute Accession="NM_006657" Version="3" Change="c.636+9G&gt;A" Type="HGVS, coding, RefSeq">NM_006657.3:c.636+9G&gt;A</Attribute>
 38 |         </AttributeSet>
 39 |         <AttributeSet>
 40 |           <Attribute Accession="NM_206965" Version="2" Change="c.636+9G&gt;A" Type="HGVS, coding, RefSeq">NM_206965.2:c.636+9G&gt;A</Attribute>
 41 |         </AttributeSet>
 42 |         <AttributeSet>
 43 |           <Attribute Accession="NG_016191" Version="1" Change="g.9019G&gt;A" Type="HGVS, genomic, RefSeqGene">NG_016191.1:g.9019G&gt;A</Attribute>
 44 |         </AttributeSet>
 45 |         <AttributeSet>
 46 |           <Attribute Accession="NC_000021" Version="9" Change="g.46151549C&gt;T" Type="HGVS, genomic, top level" integerValue="38">NC_000021.9:g.46151549C&gt;T</Attribute>
 47 |         </AttributeSet>
 48 |         <AttributeSet>
 49 |           <Attribute Accession="NC_000021" Version="8" Change="g.47571463C&gt;T" Type="HGVS, genomic, top level, previous" integerValue="37">NC_000021.8:g.47571463C&gt;T</Attribute>
 50 |         </AttributeSet>
 51 |         <AttributeSet>
 52 |           <Attribute Accession="NM_006657" Version="2" Change="c.636+9G&gt;A" Type="HGVS, previous">NM_006657.2:c.636+9G&gt;A</Attribute>
 53 |         </AttributeSet>
 54 |         <AttributeSet>
 55 |           <Attribute Accession="NP_001337527" Version="1" Change="p.Asn5=" Type="HGVS, protein, RefSeq">NP_001337527.1:p.Asn5=</Attribute>
 56 |         </AttributeSet>
 57 |         <AttributeSet>
 58 |           <Attribute Type="MolecularConsequence">intron variant</Attribute>
 59 |           <XRef ID="SO:0001627" DB="Sequence Ontology"/>
 60 |           <XRef ID="NM_001320412.2:c.636+9G&gt;A" DB="RefSeq"/>
 61 |         </AttributeSet>
 62 |         <AttributeSet>
 63 |           <Attribute Type="MolecularConsequence">intron variant</Attribute>
 64 |           <XRef ID="SO:0001627" DB="Sequence Ontology"/>
 65 |           <XRef ID="NM_006657.3:c.636+9G&gt;A" DB="RefSeq"/>
 66 |         </AttributeSet>
 67 |         <AttributeSet>
 68 |           <Attribute Type="MolecularConsequence">intron variant</Attribute>
 69 |           <XRef ID="SO:0001627" DB="Sequence Ontology"/>
 70 |           <XRef ID="NM_206965.2:c.636+9G&gt;A" DB="RefSeq"/>
 71 |         </AttributeSet>
 72 |         <AttributeSet>
 73 |           <Attribute Type="MolecularConsequence">synonymous variant</Attribute>
 74 |           <XRef ID="SO:0001819" DB="Sequence Ontology"/>
 75 |           <XRef ID="NM_001350598.1:c.15C&gt;T" DB="RefSeq"/>
 76 |         </AttributeSet>
 77 |         <AlleleFrequencyList>
 78 |           <AlleleFrequency Value="0.00062" Source="NHLBI Exome Sequencing Project (ESP) Exome Variant Server"/>
 79 |           <AlleleFrequency Value="0.00025" Source="Exome Aggregation Consortium (ExAC)"/>
 80 |           <AlleleFrequency Value="0.00064" Source="The Genome Aggregation Database (gnomAD)"/>
 81 |           <AlleleFrequency Value="0.00037" Source="Trans-Omics for Precision Medicine (TOPMed)"/>
 82 |         </AlleleFrequencyList>
 83 |         <CytogeneticLocation>21q22.3</CytogeneticLocation>
 84 |         <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="21" Accession="NC_000021.9" start="46151549" stop="46151549" display_start="46151549" display_stop="46151549" variantLength="1" positionVCF="46151549" referenceAlleleVCF="C" alternateAlleleVCF="T"/>
 85 |         <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="21" Accession="NC_000021.8" start="47571463" stop="47571463" display_start="47571463" display_stop="47571463" variantLength="1" positionVCF="47571463" referenceAlleleVCF="C" alternateAlleleVCF="T"/>
 86 |         <MeasureRelationship Type="within multiple genes by overlap">
 87 |           <Name>
 88 |             <ElementValue Type="Preferred">FTCD antisense RNA 1</ElementValue>
 89 |           </Name>
 90 |           <Symbol>
 91 |             <ElementValue Type="Preferred">FTCD-AS1</ElementValue>
 92 |           </Symbol>
 93 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="21" Accession="NC_000021.9" start="46151375" stop="46152647" display_start="46151375" display_stop="46152647" Strand="+"/>
 94 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="21" Accession="NT_187626.1" start="10434" stop="11706" display_start="10434" display_stop="11706" Strand="+"/>
 95 |           <XRef ID="100861507" DB="Gene"/>
 96 |           <XRef ID="HGNC:40243" DB="HGNC"/>
 97 |         </MeasureRelationship>
 98 |         <MeasureRelationship Type="within multiple genes by overlap">
 99 |           <Name>
100 |             <ElementValue Type="Preferred">formimidoyltransferase cyclodeaminase</ElementValue>
101 |           </Name>
102 |           <Symbol>
103 |             <ElementValue Type="Preferred">FTCD</ElementValue>
104 |           </Symbol>
105 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="21" Accession="NC_000021.9" start="46135981" stop="46156482" display_start="46135981" display_stop="46156482" Strand="-"/>
106 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="21" Accession="NT_187626.1" start="4876" stop="14638" display_start="4876" display_stop="14638" Strand="-"/>
107 |           <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="21" Accession="NC_000021.8" start="47556175" stop="47575480" display_start="47556175" display_stop="47575480" variantLength="19306" Strand="-"/>
108 |           <XRef ID="10841" DB="Gene"/>
109 |           <XRef Type="MIM" ID="606806" DB="OMIM"/>
110 |           <XRef ID="HGNC:3974" DB="HGNC"/>
111 |         </MeasureRelationship>
112 |         <XRef ID="750092" DB="Illumina Clinical Services Laboratory,Illumina"/>
113 |         <XRef Type="rs" ID="370596374" DB="dbSNP"/>
114 |       </Measure>
115 |       <Name>
116 |         <ElementValue Type="Preferred">NM_206965.2(FTCD):c.636+9G&gt;A</ElementValue>
117 |       </Name>
118 |       <XRef ID="CA10073801" DB="ClinGen"/>
119 |     </MeasureSet>
120 |     <TraitSet Type="Disease" ID="1099">
121 |       <Trait ID="1296" Type="Disease">
122 |         <Name>
123 |           <ElementValue Type="Preferred">GLUTAMATE FORMIMINOTRANSFERASE DEFICIENCY</ElementValue>
124 |           <XRef ID="Glutamate+Formiminotransferase+Deficiency/3104" DB="Genetic Alliance"/>
125 |           <XRef Type="MIM" ID="229100" DB="OMIM"/>
126 |           <XRef Type="Allelic variant" ID="606806.0001" DB="OMIM"/>
127 |           <XRef Type="Allelic variant" ID="606806.0002" DB="OMIM"/>
128 |           <XRef Type="Allelic variant" ID="606806.0003" DB="OMIM"/>
129 |           <XRef ID="9279" DB="Office of Rare Diseases"/>
130 |           <XRef ID="59761008" DB="SNOMED CT"/>
131 |         </Name>
132 |         <AttributeSet>
133 |           <Attribute Type="public definition">Glutamate formiminotransferase deficiency is an autosomal recessive disorder and the second most common inborn error of folate metabolism. Features of a severe phenotype include elevated levels of formiminoglutamate (FIGLU) in the urine in response to histidine administration, megaloblastic anemia, and mental retardation. Features of a mild phenotype include high urinary excretion of FIGLU in the absence of histidine administration, mild developmental delay, and no hematologic abnormalities (summary by Hilton et al., 2003).</Attribute>
134 |           <XRef Type="MIM" ID="229100" DB="OMIM"/>
135 |         </AttributeSet>
136 |         <XRef ID="C0268609" DB="MedGen"/>
137 |         <XRef ID="51208" DB="Orphanet"/>
138 |         <XRef Type="MIM" ID="229100" DB="OMIM"/>
139 |       </Trait>
140 |     </TraitSet>
141 |   </ReferenceClinVarAssertion>
142 |   <ClinVarAssertion ID="750422" SubmissionName="ICSL_2016Q4">
143 |     <ClinVarSubmissionID localKey="750092|Formiminotransferase Deficiency" submittedAssembly="GRCh37" submitter="Illumina Clinical Services Laboratory,Illumina" submitterDate="2016-10-18"/>
144 |     <ClinVarAccession Acc="SCV000436901" Version="2" Type="SCV" OrgID="504895" OrganizationCategory="laboratory" OrgType="primary" DateUpdated="2019-03-31"/>
145 |     <RecordStatus>current</RecordStatus>
146 |     <ClinicalSignificance DateLastEvaluated="2016-06-14">
147 |       <ReviewStatus>criteria provided, single submitter</ReviewStatus>
148 |       <Description>Uncertain significance</Description>
149 |     </ClinicalSignificance>
150 |     <Assertion Type="variation to disease"/>
151 |     <ExternalID DB="Illumina Clinical Services Laboratory" ID="750092"/>
152 |     <AttributeSet>
153 |       <Attribute Type="AssertionMethod">ICSL Variant Classification 20161018</Attribute>
154 |       <Citation Type="general">
155 |         <URL>https://submit.ncbi.nlm.nih.gov/ft/byid/4jQgNGYk/ICSL_Variant_Classification_20161018.pdf</URL>
156 |         <CitationText>ICSL_Variant_Classification_20161018.pdf</CitationText>
157 |       </Citation>
158 |     </AttributeSet>
159 |     <ObservedIn>
160 |       <Sample>
161 |         <Origin>germline</Origin>
162 |         <Species TaxonomyId="9606">human</Species>
163 |         <AffectedStatus>unknown</AffectedStatus>
164 |       </Sample>
165 |       <Method>
166 |         <MethodType>clinical testing</MethodType>
167 |       </Method>
168 |       <ObservedData>
169 |         <Attribute Type="Description">not provided</Attribute>
170 |       </ObservedData>
171 |     </ObservedIn>
172 |     <MeasureSet Type="Variant">
173 |       <Measure Type="Variation">
174 |         <AttributeSet>
175 |           <Attribute Type="HGVS">NM_006657.2:c.636+9G&gt;A</Attribute>
176 |         </AttributeSet>
177 |         <MeasureRelationship Type="variant in gene">
178 |           <Symbol>
179 |             <ElementValue Type="Preferred">FTCD</ElementValue>
180 |           </Symbol>
181 |         </MeasureRelationship>
182 |       </Measure>
183 |     </MeasureSet>
184 |     <TraitSet Type="Disease">
185 |       <Trait Type="Disease">
186 |         <Name>
187 |           <ElementValue Type="Preferred">Formiminotransferase Deficiency</ElementValue>
188 |         </Name>
189 |       </Trait>
190 |     </TraitSet>
191 |   </ClinVarAssertion>
192 | </ClinVarSet>
193 | 
194 | <ClinVarSet ID="48926956">
195 |   <RecordStatus>current</RecordStatus>
196 |   <Title>NM_206965.2(FTCD):c.636+9G&gt;A AND not provided</Title>
197 |   <ReferenceClinVarAssertion DateCreated="2019-12-15" DateLastUpdated="2019-12-17" ID="2116811">
198 |     <ClinVarAccession Acc="RCV000876765" Version="1" Type="RCV" DateUpdated="2019-12-17"/>
199 |     <RecordStatus>current</RecordStatus>
200 |     <ClinicalSignificance DateLastEvaluated="2018-02-12">
201 |       <ReviewStatus>criteria provided, single submitter</ReviewStatus>
202 |       <Description>Likely benign</Description>
203 |     </ClinicalSignificance>
204 |     <Assertion Type="variation to disease"/>
205 |     <ObservedIn>
206 |       <Sample>
207 |         <Origin>germline</Origin>
208 |         <Species TaxonomyId="9606">human</Species>
209 |         <AffectedStatus>unknown</AffectedStatus>
210 |       </Sample>
211 |       <Method>
212 |         <MethodType>clinical testing</MethodType>
213 |       </Method>
214 |       <ObservedData ID="49815428">
215 |         <Attribute Type="Description">not provided</Attribute>
216 |       </ObservedData>
217 |     </ObservedIn>
218 |     <MeasureSet Type="Variant" ID="340430" Acc="VCV000340430" Version="2">
219 |       <Measure Type="single nucleotide variant" ID="351952">
220 |         <Name>
221 |           <ElementValue Type="Preferred">NM_206965.2(FTCD):c.636+9G&gt;A</ElementValue>
222 |         </Name>
223 |         <AttributeSet>
224 |           <Attribute Accession="NM_001350598" Version="1" Change="c.15C&gt;T" Type="HGVS, coding, RefSeq">NM_001350598.1:c.15C&gt;T</Attribute>
225 |         </AttributeSet>
226 |         <AttributeSet>
227 |           <Attribute Accession="NM_001320412" Version="2" Change="c.636+9G&gt;A" Type="HGVS, coding, RefSeq">NM_001320412.2:c.636+9G&gt;A</Attribute>
228 |         </AttributeSet>
229 |         <AttributeSet>
230 |           <Attribute Accession="NM_006657" Version="3" Change="c.636+9G&gt;A" Type="HGVS, coding, RefSeq">NM_006657.3:c.636+9G&gt;A</Attribute>
231 |         </AttributeSet>
232 |         <AttributeSet>
233 |           <Attribute Accession="NM_206965" Version="2" Change="c.636+9G&gt;A" Type="HGVS, coding, RefSeq">NM_206965.2:c.636+9G&gt;A</Attribute>
234 |         </AttributeSet>
235 |         <AttributeSet>
236 |           <Attribute Accession="NG_016191" Version="1" Change="g.9019G&gt;A" Type="HGVS, genomic, RefSeqGene">NG_016191.1:g.9019G&gt;A</Attribute>
237 |         </AttributeSet>
238 |         <AttributeSet>
239 |           <Attribute Accession="NC_000021" Version="9" Change="g.46151549C&gt;T" Type="HGVS, genomic, top level" integerValue="38">NC_000021.9:g.46151549C&gt;T</Attribute>
240 |         </AttributeSet>
241 |         <AttributeSet>
242 |           <Attribute Accession="NC_000021" Version="8" Change="g.47571463C&gt;T" Type="HGVS, genomic, top level, previous" integerValue="37">NC_000021.8:g.47571463C&gt;T</Attribute>
243 |         </AttributeSet>
244 |         <AttributeSet>
245 |           <Attribute Accession="NM_006657" Version="2" Change="c.636+9G&gt;A" Type="HGVS, previous">NM_006657.2:c.636+9G&gt;A</Attribute>
246 |         </AttributeSet>
247 |         <AttributeSet>
248 |           <Attribute Accession="NP_001337527" Version="1" Change="p.Asn5=" Type="HGVS, protein, RefSeq">NP_001337527.1:p.Asn5=</Attribute>
249 |         </AttributeSet>
250 |         <AttributeSet>
251 |           <Attribute Type="MolecularConsequence">intron variant</Attribute>
252 |           <XRef ID="SO:0001627" DB="Sequence Ontology"/>
253 |           <XRef ID="NM_001320412.2:c.636+9G&gt;A" DB="RefSeq"/>
254 |         </AttributeSet>
255 |         <AttributeSet>
256 |           <Attribute Type="MolecularConsequence">intron variant</Attribute>
257 |           <XRef ID="SO:0001627" DB="Sequence Ontology"/>
258 |           <XRef ID="NM_006657.3:c.636+9G&gt;A" DB="RefSeq"/>
259 |         </AttributeSet>
260 |         <AttributeSet>
261 |           <Attribute Type="MolecularConsequence">intron variant</Attribute>
262 |           <XRef ID="SO:0001627" DB="Sequence Ontology"/>
263 |           <XRef ID="NM_206965.2:c.636+9G&gt;A" DB="RefSeq"/>
264 |         </AttributeSet>
265 |         <AttributeSet>
266 |           <Attribute Type="MolecularConsequence">synonymous variant</Attribute>
267 |           <XRef ID="SO:0001819" DB="Sequence Ontology"/>
268 |           <XRef ID="NM_001350598.1:c.15C&gt;T" DB="RefSeq"/>
269 |         </AttributeSet>
270 |         <AlleleFrequencyList>
271 |           <AlleleFrequency Value="0.00062" Source="NHLBI Exome Sequencing Project (ESP) Exome Variant Server"/>
272 |           <AlleleFrequency Value="0.00025" Source="Exome Aggregation Consortium (ExAC)"/>
273 |           <AlleleFrequency Value="0.00064" Source="The Genome Aggregation Database (gnomAD)"/>
274 |           <AlleleFrequency Value="0.00037" Source="Trans-Omics for Precision Medicine (TOPMed)"/>
275 |         </AlleleFrequencyList>
276 |         <CytogeneticLocation>21q22.3</CytogeneticLocation>
277 |         <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="21" Accession="NC_000021.9" start="46151549" stop="46151549" display_start="46151549" display_stop="46151549" variantLength="1" positionVCF="46151549" referenceAlleleVCF="C" alternateAlleleVCF="T"/>
278 |         <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="21" Accession="NC_000021.8" start="47571463" stop="47571463" display_start="47571463" display_stop="47571463" variantLength="1" positionVCF="47571463" referenceAlleleVCF="C" alternateAlleleVCF="T"/>
279 |         <MeasureRelationship Type="within multiple genes by overlap">
280 |           <Name>
281 |             <ElementValue Type="Preferred">FTCD antisense RNA 1</ElementValue>
282 |           </Name>
283 |           <Symbol>
284 |             <ElementValue Type="Preferred">FTCD-AS1</ElementValue>
285 |           </Symbol>
286 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="21" Accession="NC_000021.9" start="46151375" stop="46152647" display_start="46151375" display_stop="46152647" Strand="+"/>
287 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="21" Accession="NT_187626.1" start="10434" stop="11706" display_start="10434" display_stop="11706" Strand="+"/>
288 |           <XRef ID="100861507" DB="Gene"/>
289 |           <XRef ID="HGNC:40243" DB="HGNC"/>
290 |         </MeasureRelationship>
291 |         <MeasureRelationship Type="within multiple genes by overlap">
292 |           <Name>
293 |             <ElementValue Type="Preferred">formimidoyltransferase cyclodeaminase</ElementValue>
294 |           </Name>
295 |           <Symbol>
296 |             <ElementValue Type="Preferred">FTCD</ElementValue>
297 |           </Symbol>
298 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="21" Accession="NC_000021.9" start="46135981" stop="46156482" display_start="46135981" display_stop="46156482" Strand="-"/>
299 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="21" Accession="NT_187626.1" start="4876" stop="14638" display_start="4876" display_stop="14638" Strand="-"/>
300 |           <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="21" Accession="NC_000021.8" start="47556175" stop="47575480" display_start="47556175" display_stop="47575480" variantLength="19306" Strand="-"/>
301 |           <XRef ID="10841" DB="Gene"/>
302 |           <XRef Type="MIM" ID="606806" DB="OMIM"/>
303 |           <XRef ID="HGNC:3974" DB="HGNC"/>
304 |         </MeasureRelationship>
305 |         <XRef ID="750092" DB="Illumina Clinical Services Laboratory,Illumina"/>
306 |         <XRef Type="rs" ID="370596374" DB="dbSNP"/>
307 |       </Measure>
308 |       <Name>
309 |         <ElementValue Type="Preferred">NM_206965.2(FTCD):c.636+9G&gt;A</ElementValue>
310 |       </Name>
311 |       <XRef ID="CA10073801" DB="ClinGen"/>
312 |     </MeasureSet>
313 |     <TraitSet Type="Disease" ID="9460">
314 |       <Trait ID="17556" Type="Disease">
315 |         <Name>
316 |           <ElementValue Type="Preferred">not provided</ElementValue>
317 |           <XRef ID="13DG0619" DB="Developmental Genetics Unit,King Faisal Specialist Hospital &amp; Research Centre"/>
318 |         </Name>
319 |         <AttributeSet>
320 |           <Attribute Type="public definition">The term 'not provided' is registered in MedGen to support identification of submissions to ClinVar for which no condition was named when assessing the variant. 'not provided' differs from 'not specified', which is used when a variant is asserted to be benign, likely benign, or of uncertain significance for conditions that have not been specified.</Attribute>
321 |         </AttributeSet>
322 |         <XRef ID="CN517202" DB="MedGen"/>
323 |       </Trait>
324 |     </TraitSet>
325 |   </ReferenceClinVarAssertion>
326 |   <ClinVarAssertion ID="1987113" SubmissionName="SUB5321749">
327 |     <ClinVarSubmissionID localKey="1299127|MedGen:CN517202" submittedAssembly="GRCh37" submitter="Invitae" submitterDate="2019-03-14"/>
328 |     <ClinVarAccession Acc="SCV001019381" Version="1" Type="SCV" OrgID="500031" OrganizationCategory="laboratory" OrgType="primary" DateUpdated="2019-12-17"/>
329 |     <RecordStatus>current</RecordStatus>
330 |     <ClinicalSignificance DateLastEvaluated="2018-02-12">
331 |       <ReviewStatus>criteria provided, single submitter</ReviewStatus>
332 |       <Description>Likely benign</Description>
333 |     </ClinicalSignificance>
334 |     <Assertion Type="variation to disease"/>
335 |     <ExternalID DB="Invitae" ID="1299127"/>
336 |     <AttributeSet>
337 |       <Attribute Type="AssertionMethod">Nykamp K et al. (Genet Med 2017)</Attribute>
338 |       <Citation>
339 |         <ID Source="PubMed">28492532</ID>
340 |       </Citation>
341 |     </AttributeSet>
342 |     <ObservedIn>
343 |       <Sample>
344 |         <Origin>germline</Origin>
345 |         <Species TaxonomyId="9606">human</Species>
346 |         <AffectedStatus>unknown</AffectedStatus>
347 |       </Sample>
348 |       <Method>
349 |         <MethodType>clinical testing</MethodType>
350 |       </Method>
351 |       <ObservedData>
352 |         <Attribute Type="Description">not provided</Attribute>
353 |       </ObservedData>
354 |     </ObservedIn>
355 |     <MeasureSet Type="Variant">
356 |       <Measure Type="Variation">
357 |         <AttributeSet>
358 |           <Attribute Type="HGVS">NM_006657.2:c.636+9G&gt;A</Attribute>
359 |         </AttributeSet>
360 |         <MeasureRelationship Type="variant in gene">
361 |           <Symbol>
362 |             <ElementValue Type="Preferred">FTCD</ElementValue>
363 |           </Symbol>
364 |         </MeasureRelationship>
365 |       </Measure>
366 |     </MeasureSet>
367 |     <TraitSet Type="Disease">
368 |       <Trait Type="Disease">
369 |         <Name>
370 |           <ElementValue Type="Preferred">not provided</ElementValue>
371 |         </Name>
372 |         <XRef DB="MedGen" ID="CN517202" Type="CUI"/>
373 |       </Trait>
374 |     </TraitSet>
375 |   </ClinVarAssertion>
376 | </ClinVarSet>
377 | 
378 | 


--------------------------------------------------------------------------------
/tests/files/618897_2019-05.xml:
--------------------------------------------------------------------------------
  1 | <ClinVarSet ID="40235997">
  2 |   <RecordStatus>current</RecordStatus>
  3 |   <Title>NC_000007.14:g.117559618_117559619delGA AND Cystic fibrosis</Title>
  4 |   <ReferenceClinVarAssertion DateCreated="2019-02-17" DateLastUpdated="2019-03-30" ID="1735217">
  5 |     <ClinVarAccession Acc="RCV000757792" Version="1" Type="RCV" DateUpdated="2019-03-31"/>
  6 |     <RecordStatus>current</RecordStatus>
  7 |     <ClinicalSignificance DateLastEvaluated="2018-11-05">
  8 |       <ReviewStatus>criteria provided, single submitter</ReviewStatus>
  9 |       <Description>Likely pathogenic</Description>
 10 |     </ClinicalSignificance>
 11 |     <Assertion Type="variation to disease"/>
 12 |     <ObservedIn>
 13 |       <Sample>
 14 |         <Origin>unknown</Origin>
 15 |         <Species TaxonomyId="9606">human</Species>
 16 |         <AffectedStatus>yes</AffectedStatus>
 17 |       </Sample>
 18 |       <Method>
 19 |         <MethodType>clinical testing</MethodType>
 20 |       </Method>
 21 |       <ObservedData ID="38643589">
 22 |         <Attribute Type="Description">not provided</Attribute>
 23 |       </ObservedData>
 24 |     </ObservedIn>
 25 |     <MeasureSet Type="Variant" ID="618897" Acc="VCV000618897" Version="1">
 26 |       <Measure Type="Deletion" ID="610325">
 27 |         <Name>
 28 |           <ElementValue Type="Preferred">NC_000007.14:g.117559618_117559619delGA</ElementValue>
 29 |         </Name>
 30 |         <AttributeSet>
 31 |           <Attribute Change="g.117559618_117559619delGA" Accession="NC_000007" Version="14" Type="HGVS, genomic, top level" integerValue="38">NC_000007.14:g.117559618_117559619delGA</Attribute>
 32 |         </AttributeSet>
 33 |         <AttributeSet>
 34 |           <Attribute Change="g.117199671_117199672delAG" Accession="NC_000007" Version="13" Type="HGVS, genomic, top level, previous" integerValue="37">NC_000007.13:g.117199671_117199672delAG</Attribute>
 35 |         </AttributeSet>
 36 |         <CytogeneticLocation>7q31.2</CytogeneticLocation>
 37 |         <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="7" Accession="NC_000007.14" start="117559618" stop="117559619" display_start="117559618" display_stop="117559619" referenceAllele="GA" alternateAllele="-" positionVCF="117559616" referenceAlleleVCF="TAG" alternateAlleleVCF="T"/>
 38 |         <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="7" Accession="NC_000007.13" start="117199671" stop="117199672" display_start="117199671" display_stop="117199672" referenceAllele="AG" alternateAllele="-" positionVCF="117199670" referenceAlleleVCF="TAG" alternateAlleleVCF="T"/>
 39 |         <MeasureRelationship Type="within multiple genes by overlap">
 40 |           <Name>
 41 |             <ElementValue Type="Preferred">CFTR antisense RNA 1</ElementValue>
 42 |           </Name>
 43 |           <Symbol>
 44 |             <ElementValue Type="Preferred">CFTR-AS1</ElementValue>
 45 |           </Symbol>
 46 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="7" Accession="NC_000007.14" start="117542521" stop="117564676" display_start="117542521" display_stop="117564676" Strand="-"/>
 47 |           <XRef ID="111082987" DB="Gene"/>
 48 |           <XRef ID="HGNC:40144" DB="HGNC"/>
 49 |         </MeasureRelationship>
 50 |         <MeasureRelationship Type="within multiple genes by overlap">
 51 |           <Name>
 52 |             <ElementValue Type="Preferred">cystic fibrosis transmembrane conductance regulator</ElementValue>
 53 |           </Name>
 54 |           <Symbol>
 55 |             <ElementValue Type="Preferred">CFTR</ElementValue>
 56 |           </Symbol>
 57 |           <SequenceLocation Assembly="GRCh38" AssemblyAccessionVersion="GCF_000001405.38" AssemblyStatus="current" Chr="7" Accession="NC_000007.14" start="117479963" stop="117668665" display_start="117479963" display_stop="117668665" Strand="+"/>
 58 |           <SequenceLocation Assembly="GRCh37" AssemblyAccessionVersion="GCF_000001405.25" AssemblyStatus="previous" Chr="7" Accession="NC_000007.13" start="117120016" stop="117308718" display_start="117120016" display_stop="117308718" variantLength="188703" Strand="+"/>
 59 |           <XRef ID="1080" DB="Gene"/>
 60 |           <XRef Type="MIM" ID="602421" DB="OMIM"/>
 61 |           <XRef ID="HGNC:1884" DB="HGNC"/>
 62 |         </MeasureRelationship>
 63 |       </Measure>
 64 |       <Name>
 65 |         <ElementValue Type="Preferred">NC_000007.14:g.117559618_117559619delGA</ElementValue>
 66 |       </Name>
 67 |     </MeasureSet>
 68 |     <TraitSet Type="Disease" ID="1961">
 69 |       <Trait ID="1695" Type="Disease">
 70 |         <Name>
 71 |           <ElementValue Type="Preferred">Cystic fibrosis</ElementValue>
 72 |           <XRef ID="Cystic+Fibrosis/2071" DB="Genetic Alliance"/>
 73 |           <XRef Type="MIM" ID="219700" DB="OMIM"/>
 74 |           <XRef ID="6233" DB="Office of Rare Diseases"/>
 75 |           <XRef ID="190905008" DB="SNOMED CT"/>
 76 |         </Name>
 77 |         <Symbol>
 78 |           <ElementValue Type="Preferred">CF</ElementValue>
 79 |           <XRef Type="MIM" ID="219700" DB="OMIM"/>
 80 |           <XRef ID="6233" DB="Office of Rare Diseases"/>
 81 |         </Symbol>
 82 |         <AttributeSet>
 83 |           <Attribute Type="public definition">Cystic fibrosis (CF) is a multisystem disease affecting epithelia of the respiratory tract, exocrine pancreas, intestine, hepatobiliary system, and exocrine sweat glands. Morbidities include progressive obstructive lung disease with bronchiectasis, frequent hospitalizations for pulmonary disease, pancreatic insufficiency and malnutrition, recurrent sinusitis and bronchitis, and male infertility. Pulmonary disease is the major cause of morbidity and mortality in CF. Meconium ileus occurs at birth in 15%-20% of newborns with CF. More than 95% of males with CF are infertile. Congenital absence of the vas deferens (CAVD) is generally identified during evaluation of infertility or as an incidental finding at the time of a surgical procedure. Hypoplasia or aplasia of the vas deferens and seminal vesicles may occur either bilaterally or unilaterally. Testicular development and function and spermatogenesis are usually normal.</Attribute>
 84 |           <XRef ID="NBK1250" DB="GeneReviews"/>
 85 |         </AttributeSet>
 86 |         <AttributeSet>
 87 |           <Attribute Type="disease mechanism" integerValue="273">loss of function</Attribute>
 88 |           <XRef ID="GTR000004176" DB="Genetic Testing Registry (GTR)"/>
 89 |           <XRef ID="GTR000005248" DB="Genetic Testing Registry (GTR)"/>
 90 |           <XRef ID="GTR000025280" DB="Genetic Testing Registry (GTR)"/>
 91 |           <XRef ID="GTR000028916" DB="Genetic Testing Registry (GTR)"/>
 92 |           <XRef ID="GTR000074114" DB="Genetic Testing Registry (GTR)"/>
 93 |           <XRef ID="GTR000194834" DB="Genetic Testing Registry (GTR)"/>
 94 |           <XRef ID="GTR000209428" DB="Genetic Testing Registry (GTR)"/>
 95 |           <XRef ID="GTR000226639" DB="Genetic Testing Registry (GTR)"/>
 96 |           <XRef ID="GTR000277984" DB="Genetic Testing Registry (GTR)"/>
 97 |           <XRef ID="GTR000288561" DB="Genetic Testing Registry (GTR)"/>
 98 |           <XRef ID="GTR000320929" DB="Genetic Testing Registry (GTR)"/>
 99 |           <XRef ID="GTR000324499" DB="Genetic Testing Registry (GTR)"/>
100 |           <XRef ID="GTR000328569" DB="Genetic Testing Registry (GTR)"/>
101 |           <XRef ID="GTR000330969" DB="Genetic Testing Registry (GTR)"/>
102 |           <XRef ID="GTR000332363" DB="Genetic Testing Registry (GTR)"/>
103 |           <XRef ID="GTR000501120" DB="Genetic Testing Registry (GTR)"/>
104 |           <XRef ID="GTR000501211" DB="Genetic Testing Registry (GTR)"/>
105 |           <XRef ID="GTR000501918" DB="Genetic Testing Registry (GTR)"/>
106 |           <XRef ID="GTR000501920" DB="Genetic Testing Registry (GTR)"/>
107 |           <XRef ID="GTR000501921" DB="Genetic Testing Registry (GTR)"/>
108 |           <XRef ID="GTR000502992" DB="Genetic Testing Registry (GTR)"/>
109 |           <XRef ID="GTR000505699" DB="Genetic Testing Registry (GTR)"/>
110 |           <XRef ID="GTR000506511" DB="Genetic Testing Registry (GTR)"/>
111 |           <XRef ID="GTR000507003" DB="Genetic Testing Registry (GTR)"/>
112 |           <XRef ID="GTR000507006" DB="Genetic Testing Registry (GTR)"/>
113 |           <XRef ID="GTR000507010" DB="Genetic Testing Registry (GTR)"/>
114 |           <XRef ID="GTR000507864" DB="Genetic Testing Registry (GTR)"/>
115 |           <XRef ID="GTR000507950" DB="Genetic Testing Registry (GTR)"/>
116 |           <XRef ID="GTR000508782" DB="Genetic Testing Registry (GTR)"/>
117 |           <XRef ID="GTR000508810" DB="Genetic Testing Registry (GTR)"/>
118 |           <XRef ID="GTR000508811" DB="Genetic Testing Registry (GTR)"/>
119 |           <XRef ID="GTR000508812" DB="Genetic Testing Registry (GTR)"/>
120 |           <XRef ID="GTR000508889" DB="Genetic Testing Registry (GTR)"/>
121 |           <XRef ID="GTR000508890" DB="Genetic Testing Registry (GTR)"/>
122 |           <XRef ID="GTR000508893" DB="Genetic Testing Registry (GTR)"/>
123 |           <XRef ID="GTR000508894" DB="Genetic Testing Registry (GTR)"/>
124 |           <XRef ID="GTR000514611" DB="Genetic Testing Registry (GTR)"/>
125 |           <XRef ID="GTR000515757" DB="Genetic Testing Registry (GTR)"/>
126 |           <XRef ID="GTR000515758" DB="Genetic Testing Registry (GTR)"/>
127 |           <XRef ID="GTR000520059" DB="Genetic Testing Registry (GTR)"/>
128 |           <XRef ID="GTR000521501" DB="Genetic Testing Registry (GTR)"/>
129 |           <XRef ID="GTR000521905" DB="Genetic Testing Registry (GTR)"/>
130 |           <XRef ID="GTR000522527" DB="Genetic Testing Registry (GTR)"/>
131 |           <XRef ID="GTR000528606" DB="Genetic Testing Registry (GTR)"/>
132 |           <XRef ID="GTR000529460" DB="Genetic Testing Registry (GTR)"/>
133 |           <XRef ID="GTR000530118" DB="Genetic Testing Registry (GTR)"/>
134 |           <XRef ID="GTR000553017" DB="Genetic Testing Registry (GTR)"/>
135 |           <XRef ID="GTR000556535" DB="Genetic Testing Registry (GTR)"/>
136 |           <XRef ID="GTR000556536" DB="Genetic Testing Registry (GTR)"/>
137 |           <XRef ID="GTR000558875" DB="Genetic Testing Registry (GTR)"/>
138 |           <XRef ID="GTR000558928" DB="Genetic Testing Registry (GTR)"/>
139 |         </AttributeSet>
140 |         <AttributeSet>
141 |           <Attribute Type="disease mechanism">More than 1,000 CFTR variants have been reported.  Most common pathogenic variant is p.Phe508del.</Attribute>
142 |         </AttributeSet>
143 |         <Citation Type="practice guideline" Abbrev="ACMG/ACOG, 2001">
144 |           <ID Source="PubMed">11280952</ID>
145 |         </Citation>
146 |         <Citation Type="practice guideline" Abbrev="ACMG, 2004">
147 |           <ID Source="pmc">3110945</ID>
148 |         </Citation>
149 |         <Citation Type="review" Abbrev="GeneReviews">
150 |           <ID Source="PubMed">20301428</ID>
151 |           <ID Source="BookShelf">NBK1250</ID>
152 |         </Citation>
153 |         <Citation Type="practice guideline" Abbrev="ACOG, 2009">
154 |           <ID Source="PubMed">19888064</ID>
155 |         </Citation>
156 |         <Citation Type="practice guideline" Abbrev="NSGC, 2005">
157 |           <ID Source="PubMed">15789152</ID>
158 |         </Citation>
159 |         <Citation Type="practice guideline" Abbrev="ACMG, 2008">
160 |           <ID Source="pmc">3110977</ID>
161 |         </Citation>
162 |         <Citation Type="practice guideline" Abbrev="CPGPT/CF FPT Committees, 2010">
163 |           <ID Source="PubMed">20675678</ID>
164 |         </Citation>
165 |         <Citation Type="practice guideline" Abbrev="ECFS, 2010">
166 |           <ID Source="PubMed">20605539</ID>
167 |         </Citation>
168 |         <Citation Type="practice guideline" Abbrev="CFF, 2009">
169 |           <ID Source="PubMed">19914445</ID>
170 |         </Citation>
171 |         <Citation Type="practice guideline" Abbrev="CFF, 2009">
172 |           <ID Source="PubMed">19914443</ID>
173 |         </Citation>
174 |         <Citation Type="practice guideline" Abbrev="ACMG ACT Sheets, 2010">
175 |           <ID Source="PubMed">21938795</ID>
176 |         </Citation>
177 |         <Citation Type="practice guideline" Abbrev="DailyMed Drug Label, 2012">
178 |           <URL>https://dailymed.nlm.nih.gov/dailymed/lookup.cfm?setid=0ab0c9f8-3eee-4e0f-9f3f-c1e16aaffe25</URL>
179 |           <CitationText>DailyMed Drug Label, KALYDECO, 2012</CitationText>
180 |         </Citation>
181 |         <Citation Type="practice guideline" Abbrev="ACMG Lab QA, 2002">
182 |           <ID Source="PubMed">12394352</ID>
183 |         </Citation>
184 |         <Citation Type="practice guideline" Abbrev="CDC, 2012">
185 |           <ID Source="PubMed">22475884</ID>
186 |         </Citation>
187 |         <Citation Type="Suggested Reading" Abbrev="Accurso et al., 2010">
188 |           <ID Source="pmc">3148255</ID>
189 |         </Citation>
190 |         <Citation Type="practice guideline" Abbrev="ACOG, 2011">
191 |           <ID Source="PubMed">21422883</ID>
192 |         </Citation>
193 |         <Citation Type="practice guideline" Abbrev="CPIC, 2014">
194 |           <ID Source="pmc">4026598</ID>
195 |         </Citation>
196 |         <Citation Type="practice guideline" Abbrev="NSGC, 2014">
197 |           <ID Source="PubMed">24014130</ID>
198 |         </Citation>
199 |         <Citation Type="Position Statement" Abbrev="HGS Australasia, 2014">
200 |           <ID Source="PubMed">25431289</ID>
201 |         </Citation>
202 |         <Citation Type="Suggested Reading" Abbrev="Wainwright et al., 2015">
203 |           <ID Source="PubMed">25981758</ID>
204 |         </Citation>
205 |         <Citation Type="practice guideline" Abbrev="CFF, 2007">
206 |           <ID Source="PubMed">17761616</ID>
207 |         </Citation>
208 |         <XRef ID="C0010674" DB="MedGen"/>
209 |         <XRef ID="586" DB="Orphanet"/>
210 |         <XRef Type="MIM" ID="219700" DB="OMIM"/>
211 |       </Trait>
212 |     </TraitSet>
213 |   </ReferenceClinVarAssertion>
214 |   <ClinVarAssertion ID="1732820" SubmissionName="MENDELICS_CLINVAR_014">
215 |     <ClinVarSubmissionID localKey="NC_000007.13:g.117199671_117199672delAG|OMIM:219700" submittedAssembly="GRCh37" submitter="Mendelics" submitterDate="2018-11-11"/>
216 |     <ClinVarAccession Acc="SCV000886168" Version="1" Type="SCV" OrgID="500035" OrganizationCategory="laboratory" OrgType="primary" DateUpdated="2019-03-31"/>
217 |     <RecordStatus>current</RecordStatus>
218 |     <ClinicalSignificance DateLastEvaluated="2018-11-05">
219 |       <ReviewStatus>criteria provided, single submitter</ReviewStatus>
220 |       <Description>Likely pathogenic</Description>
221 |     </ClinicalSignificance>
222 |     <Assertion Type="variation to disease"/>
223 |     <AttributeSet>
224 |       <Attribute Type="AssertionMethod">Mendelics Assertion Criteria 2017</Attribute>
225 |       <Citation>
226 |         <URL>https://submit.ncbi.nlm.nih.gov/ft/byid/chhjzatu/mendelics_assertion_criteria_2017.pdf</URL>
227 |       </Citation>
228 |     </AttributeSet>
229 |     <ObservedIn>
230 |       <Sample>
231 |         <Origin>unknown</Origin>
232 |         <Species TaxonomyId="9606">human</Species>
233 |         <AffectedStatus>yes</AffectedStatus>
234 |       </Sample>
235 |       <Method>
236 |         <MethodType>clinical testing</MethodType>
237 |       </Method>
238 |       <ObservedData>
239 |         <Attribute Type="Description">not provided</Attribute>
240 |       </ObservedData>
241 |     </ObservedIn>
242 |     <MeasureSet Type="Variant">
243 |       <Measure Type="Variation">
244 |         <AttributeSet>
245 |           <Attribute Type="HGVS">NC_000007.13:g.117199671_117199672delAG</Attribute>
246 |         </AttributeSet>
247 |         <MeasureRelationship Type="variant in gene">
248 |           <Symbol>
249 |             <ElementValue Type="Preferred">CFTR</ElementValue>
250 |           </Symbol>
251 |         </MeasureRelationship>
252 |       </Measure>
253 |     </MeasureSet>
254 |     <TraitSet Type="Disease">
255 |       <Trait Type="Disease">
256 |         <XRef DB="OMIM" ID="219700" Type="MIM"/>
257 |       </Trait>
258 |     </TraitSet>
259 |   </ClinVarAssertion>
260 | </ClinVarSet>
261 | 
262 | 


--------------------------------------------------------------------------------
/tests/files/MT.gff:
--------------------------------------------------------------------------------
  1 | ##sequence-region NC_012920.1 1 16569
  2 | NC_012920.1	RefSeq	region	1	16569	.	+	.	ID=NC_012920.1:1..16569;Dbxref=taxon:9606;Is_circular=true;Name=MT;country=United Kingdom: Great Britain;gbkey=Src;genome=mitochondrion;isolation-source=caucasian;mol_type=genomic DNA;note=this is the rCRS;tissue-type=placenta
  3 | NC_012920.1	RefSeq	gene	577	647	.	+	.	ID=gene-TRNF;Dbxref=GeneID:4558,HGNC:HGNC:7481,MIM:590070;Name=TRNF;gbkey=Gene;gene=TRNF;gene_biotype=tRNA
  4 | NC_012920.1	RefSeq	tRNA	577	647	.	+	.	ID=rna-TRNF;Parent=gene-TRNF;Dbxref=GeneID:4558,HGNC:HGNC:7481,MIM:590070;Note=NAR: 1455;anticodon=(pos:611..613);codons=1;gbkey=tRNA;gene=TRNF;product=tRNA-Phe
  5 | NC_012920.1	RefSeq	exon	577	647	.	+	.	ID=exon-TRNF-1;Parent=rna-TRNF;Dbxref=GeneID:4558,HGNC:HGNC:7481,MIM:590070;Note=NAR: 1455;anticodon=(pos:611..613);codons=1;gbkey=tRNA;gene=TRNF;product=tRNA-Phe
  6 | NC_012920.1	RefSeq	gene	648	1601	.	+	.	ID=gene-RNR1;Dbxref=GeneID:4549,HGNC:HGNC:7470,MIM:561000;Name=RNR1;gbkey=Gene;gene=RNR1;gene_biotype=rRNA;gene_synonym=MTRNR1
  7 | NC_012920.1	RefSeq	rRNA	648	1601	.	+	.	ID=rna-RNR1;Parent=gene-RNR1;Dbxref=GeneID:4549,HGNC:HGNC:7470,MIM:561000;Note=12S rRNA%3B 12S ribosomal RNA;gbkey=rRNA;gene=RNR1;product=s-rRNA
  8 | NC_012920.1	RefSeq	exon	648	1601	.	+	.	ID=exon-RNR1-1;Parent=rna-RNR1;Dbxref=GeneID:4549,HGNC:HGNC:7470,MIM:561000;Note=12S rRNA%3B 12S ribosomal RNA;gbkey=rRNA;gene=RNR1;product=s-rRNA
  9 | NC_012920.1	RefSeq	gene	1602	1670	.	+	.	ID=gene-TRNV;Dbxref=GeneID:4577,HGNC:HGNC:7500,MIM:590105;Name=TRNV;gbkey=Gene;gene=TRNV;gene_biotype=tRNA;gene_synonym=MTTV
 10 | NC_012920.1	RefSeq	tRNA	1602	1670	.	+	.	ID=rna-TRNV;Parent=gene-TRNV;Dbxref=GeneID:4577,HGNC:HGNC:7500,MIM:590105;Note=NAR: 2053;anticodon=(pos:1633..1635);codons=50;gbkey=tRNA;gene=TRNV;product=tRNA-Val
 11 | NC_012920.1	RefSeq	exon	1602	1670	.	+	.	ID=exon-TRNV-1;Parent=rna-TRNV;Dbxref=GeneID:4577,HGNC:HGNC:7500,MIM:590105;Note=NAR: 2053;anticodon=(pos:1633..1635);codons=50;gbkey=tRNA;gene=TRNV;product=tRNA-Val
 12 | NC_012920.1	RefSeq	gene	1671	3229	.	+	.	ID=gene-RNR2;Dbxref=GeneID:4550,HGNC:HGNC:7471,MIM:561010;Name=RNR2;gbkey=Gene;gene=RNR2;gene_biotype=rRNA;gene_synonym=MTRNR2
 13 | NC_012920.1	RefSeq	rRNA	1671	3229	.	+	.	ID=rna-RNR2;Parent=gene-RNR2;Dbxref=GeneID:4550,HGNC:HGNC:7471,MIM:561010;Note=16S ribosomal RNA%3B 16S rRNA;gbkey=rRNA;gene=RNR2;product=l-rRNA
 14 | NC_012920.1	RefSeq	exon	1671	3229	.	+	.	ID=exon-RNR2-1;Parent=rna-RNR2;Dbxref=GeneID:4550,HGNC:HGNC:7471,MIM:561010;Note=16S ribosomal RNA%3B 16S rRNA;gbkey=rRNA;gene=RNR2;product=l-rRNA
 15 | NC_012920.1	RefSeq	sequence_feature	3107	3107	.	+	.	ID=id-NC_012920.1:3107..3107;Note=preserves historical genome annotation numbering;gbkey=misc_feature
 16 | NC_012920.1	RefSeq	gene	3230	3304	.	+	.	ID=gene-TRNL1;Dbxref=GeneID:4567,HGNC:HGNC:7490,MIM:590050;Name=TRNL1;gbkey=Gene;gene=TRNL1;gene_biotype=tRNA;gene_synonym=MTTL1
 17 | NC_012920.1	RefSeq	tRNA	3230	3304	.	+	.	ID=rna-TRNL1;Parent=gene-TRNL1;Dbxref=GeneID:4567,HGNC:HGNC:7490,MIM:590050;Note=NAR: 1054;anticodon=(pos:3265..3267);codons=2%2C3;gbkey=tRNA;gene=TRNL1;product=tRNA-Leu
 18 | NC_012920.1	RefSeq	exon	3230	3304	.	+	.	ID=exon-TRNL1-1;Parent=rna-TRNL1;Dbxref=GeneID:4567,HGNC:HGNC:7490,MIM:590050;Note=NAR: 1054;anticodon=(pos:3265..3267);codons=2%2C3;gbkey=tRNA;gene=TRNL1;product=tRNA-Leu
 19 | NC_012920.1	RefSeq	gene	3307	4262	.	+	.	ID=gene-ND1;Dbxref=GeneID:4535,HGNC:HGNC:7455,MIM:516000;Name=ND1;gbkey=Gene;gene=ND1;gene_biotype=protein_coding;gene_synonym=MTND1
 20 | NC_012920.1	RefSeq	CDS	3307	4262	.	+	0	ID=cds-YP_003024026.1;Parent=gene-ND1;Dbxref=Genbank:YP_003024026.1,GeneID:4535,HGNC:HGNC:7455,MIM:516000;Name=YP_003024026.1;Note=TAA stop codon is completed by the addition of 3' A residues to the mRNA;gbkey=CDS;gene=ND1;product=NADH dehydrogenase subunit 1;protein_id=YP_003024026.1;transl_except=(pos:4261..4262%2Caa:TERM);transl_table=2
 21 | NC_012920.1	RefSeq	gene	4263	4331	.	+	.	ID=gene-TRNI;Dbxref=GeneID:4565,HGNC:HGNC:7488,MIM:590045;Name=TRNI;gbkey=Gene;gene=TRNI;gene_biotype=tRNA;gene_synonym=MTTI
 22 | NC_012920.1	RefSeq	tRNA	4263	4331	.	+	.	ID=rna-TRNI;Parent=gene-TRNI;Dbxref=GeneID:4565,HGNC:HGNC:7488,MIM:590045;Note=NAR: 0997;anticodon=(pos:4292..4294);codons=33;gbkey=tRNA;gene=TRNI;product=tRNA-Ile
 23 | NC_012920.1	RefSeq	exon	4263	4331	.	+	.	ID=exon-TRNI-1;Parent=rna-TRNI;Dbxref=GeneID:4565,HGNC:HGNC:7488,MIM:590045;Note=NAR: 0997;anticodon=(pos:4292..4294);codons=33;gbkey=tRNA;gene=TRNI;product=tRNA-Ile
 24 | NC_012920.1	RefSeq	gene	4329	4400	.	-	.	ID=gene-TRNQ;Dbxref=GeneID:4572,HGNC:HGNC:7495,MIM:590030;Name=TRNQ;gbkey=Gene;gene=TRNQ;gene_biotype=tRNA;gene_synonym=MTTQ
 25 | NC_012920.1	RefSeq	tRNA	4329	4400	.	-	.	ID=rna-TRNQ;Parent=gene-TRNQ;Dbxref=GeneID:4572,HGNC:HGNC:7495,MIM:590030;Note=NAR: 0597;anticodon=(pos:complement(4365..4367));codons=26;gbkey=tRNA;gene=TRNQ;product=tRNA-Gln
 26 | NC_012920.1	RefSeq	exon	4329	4400	.	-	.	ID=exon-TRNQ-1;Parent=rna-TRNQ;Dbxref=GeneID:4572,HGNC:HGNC:7495,MIM:590030;Note=NAR: 0597;anticodon=(pos:complement(4365..4367));codons=26;gbkey=tRNA;gene=TRNQ;product=tRNA-Gln
 27 | NC_012920.1	RefSeq	gene	4402	4469	.	+	.	ID=gene-TRNM;Dbxref=GeneID:4569,HGNC:HGNC:7492,MIM:590065;Name=TRNM;gbkey=Gene;gene=TRNM;gene_biotype=tRNA;gene_synonym=MTTM
 28 | NC_012920.1	RefSeq	tRNA	4402	4469	.	+	.	ID=rna-TRNM;Parent=gene-TRNM;Dbxref=GeneID:4569,HGNC:HGNC:7492,MIM:590065;Note=NAR: 1297;anticodon=(pos:4432..4434);codons=35;gbkey=tRNA;gene=TRNM;product=tRNA-Met
 29 | NC_012920.1	RefSeq	exon	4402	4469	.	+	.	ID=exon-TRNM-1;Parent=rna-TRNM;Dbxref=GeneID:4569,HGNC:HGNC:7492,MIM:590065;Note=NAR: 1297;anticodon=(pos:4432..4434);codons=35;gbkey=tRNA;gene=TRNM;product=tRNA-Met
 30 | NC_012920.1	RefSeq	gene	4470	5511	.	+	.	ID=gene-ND2;Dbxref=GeneID:4536,HGNC:HGNC:7456,MIM:516001;Name=ND2;gbkey=Gene;gene=ND2;gene_biotype=protein_coding;gene_synonym=MTND2
 31 | NC_012920.1	RefSeq	CDS	4470	5511	.	+	0	ID=cds-YP_003024027.1;Parent=gene-ND2;Dbxref=Genbank:YP_003024027.1,GeneID:4536,HGNC:HGNC:7456,MIM:516001;Name=YP_003024027.1;Note=TAA stop codon is completed by the addition of 3' A residues to the mRNA;gbkey=CDS;gene=ND2;product=NADH dehydrogenase subunit 2;protein_id=YP_003024027.1;transl_except=(pos:5511..5511%2Caa:TERM);transl_table=2
 32 | NC_012920.1	RefSeq	gene	5512	5579	.	+	.	ID=gene-TRNW;Dbxref=GeneID:4578,HGNC:HGNC:7501,MIM:590095;Name=TRNW;gbkey=Gene;gene=TRNW;gene_biotype=tRNA;gene_synonym=MTTW
 33 | NC_012920.1	RefSeq	tRNA	5512	5579	.	+	.	ID=rna-TRNW;Parent=gene-TRNW;Dbxref=GeneID:4578,HGNC:HGNC:7501,MIM:590095;Note=NAR: 1897;anticodon=(pos:5544..5546);codons=14;gbkey=tRNA;gene=TRNW;product=tRNA-Trp
 34 | NC_012920.1	RefSeq	exon	5512	5579	.	+	.	ID=exon-TRNW-1;Parent=rna-TRNW;Dbxref=GeneID:4578,HGNC:HGNC:7501,MIM:590095;Note=NAR: 1897;anticodon=(pos:5544..5546);codons=14;gbkey=tRNA;gene=TRNW;product=tRNA-Trp
 35 | NC_012920.1	RefSeq	gene	5587	5655	.	-	.	ID=gene-TRNA;Dbxref=GeneID:4553,HGNC:HGNC:7475,MIM:590000;Name=TRNA;gbkey=Gene;gene=TRNA;gene_biotype=tRNA;gene_synonym=MTTA
 36 | NC_012920.1	RefSeq	tRNA	5587	5655	.	-	.	ID=rna-TRNA;Parent=gene-TRNA;Dbxref=GeneID:4553,HGNC:HGNC:7475,MIM:590000;Note=NAR: 0097;anticodon=(pos:complement(5623..5625));codons=54;gbkey=tRNA;gene=TRNA;product=tRNA-Ala
 37 | NC_012920.1	RefSeq	exon	5587	5655	.	-	.	ID=exon-TRNA-1;Parent=rna-TRNA;Dbxref=GeneID:4553,HGNC:HGNC:7475,MIM:590000;Note=NAR: 0097;anticodon=(pos:complement(5623..5625));codons=54;gbkey=tRNA;gene=TRNA;product=tRNA-Ala
 38 | NC_012920.1	RefSeq	gene	5657	5729	.	-	.	ID=gene-TRNN;Dbxref=GeneID:4570,HGNC:HGNC:7493,MIM:590010;Name=TRNN;gbkey=Gene;gene=TRNN;gene_biotype=tRNA;gene_synonym=MTTN
 39 | NC_012920.1	RefSeq	tRNA	5657	5729	.	-	.	ID=rna-TRNN;Parent=gene-TRNN;Dbxref=GeneID:4570,HGNC:HGNC:7493,MIM:590010;Note=NAR: 0297;anticodon=(pos:complement(5694..5696));codons=41;gbkey=tRNA;gene=TRNN;product=tRNA-Asn
 40 | NC_012920.1	RefSeq	exon	5657	5729	.	-	.	ID=exon-TRNN-1;Parent=rna-TRNN;Dbxref=GeneID:4570,HGNC:HGNC:7493,MIM:590010;Note=NAR: 0297;anticodon=(pos:complement(5694..5696));codons=41;gbkey=tRNA;gene=TRNN;product=tRNA-Asn
 41 | NC_012920.1	RefSeq	gene	5761	5826	.	-	.	ID=gene-TRNC;Dbxref=GeneID:4511,HGNC:HGNC:7477,MIM:590020;Name=TRNC;gbkey=Gene;gene=TRNC;gene_biotype=tRNA;gene_synonym=MTTC
 42 | NC_012920.1	RefSeq	tRNA	5761	5826	.	-	.	ID=rna-TRNC;Parent=gene-TRNC;Dbxref=GeneID:4511,HGNC:HGNC:7477,MIM:590020;Note=NAR: 0497;anticodon=(pos:complement(5796..5798));codons=13;gbkey=tRNA;gene=TRNC;product=tRNA-Cys
 43 | NC_012920.1	RefSeq	exon	5761	5826	.	-	.	ID=exon-TRNC-1;Parent=rna-TRNC;Dbxref=GeneID:4511,HGNC:HGNC:7477,MIM:590020;Note=NAR: 0497;anticodon=(pos:complement(5796..5798));codons=13;gbkey=tRNA;gene=TRNC;product=tRNA-Cys
 44 | NC_012920.1	RefSeq	gene	5826	5891	.	-	.	ID=gene-TRNY;Dbxref=GeneID:4579,HGNC:HGNC:7502,MIM:590100;Name=TRNY;gbkey=Gene;gene=TRNY;gene_biotype=tRNA;gene_synonym=MTTY
 45 | NC_012920.1	RefSeq	tRNA	5826	5891	.	-	.	ID=rna-TRNY;Parent=gene-TRNY;Dbxref=GeneID:4579,HGNC:HGNC:7502,MIM:590100;Note=NAR: 1997;anticodon=(pos:complement(5860..5862));codons=9;gbkey=tRNA;gene=TRNY;product=tRNA-Tyr
 46 | NC_012920.1	RefSeq	exon	5826	5891	.	-	.	ID=exon-TRNY-1;Parent=rna-TRNY;Dbxref=GeneID:4579,HGNC:HGNC:7502,MIM:590100;Note=NAR: 1997;anticodon=(pos:complement(5860..5862));codons=9;gbkey=tRNA;gene=TRNY;product=tRNA-Tyr
 47 | NC_012920.1	RefSeq	gene	5904	7445	.	+	.	ID=gene-COX1;Dbxref=GeneID:4512,HGNC:HGNC:7419,MIM:516030;Name=COX1;gbkey=Gene;gene=COX1;gene_biotype=protein_coding;gene_synonym=COI,MTCO1
 48 | NC_012920.1	RefSeq	CDS	5904	7445	.	+	0	ID=cds-YP_003024028.1;Parent=gene-COX1;Dbxref=Genbank:YP_003024028.1,GeneID:4512,HGNC:HGNC:7419,MIM:516030;Name=YP_003024028.1;Note=cytochrome c oxidase I;gbkey=CDS;gene=COX1;product=cytochrome c oxidase subunit I;protein_id=YP_003024028.1;transl_table=2
 49 | NC_012920.1	RefSeq	gene	7446	7514	.	-	.	ID=gene-TRNS1;Dbxref=GeneID:4574,HGNC:HGNC:7497,MIM:590080;Name=TRNS1;gbkey=Gene;gene=TRNS1;gene_biotype=tRNA;gene_synonym=MTTS1
 50 | NC_012920.1	RefSeq	tRNA	7446	7514	.	-	.	ID=rna-TRNS1;Parent=gene-TRNS1;Dbxref=GeneID:4574,HGNC:HGNC:7497,MIM:590080;Note=NAR: 1697;anticodon=(pos:complement(7482..7484));codons=4%2C5%2C6%2C7;gbkey=tRNA;gene=TRNS1;product=tRNA-Ser
 51 | NC_012920.1	RefSeq	exon	7446	7514	.	-	.	ID=exon-TRNS1-1;Parent=rna-TRNS1;Dbxref=GeneID:4574,HGNC:HGNC:7497,MIM:590080;Note=NAR: 1697;anticodon=(pos:complement(7482..7484));codons=4%2C5%2C6%2C7;gbkey=tRNA;gene=TRNS1;product=tRNA-Ser
 52 | NC_012920.1	RefSeq	gene	7518	7585	.	+	.	ID=gene-TRND;Dbxref=GeneID:4555,HGNC:HGNC:7478,MIM:590015;Name=TRND;gbkey=Gene;gene=TRND;gene_biotype=tRNA;gene_synonym=MTTD
 53 | NC_012920.1	RefSeq	tRNA	7518	7585	.	+	.	ID=rna-TRND;Parent=gene-TRND;Dbxref=GeneID:4555,HGNC:HGNC:7478,MIM:590015;Note=NAR: 0397;anticodon=(pos:7548..7550);codons=57;gbkey=tRNA;gene=TRND;product=tRNA-Asp
 54 | NC_012920.1	RefSeq	exon	7518	7585	.	+	.	ID=exon-TRND-1;Parent=rna-TRND;Dbxref=GeneID:4555,HGNC:HGNC:7478,MIM:590015;Note=NAR: 0397;anticodon=(pos:7548..7550);codons=57;gbkey=tRNA;gene=TRND;product=tRNA-Asp
 55 | NC_012920.1	RefSeq	gene	7586	8269	.	+	.	ID=gene-COX2;Dbxref=GeneID:4513,HGNC:HGNC:7421,MIM:516040;Name=COX2;gbkey=Gene;gene=COX2;gene_biotype=protein_coding;gene_synonym=COII,MTCO2
 56 | NC_012920.1	RefSeq	CDS	7586	8269	.	+	0	ID=cds-YP_003024029.1;Parent=gene-COX2;Dbxref=Genbank:YP_003024029.1,GeneID:4513,HGNC:HGNC:7421,MIM:516040;Name=YP_003024029.1;Note=cytochrome c oxidase II;gbkey=CDS;gene=COX2;product=cytochrome c oxidase subunit II;protein_id=YP_003024029.1;transl_table=2
 57 | NC_012920.1	RefSeq	gene	8295	8364	.	+	.	ID=gene-TRNK;Dbxref=GeneID:4566,HGNC:HGNC:7489,MIM:590060;Name=TRNK;gbkey=Gene;gene=TRNK;gene_biotype=tRNA;gene_synonym=MTTK
 58 | NC_012920.1	RefSeq	tRNA	8295	8364	.	+	.	ID=rna-TRNK;Parent=gene-TRNK;Dbxref=GeneID:4566,HGNC:HGNC:7489,MIM:590060;Note=NAR: 1197;anticodon=(pos:8323..8325);codons=42;gbkey=tRNA;gene=TRNK;product=tRNA-Lys
 59 | NC_012920.1	RefSeq	exon	8295	8364	.	+	.	ID=exon-TRNK-1;Parent=rna-TRNK;Dbxref=GeneID:4566,HGNC:HGNC:7489,MIM:590060;Note=NAR: 1197;anticodon=(pos:8323..8325);codons=42;gbkey=tRNA;gene=TRNK;product=tRNA-Lys
 60 | NC_012920.1	RefSeq	gene	8366	8572	.	+	.	ID=gene-ATP8;Dbxref=GeneID:4509,HGNC:HGNC:7415,MIM:516070;Name=ATP8;gbkey=Gene;gene=ATP8;gene_biotype=protein_coding;gene_synonym=ATPase8,MTATP8
 61 | NC_012920.1	RefSeq	CDS	8366	8572	.	+	0	ID=cds-YP_003024030.1;Parent=gene-ATP8;Dbxref=Genbank:YP_003024030.1,GeneID:4509,HGNC:HGNC:7415,MIM:516070;Name=YP_003024030.1;Note=ATP synthase 8%3B ATPase subunit 8;gbkey=CDS;gene=ATP8;product=ATP synthase F0 subunit 8;protein_id=YP_003024030.1;transl_table=2
 62 | NC_012920.1	RefSeq	gene	8527	9207	.	+	.	ID=gene-ATP6;Dbxref=GeneID:4508,HGNC:HGNC:7414,MIM:516060;Name=ATP6;gbkey=Gene;gene=ATP6;gene_biotype=protein_coding;gene_synonym=ATPase6,MTATP6
 63 | NC_012920.1	RefSeq	CDS	8527	9207	.	+	0	ID=cds-YP_003024031.1;Parent=gene-ATP6;Dbxref=Genbank:YP_003024031.1,GeneID:4508,HGNC:HGNC:7414,MIM:516060;Name=YP_003024031.1;Note=ATP synthase 6%3B ATPase subunit 6;gbkey=CDS;gene=ATP6;product=ATP synthase F0 subunit 6;protein_id=YP_003024031.1;transl_table=2
 64 | NC_012920.1	RefSeq	gene	9207	9990	.	+	.	ID=gene-COX3;Dbxref=GeneID:4514,HGNC:HGNC:7422,MIM:516050;Name=COX3;gbkey=Gene;gene=COX3;gene_biotype=protein_coding;gene_synonym=COIII,MTCO3
 65 | NC_012920.1	RefSeq	CDS	9207	9990	.	+	0	ID=cds-YP_003024032.1;Parent=gene-COX3;Dbxref=Genbank:YP_003024032.1,GeneID:4514,HGNC:HGNC:7422,MIM:516050;Name=YP_003024032.1;Note=TAA stop codon is completed by the addition of 3' A residues to the mRNA;gbkey=CDS;gene=COX3;product=cytochrome c oxidase subunit III;protein_id=YP_003024032.1;transl_except=(pos:9990..9990%2Caa:TERM);transl_table=2
 66 | NC_012920.1	RefSeq	gene	9991	10058	.	+	.	ID=gene-TRNG;Dbxref=GeneID:4563,HGNC:HGNC:7486,MIM:590035;Name=TRNG;gbkey=Gene;gene=TRNG;gene_biotype=tRNA;gene_synonym=MTTG
 67 | NC_012920.1	RefSeq	tRNA	9991	10058	.	+	.	ID=rna-TRNG;Parent=gene-TRNG;Dbxref=GeneID:4563,HGNC:HGNC:7486,MIM:590035;Note=NAR: 0797;anticodon=(pos:10021..10023);codons=62;gbkey=tRNA;gene=TRNG;product=tRNA-Gly
 68 | NC_012920.1	RefSeq	exon	9991	10058	.	+	.	ID=exon-TRNG-1;Parent=rna-TRNG;Dbxref=GeneID:4563,HGNC:HGNC:7486,MIM:590035;Note=NAR: 0797;anticodon=(pos:10021..10023);codons=62;gbkey=tRNA;gene=TRNG;product=tRNA-Gly
 69 | NC_012920.1	RefSeq	gene	10059	10404	.	+	.	ID=gene-ND3;Dbxref=GeneID:4537,HGNC:HGNC:7458,MIM:516002;Name=ND3;gbkey=Gene;gene=ND3;gene_biotype=protein_coding;gene_synonym=MTND3
 70 | NC_012920.1	RefSeq	CDS	10059	10404	.	+	0	ID=cds-YP_003024033.1;Parent=gene-ND3;Dbxref=Genbank:YP_003024033.1,GeneID:4537,HGNC:HGNC:7458,MIM:516002;Name=YP_003024033.1;Note=TAA stop codon is completed by the addition of 3' A residues to the mRNA;gbkey=CDS;gene=ND3;product=NADH dehydrogenase subunit 3;protein_id=YP_003024033.1;transl_except=(pos:10404..10404%2Caa:TERM);transl_table=2
 71 | NC_012920.1	RefSeq	gene	10405	10469	.	+	.	ID=gene-TRNR;Dbxref=GeneID:4573,HGNC:HGNC:7496,MIM:590005;Name=TRNR;gbkey=Gene;gene=TRNR;gene_biotype=tRNA;gene_synonym=MTTR
 72 | NC_012920.1	RefSeq	tRNA	10405	10469	.	+	.	ID=rna-TRNR;Parent=gene-TRNR;Dbxref=GeneID:4573,HGNC:HGNC:7496,MIM:590005;Note=NAR: 0197;anticodon=(pos:10435..10437);codons=30;gbkey=tRNA;gene=TRNR;product=tRNA-Arg
 73 | NC_012920.1	RefSeq	exon	10405	10469	.	+	.	ID=exon-TRNR-1;Parent=rna-TRNR;Dbxref=GeneID:4573,HGNC:HGNC:7496,MIM:590005;Note=NAR: 0197;anticodon=(pos:10435..10437);codons=30;gbkey=tRNA;gene=TRNR;product=tRNA-Arg
 74 | NC_012920.1	RefSeq	gene	10470	10766	.	+	.	ID=gene-ND4L;Dbxref=GeneID:4539,HGNC:HGNC:7460,MIM:516004;Name=ND4L;gbkey=Gene;gene=ND4L;gene_biotype=protein_coding;gene_synonym=MTND4L
 75 | NC_012920.1	RefSeq	CDS	10470	10766	.	+	0	ID=cds-YP_003024034.1;Parent=gene-ND4L;Dbxref=Genbank:YP_003024034.1,GeneID:4539,HGNC:HGNC:7460,MIM:516004;Name=YP_003024034.1;gbkey=CDS;gene=ND4L;product=NADH dehydrogenase subunit 4L;protein_id=YP_003024034.1;transl_table=2
 76 | NC_012920.1	RefSeq	gene	10760	12137	.	+	.	ID=gene-ND4;Dbxref=GeneID:4538,HGNC:HGNC:7459,MIM:516003;Name=ND4;gbkey=Gene;gene=ND4;gene_biotype=protein_coding;gene_synonym=MTND4
 77 | NC_012920.1	RefSeq	CDS	10760	12137	.	+	0	ID=cds-YP_003024035.1;Parent=gene-ND4;Dbxref=Genbank:YP_003024035.1,GeneID:4538,HGNC:HGNC:7459,MIM:516003;Name=YP_003024035.1;Note=TAA stop codon is completed by the addition of 3' A residues to the mRNA;gbkey=CDS;gene=ND4;product=NADH dehydrogenase subunit 4;protein_id=YP_003024035.1;transl_except=(pos:12137..12137%2Caa:TERM);transl_table=2
 78 | NC_012920.1	RefSeq	gene	12138	12206	.	+	.	ID=gene-TRNH;Dbxref=GeneID:4564,HGNC:HGNC:7487,MIM:590040;Name=TRNH;gbkey=Gene;gene=TRNH;gene_biotype=tRNA;gene_synonym=MTTH
 79 | NC_012920.1	RefSeq	tRNA	12138	12206	.	+	.	ID=rna-TRNH;Parent=gene-TRNH;Dbxref=GeneID:4564,HGNC:HGNC:7487,MIM:590040;Note=NAR: 0897;anticodon=(pos:12168..12170);codons=25;gbkey=tRNA;gene=TRNH;product=tRNA-His
 80 | NC_012920.1	RefSeq	exon	12138	12206	.	+	.	ID=exon-TRNH-1;Parent=rna-TRNH;Dbxref=GeneID:4564,HGNC:HGNC:7487,MIM:590040;Note=NAR: 0897;anticodon=(pos:12168..12170);codons=25;gbkey=tRNA;gene=TRNH;product=tRNA-His
 81 | NC_012920.1	RefSeq	gene	12207	12265	.	+	.	ID=gene-TRNS2;Dbxref=GeneID:4575,HGNC:HGNC:7498,MIM:590085;Name=TRNS2;gbkey=Gene;gene=TRNS2;gene_biotype=tRNA;gene_synonym=MTTS2
 82 | NC_012920.1	RefSeq	tRNA	12207	12265	.	+	.	ID=rna-TRNS2;Parent=gene-TRNS2;Dbxref=GeneID:4575,HGNC:HGNC:7498,MIM:590085;Note=NAR: 1656;anticodon=(pos:12226..12228);codons=44%2C45;gbkey=tRNA;gene=TRNS2;product=tRNA-Ser
 83 | NC_012920.1	RefSeq	exon	12207	12265	.	+	.	ID=exon-TRNS2-1;Parent=rna-TRNS2;Dbxref=GeneID:4575,HGNC:HGNC:7498,MIM:590085;Note=NAR: 1656;anticodon=(pos:12226..12228);codons=44%2C45;gbkey=tRNA;gene=TRNS2;product=tRNA-Ser
 84 | NC_012920.1	RefSeq	gene	12266	12336	.	+	.	ID=gene-TRNL2;Dbxref=GeneID:4568,HGNC:HGNC:7491,MIM:590055;Name=TRNL2;gbkey=Gene;gene=TRNL2;gene_biotype=tRNA;gene_synonym=MTTL2
 85 | NC_012920.1	RefSeq	tRNA	12266	12336	.	+	.	ID=rna-TRNL2;Parent=gene-TRNL2;Dbxref=GeneID:4568,HGNC:HGNC:7491,MIM:590055;Note=NAR: 1097;anticodon=(pos:12298..12300);codons=16%2C17%2C18%2C19;gbkey=tRNA;gene=TRNL2;product=tRNA-Leu
 86 | NC_012920.1	RefSeq	exon	12266	12336	.	+	.	ID=exon-TRNL2-1;Parent=rna-TRNL2;Dbxref=GeneID:4568,HGNC:HGNC:7491,MIM:590055;Note=NAR: 1097;anticodon=(pos:12298..12300);codons=16%2C17%2C18%2C19;gbkey=tRNA;gene=TRNL2;product=tRNA-Leu
 87 | NC_012920.1	RefSeq	gene	12337	14148	.	+	.	ID=gene-ND5;Dbxref=GeneID:4540,HGNC:HGNC:7461,MIM:516005;Name=ND5;gbkey=Gene;gene=ND5;gene_biotype=protein_coding;gene_synonym=MTND5
 88 | NC_012920.1	RefSeq	CDS	12337	14148	.	+	0	ID=cds-YP_003024036.1;Parent=gene-ND5;Dbxref=Genbank:YP_003024036.1,GeneID:4540,HGNC:HGNC:7461,MIM:516005;Name=YP_003024036.1;gbkey=CDS;gene=ND5;product=NADH dehydrogenase subunit 5;protein_id=YP_003024036.1;transl_table=2
 89 | NC_012920.1	RefSeq	gene	14149	14673	.	-	.	ID=gene-ND6;Dbxref=GeneID:4541,HGNC:HGNC:7462,MIM:516006;Name=ND6;gbkey=Gene;gene=ND6;gene_biotype=protein_coding;gene_synonym=MTND6
 90 | NC_012920.1	RefSeq	CDS	14149	14673	.	-	0	ID=cds-YP_003024037.1;Parent=gene-ND6;Dbxref=Genbank:YP_003024037.1,GeneID:4541,HGNC:HGNC:7462,MIM:516006;Name=YP_003024037.1;gbkey=CDS;gene=ND6;product=NADH dehydrogenase subunit 6;protein_id=YP_003024037.1;transl_table=2
 91 | NC_012920.1	RefSeq	gene	14674	14742	.	-	.	ID=gene-TRNE;Dbxref=GeneID:4556,HGNC:HGNC:7479,MIM:590025;Name=TRNE;gbkey=Gene;gene=TRNE;gene_biotype=tRNA;gene_synonym=MTTE
 92 | NC_012920.1	RefSeq	tRNA	14674	14742	.	-	.	ID=rna-TRNE;Parent=gene-TRNE;Dbxref=GeneID:4556,HGNC:HGNC:7479,MIM:590025;Note=NAR: 0697;anticodon=(pos:complement(14710..14712));codons=58;gbkey=tRNA;gene=TRNE;product=tRNA-Glu
 93 | NC_012920.1	RefSeq	exon	14674	14742	.	-	.	ID=exon-TRNE-1;Parent=rna-TRNE;Dbxref=GeneID:4556,HGNC:HGNC:7479,MIM:590025;Note=NAR: 0697;anticodon=(pos:complement(14710..14712));codons=58;gbkey=tRNA;gene=TRNE;product=tRNA-Glu
 94 | NC_012920.1	RefSeq	gene	14747	15887	.	+	.	ID=gene-CYTB;Dbxref=GeneID:4519,HGNC:HGNC:7427,MIM:516020;Name=CYTB;gbkey=Gene;gene=CYTB;gene_biotype=protein_coding;gene_synonym=MTCYB
 95 | NC_012920.1	RefSeq	CDS	14747	15887	.	+	0	ID=cds-YP_003024038.1;Parent=gene-CYTB;Dbxref=Genbank:YP_003024038.1,GeneID:4519,HGNC:HGNC:7427,MIM:516020;Name=YP_003024038.1;Note=TAA stop codon is completed by the addition of 3' A residues to the mRNA;gbkey=CDS;gene=CYTB;product=cytochrome b;protein_id=YP_003024038.1;transl_except=(pos:15887..15887%2Caa:TERM);transl_table=2
 96 | NC_012920.1	RefSeq	gene	15888	15953	.	+	.	ID=gene-TRNT;Dbxref=GeneID:4576,HGNC:HGNC:7499,MIM:590090;Name=TRNT;gbkey=Gene;gene=TRNT;gene_biotype=tRNA;gene_synonym=MTTT
 97 | NC_012920.1	RefSeq	tRNA	15888	15953	.	+	.	ID=rna-TRNT;Parent=gene-TRNT;Dbxref=GeneID:4576,HGNC:HGNC:7499,MIM:590090;Note=NAR: 1797;anticodon=(pos:15919..15921);codons=38;gbkey=tRNA;gene=TRNT;product=tRNA-Thr
 98 | NC_012920.1	RefSeq	exon	15888	15953	.	+	.	ID=exon-TRNT-1;Parent=rna-TRNT;Dbxref=GeneID:4576,HGNC:HGNC:7499,MIM:590090;Note=NAR: 1797;anticodon=(pos:15919..15921);codons=38;gbkey=tRNA;gene=TRNT;product=tRNA-Thr
 99 | NC_012920.1	RefSeq	gene	15956	16023	.	-	.	ID=gene-TRNP;Dbxref=GeneID:4571,HGNC:HGNC:7494,MIM:590075;Name=TRNP;gbkey=Gene;gene=TRNP;gene_biotype=tRNA;gene_synonym=MTTP
100 | NC_012920.1	RefSeq	tRNA	15956	16023	.	-	.	ID=rna-TRNP;Parent=gene-TRNP;Dbxref=GeneID:4571,HGNC:HGNC:7494,MIM:590075;Note=NAR: 1597;anticodon=(pos:complement(15990..15992));codons=22;gbkey=tRNA;gene=TRNP;product=tRNA-Pro
101 | NC_012920.1	RefSeq	exon	15956	16023	.	-	.	ID=exon-TRNP-1;Parent=rna-TRNP;Dbxref=GeneID:4571,HGNC:HGNC:7494,MIM:590075;Note=NAR: 1597;anticodon=(pos:complement(15990..15992));codons=22;gbkey=tRNA;gene=TRNP;product=tRNA-Pro
102 | NC_012920.1	RefSeq	D_loop	16024	17145	.	-	.	ID=id-NC_012920.1:1..16569;gbkey=D-loop
103 | 


--------------------------------------------------------------------------------
/tests/files/TREX1.gff:
--------------------------------------------------------------------------------
 1 | NC_000003.11	BestRefSeq	gene	48488183	48509044	.	+	.	ID=gene-ATRIP-TREX1;Dbxref=GeneID:111822955;Name=ATRIP-TREX1;description=ATRIP-TREX1 readthrough;gbkey=Gene;gene=ATRIP-TREX1;gene_biotype=lncRNA
 2 | NC_000003.11	BestRefSeq	lnc_RNA	48488183	48509044	.	+	.	ID=rna-NR_153405.1;Parent=gene-ATRIP-TREX1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Name=NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1
 3 | NC_000003.11	BestRefSeq	exon	48488183	48488496	.	+	.	ID=exon-NR_153405.1-1;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1
 4 | NC_000003.11	BestRefSeq	exon	48491443	48491576	.	+	.	ID=exon-NR_153405.1-2;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1
 5 | NC_000003.11	BestRefSeq	exon	48493135	48493305	.	+	.	ID=exon-NR_153405.1-3;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1
 6 | NC_000003.11	BestRefSeq	exon	48495700	48495818	.	+	.	ID=exon-NR_153405.1-4;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1
 7 | NC_000003.11	BestRefSeq	exon	48498659	48498816	.	+	.	ID=exon-NR_153405.1-5;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1
 8 | NC_000003.11	BestRefSeq	exon	48500758	48500853	.	+	.	ID=exon-NR_153405.1-6;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1
 9 | NC_000003.11	BestRefSeq	exon	48501186	48501315	.	+	.	ID=exon-NR_153405.1-7;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1
10 | NC_000003.11	BestRefSeq	exon	48501509	48502198	.	+	.	ID=exon-NR_153405.1-8;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1
11 | NC_000003.11	BestRefSeq	exon	48502760	48502844	.	+	.	ID=exon-NR_153405.1-9;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1
12 | NC_000003.11	BestRefSeq	exon	48505144	48505280	.	+	.	ID=exon-NR_153405.1-10;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1
13 | NC_000003.11	BestRefSeq	exon	48505440	48505531	.	+	.	ID=exon-NR_153405.1-11;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1
14 | NC_000003.11	BestRefSeq	exon	48505981	48506061	.	+	.	ID=exon-NR_153405.1-12;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1
15 | NC_000003.11	BestRefSeq	exon	48506230	48506482	.	+	.	ID=exon-NR_153405.1-13;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1
16 | NC_000003.11	BestRefSeq	exon	48506886	48507708	.	+	.	ID=exon-NR_153405.1-14;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1
17 | NC_000003.11	BestRefSeq	exon	48508029	48509044	.	+	.	ID=exon-NR_153405.1-15;Parent=rna-NR_153405.1;Dbxref=GeneID:111822955,Genbank:NR_153405.1;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=ncRNA;gene=ATRIP-TREX1;inference=similar to RNA sequence (same species):RefSeq:NR_153405.1;product=ATRIP-TREX1 readthrough;transcript_id=NR_153405.1
18 | NC_000003.11	BestRefSeq	gene	48507229	48509044	.	+	.	ID=gene-TREX1;Dbxref=GeneID:11277,HGNC:HGNC:12269,MIM:606609;Name=TREX1;description=three prime repair exonuclease 1;gbkey=Gene;gene=TREX1;gene_biotype=protein_coding;gene_synonym=AGS1,CRV,DRN3,HERNS
19 | NC_000003.11	BestRefSeq	mRNA	48507229	48509044	.	+	.	ID=rna-NM_007248.5;Parent=gene-TREX1;Dbxref=GeneID:11277,Genbank:NM_007248.5,HGNC:HGNC:12269,MIM:606609;Name=NM_007248.5;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=TREX1;inference=similar to RNA sequence%2C mRNA (same species):RefSeq:NM_007248.5;product=three prime repair exonuclease 1%2C transcript variant 5;transcript_id=NM_007248.5
20 | NC_000003.11	BestRefSeq	exon	48507229	48507568	.	+	.	ID=exon-NM_007248.5-1;Parent=rna-NM_007248.5;Dbxref=GeneID:11277,Genbank:NM_007248.5,HGNC:HGNC:12269,MIM:606609;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=TREX1;inference=similar to RNA sequence%2C mRNA (same species):RefSeq:NM_007248.5;product=three prime repair exonuclease 1%2C transcript variant 5;transcript_id=NM_007248.5
21 | NC_000003.11	BestRefSeq	exon	48508067	48509044	.	+	.	ID=exon-NM_007248.5-2;Parent=rna-NM_007248.5;Dbxref=GeneID:11277,Genbank:NM_007248.5,HGNC:HGNC:12269,MIM:606609;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=TREX1;inference=similar to RNA sequence%2C mRNA (same species):RefSeq:NM_007248.5;product=three prime repair exonuclease 1%2C transcript variant 5;transcript_id=NM_007248.5
22 | NC_000003.11	BestRefSeq	CDS	48508085	48508999	.	+	0	ID=cds-NP_009179.2;Parent=rna-NM_007248.5;Dbxref=CCDS:CCDS59451.1,GeneID:11277,Genbank:NP_009179.2,HGNC:HGNC:12269,MIM:606609;Name=NP_009179.2;Note=isoform c is encoded by transcript variant 5;gbkey=CDS;gene=TREX1;product=three-prime repair exonuclease 1 isoform c;protein_id=NP_009179.2
23 | NC_000003.11	BestRefSeq	mRNA	48507629	48509044	.	+	.	ID=rna-NM_033629.6;Parent=gene-TREX1;Dbxref=GeneID:11277,Genbank:NM_033629.6,HGNC:HGNC:12269,MIM:606609;Name=NM_033629.6;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=TREX1;inference=similar to RNA sequence%2C mRNA (same species):RefSeq:NM_033629.6;product=three prime repair exonuclease 1%2C transcript variant 4;tag=MANE Select;transcript_id=NM_033629.6
24 | NC_000003.11	BestRefSeq	exon	48507629	48507708	.	+	.	ID=exon-NM_033629.6-1;Parent=rna-NM_033629.6;Dbxref=GeneID:11277,Genbank:NM_033629.6,HGNC:HGNC:12269,MIM:606609;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=TREX1;inference=similar to RNA sequence%2C mRNA (same species):RefSeq:NM_033629.6;product=three prime repair exonuclease 1%2C transcript variant 4;tag=MANE Select;transcript_id=NM_033629.6
25 | NC_000003.11	BestRefSeq	exon	48508029	48509044	.	+	.	ID=exon-NM_033629.6-2;Parent=rna-NM_033629.6;Dbxref=GeneID:11277,Genbank:NM_033629.6,HGNC:HGNC:12269,MIM:606609;Note=The RefSeq transcript has 1 substitution compared to this genomic sequence;exception=annotated by transcript or proteomic data;gbkey=mRNA;gene=TREX1;inference=similar to RNA sequence%2C mRNA (same species):RefSeq:NM_033629.6;product=three prime repair exonuclease 1%2C transcript variant 4;tag=MANE Select;transcript_id=NM_033629.6
26 | NC_000003.11	BestRefSeq	CDS	48508055	48508999	.	+	0	ID=cds-NP_338599.1;Parent=rna-NM_033629.6;Dbxref=CCDS:CCDS2769.1,GeneID:11277,Genbank:NP_338599.1,HGNC:HGNC:12269,MIM:606609;Name=NP_338599.1;Note=isoform b is encoded by transcript variant 4;gbkey=CDS;gene=TREX1;product=three-prime repair exonuclease 1 isoform b;protein_id=NP_338599.1;tag=MANE Select
27 | NC_000023.10	BestRefSeq	gene	122734420	122866902	.	-	.	ID=gene-THOC2;Dbxref=GeneID:57187,HGNC:HGNC:19073,MIM:300395;Name=THOC2;description=THO complex 2;gbkey=Gene;gene=THOC2;gene_biotype=protein_coding;gene_synonym=CXorf3,dJ506G2.1,hTREX120,MRX12,MRX35,THO2
28 | 


--------------------------------------------------------------------------------
/tests/files/hgnc_toy.tsv:
--------------------------------------------------------------------------------
  1 | Approved symbol	Alias symbols	NCBI Gene ID(supplied by NCBI)
  2 | BRCA2	FAD, FAD1, BRCC2, XRCC11	675
  3 | TMEM127	FLJ20507, FLJ22257	55654
  4 | TRPV4	OTRPC4, TRP12, VROAC, VRL-2, VR-OAC, CMT2C	59341
  5 | GLB1	EBP	2720
  6 | GLB1L2		89944
  7 | GLB1L3	FLJ90231	112937
  8 | GLB1L	MGC10771	79411
  9 | PLGLB1	PRP-B	5343
 10 | SH3GLB1	CGI-61, KIAA0491, Bif-1, PPP1R70	51100
 11 | AIF1	IRT-1, AIF-1, Em:AF129756.17, IBA1	199
 12 | ARHGEF12	KIAA0382, LARG	23365
 13 | C1QTNF12	MGC105127, CTRP12, ADIPOLIN	388581
 14 | CD300A	Irp60, CMRF35H, CMRF-35-H9, IRC1, IRC2, IGSF12	11314
 15 | CYP4F12		66002
 16 | DCAF12	DKFZP434O125, MGC1058, CT102, TCC52	25853
 17 | DCAF12L1	KIAA1892L	139170
 18 | DCAF12L2		340578
 19 | F12		2161
 20 | FGF12-AS1		100873986
 21 | FGF12-AS2		100873987
 22 | FGF12-AS3		100873988
 23 | FGF12B		
 24 | FGF12	FHF1	2257
 25 | KIF12		113220
 26 | KLF12	AP-2rep, HSPC122, AP2REP	11278
 27 | LINC00588	DKFZP434F122	26138
 28 | MAPK10	JNK3, p493F12, p54bSAPK	5602
 29 | MKRN3-AS1	FNZ127, NCRNA00009, ZNF127-AS	10108
 30 | NBPF12	COAS1	149013
 31 | OR1F12	hs6M1-35P, OR1F12Q	442179
 32 | OR4F12		
 33 | PHF12	PF1, KIAA1523	57649
 34 | PRAMEF12	OTTHUMG00000001927	390999
 35 | PSLNR	LA16c-83F12.6	106146148
 36 | RET	PTC, CDHF12, RET51, CDHR16	5979
 37 | RFPL3	RNF120	10738
 38 | RNF121	FLJ11099	55298
 39 | RNF122	FLJ12526	79845
 40 | RNF123	FLJ12565, KPC1	63891
 41 | RNF125	FLJ20456	54941
 42 | RNF126	FLJ20552	55658
 43 | RNF126P1		376412
 44 | RNF128	FLJ23516, GRAIL	79589
 45 | SPINT1-AS1	RP11-532F12.5	102724362
 46 | SRSF12	SRrp35, SFRS19	135295
 47 | TAF12	TAFII20	6883
 48 | TCF12	HEB, HTF4, HsT17266, bHLHb20, p64	6938
 49 | TNFRSF12A	FN14, TweakR, CD266	51330
 50 | TNFRSF12L	DR3L	
 51 | TNFSF12-TNFSF13	TWE-PRIL	407977
 52 | TNFSF12	TWEAK, DR3LG, APO3L	8742
 53 | TRAF3IP1	MIP-T3, DKFZP434F124, MIPT3, IFT54, FAP116	26146
 54 | UQCRHP1	Em:AF129756.18	100130756
 55 | ZBTB20	ODA-8S, DKFZp566F123, DPZF	26137
 56 | ZNF101	HZF12, DKFZp570I0164	94039
 57 | ZNF120		
 58 | ZNF121	ZHC32, ZNF20	7675
 59 | ZNF122		
 60 | ZNF123P	HZF-1	100188891
 61 | ZNF124	HZF16, HZF-16	7678
 62 | ZNF125	HZF3, HZF-3	
 63 | ZNF126	HZF2, HZF-2	
 64 | ZNF128		
 65 | ZNF129		
 66 | ZNF12	KOX3, GIOT-3	7559
 67 | ZNF92	HPF12, TF12	168374
 68 | SLC34A1	NAPI-3, NPTIIa, SLC11	6569
 69 | CFTR-AS1	BGas	111082987
 70 | CFTR	MRP7, ABC35, TNR-CFTR, dJ760C5.1, CFTR/MRP	1080
 71 | CFTRP1	dJ760C5.1	140871
 72 | CFTRP2		107080633
 73 | CFTRP3		106481718
 74 | TAS2R16	T2R16	50833
 75 | APAH1		
 76 | CPA6	CPAH	57094
 77 | PAH	PH	5053
 78 | PHYHIP	KIAA0273, PAHX-AP	9796
 79 | PHYH	PAHX, RD, PHYH1	5264
 80 | PTLAH	FPAH	
 81 | SLC22A6	ROAT1, PAHT, OAT1	9356
 82 | ADHFE1	FLJ32430	137872
 83 | HAMP	LEAP-1, HEPC, HFE2B, LEAP1	57817
 84 | HFE-AS1		
 85 | HFE	HLA-H, HFE1	3077
 86 | HJV	JH, HFE2A, RGMC, hemojuvelin, haemojuvelin	148738
 87 | SLC40A1	MTP1, IREG1, FPN1, HFE4	30061
 88 | TFR2	HFE3, TFRC2	7036
 89 | TREX1	DRN3	11277
 90 | CFTR-AS1	BGas	111082987
 91 | CFTR	MRP7, ABC35, TNR-CFTR, dJ760C5.1, CFTR/MRP	1080
 92 | CFTRP1	dJ760C5.1	140871
 93 | CFTRP2		107080633
 94 | CFTRP3		106481718
 95 | ADORA2A-AS1	FLJ34651	646023
 96 | ADORA2A	RDC8	135
 97 | SPECC1L-ADORA2A		101730217
 98 | FTCD		10841
 99 | FTCD-AS1		100861507
100 | FTCDNL1	FONG	348751
101 | CYP2D6	CPD6, P450-DB1, CYP2D, P450C2D	1565
102 | MUTYH	MYH	4595
103 | MSH6		2956
104 | ABHD15-AS1	linc-TP53I13, lnc-TP53I13	104355133
105 | EI24	PIG8, TP53I8, EPG4	9538
106 | ENC1	PIG10, ENC-1, TP53I10, KLHL37	8507
107 | GAMT	PIG2, TP53I2	2593
108 | LGALS7	GAL7, PIG1, TP53I1, LGALS7A	3963
109 | LITAF	PIG7, SIMPLE, FLJ38636, TP53I7	9516
110 | MAD1L1	HsMAD1, TXBP181, MAD1, PIG9, TP53I9	8379
111 | PRODH	HSPOX2, PRODH1, PIG6, PRODH2, TP53I6	5625
112 | PTGES	MGST-IV, PIG12, MGST1-L1, TP53I12	9536
113 | SAA1	PIG4, TP53I4	6288
114 | TOPORS	TP53BPL, LUN	10210
115 | TP53AIP1	p53AIP1	63970
116 | TP53BP1	53BP1, p202, TDRD30	7158
117 | TP53BP2P1		94299
118 | TP53BP2	PPP1R13A, ASPP2, 53BP2	7159
119 | TP53COR1	linc-p21, lincRNA-p21, Trp53cor1	102800311
120 | TP53CP	p53CP	
121 | TP53I11	PIG11	9537
122 | TP53I13	DSCP1	90313
123 | TP53I3	PIG3	9540
124 | TP53INP1	DKFZp434M1317, FLJ22139, P53DINP1, SIP, TP53INP1A, TP53INP1B, Teap	94241
125 | TP53INP2	FLJ21759, FLJ23500, DKFZp434B2411, DKFZp434O0827, dJ1181N3.1, PINH, DOR	58476
126 | TP53L	NBP	
127 | TP53	p53, LFS1	7157
128 | TP53RK	dJ101A2.2, prpk, Nori-2p, BUD32, TPRKB	112858
129 | TP53TG1	H_RG012D21.9, LINC00096	11257
130 | TP53TG3B		729355
131 | TP53TG3C		653550
132 | TP53TG3D		729264
133 | TP53TG3E		102724101
134 | TP53TG3F		102724127
135 | TP53TG3GP		106660619
136 | TP53TG3HP		100130700
137 | TP53TG3	P53TG3, TP53TG3A	24150
138 | TP53TG5	CLG01, dJ453C12.5	27296
139 | TMEM127	FLJ20507, FLJ22257	55654
140 | 


--------------------------------------------------------------------------------
/tests/functional-tests.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | test -e ssshtest || wget -q https://raw.githubusercontent.com/ryanlayer/ssshtest/master/ssshtest
  3 | 
  4 | . ssshtest
  5 | 
  6 | nim c -d:debug  -d:useSysAssert -d:useGcAssert --lineDir:on --debuginfo --boundChecks:on -x:on src/clinvcf
  7 | grch37_version="--genome GRCh37"
  8 | exe=./src/clinvcf
  9 | 
 10 | run simple_parsing $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/BRCA2.gff $grch37_version tests/files/37785.xml
 11 | assert_exit_code 0
 12 | assert_in_stdout "##fileDate=2019-12-31"
 13 | assert_in_stdout "13	32893387	37785	T	A"
 14 | assert_in_stdout "CLNSIG=Conflicting_interpretations_of_pathogenicity"
 15 | assert_in_stdout "ALLELEID=46341"
 16 | assert_in_stdout "GENEINFO=BRCA2:675"
 17 | assert_in_stdout "CLNREVSTAT=criteria_provided,_conflicting_interpretations"
 18 | assert_in_stdout "MC=SO:0001583|missense_variant"
 19 | assert_in_stdout "RS=80358507"
 20 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
 21 | assert_in_stdout "VARIANTLENGTH=1"
 22 | 
 23 | # Check integration of NCBI clinsig conversion
 24 | run ncbi_clnsig_conversion $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/109.xml
 25 | assert_exit_code 0
 26 | assert_in_stdout "CLNSIG=Likely_pathogenic,_risk_factor"
 27 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
 28 | assert_in_stdout "VARIANTLENGTH=1"
 29 | 
 30 | # Multiple submission from same submitter
 31 | run mutli_subs_from_same_submitter $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/307134.xml
 32 | assert_exit_code 0
 33 | assert_in_stdout "CLNREVSTAT=criteria_provided,_single_submitter"
 34 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
 35 | assert_in_stdout "VARIANTLENGTH=1"
 36 | 
 37 | # Multiple submission from same submitter
 38 | run skip_het_compound $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/928.xml
 39 | assert_exit_code 0
 40 | assert_in_stdout "CLNSIG=Likely_pathogenic"
 41 | 
 42 | # Conflicting variants should always has a ReviewStatus conflicting
 43 | # Even if all submission are from the same submitter
 44 | run same_submitter_conflict $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/1166.xml
 45 | assert_exit_code 0
 46 | assert_in_stdout "CLNREVSTAT=criteria_provided,_conflicting_interpretations"
 47 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
 48 | assert_in_stdout "VARIANTLENGTH=1"
 49 | 
 50 | # Multiple 3-4 stars subs, take them all !!! (see case 7108)
 51 | run run_multiple_3_4_star_subs $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/7108.xml
 52 | assert_exit_code 0
 53 | assert_in_stdout "CLNSIG=Pathogenic,_drug_response"
 54 | assert_in_stdout "CLNREVSTAT=reviewed_by_expert_panel"
 55 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
 56 | assert_in_stdout "VARIANTLENGTH=1"
 57 | 
 58 | # Sort non-ACMG clnsig lexicographically
 59 | run sort_non_acmg_cnlsig_tags $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/5333.xml
 60 | assert_exit_code 0
 61 | assert_in_stdout "CLNSIG=Affects,_risk_factor"
 62 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
 63 | assert_in_stdout "VARIANTLENGTH=1"
 64 | 
 65 | # Sort non-ACMG clnsig lexicographically
 66 | run expert_panel $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/582.xml
 67 | assert_exit_code 0
 68 | assert_in_stdout "CLNSIG=Pathogenic;"
 69 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
 70 | assert_in_stdout "VARIANTLENGTH=1"
 71 | 
 72 | # Correction of conflicting interpretation
 73 | run conflict_deciphering $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/9.xml
 74 | assert_exit_code 0
 75 | assert_in_stdout "CLNSIG=Pathogenic"
 76 | assert_in_stdout "OLD_CLNSIG=Conflicting_interpretations_of_pathogenicity"
 77 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
 78 | assert_in_stdout "VARIANTLENGTH=1"
 79 | 
 80 | # Handle multiple gene and select the prefered one from HGVS
 81 | run multi_gene_selection $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/TREX1.gff $grch37_version tests/files/225499.xml
 82 | assert_exit_code 0
 83 | assert_in_stdout "GENEINFO=TREX1:11277"
 84 | assert_in_stdout "VARIANTTYPE=Duplication"
 85 | assert_in_stdout "VARIANTLENGTH=1"
 86 | 
 87 | # Handle mutliple gene and select the prefered based on submissions (HGVS has no gene)
 88 | run multi_gene_selection $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/CFTR.gff $grch37_version tests/files/618897_2019-05.xml
 89 | assert_exit_code 0
 90 | assert_in_stdout "GENEINFO=CFTR:1080"
 91 | assert_in_stdout "VARIANTTYPE=Deletion"
 92 | assert_in_stdout "VARIANTLENGTH=1"
 93 | 
 94 | # Mitochondrial annotations
 95 | run mito_anno $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/MT.gff $grch37_version tests/files/9618.xml
 96 | assert_exit_code 0
 97 | assert_in_stdout "GENEINFO=TRNE:4556"
 98 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
 99 | assert_in_stdout "VARIANTLENGTH=1"
100 | 
101 | # Coding first option for gene anno to force using protein coding annotation
102 | run coding_first_control $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/ADORA2A.gff $grch37_version tests/files/225974.xml
103 | assert_exit_code 0
104 | assert_in_stdout "GENEINFO=ADORA2A:135"
105 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
106 | assert_in_stdout "VARIANTLENGTH=1"
107 | run coding_first_option $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/ADORA2A.gff --coding-first $grch37_version tests/files/225974.xml
108 | assert_exit_code 0
109 | assert_in_stdout "GENEINFO=ADORA2A:135"
110 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
111 | assert_in_stdout "VARIANTLENGTH=1"
112 | # Consider close exonic regions (20bp padding) as exonic in gene priorization module
113 | # In this case variant is in FTCD-AS1 (protein-coding) exon but 9bp away from FTCD.
114 | # We discriminate these two gene using the gene_id of FTCD that is smaller that FTCD-AS1
115 | run close_exonic_region $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/FTCD.gff $grch37_version tests/files/340430.xml
116 | assert_exit_code 0
117 | assert_in_stdout "GENEINFO=FTCD:10841"
118 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
119 | assert_in_stdout "VARIANTLENGTH=1"
120 | 
121 | # For antivariant (same as the reference)
122 | # We use the "." for the alternate allele representation
123 | run antivariant $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/242771.xml
124 | assert_exit_code 0
125 | assert_in_stdout "22	42523943	242771	A	."
126 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
127 | assert_in_stdout "VARIANTLENGTH=1"
128 | 
129 | # Haplotypes should not be exported in the VCF
130 | run haplotype $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/16895.xml
131 | assert_equal "$(grep -v '^#' $STDOUT_FILE)" ""
132 | 
133 | # 3-stars reclassification system
134 | run three_star_reclassification $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/184976.xml
135 | assert_exit_code 0
136 | assert_in_stdout "CLNSIG=Pathogenic/Likely_pathogenic"
137 | assert_in_stdout "CLNRECSTAT=3"
138 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
139 | assert_in_stdout "VARIANTLENGTH=1"
140 | 
141 | run two_star_reclassification $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/140866.xml
142 | assert_exit_code 0
143 | assert_in_stdout "CLNSIG=Likely_pathogenic"
144 | assert_in_stdout "CLNRECSTAT=2"
145 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
146 | assert_in_stdout "VARIANTLENGTH=1"
147 | 
148 | run one_star_reclassification $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/182965.xml
149 | assert_exit_code 0
150 | assert_in_stdout "CLNRECSTAT=1"
151 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
152 | assert_in_stdout "VARIANTLENGTH=1"
153 | 
154 | # Pathology parsing
155 | run pathology_field_parsing $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/109.xml
156 | assert_exit_code 0
157 | assert_in_stdout "CLNDISEASE=pheochromocytoma_susceptibility_to|pheochromocytoma"
158 | assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
159 | assert_in_stdout "VARIANTLENGTH=1"


--------------------------------------------------------------------------------
/tests/gff_tests.nim:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import clinvcfpkg/gff
 3 | 
 4 | suite "test GFF functions":
 5 | 
 6 |   test "test interval":
 7 |     var 
 8 |       a = Region(chrom: "A", start: 1, stop: 8)
 9 |       b = Region(chrom: "A", start: 5, stop: 12)
10 |       c = Region(chrom: "A", start: 1, stop: 4)
11 |       d = Region(chrom: "A", start: 1, stop: 4)
12 | 
13 |     check a.isOverlapping(b) == true
14 |     check a.isOverlapping(c) == true
15 |     check b.isOverlapping(c) == false
16 | 
17 |     check (d == c) == true
18 |     a.merge(b)
19 |     check a.start == 1
20 |     check a.stop == 12
21 |     check b.start == 5
22 |     check b.stop == 12
23 | 


--------------------------------------------------------------------------------
/tests/hgnc_tests.nim:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import tables
 3 | import clinvcfpkg/hgnc
 4 | 
 5 | suite "test HGNC functions":
 6 | 
 7 |   test "test initHgncDbfromFile":
 8 |     var f = "tests/files/hgnc_toy.tsv"
 9 |     var hgncIndex = initHgncDbfromFile(f)
10 |     # assert alias give the same entrez ID
11 |     check hgncIndex.alias["LFS1"] == "TP53"
12 | 


--------------------------------------------------------------------------------
/tests/nim.cfg:
--------------------------------------------------------------------------------
1 | path = "../src"


--------------------------------------------------------------------------------