├── Build Announcements
    ├── 151
    │   └── README.md
    └── 152
    │   └── README.md
├── Frequency Project
    ├── README.md
    └── graf-pop
    │   ├── AncInferSNPs.txt
    │   ├── AncestrySnps.cpp
    │   ├── AncestrySnps.h
    │   ├── BedFileSnpGeno.cpp
    │   ├── BedFileSnpGeno.h
    │   ├── BimFileAncestrySnps.cpp
    │   ├── BimFileAncestrySnps.h
    │   ├── ExtractAncSnpsFromVcfGz.pl
    │   ├── FamFileSamples.cpp
    │   ├── FamFileSamples.h
    │   ├── GrafPop.cpp
    │   ├── GrafPop.h
    │   ├── GrafPopFiles.pm
    │   ├── GrafPop_README.md
    │   ├── GraphColors.pm
    │   ├── GraphParameters.pm
    │   ├── GraphTransformation.pm
    │   ├── Makefile
    │   ├── PlotGrafPopResults.pl
    │   ├── PopulationCutoffs.pm
    │   ├── README.md
    │   ├── SampleGenoAncestry.cpp
    │   ├── SampleGenoAncestry.h
    │   ├── SampleGenoDist.cpp
    │   ├── SampleGenoDist.h
    │   ├── SaveSamples.pl
    │   ├── SubjectAncestry.pm
    │   ├── Util.cpp
    │   ├── Util.h
    │   ├── VcfSampleAncestrySnpGeno.cpp
    │   ├── VcfSampleAncestrySnpGeno.h
    │   └── readme.md
├── README.md
├── lib
    └── python
    │   ├── README.md
    │   ├── navs.py
    │   └── rsatt.py
├── requirement.txt
├── specs
    ├── README.md
    ├── eSummary.xml
    ├── eSummary.xsd
    └── refsnp_specification_deprecated.yaml
├── tests
    └── unittest
    │   ├── test_navs.py
    │   └── test_snpjsonparser.py
└── tutorials
    ├── Json_tutorial.txt
    ├── MafGraph.ipynb
    ├── README.md
    ├── Variation Services
        ├── Jupyter_Notebook
        │   ├── .gitignore
        │   ├── .library.json
        │   ├── Data
        │   │   ├── test_hgvs.txt
        │   │   ├── test_rs.txt
        │   │   └── test_vcf.vcf
        │   ├── README.md
        │   ├── by_gene.ipynb
        │   ├── by_rsid.ipynb
        │   ├── frequencies_for_vcf.ipynb
        │   ├── metadata_as_hash.ipynb
        │   ├── navs_spdi_demo.ipynb
        │   ├── plot.ipynb
        │   ├── querying_subsets_ftp.ipynb
        │   └── spdi_batch.ipynb
        ├── README.md
        ├── spdi_batch.py
        ├── test_hgvs.txt
        ├── test_rs.txt
        ├── test_variation.py
        └── test_vcf.vcf
    ├── eUtils.ipynb
    ├── extract_flank.sh
    ├── get_rs_flank.ipynb
    ├── hadoop_json_annotation.py
    ├── hadoop_json_clinical.py
    ├── hadoop_json_merge.py
    ├── hadoop_json_placement.py
    ├── refsnp-sample.json.gz
    ├── rsjson_allele_info_demo.py
    ├── rsjson_att_demo.ipynb
    ├── rsjson_demo.py
    └── rsjson_getss_info_demo.py


/Build Announcements/151/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # ANNOUNCEMENT: dbSNP Human Build 151 Double in Size to 660 Million Reference SNP (rs)
 3 | 
 4 | 
 5 | ## April 24, 2018
 6 | 
 7 | 
 8 |  
 9 | ## REMINDER: Important dbSNP changes and notifications 
10 | https://www.ncbi.nlm.nih.gov/mailman/pipermail/dbsnp-announce/2018q2/000186.html
11 |  
12 | 
13 | dbSNP human build 151 for both GRCh38.p7 and GRCh37.p13 assemblies is now available.  This build include new submissions from TopMed (https://www.nhlbi.nih.gov/research/resources/nhlbi-precision-medicine-initiative/topmed) and GnomAD (http://gnomad.broadinstitute.org/about) which more than double the number of available dbSNP reference SNP (rs) from 324 million to 660 million.   Allele frequency data are available for more than 500 million rs (see summary below) with most being rare (MAF < =0.001).
14 | 
15 | ### Build Summary:
16 | 
17 | |dbSNP ID|Build Total|
18 | |-------------------------------------|-------------|
19 | |Total Submitted SNP (ss) - redundant |1,803,358,848|
20 | |Total Reference SNP (rs) - non-redundant|660,773,127|
21 | 
22 | |Genomic mapping|GRCh37.p13|GRCh38.p7|
23 | |--------|-----------|-----------|
24 | |Assembly|648,992,551|660,440,048|
25 |  
26 | |Refseq Annotation|GRCh37.p13|GRCh38.p7|
27 | |-----------------|----------|---------|
28 | |Gene ID|30,194|38,811|
29 | |mRNA Accession|106,113|163,679|
30 | |Protein Accession|82,936|115,774|
31 | 
32 | |Function Class|GRCh37.p13|GRCh38.p7|
33 | |--------------|----------|---------|
34 | |CDS-INDEL|273,510|313,439|
35 | |CDS-SYNON|3,366,422|3,650,883|
36 | |FRAMESHIFT|411,981|424,833|
37 | |INTRON|250,243,824|348,938,103|
38 | |MISSENSE|6,938,964|7,506,129|
39 | |NCRNA|5,669,033|13,602,650|
40 | |NEARGENE-3|3,818,163|6,400,653|
41 | |NEARGENE-5|15,808,046|25,853,318|
42 | |SPLICE-3|95,699|117,225|
43 | |SPLICE-5|111,212|141,498|
44 | |STOP-GAIN|244,372|279,741|
45 | |STOP-LOSS|10,913|13,661|
46 | |UTR-3|6,129,919|8,668,133|
47 | |UTR-5|2,419,529|4,640,915|
48 | 
49 | 
50 | ### RS Allele Frequency Counts:
51 | 
52 | |Minor Allele Frequency (MAF)|TOPMED|GnomAD|1000 Genomes|ExAC|GO-ESP|
53 | |-----------------------|-----------|-----------|----------|---------|---------|
54 | |<=0.001|470,535,424|198,960,749|54,686,241|8,667,575|1,527,303|
55 | |>0.001 and <=0.01|20,069,273|15,967,086|15,944,038|276,851|197,816|
56 | |>0.01 and <0.1|11,410,633|7,940,223|7,852,416|99,605|99,694|
57 | |>=0.1|8,225,418|5,799,895|6,365,867|85,773|63,191|
58 | |Total RS with Frequency|510,240,748|228,667,953|84,848,562|9,129,804|1,888,004|
59 | 
60 | ### Entrez Search
61 | 	https://www.ncbi.nlm.nih.gov/snp
62 | 
63 | ### FTP
64 | 
65 | 			GRCh37.p13
66 | 			[ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh37p13](ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh37p13)
67 | 
68 | 
69 | 			GRCh38.p7
70 | 			[ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7](ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7)
71 | 
72 | 
73 | 
74 | Please email snp-admin@ncbi.nlm.nih.gov for any questions, suggestions, and comments.
75 | 
76 | Regards,
77 | 
78 | dbSNP Production Team
79 | 


--------------------------------------------------------------------------------
/Build Announcements/152/README.md:
--------------------------------------------------------------------------------
 1 | ## dbSNP build 152 release notes
 2 | https://www.ncbi.nlm.nih.gov/mailman/pipermail/dbsnp-announce/2018q4/000193.html
 3 | ```text
 4 | Organism name: Homo sapiens
 5 | Taxonomy ID: 9606
 6 | 
 7 | 1. RefSnp (RS)
 8 | 
 9 | Total RS: 683309324
10 | 
11 | 1.1 RS counts by location:
12 | chr1	51344938
13 | chr2	54963502
14 | chr3	44984330
15 | chr4	43257612
16 | chr5	40588494
17 | chr6	37950204
18 | chr7	36319227
19 | chr8	34468936
20 | chr9	28512286
21 | chr10	30298369
22 | chr11	31059211
23 | chr12	30016334
24 | chr13	22168655
25 | chr14	20193282
26 | chr15	18891124
27 | chr16	20788663
28 | chr17	18415589
29 | chr18	17534420
30 | chr19	14060284
31 | chr20	14403548
32 | chr21	8635154
33 | chr22	8971776
34 | chrX	25569389
35 | chrY	1275185
36 | chrM	3164
37 | Alt Only	29553
38 | Patch	7655
39 | Not On	39879
40 | PAR	798063
41 | Unplaced	93094
42 | Unlocalized	184778
43 | 
44 | NOTE: Assembly term (ALT, PAR, etc.) definitions (https://www.ncbi.nlm.nih.gov/grc/help/definitions/).
45 | 
46 | 1.2 RS counts by type:
47 | Live RS	655379774
48 | Unsupported RS	103739
49 | Withdrawn RS	6854924
50 | Locationless RS	1286
51 | Merged RS	20969601
52 | 
53 | RS Type Definitions:
54 | Live = RS has location on reference sequence 
55 | Merged = RS merged to existing RS due to improved clustering algorithm or possibly a change to the reference sequence that would result in identical canonical alleles (e.g. updated repeat regions).
56 | Unsupported = No Submitted SNP (SS) matched any of the RS alleles. (Same causes as merging.)
57 | Locationless = An older RS where the location couldn't be obtained from SS and was not available for the build.
58 | Withdrawn = All SS that belong to the RS cluster were withdrawn.
59 | 
60 | All above RS including non-Live records have history for traceability.
61 | 
62 | 2. SubSnp (SS)
63 | 
64 | Total SS: 1828331768
65 | Unmapped SS: 60193
66 | ```
67 | 


--------------------------------------------------------------------------------
/Frequency Project/README.md:
--------------------------------------------------------------------------------
 1 | # dbSNP (https://www.ncbi.nlm.nih.gov/snp)
 2 | ## ****This project is subject to change due to work in progress.  Please follow this space for updates.****
 3 | 
 4 | dbSNP aggregate allele frequency data from multiple sources including:
 5 | 
 6 | * HapMap
 7 | * 1000 Genomes
 8 | * GO-ESP
 9 | * ExAC
10 | * GnomAD
11 | * TOPMED
12 | * and many others
13 | 
14 | Example: (https://www.ncbi.nlm.nih.gov/snp/rs328#frequency_tab) 
15 | 
16 | dbSNP is currently designing new services to allow searching and retrieving frequency data.   Please send your comments and suggestions to snp-admin@ncbi.nlm.nih.gov or submit a request on GitHub (https://github.com/ncbi/dbsnp/issues).
17 | 
18 | Thank you for your interest.
19 | 
20 | Regards,
21 | 
22 | dbSNP Team
23 | 
24 | 
25 | 
26 | ---------------------------------------------------------------------------------------------------
27 | # ASHG 2019 Presentation
28 | 
29 | ### Open access to dbGaP new aggregated allele frequency for variant interpretation.
30 | 
31 | The slides are available on the dbSNP homepage (https://www.ncbi.nlm.nih.gov/snp/)
32 | 
33 | dbSNP Frequency content: https://ftp.ncbi.nlm.nih.gov/pub/factsheets/CoLabs_dbGaP_Frequency.pdf
34 | 
35 | ## Abstract:
36 | 
37 | NCBI database of Genotypes and Phenotypes (dbGaP) contains the results of over 1,200 studies investigating the interaction of genotype and phenotype. The database has over two million subjects and hundreds of millions of variants along with thousands of phenotypes and molecular assay data. This unprecedented volume and variety of data promise huge opportunities to identify genetic factors that influence health and disease. With this possibility, NIH has recently updated the Genomic Summary Results (GSR) access restriction to allow responsible sharing and use of the dbGaP GSR data (https://grants.nih.gov/grants/guide/notice-files/NOT-OD-19-023.html).
38 | 
39 | In fulfilling the updated GSR policy and to improve variant interpretation for health and disease, NCBI has undertaken the challenging task to compute allele frequency for variants in dbGaP across approved un-restricted studies and provide the data as ‘open-access’ to the public. The work involved harmonizing and normalizing heterogeneous data and file formats either from GWAS chip array or direct sequencing. Using dbSNP and dbGaP workflows the data were QA/QC and were transformed to standard VCF format as input into an automated pipeline to aggregate, remap and cluster to existing dbSNP rs, and compute allele frequency. Allele frequencies are calculated for 12 major populations including European, Hispanic, African, Asian, and others that were computed using GRAF-pop (Jin et al., 2019).
40 | 
41 | The initially released data (pending) included MAF for about 500M sites with data in dbSNP and +20M novel sites from +150 thousand subjects across more than 60 studies. dbGaP MAF data are consistent with MAF data previously reported in GnomAD for the same variants. Moreover, dbGaP has frequency data for novel and existing variants in dbSNP and ClinVar but not reported in 1000Genomes, GnomAD, ExAC, or TopMed. The data volume will grow and can potentially reach over a billion variants from millions of subjects combined across all dbGaP studies. New studies will be added to future dbSNP build release for ‘de novo’ allele frequency calculation across all studies. This presentation will describe the available resources (Web, FTP, and API) and how researchers, clinicians, and developers can incorporate these data into their workflows and applications to understanding human variation and disease.
42 | 
43 | ## Acknowledgments:
44 | Work at NCBI is supported by the NIH Intramural Research Program and the National Library of Medicine.
45 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/AncestrySnps.cpp:
--------------------------------------------------------------------------------
  1 | #include "AncestrySnps.h"
  2 | 
  3 | AncestrySnp::AncestrySnp(int id, int rsNum, int ch, int g37, int g38, char a1, char a2, float* refPops, float *vtxPops)
  4 | {
  5 |     snpId = id;
  6 |     rs = rsNum;
  7 |     chr = ch;
  8 |     posG37 = g37;
  9 |     posG38 = g38;
 10 |     ref = a1;
 11 |     alt = a2;
 12 | 
 13 |     for (int i = 0; i < numRefPops; i++) refPopAfs[i] = refPops[i];
 14 |     for (int i = 0; i < numVtxPops; i++) vtxPopAfs[i] = vtxPops[i];
 15 | }
 16 | 
 17 | AncestrySnps::AncestrySnps()
 18 | {
 19 | 
 20 |     refPopNames[0] = "African";
 21 |     refPopNames[1] = "European";
 22 |     refPopNames[2] = "Asian";
 23 |     refPopNames[3] = "Mexican";
 24 |     refPopNames[4] = "Indian-Pakistani";
 25 | 
 26 |     snps = {};
 27 | }
 28 | 
 29 | AncestrySnps::~AncestrySnps()
 30 | {
 31 |     snps.clear();
 32 |     rsToAncSnpId.clear();
 33 |     pos37ToAncSnpId.clear();
 34 |     pos38ToAncSnpId.clear();
 35 | }
 36 | 
 37 | int AncestrySnps::ReadAncestrySnpsFromFile(string ancSnpFile)
 38 | {
 39 |     ASSERT(FileExists(ancSnpFile.c_str()), "File " << ancSnpFile << " does not exist!\n");
 40 | 
 41 |     double popExpPfSums[numRefPops];
 42 |     double popExpPaSums[numRefPops];
 43 |     double popExpPeSums[numRefPops];
 44 | 
 45 |     for (int popId = 0; popId < numRefPops; popId++) {
 46 |         popExpPeSums[popId] = 0;
 47 |         popExpPfSums[popId] = 0;
 48 |         popExpPaSums[popId] = 0;
 49 |     }
 50 | 
 51 |     int lineLen = 300;
 52 |     char snpLine[lineLen];
 53 | 
 54 |     FILE *ifp = fopen(ancSnpFile.c_str(), "r");
 55 |     ASSERT(ifp, "Couldn't open file " << ancSnpFile << ".\n");
 56 | 
 57 |     int lineNo = 0;
 58 |     bool fileIsValid = true;
 59 | 
 60 |     int numSnps = 0;
 61 |     int rsNum, chr, g37, g38;
 62 |     char a1, a2;
 63 |     float rfEur, rfAfa, rfAsn, rfLat, rfSas, vtEur, vtAfr, vtEas;
 64 | 
 65 |     while (fgets(snpLine, lineLen, ifp) != NULL && fileIsValid == true) {
 66 |         if (lineNo == 0) {
 67 |             if (snpLine[0] != 'c' || snpLine[1] != 'h' || snpLine[2] != 'r') {
 68 |                 fileIsValid = false;
 69 |             }
 70 |         }
 71 |         else {
 72 |             sscanf(snpLine, "%d %d %d %d %c %c %f %f %f %f %f %f %f %f",
 73 |             &chr, &g37, &g38, &rsNum, &a1, &a2, &rfEur, &rfAfa, &rfAsn, &rfLat, &rfSas, &vtEur, &vtAfr, &vtEas);
 74 | 
 75 |             float* refPopAfs = new float[numRefPops];
 76 |             float* vtxPopAfs = new float[numVtxPops];
 77 | 
 78 |             refPopAfs[0] = rfEur;
 79 |             refPopAfs[1] = rfAfa;
 80 |             refPopAfs[2] = rfAsn;
 81 |             refPopAfs[3] = rfLat;
 82 |             refPopAfs[4] = rfSas;
 83 | 
 84 |             vtxPopAfs[0] = vtEur;
 85 |             vtxPopAfs[1] = vtAfr;
 86 |             vtxPopAfs[2] = vtEas;
 87 | 
 88 |             AncestrySnp ancSnp(numSnps, rsNum, chr, g37, g38, a1, a2, refPopAfs, vtxPopAfs);
 89 | 
 90 |             snps.push_back(ancSnp);
 91 | 
 92 |             long int chrPos37 = (long)chr * 1000000000 + g37;
 93 |             long int chrPos38 = (long)chr * 1000000000 + g38;
 94 | 
 95 |             rsToAncSnpId[rsNum] = numSnps;
 96 |             pos37ToAncSnpId[chrPos37] = numSnps;
 97 |             pos38ToAncSnpId[chrPos38] = numSnps;
 98 | 
 99 |             double pev = refPopAfs[0];
100 |             double pfv = refPopAfs[1];
101 |             double pav = refPopAfs[2];
102 | 
103 |             double qev = 1 - pev;
104 |             double qfv = 1 - pfv;
105 |             double qav = 1 - pav;
106 | 
107 |             for (int vtxId = 0; vtxId < 3; vtxId++) {
108 |                 double pv  = vtxPopAfs[vtxId];
109 |                 double qv  = 1 - pv;
110 | 
111 |                 double aaPev = log(pev) * 2;
112 |                 double bbPev = log(qev) * 2;
113 |                 double abPev = log(pev) + log(qev) + log(2);
114 | 
115 |                 double aaPfv = log(pfv) * 2;
116 |                 double bbPfv = log(qfv) * 2;
117 |                 double abPfv = log(pfv) + log(qfv) + log(2);
118 | 
119 |                 double aaPav = log(pav) * 2;
120 |                 double bbPav = log(qav) * 2;
121 |                 double abPav = log(pav) + log(qav) + log(2);
122 | 
123 |                 double eGd = aaPev * pv * pv + bbPev * qv * qv + abPev * 2 * pv * qv;
124 |                 double fGd = aaPfv * pv * pv + bbPfv * qv * qv + abPfv * 2 * pv * qv;
125 |                 double aGd = aaPav * pv * pv + bbPav * qv * qv + abPav * 2 * pv * qv;
126 | 
127 |                 vtxExpGenoDists[vtxId][0][numSnps] = eGd;
128 |                 vtxExpGenoDists[vtxId][1][numSnps] = fGd;
129 |                 vtxExpGenoDists[vtxId][2][numSnps] = aGd;
130 | 
131 |                 popExpPeSums[vtxId] += eGd;
132 |                 popExpPfSums[vtxId] += fGd;
133 |                 popExpPaSums[vtxId] += aGd;
134 |             }
135 | 
136 |             delete refPopAfs;
137 |             delete vtxPopAfs;
138 | 
139 |             numSnps++;
140 |         }
141 | 
142 |         lineNo++;
143 |     }
144 |     fclose(ifp);
145 | 
146 |     ASSERT(numSnps == numAncSnps, "numSnps = " << numAncSnps << ".\n");
147 | 
148 |     for (int vtxId = 0; vtxId < 3; vtxId++) {
149 |         vtxPopExpGds[vtxId].e = -1 * popExpPeSums[vtxId]/numSnps;
150 |         vtxPopExpGds[vtxId].f = -1 * popExpPfSums[vtxId]/numSnps;
151 |         vtxPopExpGds[vtxId].a = -1 * popExpPaSums[vtxId]/numSnps;
152 |     }
153 | 
154 |     cout << "Read " << numSnps << " ancestry SNPs from file " << ancSnpFile << "\n\n";
155 |     if (0) {
156 |         cout << "Expected vertex genetic distances\n";
157 |         for (int vtxId = 0; vtxId < 3; vtxId++) {
158 |             cout << "\tVertex " << vtxId << "\n";
159 |             cout << "\t\tEUR: " << vtxPopExpGds[vtxId].e << "\n";
160 |             cout << "\t\tAFR: " << vtxPopExpGds[vtxId].f << "\n";
161 |             cout << "\t\tEAS: " << vtxPopExpGds[vtxId].a << "\n";
162 |         }
163 |     }
164 | }
165 | 
166 | int AncestrySnps::FindSnpIdGivenRs(int rsNum)
167 | {
168 |     int snpId = -1;
169 | 
170 |     if (rsToAncSnpId.find(rsNum) != rsToAncSnpId.end()) {
171 |         snpId = rsToAncSnpId[rsNum];
172 |     }
173 | 
174 |     return snpId;
175 | }
176 | 
177 | int AncestrySnps::FindSnpIdGivenChrPos(int chr, int pos, int build)
178 | {
179 |     int snpId = -1;
180 | 
181 |     long int chrPos = long(chr) * 1000000000 + pos;
182 | 
183 |     if (build == 37) {
184 |         if (pos37ToAncSnpId.find(chrPos) != pos37ToAncSnpId.end()) {
185 |             snpId = pos37ToAncSnpId[chrPos];
186 |         }
187 |     }
188 |     else if (build == 38) {
189 |         if (pos38ToAncSnpId.find(chrPos) != pos38ToAncSnpId.end()) {
190 |             snpId = pos38ToAncSnpId[chrPos];
191 |         }
192 |     }
193 | 
194 |     return snpId;
195 | }
196 | 
197 | AncestrySnp AncestrySnps::GetAncestrySnp(int snpId)
198 | {
199 |     return snps[snpId];
200 | }
201 | 
202 | void AncestrySnps::ShowAncestrySnps()
203 | {
204 |     int numAncSnps = snps.size();
205 | 
206 |     cout << "Total " << numAncSnps << " Ancestry SNPs.\n";
207 |     bool debug = 0;
208 | 
209 |     if (debug) {
210 |         for (int i = 0; i < 20; i++) {
211 |             int snpId = i * 5000;
212 |             AncestrySnp snp = snps[snpId];
213 |             cout << "SNP " << snpId << " rs " << snp.rs << " chr " << snp.chr << " pos " << snp.posG37
214 |             << " ref " << snp.ref << " alt " << snp.alt;
215 |             for (int j = 0; j < numRefPops; j++) cout << " Ref " << j << " = " << snp.refPopAfs[j];
216 |             for (int j = 0; j < numVtxPops; j++) cout << " Vtx " << j << " = " << snp.vtxPopAfs[j];
217 |             cout << "\n";
218 |         }
219 |     }
220 | 
221 |     cout << "Positions (x, y, z coordinates) of the three vertices when all SNPs have genotypes:\n";
222 |     printf("\tE:  %5.4f  %5.4f  %5.4f\n", vtxPopExpGds[0].e, vtxPopExpGds[0].f, vtxPopExpGds[0].a);
223 |     printf("\tF:  %5.4f  %5.4f  %5.4f\n", vtxPopExpGds[1].e, vtxPopExpGds[1].f, vtxPopExpGds[1].a);
224 |     printf("\tA:  %5.4f  %5.4f  %5.4f\n", vtxPopExpGds[2].e, vtxPopExpGds[2].f, vtxPopExpGds[2].a);
225 | 
226 |     cout << "\nExpected genetic distances\n";
227 |     for (int i = 0; i < 3; i++) {
228 |         for (int j = 0; j < 3; j++) {
229 |             cout << i << "-" << j << ": ";
230 |             for (int k = 0; k < 5; k++) {
231 |                 cout << vtxExpGenoDists[i][j][k] << " ";
232 |             }
233 |             cout << "\n";
234 |         }
235 |     }
236 | }
237 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/AncestrySnps.h:
--------------------------------------------------------------------------------
 1 | #ifndef ANCESTRY_SNPS_H
 2 | #define ANCESTRY_SNPS_H
 3 | 
 4 | #include "Util.h"
 5 | 
 6 | static const int numAncSnps = 100437;
 7 | static const int numRefPops = 5;
 8 | static const int numVtxPops = 3;
 9 | 
10 | class AncestrySnp
11 | {
12 | public:
13 |     int snpId;
14 |     int rs;
15 |     int chr;
16 |     int posG37;
17 |     int posG38;
18 |     char ref;
19 |     char alt;
20 |     float vtxPopAfs[numVtxPops]; // E, F, A, or EUR, AFR, EAS
21 |     float refPopAfs[numRefPops]; // EUR, AFA, ASN, LAT, SAS
22 | 
23 | public:
24 |     AncestrySnp(int, int, int, int, int, char, char, float*, float*);
25 | };
26 | 
27 | class AncestrySnps
28 | {
29 |     map<int, int> rsToAncSnpId;
30 |     map<long int, int> pos37ToAncSnpId;
31 |     map<long int, int> pos38ToAncSnpId;
32 | 
33 | public:
34 |     AncestrySnps();
35 |     ~AncestrySnps();
36 |     vector<AncestrySnp> snps;
37 |     // For each SNP, keeps the expected genetic distance from the 3 vertices to the 3 ref population
38 |     double vtxExpGenoDists[numVtxPops][numVtxPops][numAncSnps];
39 |     // Vertex genetic distances summed up using all ancestry SNPs
40 |     GenoDist vtxPopExpGds[numVtxPops];
41 | 
42 |     string refPopNames[numRefPops];
43 | 
44 |     int ReadAncestrySnpsFromFile(string);
45 |     int FindSnpIdGivenRs(int);
46 |     int FindSnpIdGivenChrPos(int, int, int);
47 |     AncestrySnp GetAncestrySnp(int);
48 |     void SetVertexExpecteGeneticDists();
49 |     int GetNumAncestrySnps() { return snps.size(); };
50 |     void ShowAncestrySnps();
51 | };
52 | 
53 | #endif
54 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/BedFileSnpGeno.cpp:
--------------------------------------------------------------------------------
  1 | #include "BedFileSnpGeno.h"
  2 | 
  3 | BedFileSnpGeno::BedFileSnpGeno(string bFile, AncestrySnps *aSnps, BimFileAncestrySnps *bSnps, FamFileSamples *fSmps)
  4 | {
  5 |     bedFile = bFile;
  6 |     ancSnps = aSnps;
  7 |     bimSnps = bSnps;
  8 |     famSmps = fSmps;
  9 | 
 10 |     unsigned long base = 1;
 11 |     for (int i = 0; i < 64; i++) {
 12 |         baseNums[i] = base;
 13 |         base = base << 1;
 14 |     }
 15 | 
 16 |     numAncSnps = ancSnps->GetNumAncestrySnps();
 17 |     numBimSnps = bimSnps->GetNumBimSnps();
 18 |     numBimAncSnps = bimSnps->GetNumBimAncestrySnps();
 19 |     numSamples = famSmps->GetNumFamSamples();
 20 | 
 21 |     ancSnpSmpGenos = {};
 22 |     ancSnpSnpIds = {};
 23 | 
 24 |     vtxExpGd0 = new SampleGenoDist(&aSnps->vtxPopExpGds[0], &aSnps->vtxPopExpGds[1],
 25 |     &aSnps->vtxPopExpGds[2], &aSnps->vtxPopExpGds[0]);
 26 |     vtxExpGd0->TransformAllDists();
 27 |     vtxExpGd0->CalculateBaryCenters();
 28 | }
 29 | 
 30 | BedFileSnpGeno::~BedFileSnpGeno()
 31 | {
 32 |     for (int i = 0; i < ancSnpSmpGenos.size(); i++) {
 33 |         delete ancSnpSmpGenos[i];
 34 |     }
 35 |     ancSnpSmpGenos.clear();
 36 |     ancSnpSnpIds.clear();
 37 | }
 38 | 
 39 | char BedFileSnpGeno::GetCompAllele(char a)
 40 | {
 41 |     char c = '0';
 42 |     if      (a == 'A') c = 'T';
 43 |     else if (a == 'T') c = 'A';
 44 |     else if (a == 'G') c = 'C';
 45 |     else if (a == 'C') c = 'G';
 46 | 
 47 |     return c;
 48 | }
 49 | 
 50 | char* BedFileSnpGeno::RecodeBedSnpGeno(char *snpBedGenos, int numBytes, bool swap)
 51 | {
 52 |     char *snpGenos = new char[numSamples]; // char only takes one byte
 53 |     for (int i = 0; i < numSamples; i++) snpGenos[i] = 3;
 54 | 
 55 |     int smpNo = 0;
 56 |     int byteNo = 0;
 57 | 
 58 |     for (byteNo = 0; byteNo < numBytes; byteNo++) {
 59 |         char genoByte = snpBedGenos[byteNo];
 60 |         int val = int(genoByte);
 61 | 
 62 |         for (int byteSmpNo = 0; byteSmpNo < 4; byteSmpNo++) {
 63 |             int bit1Pos = byteSmpNo * 2;
 64 |             int bit2Pos = bit1Pos + 1;;
 65 | 
 66 |             int bit1 = genoByte & baseNums[bit1Pos];
 67 |             int bit2 = genoByte & baseNums[bit2Pos];
 68 | 
 69 |             int intGeno = 3;
 70 |             if      ( bit1 &&  bit2) intGeno = 2;
 71 |             else if (!bit1 &&  bit2) intGeno = 1;
 72 |             else if (!bit1 && !bit2) intGeno = 0;
 73 | 
 74 |             if (swap) {
 75 |                 if      (intGeno == 0) intGeno = 2;
 76 |                 else if (intGeno == 2) intGeno = 0;
 77 |             }
 78 | 
 79 |             if (smpNo < numSamples) snpGenos[smpNo] = intGeno;
 80 |             smpNo++;
 81 |         }
 82 |     }
 83 | 
 84 |     return snpGenos;
 85 | }
 86 | 
 87 | bool BedFileSnpGeno::ReadGenotypesFromBedFile()
 88 | {
 89 |     bool hasErr = false;
 90 | 
 91 |     char header[2];
 92 |     char mode[1];
 93 | 
 94 |     ifstream bedFilePtr (bedFile, ios::in | ios::binary);
 95 | 
 96 |     long snpNumBytes = (numSamples - 1) / 4 + 1;
 97 |     long expFileLen = snpNumBytes * numBimSnps + 3;
 98 | 
 99 |     bedFilePtr.seekg (0, bedFilePtr.end);
100 |     long fileLen = bedFilePtr.tellg();
101 |     bedFilePtr.seekg(0, bedFilePtr.beg);
102 | 
103 |     bedFilePtr.read (header, 2);
104 |     bedFilePtr.read (mode, 1);
105 | 
106 |     if (header[0] != 108 || header[1] != 27) {
107 |         cout << "ERROR: File " << bedFile << " is not a valid PLINK bed file!\n";
108 |         hasErr = true;
109 |     }
110 |     else if (mode[0] != 1) {
111 |         cout << "ERROR: File " << bedFile << " is not in SNP mode!\n";
112 |         hasErr = true;
113 |     }
114 | 
115 |     if (fileLen != expFileLen) {
116 |         cout << "ERROR: Number of genotypes in bed file doesn't match fam and bim File!\n";
117 |         cout << "\tFam file has " << numSamples << " samples.  Bim file has "
118 |         << numBimSnps << " SNPs. Each SNP should have "
119 |         << snpNumBytes << " bytes.  Expected total " << expFileLen << " bytes.\n";
120 |         cout << "\tBed file has " << fileLen << " bytes.\n";
121 |         hasErr = true;
122 |     }
123 | 
124 |     if (hasErr) return hasErr;
125 |     cout << "Reading genotypes from " << bedFile << "\n";
126 | 
127 |     char buff[snpNumBytes];             // Reusable memory to keep the genotypes
128 |     int bimAncSnpNo = 0;
129 | 
130 |     for (int i = 0; i < numBimSnps; i++) {
131 |         bedFilePtr.read (buff, snpNumBytes);
132 |         int ancSnpId = bimSnps->GetAncSnpIdGivenBimSnpPos(i);
133 |         int match = bimSnps->GetAlleleMatchGivenBimSnpPos(i);
134 |         bool swap = match ==  2 || match == -2 ? true : false;
135 | 
136 |         if (ancSnpId >= 0) {
137 |             char* snpGenoStr = new char[snpNumBytes];
138 |             for (int j = 0; j < snpNumBytes; j++) snpGenoStr[j] = buff[j];
139 |             ASSERT(bimAncSnpNo < numAncSnps, "bim ancestry SNP ID " << bimAncSnpNo << " not less than " << numAncSnps << "\n");
140 | 
141 |             char *snpSmpGeno = RecodeBedSnpGeno(snpGenoStr, snpNumBytes, swap);
142 | 
143 |             ancSnpSmpGenos.push_back(snpSmpGeno);
144 |             ancSnpSnpIds.push_back(ancSnpId);
145 | 
146 |             bimAncSnpNo++;
147 |         }
148 |     }
149 | 
150 |     bedFilePtr.close();
151 |     numBimAncSnps = bimAncSnpNo;
152 | 
153 |     cout << "Read genotypes of " << bimAncSnpNo << " Ancestry SNPs from total " << numBimSnps << " SNPs.\n";
154 |     cout << "Bed file has genotypes of " << numBimSnps << " SNPs. Read genotypes of "
155 |          << numBimAncSnps << " ancestry SNPs for " << numSamples << " samples.\n";
156 | 
157 |     return 0;
158 | }
159 | 
160 | 
161 | int BedFileSnpGeno::GetSnpGenoInt(bool b1, bool b2)
162 | {
163 |     int g = 3;
164 | 
165 |     if (!b1 && !b2) {
166 |         g = 0;
167 |     }
168 |     else if (!b1 && b2) {
169 |         g = 1;
170 |     }
171 |     else if (b1 && b2) {
172 |         g = 2;
173 |     }
174 | 
175 |     return g;
176 | }
177 | 
178 | void BedFileSnpGeno::ShowSummary()
179 | {
180 |     cout << "\n";
181 |     cout << "Total " << numSamples << " samples\n";
182 |     cout << "Total " << numAncSnps << " Ancestry SNPs\n";
183 |     cout << "Total " << numBimAncSnps << " Ancestry SNPs in bim file\n\n";
184 | }
185 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/BedFileSnpGeno.h:
--------------------------------------------------------------------------------
 1 | #ifndef BED_FILE_SNP_GENO_H
 2 | #define BED_FILE_SNP_GENO_H
 3 | 
 4 | #include <fstream>
 5 | #include "Util.h"
 6 | #include "AncestrySnps.h"
 7 | #include "BimFileAncestrySnps.h"
 8 | #include "FamFileSamples.h"
 9 | #include "SampleGenoDist.h"
10 | 
11 | static const int BYTE1_IN_BED_FILE = 108;
12 | static const int BYTE2_IN_BED_FILE = 27;
13 | static const int BYTE_OF_SNP_MODE  = 1;
14 | 
15 | class BedFileSnpGeno
16 | {
17 | public:
18 |     unsigned long baseNums[64];    // bits 1, 10, 100 ... for decoding genos in bed file
19 | 
20 |     int numAncSnps;
21 |     int numSamples;
22 |     int numBimSnps;
23 |     int numBimAncSnps;
24 | 
25 |     string bedFile;
26 |     AncestrySnps *ancSnps;
27 |     BimFileAncestrySnps *bimSnps;
28 |     FamFileSamples *famSmps;
29 |     SampleGenoDist *vtxExpGd0;    // Genetic distances from 3 vertices to ref populations when all SNPs have genotypes
30 | 
31 | public:
32 |     vector<char*> ancSnpSmpGenos; // Genotypes of Ancestry SNPs in an array of chars (0 = AA, 1 = AB; 2 = BB) of chars
33 |     vector<int> ancSnpSnpIds;     // Genotypes of Ancestry SNPs in an array of SNP IDs
34 | 
35 |     BedFileSnpGeno(string, AncestrySnps*, BimFileAncestrySnps*, FamFileSamples*);
36 |     ~BedFileSnpGeno();
37 |     bool ReadGenotypesFromBedFile();
38 |     void ShowSummary();
39 |     void InitPopPvalues();
40 | 
41 | private:
42 |     int genoFileLineLen;         // Max length of one sample geno line (with sample info)
43 |     int smpNameLen;
44 | 
45 |     char GetCompAllele(char);
46 |     int  GetSnpGenoInt(bool, bool);
47 |     char* RecodeBedSnpGeno(char*, int, bool);
48 | };
49 | 
50 | #endif
51 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/BimFileAncestrySnps.cpp:
--------------------------------------------------------------------------------
  1 | #include "BimFileAncestrySnps.h"
  2 | 
  3 | BimFileAncestrySnps::BimFileAncestrySnps()
  4 | {
  5 |     numDupAncSnps = 0;
  6 |     filename = "";
  7 |     numBimSnps = 0;
  8 | }
  9 | 
 10 | BimFileAncestrySnps::BimFileAncestrySnps(int totSnps)
 11 | {
 12 |     totAncSnps = totSnps;
 13 |     numDupAncSnps = 0;
 14 |     filename = "";
 15 |     numBimSnps = 0;
 16 |     numBimAncSnps = 0;
 17 |     numGoodAncSnps = 0;
 18 |     numRsAncSnps = 0;
 19 |     numPos37Snps = 0;
 20 |     numPos38Snps = 0;
 21 | }
 22 | 
 23 | BimFileAncestrySnps::~BimFileAncestrySnps()
 24 | {
 25 |     bimSnpAncSnpIds.clear();
 26 | }
 27 | 
 28 | char BimFileAncestrySnps::FlipAllele(char allele)
 29 | {
 30 |     char flipAllele = '0';
 31 | 
 32 |     switch(allele) {
 33 |         case 'A': flipAllele = 'T'; break;
 34 |         case 'T': flipAllele = 'A'; break;
 35 |         case 'G': flipAllele = 'C'; break;
 36 |         case 'C': flipAllele = 'G'; break;
 37 |     }
 38 | 
 39 |     return flipAllele;
 40 | }
 41 | 
 42 | int BimFileAncestrySnps::CompareAncestrySnpAlleles(const char a1, const char a2, const char expA1, const char expA2)
 43 | {
 44 |     int match = 0;
 45 | 
 46 |     char fa1 = FlipAllele(a1);
 47 |     char fa2 = FlipAllele(a2);
 48 | 
 49 |     if (a1 == expA1 && a2 == expA2) {
 50 |         match = 1;
 51 |     }
 52 |     else if (a1 == expA2 && a2 == expA1) {
 53 |         match = 2;  // swap
 54 |     }
 55 |     else if (fa1 == expA1 && fa2 == expA2) {
 56 |         match = -1; // flip
 57 |     }
 58 |     else if (fa1 == expA2 && fa2 == expA1) {
 59 |         match = -2; // swap and flip
 60 |     }
 61 | 
 62 |     return match;
 63 | }
 64 | 
 65 | int BimFileAncestrySnps::ReadAncestrySnpsFromFile(string bimFile, AncestrySnps* ancSnps)
 66 | {
 67 |     cout << "Reading SNPs from file " << bimFile << "\n";
 68 | 
 69 |     ASSERT(FileExists(bimFile.c_str()), "File " << bimFile << " does not exist.");
 70 | 
 71 |     filename = bimFile;
 72 | 
 73 |     int lineLen = 1048675;
 74 |     char fpLine[lineLen];
 75 | 
 76 |     FILE *ifp = fopen(bimFile.c_str(), "r");
 77 |     ASSERT(ifp, "Could not open " << bimFile << "\n");
 78 | 
 79 |     bool fileIsValid = true;
 80 | 
 81 |     int i;
 82 |     int rs, pos;
 83 |     char chrStr[128], rsStr[128], cm[64];
 84 |     char refStr[524288], altStr[524288]; // In case there are very, very long refs or alts
 85 | 
 86 |     // Read the bed file and save lines with potential ancestry SNPs into memory
 87 |     vector<int> bimSnpIds;
 88 |     vector<int> rsAncSnpIds;
 89 |     vector<int> pos37SnpIds;
 90 |     vector<int> pos38SnpIds;
 91 |     vector<char> refs;
 92 |     vector<char> alts;
 93 | 
 94 |     int numSaveSnps = 0;
 95 |     int numRsAncSnps = 0;
 96 |     int numPos37Snps = 0;
 97 |     int numPos38Snps = 0;
 98 | 
 99 |     numBimSnps = 0;
100 |     while (fgets(fpLine, lineLen, ifp) != NULL && fileIsValid == true) {
101 |         sscanf(fpLine, "%s %s %s %d %s %s", chrStr, rsStr, cm, &pos, refStr, altStr);
102 | 
103 |         int chr = GetChromosomeFromString(chrStr);
104 |         int rsNum = GetRsNumFromString(rsStr);
105 | 
106 |         int rsAncSnpId = ancSnps->FindSnpIdGivenRs(rsNum);
107 |         int pos37SnpId = ancSnps->FindSnpIdGivenChrPos(chr, pos, 37);
108 |         int pos38SnpId = ancSnps->FindSnpIdGivenChrPos(chr, pos, 38);
109 | 
110 |         if (rsAncSnpId > -1 || pos37SnpId > -1 || pos38SnpId > -1) {
111 |             char ref = 0, alt = 0;
112 |             if (strlen(refStr) == 1) ref = refStr[0];
113 |             if (strlen(altStr) == 1) alt = altStr[0];
114 | 
115 |             bimSnpIds.push_back(numBimSnps);
116 |             rsAncSnpIds.push_back(rsAncSnpId);
117 |             pos37SnpIds.push_back(pos37SnpId);
118 |             pos38SnpIds.push_back(pos38SnpId);
119 |             refs.push_back(ref);
120 |             alts.push_back(alt);
121 | 
122 |             if (rsAncSnpId > -1) numRsAncSnps++;
123 |             if (pos37SnpId > -1) numPos37Snps++;
124 |             if (pos38SnpId > -1) numPos38Snps++;
125 | 
126 |             numSaveSnps++;
127 |         }
128 | 
129 |         numBimSnps++;
130 |     }
131 | 
132 |     fclose(ifp);
133 | 
134 |     // Rs ID, GB37, or GB38, use whichever returns the most ancestry SNPs to find these SNPs
135 |     ancSnpType = AncestrySnpType::RSID;
136 |     int maxBimAncSnps = numRsAncSnps;
137 | 
138 |     if (numPos37Snps > maxBimAncSnps) {
139 |         ancSnpType = AncestrySnpType::GB37;
140 |         maxBimAncSnps = numPos37Snps;
141 |     }
142 | 
143 |     if (numPos38Snps > maxBimAncSnps) {
144 |         ancSnpType = AncestrySnpType::GB38;
145 |         maxBimAncSnps = numPos38Snps;
146 |     }
147 | 
148 |     for (i = 0; i < numBimSnps; i++) {
149 |         bimSnpAncSnpIds.push_back(-1);
150 |         bimSnpAlleleMatches.push_back(0);
151 |     }
152 | 
153 |     // Avoid adding same SNP more than once
154 |     numDupAncSnps = 0;
155 |     bool ancIdAdded[totAncSnps];
156 |     for (i = 0; i < totAncSnps; i++) ancIdAdded[i] = false;
157 | 
158 |     numBimAncSnps = 0;
159 |     int numSwaps = 0;
160 |     for (i = 0; i < numSaveSnps; i++) {
161 |         int bimSnpId = bimSnpIds[i];
162 |         char ref = refs[i];
163 |         char alt = alts[i];
164 | 
165 |         int ancSnpId = -1;
166 |         if (ancSnpType == AncestrySnpType::RSID) {
167 |             ancSnpId = rsAncSnpIds[i];
168 |         }
169 |         else if (ancSnpType == AncestrySnpType::GB37) {
170 |             ancSnpId = pos37SnpIds[i];
171 |         }
172 |         else if (ancSnpType == AncestrySnpType::GB38) {
173 |             ancSnpId = pos38SnpIds[i];
174 |         }
175 | 
176 |         if (ancSnpId > -1) {
177 |             AncestrySnp ancSnp = ancSnps->GetAncestrySnp(ancSnpId);
178 |             int match = CompareAncestrySnpAlleles(ref, alt, ancSnp.ref, ancSnp.alt);
179 | 
180 |             // Only save SNPs with expected alleles
181 |             if (match) {
182 |                 if (ancIdAdded[ancSnpId]) {
183 |                     numDupAncSnps++;
184 |                 }
185 |                 else {
186 |                     ancIdAdded[ancSnpId] = true;
187 |                     bimSnpAncSnpIds[bimSnpId] = ancSnpId;
188 |                     bimSnpAlleleMatches[bimSnpId] = match;
189 |                     if (match == -2 || match ==  2) numSwaps++;
190 |                     numGoodAncSnps++;
191 |                 }
192 |             }
193 |             numBimAncSnps++;
194 |         }
195 |     }
196 | 
197 |     return numBimSnps;
198 | }
199 | 
200 | void BimFileAncestrySnps::ShowSummary()
201 | {
202 |     int numBadAncSnps = numBimAncSnps - numGoodAncSnps;
203 | 
204 |     string showSnpType = "RS IDs";
205 |     if      (ancSnpType == AncestrySnpType::GB37) showSnpType = "GRCh 37 chromosome positions";
206 |     else if (ancSnpType == AncestrySnpType::GB38) showSnpType = "GRCh 38 chromosome positions";
207 | 
208 |     cout << "Total " << numBimSnps << " SNPs in bim file. " << numBimAncSnps << " SNPs are ancestry SNPs.\n";
209 |     cout << "\t" << showSnpType << " are used to find ancestry SNPs.\n";
210 |     cout << "\t" << numGoodAncSnps << " SNPs have expected alleles and will be used for ancestry inference.\n";
211 |     if (numDupAncSnps > 0) cout << "\t" << numDupAncSnps << " ancestry SNPs have multiple entries.\n";
212 | 
213 |     if (numBadAncSnps > 0) {
214 |         cout << "\t" << numBadAncSnps << " ancestry SNPs do not have expected alleles.\n";
215 |     }
216 |     cout << "\n";
217 | 
218 |     //  int snpId = GetAncSnpIdGivenBimSnpPos(i*100);
219 |     //  int match = GetAlleleMatchGivenBimSnpPos(i*100);
220 |     //  cout << "No. " << i << ": snpID " << snpId << " match " << match << "\n";
221 |     //}
222 | }
223 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/BimFileAncestrySnps.h:
--------------------------------------------------------------------------------
 1 | #ifndef BIM_FILE_ANCESTRY_SNPS_H
 2 | #define BIM_FILE_ANCESTRY_SNPS_H
 3 | 
 4 | #include "Util.h"
 5 | #include "AncestrySnps.h"
 6 | 
 7 | class BimFileAncestrySnps
 8 | {
 9 |     int totAncSnps;
10 | 
11 |     string filename;
12 |     int numBimSnps;
13 |     int numBimAncSnps;
14 |     int numGoodAncSnps;
15 |     int numDupAncSnps = 0;
16 | 
17 |     int numRsAncSnps;
18 |     int numPos37Snps;
19 |     int numPos38Snps;
20 | 
21 |     AncestrySnpType ancSnpType;
22 | 
23 |     // For each bim SNP, if it is an Ancestry SNP, the SNP ID is saved here.
24 |     // Ancestry SNP ID is 0-based. If a bim SNP is not an Ancestry SNP, the SNP ID is -1
25 |     vector<int> bimSnpAncSnpIds;
26 |     vector<int> bimSnpAlleleMatches; // SNP allele matches: 0 = not match; -1, -2: flip; 2, -2: swap
27 | 
28 | private:
29 |     char FlipAllele(char);
30 | 
31 | public:
32 |     BimFileAncestrySnps();
33 |     BimFileAncestrySnps(int);
34 |     ~BimFileAncestrySnps();
35 |     void SetTotalAncestrySnps(int totSnps) { totAncSnps = totSnps; };
36 |     char* RecodeBedSnpGeno(char*, bool);
37 |     int ReadAncestrySnpsFromFile(string, AncestrySnps*);
38 |     int CompareAncestrySnpAlleles(const char, const char, const char, const char);
39 |     int GetNumBimSnps() { return numBimSnps; };
40 |     int GetNumBimAncestrySnps() { return numBimAncSnps; };
41 |     int GetAncSnpIdGivenBimSnpPos(int bimSnpPos) {
42 |         return bimSnpPos >= 0 && bimSnpPos < numBimSnps ? bimSnpAncSnpIds[bimSnpPos] : -1;
43 |     };
44 |     int GetAlleleMatchGivenBimSnpPos(int bimSnpPos) {
45 |         return bimSnpPos >= 0 && bimSnpPos < numBimSnps ? bimSnpAlleleMatches[bimSnpPos] : 1000;
46 |     };
47 |     void ShowSummary();
48 | };
49 | 
50 | #endif
51 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/ExtractAncSnpsFromVcfGz.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/perl -w
  2 | 
  3 | my $disclaim = << "EOF";
  4 |     ====================================================================================
  5 |                                PUBLIC DOMAIN NOTICE
  6 |                 National Center for Biotechnology Information
  7 | 
  8 |         This software/database is a "United States Government Work" under the
  9 |         terms of the United States Copyright Act.  It was written as part of
 10 |         the author's official duties as a United States Government employee and
 11 |         thus cannot be copyrighted.  This software/database is freely available
 12 |         to the public for use. The National Library of Medicine and the U.S.
 13 |         Government have not placed any restriction on its use or reproduction.
 14 |         Although all reasonable efforts have been taken to ensure the accuracy
 15 |         and reliability of the software and data, the NLM and the U.S.
 16 |         Government do not and cannot warrant the performance or results that
 17 |         may be obtained by using this software or data. The NLM and the U.S.
 18 |         Government disclaim all warranties, express or implied, including
 19 |         warranties of performance, merchantability or fitness for any particular
 20 |         purpose.
 21 | 
 22 |         Please cite the author in any work or product based on this material.
 23 | 
 24 |         Author: Yumi (Jimmy) Jin (jinyu\@ncbi.nlm.nih.gov)
 25 |         File Description: script to extract genotypes of Ancestry SNPs from one or multiple vcf or vcf.gz files.
 26 |         Date: 05/06/2021
 27 |     ====================================================================================
 28 | EOF
 29 | 
 30 | use strict;
 31 | use warnings;
 32 | use Carp;
 33 | use Time::HiRes qw(gettimeofday);
 34 | use Cwd 'abs_path';
 35 | use File::Path qw(make_path remove_tree);
 36 | 
 37 | if (@ARGV < 2) {
 38 |     print "\n$disclaim\n";
 39 |     print "Usage: ExtractAncSnpsFromVcfGz.pl <vcf_or_vcf_gz_file> <output_vcf_file> [keyword]\n\n";
 40 |     print "Note:  When 'keyword' is provided, and the name of the input file includes the keyword followed\n";
 41 |     print "       by an integer, the script searches the directory for all files that have similar\n";
 42 |     print "       name as the input file, with the only difference being the integer after the keyword,\n";
 43 |     print "       and extract genotypes from all these files.\n";
 44 |     print "\n";
 45 |     exit;
 46 | }
 47 | 
 48 | my $script = $0;
 49 | my $pathToScript = abs_path($script);
 50 | if ($pathToScript =~ /(.+)\/(.+)/) {
 51 |     $pathToScript = $1;
 52 |     $script = $2;
 53 | }
 54 | 
 55 | my $vcfFile = $ARGV[0];
 56 | my $outFile = $ARGV[1];
 57 | my $keyWord = @ARGV > 2 ? $ARGV[2] : "";
 58 | 
 59 | my @t1 = gettimeofday;
 60 | 
 61 | unless (-e $vcfFile) {
 62 |     print "\nERROR: didn't find file $vcfFile!\n\n";
 63 |     exit;
 64 | }
 65 | 
 66 | my %allVcfFiles = ();
 67 | 
 68 | if ($keyWord) {
 69 |     my $shortFile = $vcfFile;
 70 |     my $fileDir = ".";
 71 |     if ($vcfFile =~ /(.+)\/(.+)/) {
 72 |         $fileDir = $1;
 73 |         $shortFile = $2;
 74 |     }
 75 | 
 76 |     # Make sure the keyword is included in the file name
 77 |     my $fileNo = 0;
 78 |     my ($prevStr, $postStr) = ("", "");
 79 | 
 80 |     if ($shortFile =~ /(\S+)$keyWord(\d+)(\S+)/) {
 81 |         $prevStr = $1;
 82 |         $fileNo = $2 + 0;
 83 |         $postStr = $3;
 84 |     }
 85 |     else {
 86 |         print "\nERROR: didn't find keyword '$keyWord' before a number in file name!\n\n";
 87 |         exit;
 88 |     }
 89 | 
 90 |     opendir DIR, $fileDir or die "Couldn't open directory $fileDir!\n";
 91 |     my @outFiles = readdir DIR;
 92 |     for my $file (@outFiles) {
 93 |         if ($file =~ /^$prevStr$keyWord(\d+)$postStr$/) {
 94 |             my $fileNo = $1 + 0;
 95 |             $allVcfFiles{$1} = "$fileDir/$file";
 96 |         }
 97 |     }
 98 |     closedir DIR;
 99 | }
100 | else {
101 |     $allVcfFiles{1} = $vcfFile;
102 | }
103 | 
104 | my $numVcfFiles = keys %allVcfFiles;
105 | print "Found $numVcfFiles vcf files with key word '$keyWord' before an integer.\n\n" if ($keyWord);
106 | 
107 | my $ancSnpFile = "AncInferSNPs.txt";
108 | $ancSnpFile = "$pathToScript/data/$ancSnpFile";
109 | unless (-e $ancSnpFile) {
110 |     print "\nERROR: didn't find $ancSnpFile\n";
111 |     exit;
112 | }
113 | 
114 | my ($totAncSnps, $rsSnpRef, $gb37Ref, $gb38Ref) = GetAncestrySnps($ancSnpFile);
115 | my %rsAncSnpIds = %$rsSnpRef;
116 | my %gb37AncSnpIds = %$gb37Ref;
117 | my %gb38AncSnpIds = %$gb38Ref;
118 | 
119 | print "Found $totAncSnps ancestry SNPs\n";
120 | print "Extracting ancestry SNP genos from $numVcfFiles vcf files ...\n";
121 | 
122 | my ($totVcfSnps, $totSaveSnps) = (0, 0);
123 | open OUTFILE, ">$outFile" or die "\nERROR: couldn't open $outFile for writing!\n";
124 | my $saveHead = 1;
125 | foreach my $fileNo (sort {$a <=> $b} keys %allVcfFiles) {
126 |     print "File with $keyWord$fileNo: $allVcfFiles{$fileNo}\n" if ($numVcfFiles > 1);
127 |     my ($numVcfSnps, $numVcfAncSnps) = ExtractAncGenoFromFile($allVcfFiles{$fileNo}, $saveHead);
128 |     $saveHead = 0;
129 |     $totVcfSnps += $numVcfSnps;
130 |     $totSaveSnps += $numVcfAncSnps;
131 | }
132 | close OUTFILE;
133 | 
134 | print "\nExtracted genotypes of total $totSaveSnps SNPs from $totVcfSnps SNPs in $numVcfFiles files.\n";
135 | print "Results saved to $outFile\n\n";
136 | 
137 | my @t2 = gettimeofday;
138 | my $time = GetTimeDifference(\@t1, \@t2);
139 | print "Time used $time\n";
140 | 
141 | 
142 | sub GetAncestrySnps
143 | {
144 |     my $baseAncFile = shift;
145 | 
146 |     my $ancFile = $baseAncFile;
147 | 
148 |     my @allPaths = split /:/, $ENV{PATH};
149 |     unshift @allPaths, "data";
150 |     unless (-e $baseAncFile) {
151 |         for my $path (@allPaths) {
152 |             print "Checking $path\n";
153 |             if (-e "$path/$baseAncFile") {
154 |                 $ancFile = "$path/$baseAncFile";
155 |                 last;
156 |             }
157 |     	}
158 |     }
159 |     die "\nERROR: didn't find ancestry SNP file $baseAncFile!\n" unless (-e $ancFile);
160 | 	
161 |     my %rsSnpIds = ();
162 |     my %gb37SnpIds = ();
163 |     my %gb38SnpIds = ();
164 | 
165 |     my $snpId = 0;
166 |     open FILE, $ancFile or die "Couldn't open $ancFile\n";
167 |     my $head = <FILE>;
168 |     while(<FILE>) {
169 |         chomp;
170 | 
171 |         my ($chr, $gb37, $gb38, $rs, $ref, $alt, @freqs) = split /\s+/, $_;
172 |         if ($chr && $gb37 && $gb38 && $rs) {
173 |             $rsSnpIds{"rs$rs"} = $snpId;
174 |             $gb37SnpIds{"$chr\t$gb37"} = $snpId;
175 |             $gb38SnpIds{"$chr\t$gb38"} = $snpId;
176 |         }
177 | 
178 |         $snpId++;
179 |     }
180 |     close FILE;
181 | 
182 |     return ($snpId, \%rsSnpIds, \%gb37SnpIds, \%gb38SnpIds);
183 | }
184 | 
185 | sub ExtractAncGenoFromFile
186 | {
187 |     my ($file, $saveHead) = @_;
188 | 
189 |     my @saveLines = ();
190 | 
191 |     if ($file =~ /vcf\.gz/ || $file =~ /vcf\.bgz/) {
192 |     	open FILE, "zcat $file |"  or die "gunzip $file: $!";
193 |     }
194 |     elsif ($file =~ /\.vcf$/) {
195 | 	    open FILE, $file or die "Couldn't open $file\n";
196 |     }
197 | 
198 |     my $rowNo = 0;
199 |     my $numAncSnps = 0;
200 |     my $numVcfSnps = 0;
201 |     while(<FILE>) {
202 |         chomp;
203 |         $rowNo++;
204 | 
205 |         if ($_ =~ /^#/) {
206 |             push @saveLines, $_ if ($saveHead);
207 |             next;
208 |         }
209 | 
210 |         my ($chr, $pos, $snp) = split /\t/, $_;
211 |         $chr =~ s/chr//;
212 | 
213 |         if (defined $rsAncSnpIds{$snp} ||
214 |             defined $gb37AncSnpIds{"$chr\t$pos"} ||
215 |             defined $gb38AncSnpIds{"$chr\t$pos"}) {
216 |             push @saveLines, $_;
217 |             $numAncSnps++;
218 |         }
219 | 
220 |         $numVcfSnps++;
221 |         print "\tChecked $numVcfSnps SNPs. Found $numAncSnps ancestry SNPs\n" if ($numVcfSnps % 5000000 == 0);
222 |     }
223 |     close FILE;
224 |     print "\tChecked $numVcfSnps SNPs. Found $numAncSnps ancestry SNPs\n";
225 | 
226 |     if ($numAncSnps > 0) {
227 |         for my $line (@saveLines) {
228 |             print OUTFILE "$line\n";
229 |         }
230 |         print "\tSaved $numAncSnps FP SNPs\n";
231 |     }
232 | 
233 |     return ($numVcfSnps, $numAncSnps);
234 | }
235 | 
236 | sub GetTimeDifference
237 | {
238 |     my ($t1, $t2) = @_;
239 | 
240 |     my $msg = "";
241 | 
242 |     my $t1sec = $$t1[0];
243 |     my $t1us  = $$t1[1];
244 |     my $t2sec = $$t2[0];
245 |     my $t2us  = $$t2[1];
246 | 
247 |     my $ds = $t2sec - $t1sec;
248 |     my $dus = $t2us - $t1us;
249 | 
250 |     if ($dus < 0) {
251 |         $dus += 1000000;
252 |         $ds -= 1;
253 |     }
254 | 
255 |     if ($ds < 0 || ($ds == 0 && $dus < 0)) {
256 |         print "Error: T1 > T2\n";
257 |         return $msg;
258 |     }
259 | 
260 |     my $dds = $ds % 60;
261 |     $dds += $dus/1000000;
262 | 
263 |     if ($ds == 0) {
264 |         $msg .= "$dus micro seconds";
265 |         return $msg;
266 |     }
267 | 
268 |     my $dh = int($ds/3600);
269 |     $ds = $ds % 3600;
270 |     my $dm = int($ds/60);
271 | 
272 |     $msg .= "$dh hour" if ($dh > 0);
273 |     $msg .= "s" if ($dh > 1);
274 |     $msg .= " $dm minute" if ($dh > 0 || $dm > 0);
275 |     $msg .= "s" if ($dm > 1);
276 |     $msg .= " $dds seconds";
277 | 
278 |     return $msg;
279 | }
280 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/FamFileSamples.cpp:
--------------------------------------------------------------------------------
 1 | #include "FamFileSamples.h"
 2 | 
 3 | using namespace std;
 4 | 
 5 | 
 6 | FamSample::FamSample(string smp, string dad, string mom, int gender)
 7 | {
 8 |     name = smp;
 9 |     father = dad;
10 |     mother = mom;
11 |     sex = gender;
12 | }
13 | 
14 | 
15 | FamFileSamples::FamFileSamples(string file)
16 | {
17 |     filename = file;
18 |     numFamSmps = 0;
19 |     numMales = 0;
20 |     numFemales = 0;
21 | 
22 |     ReadSamplesFromFile();
23 | }
24 | 
25 | bool FamFileSamples::Summarize()
26 | {
27 |     bool success = false;
28 |     int arraySize = samples.size();
29 | 
30 |     if (arraySize == numFamSmps) {
31 |         for (int i = 0; i < numFamSmps; i++) {
32 |             FamSample smp = samples[i];
33 |             if (smp.sex == 1) {
34 |                 numMales++;
35 |             }
36 |             else if (smp.sex == 2) {
37 |                 numFemales++;
38 |             }
39 |         }
40 | 
41 |         success = true;
42 |     }
43 | 
44 |     return success;
45 | }
46 | 
47 | void FamFileSamples::ShowSummary()
48 | {
49 |     bool success = false;
50 |     cout << "Total " << numFamSmps << " samples in fam file " << filename << ".\n";
51 |     cout << "\t" << numMales << " males\n";
52 |     cout << "\t" << numFemales << " females\n";
53 |     cout << "\n";
54 | }
55 | 
56 | int FamFileSamples::ReadSamplesFromFile()
57 | {
58 |     int numFileSmps = 0;
59 | 
60 |     ASSERT(FileExists(filename.c_str()), "File " << filename << " does not exist.");
61 | 
62 |     int lineLen = 300;
63 |     char fpLine[lineLen];
64 | 
65 |     FILE *ifp = fopen(filename.c_str(), "r");
66 |     ASSERT(ifp, "Couldn't open file " << filename << ".\n");
67 | 
68 |     int lineNo = 0;
69 |     bool fileIsValid = true;
70 | 
71 |     int numSmps = 0;
72 |     int smpSex, pheno;
73 |     char famId[80], smpId[80], dadId[80], momId[80];
74 | 
75 |     while (fgets(fpLine, lineLen, ifp) != NULL && fileIsValid == true) {
76 |         sscanf(fpLine, "%s %s %s %s %d %d", famId, smpId, dadId, momId, &smpSex, &pheno);
77 | 
78 |         if (!smpSex) smpSex = 0;
79 | 
80 |         if (smpId) {
81 |             FamSample sample(smpId, dadId, momId, smpSex);
82 |             samples.push_back(sample);
83 |             numFileSmps++;
84 |         }
85 | 
86 |         lineNo++;
87 |     }
88 |     fclose(ifp);
89 | 
90 |     numFamSmps = numFileSmps;
91 | 
92 |     Summarize();
93 | 
94 |     return numFileSmps;
95 | }
96 | 
97 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/FamFileSamples.h:
--------------------------------------------------------------------------------
 1 | #ifndef FAM_FILE_SAMPLES_H
 2 | #define FAM_FILE_SAMPLES_H
 3 | 
 4 | #include "Util.h"
 5 | 
 6 | class FamSample
 7 | {
 8 | public:
 9 |     // Info from the fam file
10 |     string name;
11 |     string father;
12 |     string mother;
13 |     int sex;
14 | 
15 | public:
16 |     FamSample(string, string, string, int);
17 | };
18 | 
19 | class FamFileSamples
20 | {
21 |     string filename;
22 |     int numFamSmps;
23 |     int numMales;
24 |     int numFemales;
25 | 
26 |     bool Summarize();
27 | 
28 | private:
29 |     int ReadSamplesFromFile();
30 | 
31 | public:
32 |     vector<FamSample> samples;
33 | 
34 |     FamFileSamples(string);
35 |     int GetNumFamSamples() {return numFamSmps;};
36 |     void ShowSummary();
37 | };
38 | 
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/GrafPop.cpp:
--------------------------------------------------------------------------------
  1 | #include "GrafPop.h"
  2 | 
  3 | SampleGenoAncestry *smpGenoAnc = NULL;
  4 | 
  5 | int main(int argc, char* argv[])
  6 | {
  7 |     string usage = "Usage: grafpop <Binary PLINK set or VCF file> <output file>\n";
  8 | 
  9 |     string disclaimer =
 10 |     "\n *==========================================================================="
 11 |     "\n *  GrafPop: Software to infer subject ancestry from genotypes quickly"
 12 |     "\n *  Yumi (Jimmy) Jin, PhD"
 13 |     "\n *  jinyu@ncbi.nlm.nih.gov"
 14 |     "\n *  04/05/2021"
 15 |     "\n *"
 16 |     "\n *                            PUBLIC DOMAIN NOTICE"
 17 |     "\n *               National Center for Biotechnology Information"
 18 |     "\n *"
 19 |     "\n *  This software/database is a \"United States Government Work\" under the"
 20 |     "\n *  terms of the United States Copyright Act.  It was written as part of"
 21 |     "\n *  the author's official duties as a United States Government employee and"
 22 |     "\n *  thus cannot be copyrighted.  This software/database is freely available"
 23 |     "\n *  to the public for use. The National Library of Medicine and the U.S."
 24 |     "\n *  Government have not placed any restriction on its use or reproduction."
 25 |     "\n *"
 26 |     "\n *  Although all reasonable efforts have been taken to ensure the accuracy"
 27 |     "\n *  and reliability of the software and data, the NLM and the U.S."
 28 |     "\n *  Government do not and cannot warrant the performance or results that"
 29 |     "\n *  may be obtained by using this software or data. The NLM and the U.S."
 30 |     "\n *  Government disclaim all warranties, express or implied, including"
 31 |     "\n *  warranties of performance, merchantability or fitness for any particular"
 32 |     "\n *  purpose."
 33 |     "\n *"
 34 |     "\n *  Please cite the author in any work or product based on this material."
 35 |     "\n *"
 36 |     "\n *===========================================================================";
 37 | 
 38 |     if (argc < 3) {
 39 |         cout << disclaimer << "\n\n";
 40 |         cout << usage << "\n";
 41 |         exit(0);
 42 |     }
 43 | 
 44 |     struct timeval t1, t2;
 45 |     gettimeofday(&t1, NULL);
 46 | 
 47 |     string genoDs, outputFile;
 48 |     genoDs = argv[1];
 49 |     outputFile = argv[2];
 50 | 
 51 |     string fileBase = "";
 52 |     GenoDatasetType fileType = CheckGenoDataFile(genoDs, &fileBase);
 53 | 
 54 |     if (fileType == GenoDatasetType::NOT_EXISTS) {
 55 |         cout << "\nERROR: Genotype file " << genoDs << " doesn't exist!\n\n";
 56 |     	return 0;
 57 |     }
 58 |     else if (fileType == GenoDatasetType::IS_PLINK_GZ) {
 59 |         cout << "\nERROR: PLINK set " << genoDs << " is zipped. Please unzip it.\n\n";
 60 |         return 0;
 61 |     }
 62 |     else if (fileType == GenoDatasetType::IS_OTHER) {
 63 |         cout << "\nERROR: Genotype file " << genoDs << " should be a binary PLINK set or vcf or vcf.gz file..\n\n";
 64 |         return 0;
 65 |     }
 66 | 
 67 |     string ancSnpFile = FindFile("AncInferSNPs.txt");
 68 |     if (ancSnpFile == "") {
 69 |         cout << "\nERROR: didn't find file AncInferSNPs.txt. Please put the file under 'data' directory.\n\n";
 70 |         return 0;
 71 |     }
 72 |     AncestrySnps *ancSnps = new AncestrySnps();
 73 |     ancSnps->ReadAncestrySnpsFromFile(ancSnpFile);
 74 |     //ancSnps->ShowAncestrySnps();
 75 | 
 76 |     int totAncSnps = ancSnps->GetNumAncestrySnps();
 77 |     int minAncSnps = 100;
 78 | 
 79 |     int numThreads = thread::hardware_concurrency();
 80 |     numThreads--;
 81 | 
 82 |     smpGenoAnc = new SampleGenoAncestry(ancSnps, minAncSnps);
 83 | 
 84 |     if (fileType == GenoDatasetType::IS_VCF || fileType == GenoDatasetType::IS_VCF_GZ) {
 85 |         VcfSampleAncestrySnpGeno *vcfGeno = new VcfSampleAncestrySnpGeno(genoDs, ancSnps);
 86 |         bool dataRead = vcfGeno->ReadDataFromFile();
 87 |         if (!dataRead) {
 88 |             cout << "\nFailed to read genotype data from " << genoDs << "\n\n";
 89 |             return 0;
 90 |         }
 91 |         vcfGeno->ShowSummary();
 92 |         vcfGeno->RecodeSnpGenotypes();
 93 | 
 94 |         int numAncSnps = vcfGeno->vcfAncSnpIds.size();
 95 |         int numVcfSmps = vcfGeno->GetNumSamples();
 96 | 
 97 |         if (smpGenoAnc->HasEnoughAncestrySnps(numAncSnps)) {
 98 |             smpGenoAnc->SetGenoSamples(vcfGeno->vcfSamples);
 99 |             smpGenoAnc->SetSnpGenoData(&vcfGeno->vcfAncSnpIds, &vcfGeno->vcfAncSnpCodedGenos);
100 |         }
101 |         else {
102 |             cout << "\nWARNING: Ancestry inference not done due to lack of genotyped ancestry SNPs "
103 |              << "(at least " << minAncSnps << " ancestry SNPs are needed).\n\n";
104 |             return 0;
105 |         }
106 |     }
107 |     else if (fileType == GenoDatasetType::IS_PLINK) {
108 |         string bedFile = fileBase + ".bed";
109 |         string bimFile = fileBase + ".bim";
110 |         string famFile = fileBase + ".fam";
111 | 
112 |         if ( !FileExists(bedFile.c_str()) ||
113 |              !FileExists(bimFile.c_str()) ||
114 |              !FileExists(famFile.c_str())    ) {
115 |             if (!FileExists(bedFile.c_str())) cout << "\nERROR: didn't find " << bedFile << "\n";
116 |             if (!FileExists(bimFile.c_str())) cout << "\nERROR: didn't find " << bimFile << "\n";
117 |             if (!FileExists(famFile.c_str())) cout << "\nERROR: didn't find " << famFile << "\n";
118 |             cout << "\n";
119 |             return 0;
120 |         }
121 | 
122 |         FamFileSamples *famSmps = new FamFileSamples(famFile);
123 |         famSmps->ShowSummary();
124 | 
125 |         smpGenoAnc->SetGenoSamples(famSmps->samples);
126 |         int numSmps = smpGenoAnc->GetNumSamples();
127 | 
128 |         BimFileAncestrySnps *bimSnps = new BimFileAncestrySnps(totAncSnps);
129 |         bimSnps->ReadAncestrySnpsFromFile(bimFile, ancSnps);
130 |         int numBimAncSnps = bimSnps->GetNumBimAncestrySnps();
131 |         bimSnps->ShowSummary();
132 | 
133 |         if (smpGenoAnc->HasEnoughAncestrySnps(numBimAncSnps)) {
134 |             BedFileSnpGeno *bedGenos = new BedFileSnpGeno(bedFile, ancSnps, bimSnps, famSmps);
135 |             bool hasErr = bedGenos->ReadGenotypesFromBedFile();
136 |             if (hasErr) return 0;
137 |             bedGenos->ShowSummary();
138 | 
139 |             smpGenoAnc->SetSnpGenoData(&bedGenos->ancSnpSnpIds, &bedGenos->ancSnpSmpGenos);
140 |         }
141 |         else {
142 |             cout << "Ancestry inference not done due to lack of genotyped ancestry SNPs.\n\n";
143 |             return 0;
144 |         }
145 |     }
146 | 
147 |     cout << "\nLaunching " << numThreads << " threads to calculate ancestry scores.\n";
148 |     smpGenoAnc->SetNumThreads(numThreads);
149 | 
150 |     mutex iomutex;
151 |     vector<thread> threads(numThreads);
152 | 
153 |     for (unsigned i = 0; i < numThreads; ++i) {
154 |         threads[i] = thread([&iomutex, i] {
155 |             {
156 |                 lock_guard<mutex> iolock(iomutex);
157 |             }
158 | 
159 | 	        smpGenoAnc->SetAncestryPvalues(i);
160 |         });
161 |     }
162 | 
163 |     for (auto& t : threads) {
164 |         t.join();
165 |     }
166 | 
167 |     smpGenoAnc->SaveAncestryResults(outputFile);
168 | 
169 |     gettimeofday(&t2, NULL);
170 |     cout << "\n";
171 |     ShowTimeDiff(t1, t2);
172 | 
173 |     return 1;
174 | }
175 | 
176 | string GetExecutablePath()
177 | {
178 |     char rawPathName[PATH_MAX];
179 |     realpath(PROC_SELF_EXE, rawPathName);
180 | 
181 |     string exePath = string(rawPathName);
182 |     size_t slashPos = exePath.find_last_of("/\\");
183 |     string exeDir = exePath.substr(0, slashPos);
184 | 
185 |     return exeDir;
186 | }
187 | 
188 | string FindFile(string filename)
189 | {
190 |     string fullFile = filename;
191 | 
192 |     if (FileExists(fullFile.c_str())) return fullFile;
193 | 
194 |     string exeDir = GetExecutablePath();
195 |     fullFile = exeDir + "/data/" + filename;
196 |     if (FileExists(fullFile.c_str())) return fullFile;
197 | 
198 |     fullFile = exeDir + "/" + filename;
199 |     if (FileExists(fullFile.c_str())) return fullFile;
200 | 
201 |     if(const char* grafPath = getenv("GRAFPATH")) {
202 |         string grafDir = string(grafPath);
203 |         fullFile = grafDir + "/data/" + filename;
204 |         if (FileExists(fullFile.c_str())) return fullFile;
205 | 
206 |         fullFile = grafDir + "/" + filename;
207 |         if (FileExists(fullFile.c_str())) return fullFile;
208 |     }
209 | 
210 |     return "";
211 | }
212 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/GrafPop.h:
--------------------------------------------------------------------------------
 1 | using namespace std;
 2 | 
 3 | #ifndef GRAFPOP_H
 4 | #define GRAFPOP_H
 5 | 
 6 | #include "Util.h"
 7 | #include "AncestrySnps.h"
 8 | #include "VcfSampleAncestrySnpGeno.h"
 9 | #include "FamFileSamples.h"
10 | #include "BimFileAncestrySnps.h"
11 | #include "BedFileSnpGeno.h"
12 | #include "SampleGenoDist.h"
13 | #include "SampleGenoAncestry.h"
14 | #include <thread>
15 | #include <mutex>
16 | 
17 | #if defined(__sun)
18 | #define PROC_SELF_EXE "/proc/self/path/a.out"
19 | #else
20 | #define PROC_SELF_EXE "/proc/self/exe"
21 | #endif
22 | 
23 | string GetExecutablePath(void);
24 | string FindFile(string);
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/GrafPopFiles.pm:
--------------------------------------------------------------------------------
  1 | package GrafPopFiles;
  2 | 
  3 | use strict;
  4 | use Carp;
  5 | 
  6 | #
  7 | # Read scores of subjects from result file generated by the C++ program
  8 | #
  9 | sub ReadGrafPopResults
 10 | {
 11 |     my ($inFile, $minSnps, $maxSnps) = @_;
 12 | 
 13 |     my $sbjNo = 0;
 14 |     my $totSbjs = 0;
 15 |     my @sbjPopScores = ();
 16 |     my %allPopSbjs = ();
 17 |     my ($minAncSnps, $maxAncSnps, $totAncSnps, $meanAncSnps) = (100000, 0, 0, 0);
 18 |     my $error = "";
 19 | 
 20 |     open FILE, $inFile or die "ERROR: Couldn't open file $inFile!\n\n";
 21 |     my $header = <FILE>;
 22 |     while ($header =~ /^\#/) {
 23 | 	    $header = <FILE>;
 24 |     }
 25 |     if ($header !~ /^Sample\t\#SNPs\tGD1.*\tGD2.*\tGD3.*\tGD4.*/) {
 26 | 	    $error = "Invalid input file.  Expected following columns:\n" .
 27 | 	            "\tSample\n\t#SNPs\n\tGD1 (x)\n\tGD2 (y)\n\tGD3 (z)\n\tGD4\n\tE(%)\n\tF(%)\n\tA(%)\n\n";
 28 |     }
 29 |     else {
 30 | 	    while(<FILE>) {
 31 | 	        chomp;
 32 | 	        my @vals = split /\t/, $_;
 33 |             if (@vals >= 9) {
 34 |                 my ($sbj, $numSnps, $xVal, $yVal, $zVal, $gd4, $ePct, $fPct, $aPct) = @vals;
 35 |                 my %info = (subject => $sbj, race => "", raceNo => 0, color => "", snps => $numSnps,
 36 |                         x => $xVal, y => $yVal, z => $zVal, gd4 => $gd4, fPct => $fPct, ePct => $ePct, aPct => $aPct);
 37 |                 if ($numSnps >= $minSnps && $numSnps <= $maxSnps && !$allPopSbjs{$sbj}) {
 38 |                     push @sbjPopScores, \%info;
 39 | 
 40 |                     $minAncSnps = $numSnps if ($numSnps < $minAncSnps);
 41 |                     $maxAncSnps = $numSnps if ($numSnps > $maxAncSnps);
 42 |                     $totAncSnps += $numSnps;
 43 |                     $allPopSbjs{$sbj} = 1;
 44 |                     $sbjNo++;
 45 |                 }
 46 | 
 47 |                 $totSbjs++;
 48 |             }
 49 | 	    }
 50 |         close FILE;
 51 | 
 52 |         my $numSbjs = @sbjPopScores;
 53 |         $meanAncSnps = $numSbjs > 0 ? $totAncSnps * 1.0 / $numSbjs : 0;
 54 | 
 55 |         if ($sbjNo < 1) {
 56 |             $error = "No sample found in $inFile";
 57 |         }
 58 | 
 59 |         print "Found $numSbjs samples with population scores in file $inFile. Total $totSbjs samples.\n";
 60 |         if ($minSnps > 0 || $maxSnps < 100438) {
 61 |             if ($numSbjs > 0) {
 62 |                 print "\t$numSbjs samples have $minSnps to $maxSnps genotyped Ancestry SNPs.\n";
 63 |             }
 64 |             else {
 65 |                 $error = "No samples with genotype ancestry SNPs between $minSnps and $maxSnps found in the input file.\n";
 66 |             }
 67 |         }
 68 |     }
 69 | 
 70 |     return (\@sbjPopScores, \%allPopSbjs, $minAncSnps, $maxAncSnps, $meanAncSnps, $error);
 71 | }
 72 | 
 73 | #
 74 | # Read races from a two-column file without header line for all subjects in a set
 75 | #
 76 | sub ReadSubjectRaces
 77 | {
 78 |     my ($file, $allSbjs) = @_;
 79 | 
 80 |     my $err = "";
 81 |     my $hasRace = 0;
 82 |     my %sbjRaces = ();
 83 |     my %allRaces = ();
 84 |     my $totSbjs = 0;
 85 | 
 86 |     unless (-e $file) {
 87 |         $err = "didn't find subject race file $file!\n";
 88 |         return (\%sbjRaces, \%allRaces, $hasRace, $err);
 89 |     }
 90 | 
 91 |     my $unkRace = "NOT REPORTED";
 92 |     open FILE, $file or die "\nERROR: Couldn't open $file!\n\n";
 93 |     while (<FILE>) {
 94 |         chomp;
 95 |         next if ($_ !~ /\S/);
 96 | 
 97 |         my ($sbj, $race) = split /\t/, $_;
 98 |         $race =~ s/\s*$//;
 99 |         $race = $1 if ($race =~ /^\s*\"(.+)\"\s*$/);
100 | 
101 |         if ($sbj && $race) {
102 |             $totSbjs++;
103 | 
104 |             if ($allSbjs->{$sbj}) {
105 |                 $hasRace = 1 if ($race && $race !~ /^unknown$/i);
106 |                 $race = $unkRace if (!$race || $race !~ /\S/);
107 |                 $sbjRaces{$sbj} = $race;
108 |                 $allRaces{$race} = 1;
109 |             }
110 |         }
111 |     }
112 |     close FILE;
113 | 
114 |     my $numSbjs = keys %sbjRaces;
115 |     my $numRaces = keys %allRaces;
116 |     if ($totSbjs == 0) {
117 | 	    print "\nWARNING: No subject races found in $file.\n";
118 |     }
119 |     elsif ($numRaces > 0) {
120 | 	    print "\nRead $numRaces populations from $numSbjs subjects in $file\n";
121 |     }
122 |     else {
123 | 	    print "\nWARNING: No race values found in $file for subjects included in input GrafPop result file.\n\n";
124 |     }
125 | 
126 |     return (\%sbjRaces, \%allRaces, $hasRace, $err);
127 | }
128 | 
129 | 1;
130 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/GraphColors.pm:
--------------------------------------------------------------------------------
 1 | package GraphColors;
 2 | 
 3 | use strict;
 4 | use Carp;
 5 | use GD;
 6 | 
 7 | sub new
 8 | {
 9 |     my ($class, $img) = @_;
10 | 
11 |     my $white   = $img->colorAllocate(255, 255, 255);
12 |     my $red     = $img->colorAllocate(255,   0,   0);
13 |     my $maroon  = $img->colorAllocate(255,   0,   0);
14 |     my $green   = $img->colorAllocate(255, 128,   0); # not green anymore
15 |     my $blue    = $img->colorAllocate(  0,   0, 255);
16 |     my $navy    = $img->colorAllocate(  0,   0, 128);
17 |     my $black   = $img->colorAllocate(  0,   0,   0);
18 |     my $gray    = $img->colorAllocate(128, 128, 128);
19 |     my $yellow  = $img->colorAllocate(255, 255,   0);
20 |     my $olive   = $img->colorAllocate(128, 128,   0);
21 |     my $purple  = $img->colorAllocate(128,   0, 128);
22 |     my $magenta = $img->colorAllocate(255,   0, 255);
23 |     my $orange  = $img->colorAllocate(255, 165,   0);
24 |     my $cyan    = $img->colorAllocate(  0, 255, 255);
25 |     my $teal    = $img->colorAllocate(  0, 128, 128);
26 |     my $gold    = $img->colorAllocate(204, 153,  80);
27 |     my $expAreaColor = $img->colorAllocate(  0, 150, 150);
28 | 
29 |     # Colors for self-reported ancestries
30 |     my $maxShowRaces = 10;
31 |     my @raceColors = ();
32 |     for my $raceNo (1 .. $maxShowRaces) {
33 |         push @raceColors, "";
34 |     }
35 |     $raceColors[0] = $yellow;
36 |     $raceColors[1] = $blue;
37 |     $raceColors[2] = $red;
38 |     $raceColors[3] = $olive;
39 |     $raceColors[4] = $purple;
40 |     $raceColors[5] = $cyan;
41 |     $raceColors[6] = $green;
42 |     $raceColors[7] = $teal;
43 |     $raceColors[8] = $navy;
44 |     $raceColors[9] = $magenta;
45 |     $raceColors[10] = $gold;
46 | 
47 |     bless {
48 |         white   => $white,
49 | 	    red     => $red,
50 |         maroon  => $maroon,
51 |         green   => $green,
52 |         blue    => $blue,
53 |         navy    => $navy,
54 |         black   => $black,
55 |         gray    => $gray,
56 |         olive   => $olive,
57 |         magenta => $magenta,
58 |         orange  => $orange,
59 |         cyan    => $cyan,
60 |         teal    => $teal,
61 |         gold    => $gold,
62 |         expAreaColor => $expAreaColor,
63 |         maxShowRaces => $maxShowRaces,
64 |         raceColors => \@raceColors
65 |     }, $class;
66 | }
67 | 
68 | 1;
69 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/GraphTransformation.pm:
--------------------------------------------------------------------------------
  1 | package GraphTransformation;
  2 | 
  3 | use strict;
  4 | use Carp;
  5 | 
  6 | my $pi = 3.1415926535;
  7 | 
  8 | #
  9 | # Move and rotate a 2D point (given x, y coordinates
 10 | #
 11 | sub TransformPoint2D
 12 | {
 13 |     my ($x1, $y1, $dx, $dy, $alpha) = @_;
 14 | 
 15 |     my $xt = $x1 + $dx;
 16 |     my $yt = $y1 + $dy;
 17 | 
 18 |     my $x = $xt * cos($alpha) - $yt * sin($alpha);
 19 |     my $y = $yt * cos($alpha) + $xt * sin($alpha);
 20 | 
 21 |     $x = $x - $dx;
 22 |     $y = $y - $dy;
 23 | 
 24 |     return ($x, $y);
 25 | }
 26 | 
 27 | #
 28 | # Rotate a 3D point (array reference) around an axis
 29 | #
 30 | sub RotatePoint3D
 31 | {
 32 |     my ($pRef, $deg, $axis) = @_; # deg = angle in degree; axis = x, y or z
 33 | 
 34 |     my $rad = $deg * $pi / 180;
 35 |     my $sind = sin($rad);
 36 |     my $cosd = cos($rad);
 37 | 
 38 |     my $t1 = [];
 39 |     if ($axis =~ /^x/) {
 40 |         $t1 = [
 41 |                 [1, 0,      0,     0],
 42 |                 [0, $cosd, -$sind, 0],
 43 |                 [0, $sind,  $cosd, 0],
 44 |                 [0, 0,      0,     1]  ];
 45 |     }
 46 |     elsif ($axis =~ /^y/) {
 47 |         $t1 = [
 48 |                 [ $cosd, 0,  $sind, 0],
 49 |                 [ 0,     1,  0,     0],
 50 |                 [-$sind, 0,  $cosd, 0],
 51 |                 [ 0,     0,  0,     1]  ];
 52 |     }
 53 |     elsif ($axis =~ /^z/) {
 54 |         $t1 = [
 55 |                 [$cosd, -$sind, 0, 0],
 56 |                 [$sind,  $cosd, 0, 0],
 57 |                 [0,      0,     1, 0],
 58 |                 [0,      0,     0, 1]  ];
 59 |     }
 60 | 
 61 |     my @p2 = TransformPoint3D($pRef, $t1);
 62 | 
 63 |     return @p2;
 64 | }
 65 | 
 66 | #
 67 | # Move a 3D point
 68 | #
 69 | sub MovePoint3D
 70 | {
 71 |     my ($pRef, $dx, $dy, $dz) = @_;
 72 | 
 73 |     my $t1 = [  [1, 0, 0, $dx],
 74 |                 [0, 1, 0, $dy],
 75 |                 [0, 0, 1, $dz],
 76 |                 [0, 0, 0, 1  ]  ];
 77 | 
 78 |     my @p2 = TransformPoint3D($pRef, $t1);
 79 | 
 80 |     return @p2;
 81 | }
 82 | 
 83 | #
 84 | # Transform a 3D point given a transformation matrix
 85 | #
 86 | sub TransformPoint3D
 87 | {
 88 |     my ($pRef, $tRef) = @_;
 89 | 
 90 |     my @p = @$pRef;
 91 |     my @t = @$tRef;
 92 | 
 93 |     my @q = (0, 0, 0, 0);
 94 | 
 95 |     for my $i (0 .. 3) {
 96 |         for my $j (0 .. 3) {
 97 |             $q[$i] += $t[$i]->[$j] * $p[$j];
 98 |         }
 99 |     }
100 | 
101 |     return @q;
102 | }
103 | 
104 | #
105 | # Rotate all points in the global array around an axis
106 | #
107 | sub RotateShape3D
108 | {
109 |     my ($deg, $axis, $dataPts) = @_;
110 | 
111 |     my $rad = $deg * $pi / 180;
112 |     my $sind = sin($rad);
113 |     my $cosd = cos($rad);
114 | 
115 |     my $t1 = [];
116 |     if ($axis =~ /x/) {
117 |         $t1 = [
118 |                 [1, 0,      0,     0],
119 |                 [0, $cosd, -$sind, 0],
120 |                 [0, $sind,  $cosd, 0],
121 |                 [0, 0,      0,     1]  ];
122 |     }
123 |     elsif ($axis =~ /y/) {
124 |         $t1 = [
125 |                 [ $cosd, 0,  $sind, 0],
126 |                 [ 0,     1,  0,     0],
127 |                 [-$sind, 0,  $cosd, 0],
128 |                 [ 0,     0,  0,     1]  ];
129 |     }
130 |     elsif ($axis =~ /z/) {
131 |         $t1 = [
132 |                 [$cosd, -$sind, 0, 0],
133 |                 [$sind,  $cosd, 0, 0],
134 |                 [0,      0,     1, 0],
135 |                 [0,      0,     0, 1]  ];
136 |     }
137 | 
138 |     TransformShape3D($t1, $dataPts);
139 | }
140 | 
141 | #
142 | # Scale all points in the global array
143 | #
144 | sub ScaleShape3D
145 | {
146 |     my ($sx, $sy, $sz, $dataPts) = @_;
147 |     $sy = $sx unless ($sy);
148 |     $sz = $sx unless ($sz);
149 | 
150 |     my $t1 = [  [$sx, 0,   0,   0],
151 |                 [0,   $sy, 0,   0],
152 |                 [0,   0,   $sz, 0],
153 |                 [0,   0,   0,   1]  ];
154 | 
155 |     TransformShape3D($t1, $dataPts);
156 | }
157 | 
158 | #
159 | # Move all points in the global array
160 | #
161 | sub MoveShape3D
162 | {
163 |     my ($dx, $dy, $dz, $dataPts) = @_;
164 |     my $t1 = [  [1, 0, 0, $dx],
165 |                 [0, 1, 0, $dy],
166 |                 [0, 0, 1, $dz],
167 |                 [0, 0, 0, 1  ]  ];
168 | 
169 |     TransformShape3D($t1, $dataPts);
170 | }
171 | 
172 | 
173 | #
174 | # Transform a list of points
175 | #
176 | sub TransformShape3D
177 | {
178 |     my ($tRef, $dataPts) = @_;
179 | 
180 |     for my $i (0 .. $#$dataPts) {
181 |         my @p = @{$dataPts->[$i]};
182 |         my @q = TransformPoint3D(\@p, $tRef);
183 |         $dataPts->[$i] = \@q;
184 |     }
185 | }
186 | 
187 | 
188 | 1;
189 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/Makefile:
--------------------------------------------------------------------------------
 1 | BASEDIR = ./
 2 | INCLUDES = -I. -I$(BASEDIR)
 3 | 
 4 | #------ Compiler and options -----------------
 5 | CXX = /usr/bin/g++
 6 | CXXFLAGS = -std=c++11 -pthread -g -lm -lz $(INCLUDES)
 7 | 
 8 | HDIR = ./
 9 | SRCDIR = ./
10 | 
11 | 
12 | #----- Suffix Rules ---------------------------
13 | .SUFFIXES: .cpp .C .cc
14 | 
15 | .cpp.o:
16 | 	$(CXX) $(CXXFLAGS) -c $<
17 | 
18 | #----- File Dependencies ----------------------
19 | 
20 | SRC = Util.cpp AncestrySnps.cpp VcfSampleAncestrySnpGeno.cpp FamFileSamples.cpp BimFileAncestrySnps.cpp BedFileSnpGeno.cpp SampleGenoDist.cpp SampleGenoAncestry.cpp  GrafPop.cpp
21 | 
22 | OBJ = $(addsuffix .o, $(basename $(SRC)))
23 | 
24 | grafpop: $(OBJ)
25 | 	$(CXX) $(CXXFLAGS) -o $@ $(OBJ)
26 | 
27 | Util.o: $(HDIR)Util.h
28 | 	$(CXX) $(CXXFLAGS) -c Util.cpp
29 | AncestrySnps.o: $(HDIR)AncestrySnps.h
30 | 	$(CXX) $(CXXFLAGS) -c AncestrySnps.cpp
31 | VcfSampleAncestrySnpGeno.o: $(HDIR)VcfSampleAncestrySnpGeno.h
32 | 	$(CXX) $(CXXFLAGS) -c VcfSampleAncestrySnpGeno.cpp
33 | FamFileSamples.o: $(HDIR)FamFileSamples.h
34 | 	$(CXX) $(CXXFLAGS) -c FamFileSamples.cpp
35 | BimFileAncestrySnps.o: $(HDIR)BimFileAncestrySnps.h
36 | 	$(CXX) $(CXXFLAGS) -c BimFileAncestrySnps.cpp
37 | BedFileSnpGeno.o: $(HDIR)BedFileSnpGeno.h
38 | 	$(CXX) $(CXXFLAGS) -c BedFileSnpGeno.cpp
39 | SampleGenoDist.o: $(HDIR)SampleGenoDist.h
40 | 	$(CXX) $(CXXFLAGS) -c SampleGenoDist.cpp
41 | SampleGenoAncestry.o:$(HDIR)SampleGenoAncestry.h
42 | 	$(CXX) $(CXXFLAGS) -c SampleGenoAncestry.cpp
43 | 
44 | depend:
45 | 	makedepend $(CXXFLAGS) -Y $(SRC)
46 | 
47 | clean:
48 | 	rm -f $(OBJ) *~
49 | 
50 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/README.md:
--------------------------------------------------------------------------------
 1 | # GrafPop Source Code
 2 | 
 3 | GrafPop source code includes C++ programs and Perl scripts. See xxx (to be added) for GrafPop software documentation.
 4 | 
 5 | ### Make C++ binary `grafpop`
 6 | Under the root directory, execute:
 7 | ```sh
 8 | $ make
 9 | ```
10 | 
11 | To regenerate the C++ binary after editing the code, execute:
12 | ```sh
13 | $ make clean
14 | $ make
15 | ```
16 | 
17 | ### Run medium tests
18 | 
19 | Test scripts and test cases are placed under medium_testing directory. Test cases are saved in `test_manifest.txt`. Perl script test_grafpop.pl is used for manually running these test cases.
20 | 1. Make sure environment variable `PATH` includes current directory `.`.
21 | 1. If necessary, set environment variable `GARFPATH` to include the directory where GrafPop binary and Perl scripts are located.
22 | 1. Under `medium_testing` directory, execute: `test_grafpop.pl test_manifest.txt`.
23 | 1. If source code is updated, update `test_manifest.txt` to add new test cases or modify existing cases, and execute `test_grafpop.pl test_manifest.txt 1` to update the baseline.
24 | 1. Check the baseline files and make sure they are all correct, then execute `test_grafpop.pl test_manifest.txt` (without the second parameter) again.
25 | 1. Make sure all test cases pass. 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/SampleGenoAncestry.h:
--------------------------------------------------------------------------------
 1 | #ifndef SAMPLE_GENO_ANCESTRY_H
 2 | #define SAMPLE_GENO_ANCESTRY_H
 3 | 
 4 | #include <fstream>
 5 | #include "Util.h"
 6 | #include "AncestrySnps.h"
 7 | #include "FamFileSamples.h"
 8 | #include "SampleGenoDist.h"
 9 | 
10 | class GenoSample
11 | {
12 | public:
13 |     string name;
14 |     string father;
15 |     string mother;
16 |     int sex;
17 | 
18 |     // Ancestry results calculated from genotypes
19 |     int numAncSnps;
20 |     bool ancIsSet;
21 |     float gd1, gd2, gd3, gd4;
22 |     float ePct, fPct, aPct;   // Ancestry (EUR, AFR, EAS) components of the sample
23 | 
24 | public:
25 |     GenoSample(string);
26 |     void SetAncestryScores(int, float, float, float, float, float, float, float, bool);
27 | };
28 | 
29 | 
30 | class SampleGenoAncestry
31 | {
32 | private:
33 |     int numSamples;
34 |     int numAncSmps;
35 | 
36 |     int minAncSnps;
37 |     int totAncSnps;
38 |     int numAncSnps;
39 |     int numThreads;                // Number of threads for parallel computing
40 | 
41 |     AncestrySnps *ancSnps;
42 |     SampleGenoDist *vtxExpGd0;    // Genetic distances from 3 vertices to ref populations when all SNPs have genotypes
43 | 
44 | public:
45 |     vector<GenoSample> samples;
46 |     vector<int> *ancSnpIds;
47 |     vector<char*> *ancSnpCodedGenos; // Use char, instead of int, to save space
48 | 
49 |     SampleGenoAncestry(AncestrySnps*, int=100);
50 |     ~SampleGenoAncestry();
51 | 
52 |     void SetGenoSamples(const vector<string>&);
53 |     void SetGenoSamples(const vector<FamSample>&);
54 |     int SaveAncestryResults(string);
55 |     void SetAncestryPvalues(int);
56 |     void SetSnpGenoData(vector<int>*, vector<char*>*);
57 |     void SetNumThreads(int);
58 |     void InitPopPvalues();
59 | 
60 |     int GetNumSamples() { return numSamples; };
61 |     int GetNumAncSamples() { return numAncSmps; };
62 |     bool HasEnoughAncestrySnps(int numSnps) { return numSnps >= minAncSnps; }
63 | 
64 |     void ShowSummary();
65 | };
66 | 
67 | #endif
68 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/SampleGenoDist.cpp:
--------------------------------------------------------------------------------
  1 | #include "SampleGenoDist.h"
  2 | 
  3 | SampleGenoDist::SampleGenoDist(GenoDist *ec, GenoDist *fc, GenoDist *ac, GenoDist *sp)
  4 | {
  5 |     CopyGenoDist(&aDist, ac);
  6 |     CopyGenoDist(&eDist, ec);
  7 |     CopyGenoDist(&fDist, fc);
  8 |     CopyGenoDist(&sDist, sp);
  9 | 
 10 |     afrPosition.x = 1.05;
 11 |     afrPosition.y = 1.10;
 12 |     afrPosition.z = 0.00;
 13 | }
 14 | 
 15 | void SampleGenoDist::CopyGenoDist(GenoDist *p, GenoDist *d)
 16 | {
 17 |     p->a = d->a;
 18 |     p->e = d->e;
 19 |     p->f = d->f;
 20 | }
 21 | 
 22 | void SampleGenoDist::SetPointWithDist(Point *p, GenoDist *d)
 23 | {
 24 |     p->x = d->a;
 25 |     p->y = d->e;
 26 |     p->z = d->f;
 27 | }
 28 | 
 29 | void SampleGenoDist::CopyPoint(Point *p, Point *s)
 30 | {
 31 |     p->x = s->x;
 32 |     p->y = s->y;
 33 |     p->z = s->z;
 34 | }
 35 | 
 36 | void SampleGenoDist::MovePoint(Point *p, double dx, double dy, double dz)
 37 | {
 38 |     p->x += dx;
 39 |     p->y += dy;
 40 |     p->z += dz;
 41 | }
 42 | 
 43 | void SampleGenoDist::RotatePointOnx(Point *p, double theta)
 44 | {
 45 |     theta = theta * pi / 180;
 46 |     double y = p->y;
 47 |     double z = p->z;
 48 |     p->y = y * cos(theta) - z * sin(theta);
 49 |     p->z = y * sin(theta) + z * cos(theta);
 50 | }
 51 | 
 52 | void SampleGenoDist::RotatePointOny(Point *p, double theta)
 53 | {
 54 |     theta = theta * pi / 180;
 55 |     double x = p->x;
 56 |     double z = p->z;
 57 |     p->x = x * cos(theta) + z * sin(theta);
 58 |     p->z = x * -1 * sin(theta) + z * cos(theta);
 59 | }
 60 | 
 61 | void SampleGenoDist::RotatePointOnz(Point *p, double theta)
 62 | {
 63 |     theta = theta * pi / 180;
 64 |     double x = p->x;
 65 |     double y = p->y;
 66 |     p->x = x * cos(theta) - y * sin(theta);
 67 |     p->y = x * sin(theta) + y * cos(theta);
 68 | }
 69 | 
 70 | void SampleGenoDist::TransformAllDists()
 71 | {
 72 |     SetPointWithDist(&ePt, &eDist);
 73 |     SetPointWithDist(&fPt, &fDist);
 74 |     SetPointWithDist(&aPt, &aDist);
 75 |     SetPointWithDist(&sPt, &sDist);
 76 | 
 77 |     // Move points so that the f point is at the origin
 78 |     double dx = fPt.x;
 79 |     double dy = fPt.y;
 80 |     double dz = fPt.z;
 81 | 
 82 |     MovePoint(&ePt, -dx, -dy, -dz);
 83 |     MovePoint(&fPt, -dx, -dy, -dz);
 84 |     MovePoint(&aPt, -dx, -dy, -dz);
 85 |     MovePoint(&sPt, -dx, -dy, -dz);
 86 | 
 87 |     // Rotate on z-axis
 88 |     double theta1 = atan2(aPt.y, aPt.x) * -180 / pi;
 89 |     RotatePointOnz(&ePt, theta1);
 90 |     RotatePointOnz(&fPt, theta1);
 91 |     RotatePointOnz(&aPt, theta1);
 92 |     RotatePointOnz(&sPt, theta1);
 93 | 
 94 |     // Rotate on y-axis
 95 |     double theta2 = atan2(aPt.z, aPt.x) * 180 / pi;
 96 |     RotatePointOny(&ePt, theta2);
 97 |     RotatePointOny(&fPt, theta2);
 98 |     RotatePointOny(&aPt, theta2);
 99 |     RotatePointOny(&sPt, theta2);
100 | 
101 |     // Rotate on y-axis
102 |     double theta3 = atan2(ePt.y, ePt.z) * 180 / pi;
103 |     theta3 -= 90;
104 |     RotatePointOnx(&ePt, theta3);
105 |     RotatePointOnx(&fPt, theta3);
106 |     RotatePointOnx(&aPt, theta3);
107 |     RotatePointOnx(&sPt, theta3);
108 | 
109 |     // Move points back
110 |     MovePoint(&ePt, dx, dy, dz);
111 |     MovePoint(&fPt, dx, dy, dz);
112 |     MovePoint(&aPt, dx, dy, dz);
113 |     MovePoint(&sPt, dx, dy, dz);
114 | 
115 |     // Move aPos to afrPosition
116 |     dx = afrPosition.x - fPt.x;
117 |     dy = afrPosition.y - fPt.y;
118 |     dz = afrPosition.z - fPt.z;
119 |     MovePoint(&ePt, dx, dy, dz);
120 |     MovePoint(&fPt, dx, dy, dz);
121 |     MovePoint(&aPt, dx, dy, dz);
122 |     MovePoint(&sPt, dx, dy, dz);
123 | 
124 |     int debug = 0;
125 |     if (debug) {
126 |         printf("E: x: %6.4f  y: %6.4f  z: %6.4f\n", ePt.x, ePt.y, ePt.z);
127 |         printf("F: x: %6.4f  y: %6.4f  z: %6.4f\n", fPt.x, fPt.y, fPt.z);
128 |         printf("A: x: %6.4f  y: %6.4f  z: %6.4f\n", aPt.x, aPt.y, aPt.z);
129 |         printf("J: x: %6.4f  y: %6.4f  z: %6.4f\n", sPt.x, sPt.y, sPt.z);
130 | 
131 |         assert(0);
132 |     }
133 | }
134 | 
135 | void SampleGenoDist::CalculateBaryCenters()
136 | {
137 |     double x1 = ePt.x;
138 |     double y1 = ePt.y;
139 |     double x2 = fPt.x;
140 |     double y2 = fPt.y;
141 |     double x3 = aPt.x;
142 |     double y3 = aPt.y;
143 |     double xp = sPt.x;
144 |     double yp = sPt.y;
145 | 
146 |     double det = (y2 - y3)*(x1 - x3) + (x3 - x2)*(y1 - y3);
147 | 
148 |     eWt = ((y2 - y3)*(xp - x3) + (x3 - x2)*(yp - y3))/det;
149 |     fWt = ((y3 - y1)*(xp - x3) + (x1 - x3)*(yp - y3))/det;
150 |     aWt = 1 - eWt - fWt;
151 | }
152 | 
153 | void SampleGenoDist::ShowPositions(string title, bool showOrig)
154 | {
155 |     cout << "\n" << title << "\n";
156 | 
157 |     if (showOrig) {
158 |         cout << "\nOriginal positions of " << title << "\n";
159 | 
160 |         printf("\tE: %6.4f  %6.4f  %6.4f\n", eDist.e, eDist.f, eDist.a);
161 |         printf("\tF: %6.4f  %6.4f  %6.4f\n", fDist.e, fDist.f, fDist.a);
162 |         printf("\tA: %6.4f  %6.4f  %6.4f\n", aDist.e, aDist.f, aDist.a);
163 |         printf("\tS: %6.4f  %6.4f  %6.4f\n", sDist.e, sDist.f, sDist.a);
164 | 
165 |         cout << "\nPositions of " << title << " after transformation\n";
166 |     }
167 | 
168 |     printf("\tE: %6.4f  %6.4f  %6.4f\n", ePt.x, ePt.y, ePt.z);
169 |     printf("\tF: %6.4f  %6.4f  %6.4f\n", fPt.x, fPt.y, fPt.z);
170 |     printf("\tA: %6.4f  %6.4f  %6.4f\n", aPt.x, aPt.y, aPt.z);
171 |     //printf("\tS: %6.4f  %6.4f  %6.4f\n", sPt.x, sPt.y, sPt.z);
172 | 
173 |     //cout << "\nWeights\n";
174 |     //printf("\tE: %6.4f  %6.4f  %6.4f\n", eWt, fWt, aWt);
175 |     printf("\n");
176 | }
177 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/SampleGenoDist.h:
--------------------------------------------------------------------------------
 1 | #ifndef SAMPLE_GENO_DIST_H
 2 | #define SAMPLE_GENO_DIST_H
 3 | #include "Util.h"
 4 | 
 5 | // Given positions of the 3 vertices a, e, f and that of sample s:
 6 | // 1. Rotate all points so that the triangle is on the z=0 plane with f-a side parallel to x-axis
 7 | // 2. Calulate the Barycentric weights of the sample
 8 | class SampleGenoDist
 9 | {
10 | private:
11 |     // Genetic distances to the three reference populaton of the three vertex populations and the sample
12 |     GenoDist aDist; // EAS
13 |     GenoDist eDist; // EUR
14 |     GenoDist fDist; // AFR
15 |     GenoDist sDist; // Sample being checked
16 | 
17 | public:
18 |     // The positions in the space of:the three vertex populations and the sample
19 |     Point aPt;
20 |     Point ePt;
21 |     Point fPt;
22 |     Point sPt;
23 | 
24 |     // The barycentric weights of the sample to the three vertices
25 |     double aWt;
26 |     double eWt;
27 |     double fWt;
28 | 
29 |     // Position of fPt after transformation
30 |     Point afrPosition;
31 | 
32 |     SampleGenoDist(GenoDist*, GenoDist*, GenoDist*, GenoDist*);
33 | 
34 |     void CopyGenoDist(GenoDist*, GenoDist*);
35 |     void SetPointWithDist(Point*, GenoDist*);
36 |     void CopyPoint(Point*, Point*);
37 |     void MovePoint(Point*, double, double, double);
38 |     void RotatePointOnx(Point*, double);
39 |     void RotatePointOny(Point*, double);
40 |     void RotatePointOnz(Point*, double);
41 |     Point TransformGenoDist(GenoDist);
42 |     void TransformAllDists();
43 |     void CalculateBaryCenters();
44 |     void ShowPositions(string, bool=false);
45 | };
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/SaveSamples.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/perl
  2 | 
  3 | my $disclaim = << "EOF";
  4 |     ====================================================================================
  5 |                                PUBLIC DOMAIN NOTICE
  6 |                 National Center for Biotechnology Information
  7 | 
  8 |         This software/database is a "United States Government Work" under the
  9 |         terms of the United States Copyright Act.  It was written as part of
 10 |         the author's official duties as a United States Government employee and
 11 |         thus cannot be copyrighted.  This software/database is freely available
 12 |         to the public for use. The National Library of Medicine and the U.S.
 13 |         Government have not placed any restriction on its use or reproduction.
 14 |         Although all reasonable efforts have been taken to ensure the accuracy
 15 |         and reliability of the software and data, the NLM and the U.S.
 16 |         Government do not and cannot warrant the performance or results that
 17 |         may be obtained by using this software or data. The NLM and the U.S.
 18 |         Government disclaim all warranties, express or implied, including
 19 |         warranties of performance, merchantability or fitness for any particular
 20 |         purpose.
 21 | 
 22 |         Please cite the author in any work or product based on this material.
 23 | 
 24 |         Author: Yumi (Jimmy) Jin (jinyu\@ncbi.nlm.nih.gov)
 25 |         File Description: script to save samples and ancestry scores into a file.
 26 |         Date: 04/05/2021
 27 |     ====================================================================================
 28 | EOF
 29 | 
 30 | BEGIN {
 31 |     use Cwd 'abs_path';
 32 |     my ($scriptName, $scriptDir) = ("", "");
 33 |     my $scriptFullname = abs_path($0);
 34 |     ($scriptDir, $scriptName) = ($1, $2) if ($scriptFullname =~ /(\S+)\/(\S+)/);
 35 |     push ( @INC, $scriptDir);
 36 | }
 37 | 
 38 | use strict;
 39 | use GrafPopFiles;
 40 | use GraphParameters;
 41 | use PopulationCutoffs;
 42 | use GraphTransformation;
 43 | use SubjectAncestry;
 44 | 
 45 | if (@ARGV < 2) {
 46 |     my $usage = GetScriptUsage();
 47 |     print "$disclaim\n$usage\n\n";
 48 |     exit;
 49 | }
 50 | 
 51 | my $inFile = $ARGV[0];
 52 | my $outFile = $ARGV[1];
 53 | 
 54 | my $param = new GraphParameters();
 55 | exit unless ($param);
 56 | 
 57 | #--------------------------- Read subject GrafPop scores  ---------------------------#
 58 | my ($sbjPopScores, $allPopSbjs, $minSbjSnps, $maxSbjSnps, $meanSbjSnps, $error)
 59 |     = GrafPopFiles::ReadGrafPopResults($inFile, $param->{minSnps}, $param->{maxSnps});
 60 | if ($error) {
 61 |     print "\nERROR: $error\n\n";
 62 |     exit;
 63 | }
 64 | 
 65 | #--------------------------- Read subject races from file ---------------------------#
 66 | my %sbjRaces = ();
 67 | my %allRaces = ();
 68 | my $hasRaceInfo = 0;
 69 | my $numRaces = 0;
 70 | if ($param->{raceFile}) {
 71 |     my ($sbjRaceRef, $allRaceRef, $hasRace, $err) = GrafPopFiles::ReadSubjectRaces($param->{raceFile}, $allPopSbjs);
 72 |     %sbjRaces = %$sbjRaceRef;
 73 |     %allRaces = %$allRaceRef;
 74 |     $hasRaceInfo = $hasRace;
 75 |     if ($err) {
 76 |         print "\nERROR: $err\n";
 77 |         exit;
 78 |     }
 79 | }
 80 | 
 81 | my $ancSbjs = new SubjectAncestry($param, $sbjPopScores, \%sbjRaces);
 82 | $ancSbjs->SetSubjectGenoPopulations();
 83 | $ancSbjs->SaveResults($outFile);
 84 | $ancSbjs->ShowPopulationComparison() if ($param->{raceFile});
 85 | 
 86 | sub GetScriptUsage
 87 | {
 88 |     my $usage = "Usage: SaveSamples.pl <input file> <output file> [Options]
 89 | 
 90 |     Note:
 91 |           Input file is the file generated by the C++ grafpop program that includes subject ancestry scores.
 92 |           Samples and ancestry scores will be saved to the output file as plain texts.
 93 | 
 94 |     Options:
 95 |         Set a rectangle area to retrieve subjects from graph of GD2 (y) vs. GD1 (x)
 96 |             -xcmin   min x value
 97 |             -xcmax   max x value
 98 |             -ycmin   min y value
 99 |             -ycmax   max y value
100 |             -isByd:  retrieve subjects whose values are beyond the above rectangle
101 | 
102 |         Set minimum and maximum numbers of genotyped fingerprint SNPs for samples to be processed
103 |             -minsnp  minimum number of SNPs with genotypes
104 |             -maxsnp  maximum number of SNPs with genotypes
105 | 
106 |         Set population cutoff lines
107 |             -ecut    proportion: cutoff European proportion dividing Europeans from other populations. Default 90%.
108 |             -fcut    proportion: cutoff African proportion dividing Africans from other populations. Default 95%.
109 |                                  Set it to -1 to combine African and African American populations
110 |             -acut    proportion: cutoff East Asian proportion dividing East Asians from other populations. Default 95%.
111 |                                  Set it to -1 to combine East Asian and Other Asian populations
112 |             -ohcut   proportion: cutoff African proportion dividing Latin Americans from Other population. Default 13%.
113 |             -fhcut   proportion: cutoff African proportion dividing Latin Americans from African Americans. Default 40%.
114 | 
115 |         The input file with self-reported subject race information
116 |             -spf     a file with two columns: subject and self-reported population";
117 | 
118 |     return $usage;
119 | }
120 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/SubjectAncestry.pm:
--------------------------------------------------------------------------------
  1 | package SubjectAncestry;
  2 | 
  3 | use strict;
  4 | use Carp;
  5 | use GraphParameters;
  6 | 
  7 | sub new
  8 | {
  9 |     my ($class, $param, $sbjScores, $sbjRaces) = @_;
 10 |     my $refType = ref($sbjScores);
 11 | 
 12 |     die "\nERROR: failed to create SubjectAncestry: sbjScores is not an array\n" if ($refType ne "ARRAY");
 13 |     my $numSbjs = $#$sbjScores + 1;
 14 |     print "\nWARNING: sbjScores is an empty\n" if ($numSbjs < 1);
 15 |     if ($sbjRaces) {
 16 |         die "\nERROR: SubjectAncestry: sbjRaces is not a hash\n" if (ref($sbjRaces) ne "HASH");
 17 |         foreach my $sbjNo (0 .. $#$sbjScores) {
 18 |             my $sbj = $sbjScores->[$sbjNo]->{subject};
 19 |             $sbjScores->[$sbjNo]->{race} = $sbjRaces->{$sbj} if ($sbjRaces->{$sbj});
 20 |         }
 21 |     }
 22 | 
 23 |     bless {
 24 |         param => $param,
 25 |         sbjGenoPopIds => [],
 26 |         numSubjects => $numSbjs,
 27 |         sbjScores => $sbjScores
 28 |     }, $class;
 29 | }
 30 | 
 31 | sub ShowPopulationComparison
 32 | {
 33 |     my $self = shift;
 34 | 
 35 |     my %raceGenoIds = ();
 36 |     my @sbjPopScores = @{$self->{sbjScores}};
 37 |     my %raceSbjs = ();
 38 |     my %raceGenoPopCnts = ();
 39 | 
 40 |     for my $sbjNo (0 .. $#sbjPopScores) {
 41 |         my %info = %{$sbjPopScores[$sbjNo]};
 42 |         my $sbj  = $info{subject};
 43 |         my $race = $info{race};
 44 |         my $genoPopId = $self->{sbjGenoPopIds}->[$sbjNo];
 45 |         if ($raceSbjs{$race}) {
 46 |             $raceSbjs{$race}++;
 47 |             $raceGenoPopCnts{$race}->[$genoPopId]++;
 48 |         }
 49 |         else {
 50 |             $raceSbjs{$race} = 1;
 51 |             my @popCnts = ();
 52 |             for my $popId (0 .. 9) {
 53 |                 push @popCnts, 0;
 54 |             }
 55 |             $popCnts[$genoPopId] = 1;
 56 |             $raceGenoPopCnts{$race} = \@popCnts;
 57 |         }
 58 |     }
 59 | 
 60 |     my $maxRaceLen = 0;
 61 |     foreach my $race (keys %raceSbjs) {
 62 |         my $len = length($race);
 63 |         $maxRaceLen = $len if ($len > $maxRaceLen);
 64 |     }
 65 |     $maxRaceLen = 5 if ($maxRaceLen < 5);
 66 | 
 67 |     print "\nThe following table shows the self-reported races/ethnicities (column 'Race')\n" .
 68 |           "and population IDs assigned by GrafPop (other columns, Pop9 = Other)\n\n";
 69 |     my $cntLen = 6;
 70 |     my $raceFmt = "\%$maxRaceLen" . "s";
 71 |     my $cntFmt = "\%$cntLen" . "d";
 72 | 
 73 |     my $header = sprintf($raceFmt, "Race");
 74 |     for my $popId (1 .. 9) {
 75 |         $header = sprintf("%s  \%$cntLen" . "s", $header, "Pop$popId");
 76 |     }
 77 |     $header = sprintf("%s  \%$cntLen" . "s", $header, "Total");
 78 |     print "$header\n";
 79 |     my $headLen = length($header);
 80 |     my $dashLine = "-";
 81 |     for my $i (1 .. $headLen) {$dashLine .= "-"};
 82 |     print "$dashLine\n";
 83 |     foreach my $race (sort {$raceSbjs{$b} <=> $raceSbjs{$a}} keys %raceSbjs) {
 84 |         my @popCnts = @{$raceGenoPopCnts{$race}};
 85 |         my $line = sprintf($raceFmt, $race);
 86 |         for my $popNo (1 .. 9) {
 87 |             $line = sprintf("%s  $cntFmt", $line, $popCnts[$popNo]);
 88 |         }
 89 |         $line = sprintf("%s  $cntFmt", $line, $raceSbjs{$race});
 90 |         print "$line\n";
 91 |     }
 92 |     print "$dashLine\n";
 93 | }
 94 | 
 95 | #
 96 | # Assign populations to subjects based on the ancestry scores
 97 | #
 98 | sub SetSubjectGenoPopulations
 99 | {
100 |     my ($self) = @_;
101 | 
102 |     my @sbjPopScores = @{$self->{sbjScores}};
103 |     my %subPopSbjs = ();
104 |     $self->{sbjGenoPopIds} = [];
105 | 
106 |     for my $sbjNo (0 .. $#sbjPopScores) {
107 |         my %info = %{$sbjPopScores[$sbjNo]};
108 |         my $gd1  = $info{x};
109 |         my $gd4  = $info{gd4};
110 |         my $ePct = $info{ePct};
111 |         my $fPct = $info{fPct};
112 |         my $aPct = $info{aPct};
113 |         my $numSnps = $info{snps};
114 | 
115 |         # Calculate genotype populations based on ancestry proportions
116 |         my $genoPopId = 0;
117 |         if ($numSnps > 1000) {
118 |             $genoPopId = $self->GetGenoPopId($gd1, $gd4, $ePct, $fPct, $aPct);
119 |         }
120 | 
121 |         push @{$self->{sbjGenoPopIds}}, $genoPopId;
122 |     }
123 | }
124 | 
125 | #
126 | # Save subjects and ancestry results to the output file
127 | #
128 | sub SaveResults
129 | {
130 |     my ($self, $outFile) = @_;
131 | 
132 |     my @sbjOutputLines = ();
133 |     my $numShowSbjs = 0;
134 |     my @sbjOutputLines = ();
135 |     my @sbjPopScores = @{$self->{sbjScores}};
136 |     my @sbjGenoPopIds = @{$self->{sbjGenoPopIds}};
137 |     my %subPopSbjs = ();
138 | 
139 |     for my $sbjNo (0 .. $#sbjPopScores) {
140 |         my %info = %{$sbjPopScores[$sbjNo]};
141 |         my $sbj  = $info{subject};
142 |         my $snps = $info{snps};
143 |         my $race = $info{race};
144 |         my $gd1  = $info{x};
145 |         my $gd2  = $info{y};
146 |         my $gd3  = $info{z};
147 |         my $gd4  = $info{gd4};
148 |         my $ePct = $info{ePct};
149 |         my $fPct = $info{fPct};
150 |         my $aPct = $info{aPct};
151 |         my $numSnps = $info{snps};
152 |         my $genoPopId = $sbjGenoPopIds[$sbjNo];
153 | 
154 |         # Check if the subject is within the selected area or not
155 |         my $isWithin = 1;
156 |         $isWithin = 0 if ($self->{param}->{xCutMin} && $gd1 < $self->{param}->{xCutMin});
157 |         $isWithin = 0 if ($self->{param}->{yCutMin} && $gd2 < $self->{param}->{yCutMin});
158 |         $isWithin = 0 if ($self->{param}->{xCutMax} && $gd1 > $self->{param}->{xCutMax});
159 |         $isWithin = 0 if ($self->{param}->{yCutMax} && $gd2 > $self->{param}->{yCutMax});
160 | 
161 |         # Decide whether to include the subject or not based on the selected area
162 |         my $showData = 1;
163 |         $showData = 0 if (($self->{param}->{isBeyond} && $isWithin) || (!$self->{param}->{isBeyond} && !$isWithin));
164 |         if ($self->{param}->{isSelAll} || $showData) {
165 |             my $showGd1  = sprintf("%5.4f", $gd1);
166 |             my $showGd2  = sprintf("%5.4f", $gd2);
167 |             my $showGd3  = sprintf("%6.5f", $gd3);
168 |             my $showGd4  = sprintf("%6.5f", $gd4);
169 |             my $showbPct = sprintf("%5.2f", $fPct);
170 |             my $showwPct = sprintf("%5.2f", $ePct);
171 |             my $showaPct = sprintf("%5.2f", $aPct);
172 |             my $genoPop  = "";
173 |             if ($genoPopId > 0 && $genoPopId < 9) {
174 |                 my $genoPopNo = $genoPopId - 1;
175 |                 $genoPop = $self->{param}->{alfaFullPops}->[$genoPopNo];
176 |             }
177 |             push @sbjOutputLines,
178 |                 "$sbj\t$numSnps\t$race\t$showGd1\t$showGd2\t$showGd3\t$showGd4\t$showbPct\t$showwPct\t$showaPct\t$genoPopId\t$genoPop";
179 | 
180 |             if ($subPopSbjs{$genoPopId}) {
181 |                 $subPopSbjs{$genoPopId}++;
182 |             }
183 |             else {
184 |                 $subPopSbjs{$genoPopId} = 1;
185 |             }
186 |             $numShowSbjs++;
187 |         }
188 |     }
189 | 
190 |     if ($numShowSbjs > 0) {
191 |         open FILE, ">$outFile" or die "\nERROR: Couldn't open $outFile for writing!\n";
192 |         print FILE "Subject\t#SNPs\tSelf-reported ancestry\tGD1\tGD2\tGD3\tGD4\tP_f (%)\tP_e (%)\tP_a (%)\tPopID\tComputed population\n";
193 |         for my $line (@sbjOutputLines) {
194 |             print FILE "$line\n";
195 |         }
196 |         close FILE;
197 |         print "\nTotal $self->{numSubjects} subjects. $numShowSbjs subjects were selected and saved to $outFile\n\n";
198 | 
199 |         print "\tPopID\t#Subjs\tPopulation\n";
200 |         foreach my $genoPopId (1 .. 8) {
201 |             my $genoPop = $self->{param}->{alfaFullPops}->[$genoPopId-1];
202 |             my $numSbjs = $subPopSbjs{$genoPopId};
203 |             printf("\t%d\t%6d\t%s\n", $genoPopId, $numSbjs, $genoPop);
204 |         }
205 |         print "\n";
206 |     }
207 |     else {
208 |         print "\nNo subjects are found in the specified area.\n\n";
209 |     }
210 | }
211 | 
212 | #
213 | # Assign population based on the calculated ancestry protortions and GD4
214 | #
215 | sub GetGenoPopId
216 | {
217 |     my ($self, $gd1, $gd4, $ePct, $fPct, $aPct) = @_;
218 |     my $eComp = $ePct / 100;
219 |     my $fComp = $fPct / 100;
220 |     my $aComp = $aPct / 100;
221 |     my $eurVx = $self->{param}->{vtxCoords}->[0]->[0];
222 | 
223 |     my $genoPopId = 0;
224 | 
225 |     my $isSas = 0;
226 |     my $meanSasx = $self->{param}->{meanSasx};
227 |     my $meanAsny = $self->{param}->{meanAsny};
228 | 
229 |     my $y = $self->{param}->{sasCutBasey} + $self->{param}->{sasCutaVal} * ($gd1-$meanSasx)**2;
230 |     $isSas = 1 if ($gd4 > $y);
231 | 
232 |     my $isAsn = 0;
233 |     my $x = $self->{param}->{asnCutBasex} + $self->{param}->{asnCutaVal} * ($gd4-$meanAsny)**2;
234 |     $isAsn = 1 if ($gd1 > $x);
235 | 
236 |     if ($eComp > $self->{param}->{eurCut}) {
237 |     	$genoPopId = 1;
238 |     }
239 |     elsif ($fComp > $self->{param}->{afoCut}) {
240 | 	    $genoPopId = 2;
241 |     }
242 |     elsif ($aComp > $self->{param}->{easCut}) {
243 | 	    $genoPopId = 3;
244 |     }
245 |     elsif ($isSas) {
246 | 	    $genoPopId = 8;
247 |     }
248 |     elsif ($fComp < $self->{param}->{othLatCut}) {
249 |         if ($isAsn) {
250 |             $genoPopId = 7;
251 |         }
252 |         else {
253 |             if ($gd1 < $eurVx && $aComp < $self->{param}->{othLatCut}) {
254 |                 $genoPopId = 5;
255 |             }
256 |             else {
257 |                 if ($gd4 + $gd1 < $self->{param}->{asnLatCut}) {
258 |                     $genoPopId = 6; # Hispanic2 are on the left lower side
259 |                 }
260 |                 else {
261 |                     # Set pop ID to other. These are not Hispanics,
262 |                     # probably European/Asian admixtures. Set to Other.
263 |                     $genoPopId = 9;
264 |                 }
265 |             }
266 |         }
267 |     }
268 |     elsif ($aComp < $self->{param}->{othLatCut}) {
269 |         if ($fComp > $self->{param}->{afaLacCut}) {
270 |             $genoPopId = 4;
271 |         }
272 |         else {
273 |             $genoPopId = 5;
274 |         }
275 |     }
276 |     else {
277 |     	$genoPopId = 9;
278 |     }
279 | 
280 |     return $genoPopId;
281 | }
282 | 
283 | 1;
284 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/Util.cpp:
--------------------------------------------------------------------------------
  1 | #include "Util.h"
  2 | 
  3 | bool FileExists (const char* filename)
  4 | {
  5 |     FILE* fp = fopen(filename, "r");
  6 |     if (fp) {
  7 |         fclose(fp);
  8 |         return true;
  9 |     }
 10 |     else {
 11 |         return false;
 12 |     }
 13 | }
 14 | 
 15 | bool FileWriteable (const char* filename)
 16 | {
 17 |     FILE* fp = fopen(filename, "a");
 18 |     if (fp) {
 19 |         fclose(fp);
 20 |         return true;
 21 |     }
 22 |     else {
 23 |         return false;
 24 |     }
 25 | }
 26 | 
 27 | string LowerString(const string& inStr)
 28 | {
 29 |     int strLen = inStr.length();
 30 |     string outStr = "";
 31 |     for (int i = 0; i < strLen; i++) {
 32 |         char lowCh = tolower(inStr[i]);
 33 |         outStr.push_back(lowCh);
 34 |     }
 35 | 
 36 |     return outStr;
 37 | }
 38 | 
 39 | string UpperString(const string& inStr)
 40 | {
 41 |     int strLen = inStr.length();
 42 |     string outStr = "";
 43 |     for (int i = 0; i < strLen; i++) {
 44 |         char upCh = toupper(inStr[i]);
 45 |         outStr.push_back(upCh);
 46 |     }
 47 | 
 48 |     return outStr;
 49 | }
 50 | 
 51 | GenoDatasetType CheckGenoDataFile(const string& file, string *baseName)
 52 | {
 53 |     GenoDatasetType fileType = GenoDatasetType::IS_OTHER;
 54 | 
 55 |     // Check if this is a plink set basename
 56 |     bool plinkExists = false;
 57 |     string bedFile = file + ".bed";
 58 |     string bimFile = file + ".bim";
 59 |     string famFile = file + ".fam";
 60 | 
 61 |     if (FileExists(bedFile.c_str()) && FileExists(bimFile.c_str()) && FileExists(famFile.c_str())) {
 62 |         *baseName = file;
 63 |         fileType = GenoDatasetType::IS_PLINK;
 64 |         return fileType;
 65 |     }
 66 | 
 67 |     // Check if file exists
 68 |     bool fileExists = false;
 69 |     if (FileExists(file.c_str())) {
 70 |         fileExists = true;
 71 |     }
 72 |     else {
 73 |         fileType = GenoDatasetType::NOT_EXISTS;
 74 |     }
 75 | 
 76 |     // Check if it is a .gz file
 77 |     int fileLen = file.length();
 78 |     string gzFileBase = file; // File name. If it is .gz, stripped off ".gz"
 79 |     bool isGz = false;
 80 |     if (fileLen > 3 && file.substr(fileLen-3, 3).compare(".gz") == 0) {
 81 |         isGz = true;
 82 |         gzFileBase = file.substr(0, fileLen-3);
 83 |         fileLen -= 3;
 84 |     }
 85 | 
 86 |     // Check if it is a vcf or PLINK
 87 |     string fileBase = gzFileBase;
 88 |     string fileExt = "";
 89 |     size_t dotPos = gzFileBase.find_last_of(".");
 90 | 
 91 |     if (dotPos != string::npos) {
 92 |         fileExt = gzFileBase.substr(dotPos+1, fileLen-dotPos-1);
 93 |         fileBase = gzFileBase.substr(0, dotPos);
 94 |     }
 95 | 
 96 |     bool isVcf = false;
 97 |     bool isPlink = false;
 98 |     if (fileExt.compare("vcf") == 0) {
 99 |         isVcf = true;
100 |     }
101 |     else if (fileExt.compare("bed") == 0 ||
102 |              fileExt.compare("bim") == 0 ||
103 |              fileExt.compare("fam") == 0   ) {
104 |         isPlink = true;
105 |     }
106 |     else {
107 |         fileBase = gzFileBase;
108 |     }
109 | 
110 |     if (fileExists) {
111 |         if      (isPlink && !isGz) fileType = GenoDatasetType::IS_PLINK;
112 |         else if (isPlink &&  isGz) fileType = GenoDatasetType::IS_PLINK_GZ;
113 |         else if (isVcf   && !isGz) fileType = GenoDatasetType::IS_VCF;
114 |         else if (isVcf   &&  isGz) fileType = GenoDatasetType::IS_VCF_GZ;
115 |     }
116 | 
117 |     *baseName = fileBase;
118 | 
119 |     return fileType;
120 | }
121 | 
122 | int GetChromosomeFromString(const char* chrStr)
123 | {
124 |     int chrNum = 0;
125 |     int i = 0;
126 | 
127 |     if (strlen(chrStr) > 3 &&
128 |         (chrStr[0] == 'c' || chrStr[0] == 'C') &&
129 |         (chrStr[1] == 'h' || chrStr[1] == 'H') &&
130 |         (chrStr[2] == 'r' || chrStr[2] == 'R') ) {
131 |         i = 3;
132 |     }
133 | 
134 |     while(chrStr[i] != 0) {
135 |         int num = chrStr[i] - '0';
136 | 
137 |         if (num >= 0 && num < 10) {
138 |             chrNum = chrNum * 10 + num;
139 |         }
140 |         else {
141 |             chrNum = 0;
142 |             break;
143 |         }
144 | 
145 |         i++;
146 |     }
147 | 
148 |     if (chrNum < 1 || chrNum > 22) chrNum = 0;
149 | 
150 |     return chrNum;
151 | }
152 | 
153 | int GetRsNumFromString(const char* rsStr)
154 | {
155 |     int rsNum = 0;
156 | 
157 |     if (strlen(rsStr) > 2 &&
158 |         (rsStr[0] == 'r' || rsStr[0] == 'R') &&
159 |         (rsStr[1] == 's' || rsStr[1] == 'S') ) {
160 |         int i = 2;
161 | 
162 |         while(rsStr[i] != 0) {
163 |             int num = rsStr[i] - '0';
164 | 
165 |             if (num >= 0 && num < 10) {
166 |                 rsNum = rsNum * 10 + num;
167 |             }
168 |             else {
169 |                 rsNum = 0;
170 |                 break;
171 |             }
172 | 
173 |             i++;
174 |         }
175 |     }
176 | 
177 |     return rsNum;
178 | }
179 | 
180 | char FlipAllele(char allele)
181 | {
182 |     char flipAllele = '0';
183 | 
184 |     switch(allele) {
185 |         case 'A': flipAllele = 'T'; break;
186 |         case 'T': flipAllele = 'A'; break;
187 |         case 'G': flipAllele = 'C'; break;
188 |         case 'C': flipAllele = 'G'; break;
189 |     }
190 | 
191 |     return flipAllele;
192 | }
193 | 
194 | vector<string> SplitString(const string& str, const string& delim)
195 | {
196 |     vector<string> tokens;
197 |     size_t prev = 0, pos = 0;
198 | 
199 |     while (pos < str.length() && prev < str.length()) {
200 |         pos = str.find(delim, prev);
201 |         if (pos == string::npos) pos = str.length();
202 |         string token = str.substr(prev, pos-prev);
203 |         if (!token.empty()) tokens.push_back(token);
204 |         prev = pos + delim.length();
205 |     }
206 | 
207 |     return tokens;
208 | }
209 | 
210 | void ShowTimeDiff(const struct timeval &t1, const struct timeval &t2)
211 | {
212 |     int usec = t2.tv_usec - t1.tv_usec;
213 |     int sec = t2.tv_sec - t1.tv_sec;
214 |     if (usec < 0) {
215 |         usec += 1000000;
216 |         sec += 1;
217 |     }
218 |     int min = 0;
219 |     if (sec > 60) {
220 |         min = sec / 60;
221 |         sec = sec % 60;
222 |     }
223 |     int hour = 0;
224 |     if (min > 60) {
225 |         hour = min / 60;
226 |         min = min % 60;
227 |     }
228 | 
229 |     printf("Time used: ");
230 |     if (hour > 0) { printf("%d hours ", hour); }
231 |     if (min > 0) { printf("%d minutes ", min); }
232 |     if (sec > 0) { printf("%d seconds ", sec); }
233 |     printf("%d microseconds\n\n", usec);
234 | }
235 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/Util.h:
--------------------------------------------------------------------------------
 1 | #ifndef NDEBUG
 2 | #   define ASSERT(condition, message) \
 3 |     if (!(condition)) {	\
 4 |         cerr << "Assertion `" #condition "` failed in " << __FILE__  << " line " << __LINE__ << ": " << message << "\n"; \
 5 |         terminate(); \
 6 |     }
 7 | #else
 8 | #   define ASSERT(condition, message) {}
 9 | #endif
10 | 
11 | #ifndef UTIL_H
12 | #define UTIL_H
13 | 
14 | #include <stdio.h>
15 | #include <stdlib.h>
16 | #include <iostream>
17 | #include <string.h>
18 | #include <limits.h>
19 | #include <assert.h>
20 | #include <math.h>
21 | #include <sys/time.h>
22 | #include <time.h>
23 | #include <map>
24 | #include <vector>
25 | #include <unistd.h>
26 | 
27 | const double pi = 3.1415926;
28 | 
29 | using namespace std;
30 | 
31 | enum class AncestrySnpType
32 | {
33 |     RSID = 0,
34 |     GB37 = 1,
35 |     GB38 = 2
36 | };
37 | 
38 | enum class GenoDatasetType
39 | {
40 |     NOT_EXISTS = 0,
41 |     IS_PLINK = 1,
42 |     IS_PLINK_GZ = 2,
43 |     IS_VCF = 3,
44 |     IS_VCF_GZ = 4,
45 |     IS_OTHER = 5
46 | };
47 | 
48 | // Define Genetic Distances to the three reference populations
49 | struct GenoDist
50 | {
51 |     double e; // To European
52 |     double f; // To African
53 |     double a; // To East Asian
54 | };
55 | 
56 | // A 3-D point in space
57 | struct Point
58 | {
59 |     double x;
60 |     double y;
61 |     double z;
62 | };
63 | 
64 | bool FileExists (const char*);
65 | bool FileWriteable (const char*);
66 | int GetChromosomeFromString(const char*);
67 | int GetRsNumFromString(const char*);
68 | void ShowTimeDiff(const struct timeval&, const struct timeval&);
69 | char FlipAllele(char);
70 | vector<string> SplitString(const string&, const string&);
71 | string LowerString(const string&);
72 | string UpperString(const string&);
73 | GenoDatasetType CheckGenoDataFile(const string&, string*);
74 | 
75 | 
76 | #endif
77 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/VcfSampleAncestrySnpGeno.h:
--------------------------------------------------------------------------------
 1 | #ifndef VCF_SAMPLE_ANCESTRY_SNP_GENO_H
 2 | #define VCF_SAMPLE_ANCESTRY_SNP_GENO_H
 3 | 
 4 | #include <zlib.h>
 5 | #include <errno.h>
 6 | #include "Util.h"
 7 | #include "AncestrySnps.h"
 8 | 
 9 | #define BUFFERLEN 0x0010
10 | #define WORDLEN 10000
11 | 
12 | class VcfSampleAncestrySnpGeno
13 | {
14 | private:
15 |     string vcfFile;
16 |     AncestrySnps *ancSnps;
17 | 
18 |     // For all the following arrays:
19 |     // One record for each putative ancestry SNP (checked using rs ID, Build 37 and 38 positions)
20 | 
21 |     // Saves the two coded alleles (two GT integers like 1/0, 1|1, etc.) for each sample
22 |     // It is more readable to save the refs and alts in two arrays than in one array
23 |     vector<vector<char>> vcfAncSnpGtRefs;
24 |     vector<vector<char>> vcfAncSnpGtAlts;
25 | 
26 |     vector<int> vcfAncSnpChrs;      // chr value from The CHROM string
27 |     vector<int> vcfAncSnpPoss;      // pos value from  POS string
28 |     vector<string> vcfAncSnpSnps;   // The ID string
29 |     vector<string> vcfAncSnpRefs;   // The REF string
30 |     vector<string> vcfAncSnpAlts;   // The ALT string
31 |     vector<int> vcfRsIdAncSnpIds;   // Ancestry SNP ID derived using RS ID
32 |     vector<int> vcfGb37AncSnpIds;   // Ancestry SNP ID derived using Build 37 chr + pos
33 |     vector<int> vcfGb38AncSnpIds;   // Ancestry SNP ID derived using Build 38 chr + pos
34 | 
35 |     int totAncSnps;
36 |     int numSamples;
37 |     int totVcfSnps;
38 |     int putativeAncSnps;
39 |     int numRsIdAncSnps;
40 |     int numGb37AncSnps;
41 |     int numGb38AncSnps;
42 |     int numVcfAncSnps;
43 |     AncestrySnpType ancSnpType;
44 | 
45 |     void CompareAncestrySnpAlleles(const string, const string, const char, const char, int*, int*);
46 |     int RecodeGenotypeGivenString(const int, const int, const string);
47 |     int RecodeGenotypeGivenIntegers(const int, const int, const int, const int);
48 | 
49 | public:
50 |     // Each enotype (per SNP and sample) is coded with number of alts, i.e., 0 = RR, 1 = RA, 2 = AA, 3 = unknown
51 |     // Each row in the array is the genotypes of one SNP, coded as  an array of integers for all samples
52 |     // Ancestry SNP ID of each row is saved in the array of vcfAncSnpIds
53 |     vector<string> vcfSamples;
54 |     vector<int> vcfAncSnpIds;
55 |     vector<char*> vcfAncSnpCodedGenos; // Use char, instead of int, to save space
56 | 
57 |     VcfSampleAncestrySnpGeno(string, AncestrySnps*);
58 |     ~VcfSampleAncestrySnpGeno();
59 | 
60 |     int GetNumVcfSnps() { return totVcfSnps; };
61 |     int GetNumSamples() { return numSamples; };
62 |     int GetNumVcfAncestrySnps() { return numVcfAncSnps; };
63 |     bool ReadDataFromFile();
64 |     void RecodeSnpGenotypes();
65 | 
66 |     void ShowSummary();
67 |     void DeleteAncSnpGtValues();
68 |     void DeleteAncSnpCodedGenos();
69 | };
70 | 
71 | 
72 | #endif
73 | 


--------------------------------------------------------------------------------
/Frequency Project/graf-pop/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # dbSNP (https://www.ncbi.nlm.nih.gov/snp)
 2 | ## The database of single nucleotide polymorphisms (SNPs) and multiple small-scale variations that include insertions/deletions, microsatellites, and non-polymorphic variants.
 3 | ============================
 4 | 
 5 | ### directory layout
 6 | 
 7 |     .
 8 |     ├── specs                   # dbSNP Design and Schema Specifications 
 9 |     ├── tutorials               # Scripts and tutorials for using dbSNP data
10 |     └── README.md
11 | 


--------------------------------------------------------------------------------
/lib/python/README.md:
--------------------------------------------------------------------------------
1 | Python libraries and modules
2 | 


--------------------------------------------------------------------------------
/lib/python/navs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # ===========================================================================
  3 | #
  4 | #                            PUBLIC DOMAIN NOTICE
  5 | #               National Center for Biotechnology Information
  6 | #
  7 | #  This software/database is a "United States Government Work" under the
  8 | #  terms of the United States Copyright Act.  It was written as part of
  9 | #  the author's official duties as a United States Government employee and
 10 | #  thus cannot be copyrighted.  This software/database is freely available
 11 | #  to the public for use. The National Library of Medicine and the U.S.
 12 | #  Government have not placed any restriction on its use or reproduction.
 13 | #
 14 | #  Although all reasonable efforts have been taken to ensure the accuracy
 15 | #  and reliability of the software and data, the NLM and the U.S.
 16 | #  Government do not and cannot warrant the performance or results that
 17 | #  may be obtained by using this software or data. The NLM and the U.S.
 18 | #  Government disclaim all warranties, express or implied, including
 19 | #  warranties of performance, merchantability or fitness for any particular
 20 | #  purpose.
 21 | #
 22 | #  Please cite the author in any work or product based on this material.
 23 | #
 24 | # ===========================================================================
 25 | # Module name: navs (NCBI API Variation Services)
 26 | # Description: dbSNP data retrieval using NCBI Variation Services API
 27 | #
 28 | # Authors:  Eugene M. Shekhtman
 29 | #           Lon Phan  lonphan@ncbi.nlm.nih.gov
 30 | # For help please contact: tkt-varhd@ncbi.nlm.nih.gov
 31 | #
 32 | # ---------------------------------------------------------------------------
 33 | import requests
 34 | import json
 35 | import re
 36 | import sys
 37 | import urllib
 38 | from itertools import islice, chain
 39 | from collections import defaultdict, OrderedDict, namedtuple
 40 | 
 41 | api_rootURL = 'https://api.ncbi.nlm.nih.gov/variation/v0/'
 42 | spdi_fields = ['seq_id', 'position', 'deleted_sequence', 'inserted_sequence']
 43 | TVcf = namedtuple('TVcf', ['chrom', 'pos', 'id', 'ref', 'alt', 'qual', 'filter', 'info'])
 44 | TVcf.__new__.__defaults__ = ('.',) * len(TVcf._fields)
 45 | VcfT1 = namedtuple('VcfT1', ['chrom', 'pos', 'id', 'ref'])
 46 | VcfT2 = namedtuple('VcfT2', ['chrom', 'pos', 'ref', 'alt'])
 47 | TVcfX = namedtuple('TVcfX', ['qual', 'filter', 'info'])
 48 | 
 49 | class Variation:
 50 |     # Description: perform queries to ncbi spdi service and convert
 51 |     # between VCF, HGVS, and dbSNP RSIDs
 52 | 
 53 |     def __init__(self, init_val=''):
 54 |         self.rsid = OrderedDict()
 55 |         self.req = None
 56 |         self.rs = dict()
 57 |         self.spdi = OrderedDict()
 58 |         self.hgvs = OrderedDict()
 59 |         self.vcf = OrderedDict()
 60 |         self.vcfx = TVcfX(*['.'] * 3)
 61 | 
 62 |         rsid = None
 63 |         if isinstance(init_val, int):
 64 |             rsid = init_val
 65 |         else:
 66 |             m_rsid = re.fullmatch('(rs)?(?P<rsid>[1-9]\d*)', init_val)
 67 |             if m_rsid:
 68 |                 rsid = int(m_rsid.group('rsid'))
 69 |         
 70 |         if rsid:
 71 |             self.__init_from_rsid(rsid)
 72 |         else:
 73 |             m_other = re.fullmatch('(?P<spdi>([^:]+:){3}[^:]+)?(?P<hgvs>[^:]+:[gcmnrp]\.[^:]+)?(?P<vcf>([^\t]+\t){4}[^t].*)?', init_val)
 74 |             if m_other:
 75 |                 if m_other.group('spdi'):
 76 |                     self.__init_from_spdi(m_other.group('spdi'))
 77 |                 elif m_other.group('hgvs'):
 78 |                     self.__init_from_hgvs(m_other.group('hgvs'))
 79 |                 elif m_other.group('vcf'):
 80 |                     self.__init_from_vcf(m_other.group('vcf'))
 81 |                 else:
 82 |                     raise ValueError('Variation format not recognized: '+init_val)
 83 |             else:
 84 |                 raise ValueError('Variation format not recognized: '+init_val)
 85 |                 
 86 | 
 87 |                     
 88 |     def __init_from_rsid(self, rsid):
 89 |         self.rsid[rsid] = 1
 90 |         url = api_rootURL + 'beta/refsnp/' + str(rsid)
 91 |         self.req = requests.get(url)
 92 |         self.rs = json.loads(self.req.text)
 93 |         
 94 |         ptlp = self.__find_ptlp(self.rs['primary_snapshot_data']['placements_with_allele'])
 95 |         novar_spdi = dict()
 96 |         novar_hgvs = None
 97 |         novar_vcf_key = None
 98 |         vcf4rs = defaultdict(list)
 99 |         for a in ptlp['alleles']:
100 |             s = a['allele']['spdi']
101 |             if a['hgvs'].endswith('='):
102 |                 novar_spdi = ':'.join([str(s[i]) for i in spdi_fields])
103 |                 novar_hgvs = a['hgvs']
104 |                 url = api_rootURL + 'spdi/' + urllib.parse.quote(novar_spdi) + '/vcf_fields'
105 |                 n_vcf = json.loads(requests.get(url).text)['data']                   # {chrom, pos, ref, alt}
106 |                 n_vcf['id'] = 'rs' + str(rsid)                                       # {chrom, pos, id, ref, alt}
107 |                 novar_vcf_key = VcfT1(**{f: str(n_vcf[f]) for f in VcfT1._fields})   # (chrom, pos, id, ref)
108 |             else:
109 |                 spdi = ':'.join([str(s[i]) for i in spdi_fields])
110 |                 self.spdi[spdi] = 1
111 |                 self.hgvs[a['hgvs']] = 1
112 |                 url = api_rootURL + 'spdi/' + urllib.parse.quote(spdi) + '/vcf_fields'
113 |                 a_vcf = json.loads(requests.get(url).text)['data']             # {chrom, pos, ref, alt}
114 |                 a_vcf['id'] = 'rs' + str(rsid)                                 # {chrom, pos, id, ref, alt}
115 |                 vcf_key = VcfT1(**{f: str(a_vcf[f]) for f in VcfT1._fields})   # (chrom, pos, id, ref)
116 |                 vcf4rs[vcf_key].append(a_vcf['alt'])                              # {(chrom, pos, id, ref):[alt,]}
117 |         
118 |         if not self.hgvs and novar_hgvs:
119 |             self.spdi[novar_spdi] = 1
120 |             self.hgvs[novar_hgvs] = 1
121 |             vcf4rs[novar_vcf_key].append('.')    # creates pair (chrom, pos, id, ref):['.']
122 |         
123 |         for vcf_key in vcf4rs:
124 |             alt = ','.join(sorted(vcf4rs[vcf_key]))
125 |             vcf_key2 = VcfT2(**{f: getattr(vcf_key, f) for f in ['chrom', 'pos', 'ref']}, alt = alt)
126 |             vcf = TVcf(**vcf_key._asdict(), alt = alt, **self.vcfx._asdict())
127 |             self.vcf[vcf_key2] = vcf
128 | 
129 | 
130 | 
131 |     def __init_from_spdi(self, spdi):
132 |         self.spdi[spdi] = 1
133 |         
134 |         url = api_rootURL + 'spdi/' + urllib.parse.quote(spdi) + '/contextual'
135 |         cona = json.loads(requests.get(url).text)['data']
136 |         cona_spdi = ':'.join([str(cona[i]) for i in spdi_fields])
137 |         self.spdi[cona_spdi] = 1
138 |         
139 |         url = api_rootURL + 'spdi/' + urllib.parse.quote(spdi) + '/canonical_representative'
140 |         cana = json.loads(requests.get(url).text)['data']
141 |         cana_spdi = ':'.join([str(cana[i]) for i in spdi_fields])
142 |         self.spdi[cana_spdi] = 1
143 |         
144 |         for spdi in self.spdi.keys():
145 |             self.hgvs[self._spdi2hgvs(spdi)] = 1
146 |         url = api_rootURL + 'spdi/' + urllib.parse.quote(cona_spdi) + '/rsids'
147 |         rsid_rsp = json.loads(requests.get(url).text)
148 |         if 'data' in rsid_rsp:
149 |             for rs in rsid_rsp['data']['rsids']:
150 |                 self.__init_from_rsid(rs)
151 |         else:
152 |             for spdi in self.spdi.keys():
153 |                 url = api_rootURL + 'spdi/' + urllib.parse.quote(spdi) + '/vcf_fields'
154 |                 vcfd = json.loads(requests.get(url).text)['data']
155 |                 if vcfd['alt'] == vcfd['ref']:
156 |                     vcfd['alt'] = '.'
157 |                 vcf_key = VcfT2(**{f: str(vcfd[f]) for f in vcfd})
158 |                 vcft = TVcf(**vcf_key._asdict(), **self.vcfx._asdict())
159 |                 self.vcf[vcf_key] = vcft
160 | 
161 | 
162 |     def __init_from_hgvs(self, hgvs):
163 |         self.hgvs[hgvs] = 1
164 |         
165 |         url = api_rootURL + 'hgvs/' + urllib.parse.quote(hgvs) + '/contextuals'
166 |         conas = json.loads(requests.get(url).text)['data']['spdis']
167 |         for a in conas:
168 |             cona_spdi = ':'.join([str(a[i]) for i in spdi_fields])
169 |             self.spdi[cona_spdi] = 1
170 |             self.__init_from_spdi(cona_spdi)
171 | 
172 | 
173 | 
174 |     def __init_from_vcf(self, vcf):
175 |         #self.vcf[vcf] = 1
176 |         vcft = TVcf(*vcf.split("\t"))
177 |         vcf_key = VcfT2(**{f: getattr(vcft, f) for f in VcfT2._fields})
178 |         self.vcf[vcf_key] = vcft
179 |         self.vcfx = TVcfX(**{f: getattr(vcft, f) for f in TVcfX._fields})
180 |         url = api_rootURL + 'vcf/' + '/'.join(urllib.parse.quote(v) for v in list(vcf_key)) + '/contextuals'
181 |         conas_rsp = json.loads(requests.get(url).text)
182 |         if 'data' in conas_rsp:
183 |             conas = conas_rsp['data']['spdis']
184 |             for a in conas:
185 |                 cona_spdi = ':'.join([str(a[i]) for i in spdi_fields])
186 |                 self.spdi[cona_spdi] = 1
187 |                 self.__init_from_spdi(cona_spdi)
188 | 
189 |             
190 | 
191 |     def _spdi2hgvs(self, spdi):
192 |         url = api_rootURL + 'spdi/' + urllib.parse.quote(spdi) + '/hgvs'
193 |         return json.loads(requests.get(url).text)['data']['hgvs']
194 | 
195 | 
196 | 
197 |     def __str__(self):
198 |         return json.dumps(self.rs)
199 |     
200 |             
201 |             
202 |     def asSpdiList(self):
203 |         return self.spdi.keys()
204 | 
205 |         
206 |         
207 |     def asHgvsList(self):
208 |         return self.hgvs.keys()
209 |     
210 |         
211 |         
212 |     def asRsidList(self):
213 |         return self.rsid.keys()
214 | 
215 |         
216 |         
217 |     def asJson(self):
218 |         if len(self.rsid):
219 |             return self.req.text
220 |         else:
221 |             return None
222 |     
223 |     
224 |     
225 |     def asVcfList(self):
226 |         return ["\t".join(list(self.vcf[v])) for v in self.vcf]
227 | 
228 |         
229 | 
230 |     def __find_ptlp(self, placements):
231 |         for p in placements:
232 |             if p['is_ptlp']:
233 |                 return p
234 |                 
235 |         return None
236 |     


--------------------------------------------------------------------------------
/lib/python/rsatt.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import sys
  4 | import os
  5 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../lib/python'))
  6 | from navs import *
  7 | #get RS attribute (rsat) from JSON 
  8 | class rsatt(object): 
  9 | 
 10 |     acc_chr = {
 11 |         'NC_012920': 'MT',
 12 |         'NC_000024': 'Y',
 13 |         'NC_000023': 'X',
 14 |         'NC_000022': '22',
 15 |         'NC_000021': '21',
 16 |         'NC_000020': '20',
 17 |         'NC_000019': '19',
 18 |         'NC_000018': '18',
 19 |         'NC_000017': '17',
 20 |         'NC_000016': '16',
 21 |         'NC_000015': '15',
 22 |         'NC_000014': '14',
 23 |         'NC_000013': '13',
 24 |         'NC_000012': '12',
 25 |         'NC_000011': '11',
 26 |         'NC_000010': '10',
 27 |         'NC_000009': '9',
 28 |         'NC_000008': '8',
 29 |         'NC_000007': '7',
 30 |         'NC_000006': '6',
 31 |         'NC_000005': '5',
 32 |         'NC_000004': '4',
 33 |         'NC_000003': '3',
 34 |         'NC_000002': '2',
 35 |         'NC_000001': '1',
 36 |         }
 37 |     
 38 | 
 39 |     def pmids(self, rs_obj):
 40 | 
 41 |         return(rs_obj['citations'])
 42 |     
 43 |     
 44 |     def ss(self, rs_obj):
 45 |         
 46 |         if 'primary_snapshot_data' in rs_obj:
 47 |             # columns: rs, handle, type, ss_or_RCV
 48 |             ss_set = []
 49 |             for ss in rs_obj['primary_snapshot_data']['support']:
 50 |                 id = ss['id']
 51 |                 ss_set.append([rs_obj['refsnp_id'], ss['submitter_handle'], id['type'], id['value']])
 52 | 
 53 |             return(ss_set)
 54 | 
 55 | 
 56 |     def gene_allele_annot(self, rs_obj):
 57 | 
 58 |         rs = {}
 59 |         rs['id'] = rs_obj['refsnp_id']
 60 |         allele_info = []
 61 |         if 'primary_snapshot_data' in rs_obj:
 62 |             self.genomic_placements(rs_obj)
 63 | 
 64 |             rsa =self.refseq_annot(rs_obj)
 65 | 
 66 |             for a in rsa['alleles']:
 67 |                 if 'refseq_annot' in a:
 68 |                     rnas = a['refseq_annot']['rnas']
 69 |                     gene_symbol = a['refseq_annot']['locus']
 70 |                     gene_name = a['refseq_annot']['name']
 71 |                     for r in rnas:
 72 |                         if 'codon_aligned_transcript_change' in r:
 73 |                             mrna = r['codon_aligned_transcript_change']
 74 |                             protein = r['protein']['variant']['spdi']
 75 |                             allele_info.append([rs['id'], a['allele'], gene_name,
 76 |                                              gene_symbol, mrna['seq_id'],
 77 |                                              mrna['deleted_sequence'],
 78 |                                              str(mrna['position']),
 79 |                                              mrna['deleted_sequence'],
 80 |                                              protein['seq_id'],
 81 |                                              protein['deleted_sequence'],
 82 |                                              str(protein['position']),
 83 |                                              protein['deleted_sequence']])
 84 | 
 85 |         return(allele_info)
 86 |         
 87 |             
 88 |      def clinical_significance(self, rs_obj):
 89 |         '''
 90 |         rs clinical significance
 91 |         '''
 92 |         allele_annot = []
 93 |         primary_refsnp = rs_obj['primary_snapshot_data']
 94 |         for annot in primary_refsnp['allele_annotations']:
 95 |             for clininfo in annot['clinical']:
 96 |                 allele_annot.append(clininfo['clinical_significances'])
 97 | 
 98 |         return(allele_annot)
 99 | 
100 | 
101 |     def genomic_placements(self, info):
102 |         '''
103 |         rs genomic positions
104 |         '''
105 |         rs = {}
106 |         rs['alleles'] = []  # holder for one or more variant alleles
107 |         for alleleinfo in info['primary_snapshot_data']['placements_with_allele']:
108 |             # has top level placement (ptlp) and assembly info
109 |             if alleleinfo['is_ptlp'] and \
110 |                len(alleleinfo['placement_annot']['seq_id_traits_by_assembly']) > 0:
111 |                 # get genomic placement and alleles
112 |                 for a in alleleinfo['alleles']:
113 |                     spdi = a['allele']['spdi']
114 |                     if spdi['inserted_sequence'] == spdi['deleted_sequence']:
115 |                         rs['alleles'].append({'allele': spdi['deleted_sequence']})
116 |                         rs['seq_id'] = spdi['seq_id']
117 |                         rs['position'] = spdi['position']
118 |                     else:
119 |                         # spdi['inserted_sequence'] != spdi['deleted_sequence']:
120 |                         rs['alleles'].append({'allele': spdi['inserted_sequence']})
121 |         return rs
122 | 
123 |     def refseq_annot(self, rsobj):
124 |         '''
125 |         rs refseq info
126 |         '''
127 |         rs = self.genomic_placements(rsobj)
128 |         info = rsobj['primary_snapshot_data']['allele_annotations']
129 |         for idx in range(0, len(rs['alleles'])):
130 |             allele_annotation = info[idx]['assembly_annotation'][0]
131 |             # get only RefSeq annotation on NC
132 |             if (re.match('^NC_', allele_annotation['seq_id'])):
133 |                 for g in allele_annotation['genes']:
134 |                     # allele and annotation have same ordering
135 |                     rs['alleles'][idx]['refseq_annot'] = g
136 |         return rs
137 | 
138 | 
139 |     def mafs(self, rs_obj):
140 |         
141 |         mafs = {}
142 |         if 'primary_snapshot_data' in rs_obj:        
143 |             for allele in rs_obj['primary_snapshot_data']['allele_annotations']:
144 |                 for freq in allele['frequency']:
145 |                     if freq['study_name'] not in mafs:
146 |                         mafs[freq['study_name']] = {}
147 |                         mafs[freq['study_name']]['an'] = freq['total_count']
148 |                         mafs[freq['study_name']]['ac'] = {}
149 | 
150 |                     mafs[freq['study_name']]['ac'][freq['observation']['inserted_sequence']] \
151 |                         = freq['allele_count']
152 | 
153 |             for study, maf in mafs.items():
154 |                 sorted_ac = sorted(maf['ac'].items(), key=lambda kv: kv[1])
155 |                 idx = 0
156 |                 # minor allele is the 2nd least abundant allele
157 |                 if len(sorted_ac) > 2:
158 |                     idx = 1
159 |                 maf['maf_count'] = sorted_ac[idx][1]
160 |                 maf['maf'] = float(sorted_ac[idx][1])/maf['an']
161 |                 maf['maf_allele'] = sorted_ac[idx][0]
162 |                 maf.pop('ac')
163 | 
164 |         return(mafs)
165 |             
166 |                     
167 |     def gene_consequence(self, rs_obj):
168 |         
169 |         pass
170 | 
171 |     
172 |     def variation_type(self, rs_obj):
173 | 
174 |         if 'primary_snapshot_data' in rs_obj:
175 |             return(rs_obj['primary_snapshot_data']['variant_type'])
176 | 
177 | 
178 | 
179 |     def alleles(self, rs_obj):
180 |         alleles = []
181 |         if 'primary_snapshot_data' in rs_obj:
182 |             ptlp = self.__find_ptlp(rs_obj['primary_snapshot_data']['placements_with_allele'])
183 |             if ptlp is not None:
184 |                 for allele in ptlp['alleles']:
185 |                     alleles.append(allele['allele']['spdi']['inserted_sequence'])
186 | 
187 |         return(alleles)
188 | 
189 | 
190 |     def chr_pos(self, rs_obj):
191 | 
192 |         pos = {}
193 |         if 'primary_snapshot_data' in rs_obj:
194 |             ptlp = self.__find_ptlp(rs_obj['primary_snapshot_data']['placements_with_allele'])
195 |             if ptlp is not None:
196 |                 for seq_id_trait in ptlp['placement_annot']['seq_id_traits_by_assembly']:
197 |                     if seq_id_trait['is_top_level'] and seq_id_trait['is_chromosome']:
198 |                         pos['assembly'] = seq_id_trait['assembly_name']
199 |                         pos['assembly_accession'] = seq_id_trait['assembly_accession']
200 |                         pos['pos'] = ptlp['alleles'][0]['allele']['spdi']['position']
201 |                         seq_id = ptlp['alleles'][0]['allele']['spdi']['seq_id']
202 |                         acc = seq_id.split('.')[0]
203 |                         pos['chr'] = seq_id
204 |                         if acc in self.acc_chr:
205 |                             pos['chr'] = self.acc_chr[acc]
206 | 
207 |         return(pos)
208 |                             
209 | 
210 |     def __find_ptlp(self, placements):
211 |         for p in placements:
212 |             if p['is_ptlp']:
213 |                 return p
214 | 
215 |         return None    
216 | 


--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
1 | requests
2 | 


--------------------------------------------------------------------------------
/specs/README.md:
--------------------------------------------------------------------------------
1 | dbSNP Design and Schema Specifications 
2 | ============================
3 | 
4 | The refsnp_specification.yaml on GitHub is deprecated.  The latest spec version is here: [https://api.ncbi.nlm.nih.gov/variation/v0/var_service.yaml](https://api.ncbi.nlm.nih.gov/variation/v0/var_service.yaml)
5 | 


--------------------------------------------------------------------------------
/specs/eSummary.xml:
--------------------------------------------------------------------------------
  1 | <eSummaryResult>
  2 |     <DocumentSummarySet status="OK">
  3 |         <DbBuild>Build181212-2039.1</DbBuild>
  4 |         <DocumentSummary uid="268">
  5 |             <SNP_ID>268</SNP_ID>
  6 |             <ALLELE_ORIGIN/>
  7 |             <GLOBAL_MAFS>
  8 |                 <MAF>
  9 |                     <STUDY>TWINSUK</STUDY>
 10 |                     <FREQ>G=0.0200/74</FREQ>
 11 |                 </MAF>
 12 |                 <MAF>
 13 |                     <STUDY>GnomAD</STUDY>
 14 |                     <FREQ>G=0.0145/450</FREQ>
 15 |                 </MAF>
 16 |                 <MAF>
 17 |                     <STUDY>1000Genomes</STUDY>
 18 |                     <FREQ>G=0.0052/26</FREQ>
 19 |                 </MAF>
 20 |                 <MAF>
 21 |                     <STUDY>TOPMED</STUDY>
 22 |                     <FREQ>G=0.0109/1370</FREQ>
 23 |                 </MAF>
 24 |                 <MAF>
 25 |                     <STUDY>ExAC</STUDY>
 26 |                     <FREQ>G=0.0134/1622</FREQ>
 27 |                 </MAF>
 28 |                 <MAF>
 29 |                     <STUDY>Estonian</STUDY>
 30 |                     <FREQ>G=0.0239/107</FREQ>
 31 |                 </MAF>
 32 |                 <MAF>
 33 |                     <STUDY>GnomAD_exomes</STUDY>
 34 |                     <FREQ>G=0.0129/3172</FREQ>
 35 |                 </MAF>
 36 |                 <MAF>
 37 |                     <STUDY>ALSPAC</STUDY>
 38 |                     <FREQ>G=0.0187/72</FREQ>
 39 |                 </MAF>
 40 |             </GLOBAL_MAFS>
 41 |             <GLOBAL_POPULATION/>
 42 |             <GLOBAL_SAMPLESIZE>0</GLOBAL_SAMPLESIZE>
 43 |             <SUSPECTED/>
 44 |             <CLINICAL_SIGNIFICANCE>pathogenic</CLINICAL_SIGNIFICANCE>
 45 |             <GENES>
 46 |                 <GENE_E>
 47 |                     <NAME>LPL</NAME>
 48 |                     <GENE_ID>4023</GENE_ID>
 49 |                 </GENE_E>
 50 |             </GENES>
 51 |             <ACC>NC_000008.11</ACC>
 52 |             <CHR>8</CHR>
 53 |             <HANDLE>
 54 |                 DEBNICK,PERLEGEN,MGC_GENOME_DIFF,APPLERA_GI,PERLEGEN,AFFY,ILLUMINA,BCMHGSC_JDW,ENSEMBL,ILLUMINA,PAGE_STUDY,BCM-HGSC-SUB,1000GENOMES,1000GENOMES,1000GENOMES,OMICIA,ILLUMINA,OMIM-CURATED-RECORDS,1000GENOMES,NHLBI-ESP,1000GENOMES,EXOME_CHIP,CLINSEQ_SNP,ILLUMINA,ILLUMINA,ILLUMINA,EVA-GONL,1000GENOMES,EVA_GENOME_DK,EVA_FINRISK,EVA_DECODE,EVA_UK10K_ALSPAC,EVA_UK10K_TWINSUK,EVA_EXAC,EVA_MGP,EVA_SVP,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,JJLAB,HUMAN_LONGEVITY,TOPMED,ILLUMINA,ILLUMINA,GNOMAD,GNOMAD,GNOMAD,AFFY,AFFY,SWEGEN,ILLUMINA,ILLUMINA,CSHL,TOPMED,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA
 55 |             </HANDLE>
 56 |             <FXN_CLASS>coding_sequence_variant,missense_variant</FXN_CLASS>
 57 |             <VALIDATED>by-frequency,by-cluster</VALIDATED>
 58 |             <DOCSUM>
 59 |                 HGVS=NC_000008.11:g.19956018A>G,NC_000008.10:g.19813529A>G,NG_008855.1:g.21948A>G,NM_000237.2:c.953A>G,NP_000228.1:p.Asn318Ser|SEQ=[A/G]|GENE=LPL:4023
 60 |             </DOCSUM>
 61 |             <TAX_ID>9606</TAX_ID>
 62 |             <ORIG_BUILD>36</ORIG_BUILD>
 63 |             <UPD_BUILD>152</UPD_BUILD>
 64 |             <CREATEDATE>2000/09/19 17:02</CREATEDATE>
 65 |             <UPDATEDATE>2018/10/12 12:02</UPDATEDATE>
 66 |             <SS>
 67 |                 268,23712753,28505039,48420135,69043148,74819581,74905524,93851529,143320833,173427532,181341869,206439351,217321419,217397623,217418282,244238715,244291490,252841599,334734988,342253784,490960919,491410891,491921986,537115398,780867933,783552867,985272617,1328915147,1582593752,1584057346,1594862271,1620133702,1663127735,1689111573,1711194704,1713021090,1752723237,1917826322,1946231540,1946231541,1959093877,1959093878,2024980549,2301287943,2470945845,2634720445,2711132106,2737022388,2748007727,2864092854,2985433050,2986076176,3002804432,3022826073,3022826074,3348082024,3555882704,3625947308,3630013637,3630013638,3635162175,3638748368,3640869465,3643680166,3644964714,3644964715,3653367027,3653367028,3654194864
 68 |             </SS>
 69 |             <ALLELE>R</ALLELE>
 70 |             <SNP_CLASS>snv</SNP_CLASS>
 71 |             <CHRPOS>8:19956018</CHRPOS>
 72 |             <TEXT/>
 73 |             <SNP_ID_SORT>0000000268</SNP_ID_SORT>
 74 |             <CLINICAL_SORT>1</CLINICAL_SORT>
 75 |             <CITED_SORT/>
 76 |             <CHRPOS_SORT>0019956018</CHRPOS_SORT>
 77 |             <MERGED_SORT>0</MERGED_SORT>
 78 |         </DocumentSummary>
 79 |         <DocumentSummary uid="328">
 80 |             <SNP_ID>328</SNP_ID>
 81 |             <ALLELE_ORIGIN/>
 82 |             <GLOBAL_MAFS>
 83 |                 <MAF>
 84 |                     <STUDY>TWINSUK</STUDY>
 85 |                     <FREQ>G=0.1079/400</FREQ>
 86 |                 </MAF>
 87 |                 <MAF>
 88 |                     <STUDY>GnomAD</STUDY>
 89 |                     <FREQ>G=0.0872/2696</FREQ>
 90 |                 </MAF>
 91 |                 <MAF>
 92 |                     <STUDY>1000Genomes</STUDY>
 93 |                     <FREQ>G=0.0925/463</FREQ>
 94 |                 </MAF>
 95 |                 <MAF>
 96 |                     <STUDY>TOPMED</STUDY>
 97 |                     <FREQ>G=0.0897/11265</FREQ>
 98 |                 </MAF>
 99 |                 <MAF>
100 |                     <STUDY>ExAC</STUDY>
101 |                     <FREQ>G=0.0935/11340</FREQ>
102 |                 </MAF>
103 |                 <MAF>
104 |                     <STUDY>Estonian</STUDY>
105 |                     <FREQ>G=0.0670/300</FREQ>
106 |                 </MAF>
107 |                 <MAF>
108 |                     <STUDY>GnomAD_exomes</STUDY>
109 |                     <FREQ>G=0.0919/22602</FREQ>
110 |                 </MAF>
111 |                 <MAF>
112 |                     <STUDY>ALSPAC</STUDY>
113 |                     <FREQ>G=0.1066/411</FREQ>
114 |                 </MAF>
115 |             </GLOBAL_MAFS>
116 |             <GLOBAL_POPULATION/>
117 |             <GLOBAL_SAMPLESIZE>0</GLOBAL_SAMPLESIZE>
118 |             <SUSPECTED/>
119 |             <CLINICAL_SIGNIFICANCE>likely-benign,benign</CLINICAL_SIGNIFICANCE>
120 |             <GENES>
121 |                 <GENE_E>
122 |                     <NAME>LPL</NAME>
123 |                     <GENE_ID>4023</GENE_ID>
124 |                 </GENE_E>
125 |             </GENES>
126 |             <ACC>NC_000008.11</ACC>
127 |             <CHR>8</CHR>
128 |             <HANDLE>
129 |                 DEBNICK,WIAF-CSNP,YUSUKE,BCM_SSAHASNP,IMCJ-GDT,PERLEGEN,APPLERA_GI,PERLEGEN,SI_EXO,AFFY,PAGE_STUDY,PAGE_STUDY,PAGE_STUDY,PAGE_STUDY,BUSHMAN,1000GENOMES,1000GENOMES,1000GENOMES,1000GENOMES,1000GENOMES,1000GENOMES,1000GENOMES,1000GENOMES,1000GENOMES,1000GENOMES,OMICIA,ILLUMINA,OMIM-CURATED-RECORDS,BL,GMI,NHLBI-ESP,ILLUMINA,ILLUMINA,ILLUMINA,1000GENOMES,EXOME_CHIP,CLINSEQ_SNP,ILLUMINA,SSMP,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,EVA-GONL,JMKIDD_LAB,JMKIDD_LAB,1000GENOMES,HAMMER_LAB,EVA_GENOME_DK,EVA_FINRISK,EVA_DECODE,EVA_UK10K_ALSPAC,EVA_UK10K_TWINSUK,EVA_EXAC,EVA_MGP,ILLUMINA,ILLUMINA,WEILL_CORNELL_DGM,ILLUMINA,ILLUMINA,JJLAB,ILLUMINA,ILLUMINA,USC_VALOUEV,HUMAN_LONGEVITY,TOPMED,ILLUMINA,ILLUMINA,ILLUMINA,GRF,GNOMAD,GNOMAD,GNOMAD,AFFY,AFFY,SWEGEN,ILLUMINA,BIOINF_KMB_FNS_UNIBA,CSHL,TOPMED,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,ILLUMINA,OMUKHERJEE_ADBS,ILLUMINA,ILLUMINA,ILLUMINA
130 |             </HANDLE>
131 |             <FXN_CLASS>coding_sequence_variant,stop_gained</FXN_CLASS>
132 |             <VALIDATED>by-frequency,by-cluster</VALIDATED>
133 |             <DOCSUM>
134 |                 HGVS=NC_000008.11:g.19962213C>G,NC_000008.10:g.19819724C>G,NG_008855.1:g.28143C>G,NM_000237.2:c.1421C>G,NP_000228.1:p.Ser474Ter|SEQ=[C/G]|GENE=LPL:4023
135 |             </DOCSUM>
136 |             <TAX_ID>9606</TAX_ID>
137 |             <ORIG_BUILD>36</ORIG_BUILD>
138 |             <UPD_BUILD>152</UPD_BUILD>
139 |             <CREATEDATE>2000/09/19 17:02</CREATEDATE>
140 |             <UPDATEDATE>2018/10/12 12:02</UPDATEDATE>
141 |             <SS>
142 |                 329,3173350,4921960,10467174,16343000,24648907,48420139,69043156,71648660,74808885,181341878,181834342,181835906,182258758,198888197,217321427,217397633,217399174,217407233,217418296,217419027,217422429,223585670,234352504,241227319,244238714,244294501,252841585,254171515,279724045,342253806,410878568,484193264,485584695,490960926,491410902,491921994,536381272,655035593,779528090,780867941,782542564,783552875,834998628,985272683,1067495952,1075340057,1328915353,1397520212,1582593788,1584057350,1594862344,1620133815,1663127848,1689111743,1711194717,1752723251,1917826331,1928562440,1946231552,1959093919,2024980591,2094987020,2095209247,2153202052,2301288406,2470946277,2634720469,2634720470,2634720471,2708962560,2737022600,2748007794,2864093419,2985433067,2986076219,3002804512,3022826115,3026281130,3348082059,3555884012,3625947311,3630013668,3630013669,3632621006,3635162189,3640869479,3644964726,3646373026,3653367072,3653367073,3654194881
143 |             </SS>
144 |             <ALLELE>S</ALLELE>
145 |             <SNP_CLASS>snv</SNP_CLASS>
146 |             <CHRPOS>8:19962213</CHRPOS>
147 |             <TEXT/>
148 |             <SNP_ID_SORT>0000000328</SNP_ID_SORT>
149 |             <CLINICAL_SORT>1</CLINICAL_SORT>
150 |             <CITED_SORT/>
151 |             <CHRPOS_SORT>0019962213</CHRPOS_SORT>
152 |             <MERGED_SORT>0</MERGED_SORT>
153 |         </DocumentSummary>
154 |     </DocumentSummarySet>
155 | </eSummaryResult>
156 | 


--------------------------------------------------------------------------------
/tests/unittest/test_navs.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import sys
 3 | import os
 4 | import json
 5 | 
 6 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../lib/python'))
 7 | from navs import *
 8 | 
 9 | class NvasTestCase(unittest.TestCase):
10 | 
11 |     def __init__(self, *args, **kwargs):
12 |         super(NvasTestCase, self).__init__(*args, **kwargs)
13 | 
14 |     
15 |     def test_init_from_rsid(self):
16 | 
17 |         tester = Variation(328)
18 |         self.assertTrue(tester, 'initiation from rsid test failed.')
19 |         
20 | 
21 |     def test_init_from_hgvs(self):
22 |         tester = Variation('NC_000007.14:g.8644051C>G')
23 |         self.assertTrue(tester, 'initiation from HGVS test failed.')
24 | 
25 |         
26 |     def test_init_from_spdi(self):
27 |         tester = Variation('NC_000008.10:19813528:1:G')
28 |         self.assertTrue(tester, 'initiation from SPDI test failed.')
29 | 
30 |         
31 |     def test_init_from_vcf(self):
32 |         tester = Variation('NC_000007.14\t8644051\t.\tC\tG,T\t.\t.\tINFO')
33 |         self.assertTrue(tester, 'initiation from VCF test failed.')
34 | 
35 | 
36 |     def test_invalid_spdi(self):
37 |         
38 |         with self.assertRaises(ValueError) as cm:
39 |             Variation('NC_000008.11:19956017:1.G')
40 |         
41 |         
42 |     def test_unknown_variation(self):
43 | 
44 |         # spdi NC_000008.10:19813529:1:G doesn't have a known rs
45 |         tester = Variation('NC_000008.10:19813529:1:G')
46 |         self.assertFalse(tester.asJson(), 'unknown variation test failed.')
47 | 
48 | 
49 |     def test_json(self):
50 |         tester = Variation(328)
51 |         parsed = json.loads(tester.asJson())
52 |         expected = '328'
53 |         self.assertEqual(parsed['refsnp_id'], expected, 'retrieving json test failed.')
54 |         
55 |         
56 |     def test_hgvs(self):
57 |         tester = Variation(328)
58 |         parsed = set(tester.asHgvsList())
59 |         expected = set(['NC_000008.11:g.19962213C>G'])
60 |         self.assertEqual(parsed, expected, 'retrieving hgvs test failed.')
61 |         
62 | 
63 |     def test_spdi(self):
64 | 
65 |         tester = Variation(328)
66 |         parsed = set(tester.asSpdiList())
67 |         expected = set(['NC_000008.11:19962212:C:G'])
68 |         self.assertEqual(parsed, expected, 'retrieving spdi test failed.')
69 | 
70 | 
71 |     def test_vcf(self):
72 |         tester = Variation(328)
73 |         parsed = set(tester.asVcfList())
74 |         expected = set(['NC_000008.11	19962213	rs328	C	G	.	.	.'])
75 |         self.assertEqual(parsed, expected, 'retrieving hgvs test failed.')
76 |     
77 |             
78 | if __name__ == '__main__':
79 | 
80 |     unittest.main()
81 |                 
82 | 


--------------------------------------------------------------------------------
/tests/unittest/test_snpjsonparser.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import json
 3 | import sys, os
 4 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../tutorials'))
 5 | from snp2_json import *
 6 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../tutorials'))
 7 | 
 8 | class SnpJsonParserTestCase(unittest.TestCase):
 9 | 
10 |     def __init__(self, *args, **kwargs):
11 |         super(SnpJsonParserTestCase, self).__init__(*args, **kwargs)
12 |         self.a_var = Variation(328)
13 |         self.rs_obj = json.loads(self.a_var.asJson())        
14 |         self.driver = SnpJsonParser()
15 | 
16 |     def test_get_ss(self):
17 | 
18 |         ss_set = self.driver.get_ss_info(self.rs_obj)
19 |         self.assertTrue(len(ss_set) == 102, 'parse ss record failed.')
20 | 
21 | 
22 |     def test_get_pubmed(self):
23 |         pmid = self.driver.get_pubmed(self.rs_obj)
24 |         self.assertTrue(len(pmid) == 113, 'parse pubmed id failed.')
25 | 
26 | 
27 |     def test_get_allele_info(self):
28 |         allele_info = self.driver.get_allele_info(self.rs_obj)
29 |         self.assertTrue(len(allele_info) == 2, 'get allele info test failed.')
30 | 
31 |         
32 |     def test_get_Allele_annotations(self):
33 |         allele_annot = self.driver.get_Allele_annotations(self.rs_obj['primary_snapshot_data'])
34 |         expected = [['benign'], ['likely-benign']]
35 | 
36 |         self.assertTrue(allele_annot == expected, 'parse allele info failed.')
37 |         
38 | 
39 |     def test_getPlacements(self):
40 |         rs = {}
41 |         self.driver.getPlacements(self.rs_obj['primary_snapshot_data']['placements_with_allele'], rs)
42 |         self.assertTrue('alleles' in rs, 'getPlacements test failed.')
43 | 
44 |         # def test_getRefseqAnnot(self):
45 |         self.driver.getRefSeqAnnot(self.rs_obj['primary_snapshot_data']['allele_annotations'], rs)
46 |         self.assertTrue('alleles' in rs, 'getRefseqAnnot test failed.')
47 | 
48 | 
49 |     def test_get_mafs(self):
50 |         mafs = self.driver.get_mafs(self.rs_obj)
51 |         self.assertTrue(mafs['1000Genomes']['maf_count'] == 463, 'get mafs failed.')
52 | 
53 | 
54 |     def test_get_variation_type(self):
55 |         self.assertEqual(self.driver.get_variation_type(self.rs_obj), 'snv', 'get variant type failed.')
56 | 
57 | 
58 |     def test_get_alleles(self):
59 |         alleles = self.driver.get_alleles(self.rs_obj)
60 |         expected = ['C', 'G']
61 |         self.assertEqual(alleles, expected, 'get alleles failed.')
62 | 
63 | 
64 |     def test_get_pos(self):
65 |         pos = self.driver.get_chr_pos(self.rs_obj)
66 |         self.assertEqual(int(pos['pos']), 19962212, 'get position failed.')
67 |         
68 |         
69 |         
70 | if __name__ == '__main__':
71 | 
72 |     unittest.main()
73 | 


--------------------------------------------------------------------------------
/tutorials/Json_tutorial.txt:
--------------------------------------------------------------------------------
  1 | Tutorials for querying and processing dbSNP JSON
  2 | 
  3 | This document shows practical solutions for working with JSON format and retrieving data from the dbSNP2-built JSON files for typical inquiries.
  4 | 
  5 | 
  6 | Setup 
  7 | Basic questions and basic setup to start working with JSON file. There are tools that work with general JSON objects in small or medium scale.
  8 | 
  9 | 1. Online tools for working with JSON
 10 | 	http://www.jsoneditoronline.org/ for browsing JSON objects
 11 | 	https://github.com/mattgodbolt/zindex.git for Indexing JSON file. Zindex search is 100X faster for system time and 33X faster for total time)
 12 | 	https://stedolan.github.io/jq/ for query-like output of selected JSON elements
 13 | 
 14 | 2. JSON object
 15 | JSON object files can be downloaded through dbSNP FTP site under .redesign: ftp://ftp.ncbi.nlm.nih.gov/snp/.redesign/latest_release/JSON/ 
 16 | 
 17 | 3. JSON schema
 18 | The schema for the RefSNP JSON can be found at https://api.ncbi.nlm.nih.gov/variation/v0/, under /beta/refsnp/{rsid}, also at https://api.ncbi.nlm.nih.gov/variation/v0/var_service.yaml.
 19 | 
 20 | 4. Browse and navigate JSON file
 21 | http://www.jsoneditoronline.org/ provides a free resource for navigating hierarchical JSON objects.  Start by copy-pasting rs268 to the editor, and clicking the 'right arrow'.  Click the 'left arrow' to get a pretty print of the object.
 22 | 
 23 | 5. Index on JSON file for fast retrieval
 24 | 	Install zindex tool from https://github.com/mattgodbolt/zindex.git
 25 | 	Run 
 26 | ./zindex <JSON file> --regex '<index_field>' --numeric
 27 | 
 28 | Examples:
 29 | ./zindex refsnp-chr1.json.gz --regex 'refsnp_id":"([0-9]+)"' --numeric
 30 | ./zindex refsnp-chr1.genes.json.gz --regex 'genes":\[\{".+?"id":([0-9]+),' --numeric	
 31 | 
 32 | 
 33 | 
 34 | 
 35 | Inquiries
 36 | In this section, we demonstrate the use of JSON files with a few typical inquiries for the dbSNP data.
 37 | 
 38 | 1. Retrieve full JSON object for a single rs (e.g. rs268)
 39 | 	Use the /beta/refsnp/{rsid} API at https://api.ncbi.nlm.nih.gov/variation/v0/
 40 | 	Run on zindex-ed <JSON file>:
 41 | zq <JSON file> <RSID> 
 42 | Example:
 43 | zq refsnp-chr1.json.gz 268
 44 | 
 45 | 2. Retrieve all rs reported on a gene (e.g. LPL)
 46 | 	Since LPL is on chromosome 8, we can use zgrep on the chr8 file:
 47 | zgrep LPL JSON/refsnp-chr8.json.gz > LPL.json
 48 | 	zq refsnp-chr1.genes.json.gz --raw 'select line from index_default where key = 3586' > /dev/null
 49 | 
 50 | 3. Retrieve  descriptions from JSON for an arbitrary set of rs
 51 | 	Create index for rs number on JSON file if needed:
 52 | 		zindex refsnp-chr1.json.gz --regex 'refsnp_id":"([0-9]+)"' –numeric
 53 | 	Get example script rsjson_allele_info_demo.py from here: https://github.com/ncbi/dbsnp/tree/master/tutorials
 54 | 	Run query to retrieve a set of rs (ie. rs171, rs6025, rs6336). The 'rs' prefix is not required.
 55 | 		zq refsnp-chr1.json.gz 171 6025 6336|python ./rsjson_allele_info_demo.py
 56 | Output
 57 | 6025    C       coagulation factor V    F5      XM_017000660.1  CGA     1753    CGA     XP_016856149.1  R       396     R
 58 | 6025    C       coagulation factor V    F5      NM_000130.4     CGA     1744    CGA     NP_000121.2     R       533     R
 59 | 6025    T       coagulation factor V    F5      XM_017000660.1  CGA     1753    CAA     XP_016856149.1  R       396     Q
 60 | 6025    T       coagulation factor V    F5      NM_000130.4     CGA     1744    CAA     NP_000121.2     R       533     Q
 61 | 6336    C       neurotrophic receptor tyrosine kinase 1 NTRK1   NM_001007792.1  CAT     1781    CAT     NP_001007793.1  H       567     H
 62 | 6336    C       neurotrophic receptor tyrosine kinase 1 NTRK1   NM_002529.3     CAT     1865    CAT     NP_002520.2     H       603     H
 63 | 6336    C       neurotrophic receptor tyrosine kinase 1 NTRK1   NM_001012331.1  CAT     1847    CAT     NP_001012331.1  H       597     H
 64 | 6336    T       neurotrophic receptor tyrosine kinase 1 NTRK1   NM_001007792.1  CAT     1781    TAT     NP_001007793.1  H       567     Y
 65 | 6336    T       neurotrophic receptor tyrosine kinase 1 NTRK1   NM_002529.3     CAT     1865    TAT     NP_002520.2     H       603     Y
 66 | 6336    T       neurotrophic receptor tyrosine kinase 1 NTRK1   NM_001012331.1  CAT     1847    TAT     NP_001012331.1  H       597     Y
 67 | 
 68 | 4. Retrieve supporting ss for a selected rs rs268
 69 | Smaple query: get rs, handle, type, and supported ss or scv ID.
 70 | 	Create index for rs number on JSON file
 71 | 		zindex refsnp-chr1.json.gz --regex 'refsnp_id":"([0-9]+)"' --numeric
 72 | 	Get example script rsjson_getss_info_demo.py from: https://github.com/ncbi/dbsnp/tree/master/tutorials
 73 | 		zq refsnp-chr1.json.gz 80356763 | python ./rsjson_getss_info_demo.py
 74 | Output:
 75 | rs ID handle type ss or RCV
 76 | 80356763 ILLUMINA subsnp ss161109467
 77 | 80356763 GENEREVIEWS subsnp ss184955986
 78 | 80356763 OMIM-CURATED-RECORDS subsnp ss262861249
 79 | 80356763 ILLUMINA subsnp ss482077077
 80 | 80356763 ILLUMINA subsnp ss483025937
 81 | 80356763 ILLUMINA subsnp ss485814752
 82 | 80356763 ILLUMINA subsnp ss485827447
 83 | 80356763 ILLUMINA subsnp ss537652827
 84 | 80356763 ILLUMINA subsnp ss778654271
 85 | 80356763 ILLUMINA subsnp ss783353220
 86 | 80356763 ILLUMINA subsnp ss784304347
 87 | 80356763 ILLUMINA subsnp ss832615292
 88 | 80356763 ILLUMINA subsnp ss834112140
 89 | 80356763 EVA_EXAC subsnp ss1685815952
 90 | 80356763 ILLUMINA subsnp ss1751872718
 91 | 80356763 clinvar RCV000004574.3
 92 | 80356763 clinvar RCV000020155.1
 93 | 
 94 | 5. Retrieve a set of attributes across all rs (entire JSON)
 95 | Due to the large size of data, this task is suggested to be done with Hadoop MapReduce. Please see the 
 96 | Steps to take
 97 | 	Step 1: You will need access to a hadoop cluster. Either request permission to use hadoop cluster with your system admin or set up your own (https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/ClusterSetup.html).
 98 | 	Step 2: Install mrjob module (https://pythonhosted.org/mrjob/)
 99 | 	Step 3: If the input files are not in hadoop file system yet, need to transfer: $>hdfs dfs -put path/to/your/input path/to/hadoop
100 | 	Step 4: Sample code can be downloaded from in the tutorial directory : https://github.com/ncbi/dbsnp/edit/master/tutorials/ 
101 | 	Step 5: Follow the README to check out and install the sample code.
102 | 	Step 6: To run: json_mr.py -r hadoop hdfs://path/to/input -o hdfs://path/to/output --no-output --jobconf mapreduce.job.name=test.mrjob --jobconf mapreduce.job.reduces=100 
103 | 
104 | 6. Retrieve list of attributes (rs, attribute) from full JSON object
105 | 	Install jq tool from https://stedolan.github.io/jq/
106 | 	Run on zindex-ed file: zq refsnp-chr1.genes.json.gz --raw 'select line from index_default where key = 3586' | ./jq '.refsnp_id + " " + .status'
107 | 
108 | 7. Select the first N rs from JSON file
109 | 	Index file using zindex
110 | 	Example for N=1000:  zq refsnp-chr1.json.gz --raw 'select line from index_default limit 10000' > /dev/null
111 | 
112 | 8. Retrieve annotations for a given allele from a selected rs
113 | JSON structure for a set of alleles (list) is separate from JSON structure for the allele annotations (list), and they need to be paired up according to position in the respective lists. We need a tool to accomplish that.
114 | 	rebuild index if needed as previously described.
115 | 		./zindex refsnp-chr1.json.gz --regex 'refsnp_id":"([0-9]+)"' --numeric
116 | 	Get example script ./rsjson_allele_info_demo.py from here: https://github.com/ncbi/dbsnp/tree/master/tutorials
117 | 	
118 | Sample query for rs80356763:  zq refsnp-chr1.json.gz 80356763 | python ./rsjson_allele_info_demo.py
119 | Output: 
120 | 80356763        C       glucosidase, beta, acid GBA     NM_001171811.1  CGC     749     CGC     NP_001165282.1  R       82      R
121 | 80356763        C       glucosidase, beta, acid GBA     NM_001005742.2  CGC     913     CGC     NP_001005742.1  R       169     R
122 | 80356763        C       glucosidase, beta, acid GBA     NM_001005741.2  CGC     932     CGC     NP_001005741.1  R       169     R
123 | 80356763        C       glucosidase, beta, acid GBA     NM_000157.3     CGC     673     CGC     NP_000148.2     R       169     R
124 | 80356763        C       glucosidase, beta, acid GBA     NM_001171812.1  CGC     523     CGC     NP_001165283.1  R       120     R
125 | 80356763        A       glucosidase, beta, acid GBA     NM_001171811.1  CGC     749     CTC     NP_001165282.1  R       82      L
126 | 80356763        A       glucosidase, beta, acid GBA     NM_001005742.2  CGC     913     CTC     NP_001005742.1  R       169     L
127 | 80356763        A       glucosidase, beta, acid GBA     NM_001005741.2  CGC     932     CTC     NP_001005741.1  R       169     L
128 | 80356763        A       glucosidase, beta, acid GBA     NM_000157.3     CGC     673     CTC     NP_000148.2     R       169     L
129 | 80356763        A       glucosidase, beta, acid GBA     NM_001171812.1  CGC     523     CTC     NP_001165283.1  R       120     L
130 | 80356763        G       glucosidase, beta, acid GBA     NM_001171811.1  CGC     749     CCC     NP_001165282.1  R       82      P
131 | 80356763        G       glucosidase, beta, acid GBA     NM_001005742.2  CGC     913     CCC     NP_001005742.1  R       169     P
132 | 80356763        G       glucosidase, beta, acid GBA     NM_001005741.2  CGC     932     CCC     NP_001005741.1  R       169     P
133 | 80356763        G       glucosidase, beta, acid GBA     NM_000157.3     CGC     673     CCC     NP_000148.2     R       169     P
134 | 80356763        G       glucosidase, beta, acid GBA     NM_001171812.1  CGC     523     CCC     NP_001165283.1  R       120     P
135 | 80356763        T       glucosidase, beta, acid GBA     NM_001171811.1  CGC     749     CAC     NP_001165282.1  R       82      H
136 | 80356763        T       glucosidase, beta, acid GBA     NM_001005742.2  CGC     913     CAC     NP_001005742.1  R       169     H
137 | 80356763        T       glucosidase, beta, acid GBA     NM_001005741.2  CGC     932     CAC     NP_001005741.1  R       169     H
138 | 80356763        T       glucosidase, beta, acid GBA     NM_000157.3     CGC     673     CAC     NP_000148.2     R       169     H
139 | 80356763        T       glucosidase, beta, acid GBA     NM_001171812.1  CGC     523     CAC     NP_001165283.1  R       120     H
140 | 


--------------------------------------------------------------------------------
/tutorials/README.md:
--------------------------------------------------------------------------------
 1 | Scripts and tutorials for using dbSNP data
 2 | 
 3 | dbSNP build release JSON files are available on the FTP site ([ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON](ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON)). 
 4 | ============================
 5 | 
 6 | ### directory layout
 7 | 
 8 |     .
 9 |     ├── Variation Services                   # Tutorial for working with SPDI Variation Service
10 |     ├── eUtils.ipynb                         # Sample dbSNP eUtils query 
11 |     ├── extract_flank.sh                     # Script using eUtils to get rs flanking sequences 
12 |     ├── MafGraph.ipynb                       # eUtils query and MAF parsing and graphing
13 |     ├── hadoop_json_annotation.py            # parse dbSNP RS JSON object and extract the rs annotation using Hadoop
14 |     ├── hadoop_json_clinical.py              # parse dbSNP RS JSON object and extract clinical rs data using Hadoop
15 |     ├── hadoop_json_merge.py                 # parse dbSNP RS JSON object and extract rs merge history using Hadoop
16 |     ├── hadoop_json_placement.py             # parse dbSNP RS JSON object and extract rs mapping information (ie. position)
17 |     ├── refsnp-sample.json.gz                # Sample data containing one RefSNP JSON example for rs268 for testing             rsjson_demo.py  
18 |     ├── rsjson_demo.py                       # Sample Python script to parse RefSNP (rs) JSON object.   The script
19 |     |                                          produces a tab-delimited output containing the assembly version, sequence ID, 
20 |     |                                          position, reference allele, variant allele and ClinVar clinical significance, 
21 |     |                                          if available. NOTE: this script was tested using Python 2.7.12.
22 |     ├── rsjson_allele_info_demo.py           # Extract allele information  position, mrna and protein SPDI reference allele (inserted) and variant (deleted) sequence
23 |     ├── rsjson_getss_info_demo.py            # Extract submission information (ss, local_snp_id, etc.)
24 | 
25 |     └── README.md
26 |     
27 | ## Run and explore notebook interactively on Binder server.  It may take a few minutes for Binder server to start up.
28 | 
29 | |Notebook|Description|Binder|
30 | |---|---|---|
31 | |eUtils.ipynb|dbSNP eUtils query|[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ncbi/dbsnp/master?filepath=%2Ftutorials%2FeUtils.ipynb)|
32 | |MafGraph.ipynb|eUtils query and MAF parsing and graphing|[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ncbi/dbsnp/master?filepath=%2Ftutorials%2FMafGraph.ipynb)|
33 | 


--------------------------------------------------------------------------------
/tutorials/Variation Services/Jupyter_Notebook/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | 


--------------------------------------------------------------------------------
/tutorials/Variation Services/Jupyter_Notebook/.library.json:
--------------------------------------------------------------------------------
1 | {"name":"test","id":"test1","created":"1/3/2019 6:55:12 PM +00:00","modified":"1/3/2019 6:55:12 PM +00:00","lastBackedUp":"","accessed":"1/3/2019 7:10:00 PM +00:00","clonedFrom":null,"cloneCount":0,"gitRepositoryUrl":null,"public":"True","starCount":0,"setupSteps":[]}


--------------------------------------------------------------------------------
/tutorials/Variation Services/Jupyter_Notebook/Data/test_hgvs.txt:
--------------------------------------------------------------------------------
1 | NC_000008.10:g.19819724C>G
2 | NC_000008.11:g.19962213C>G
3 | NG_008855.1:g.28143C>G
4 | NM_000237.2:c.1421C>G
5 | 


--------------------------------------------------------------------------------
/tutorials/Variation Services/Jupyter_Notebook/Data/test_rs.txt:
--------------------------------------------------------------------------------
1 | 328
2 | rs775809821
3 | rs1052373574
4 | 


--------------------------------------------------------------------------------
/tutorials/Variation Services/Jupyter_Notebook/Data/test_vcf.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.2
 2 | ##fileDate=20180518
 3 | ##source=dbSNP
 4 | ##dbSNP_BUILD_ID=151
 5 | ##reference=GRCh38.p7
 6 | ##phasing=partial
 7 | ##INFO=<ID=RS,Number=1,Type=Integer,Description="dbSNP ID (i.e. rs number)">
 8 | ##INFO=<ID=GENEINFO,Number=1,Type=String,Description="Pairs each of gene symbol:gene id.  The gene symbol and id are delimited by a colon (:) and each pair is delimited by a vertical bar (|).  Does not include pseudogenes.">
 9 | ##INFO=<ID=PSEUDOGENEINFO,Number=1,Type=String,Description="Pairs each of pseudogene symbol:gene id.  The pseudogene symbol and id are delimited by a colon (:) and each pair is delimited by a vertical bar (|)">
10 | ##INFO=<ID=dbSNPBuildID,Number=1,Type=Integer,Description="First dbSNP Build for RS">
11 | ##INFO=<ID=SAO,Number=1,Type=Integer,Description="Variant Allele Origin: 0 - unspecified, 1 - Germline, 2 - Somatic, 3 - Both">
12 | ##INFO=<ID=SSR,Number=1,Type=Integer,Description="Variant Suspect Reason Codes (may be more than one value added together) 0 - unspecified, 1 - Paralog, 2 - byEST, 4 - oldAlign, 8 - Para_EST, 16 - 1kg_failed, 1024 - other">
13 | ##INFO=<ID=VC,Number=1,Type=String,Description="Variation Class">
14 | ##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant has associated publication">
15 | ##INFO=<ID=NSF,Number=0,Type=Flag,Description="Has non-synonymous frameshift A coding region variation where one allele in the set changes all downstream amino acids. FxnClass = 44">
16 | ##INFO=<ID=NSM,Number=0,Type=Flag,Description="Has non-synonymous missense A coding region variation where one allele in the set changes protein peptide. FxnClass = 42">
17 | ##INFO=<ID=NSN,Number=0,Type=Flag,Description="Has non-synonymous nonsense A coding region variation where one allele in the set changes to STOP codon (TER). FxnClass = 41">
18 | ##INFO=<ID=SYN,Number=0,Type=Flag,Description="Has synonymous A coding region variation where one allele in the set does not change the encoded amino acid. FxnCode = 3">
19 | ##INFO=<ID=U3,Number=0,Type=Flag,Description="In 3' UTR Location is in an untranslated region (UTR). FxnCode = 53">
20 | ##INFO=<ID=U5,Number=0,Type=Flag,Description="In 5' UTR Location is in an untranslated region (UTR). FxnCode = 55">
21 | ##INFO=<ID=ASS,Number=0,Type=Flag,Description="In acceptor splice site FxnCode = 73">
22 | ##INFO=<ID=DSS,Number=0,Type=Flag,Description="In donor splice-site FxnCode = 75">
23 | ##INFO=<ID=INT,Number=0,Type=Flag,Description="In Intron FxnCode = 6">
24 | ##INFO=<ID=R3,Number=0,Type=Flag,Description="In 3' gene region FxnCode = 13">
25 | ##INFO=<ID=R5,Number=0,Type=Flag,Description="In 5' gene region FxnCode = 15">
26 | ##INFO=<ID=GNO,Number=0,Type=Flag,Description="Genotypes available.">
27 | ##INFO=<ID=PUB,Number=0,Type=Flag,Description="RefSNP or associated SubSNP is mentioned in a publication">
28 | ##INFO=<ID=FREQ,Number=.,Type=String,Description="An ordered list of allele frequencies as reported by various genomic studies, starting with the reference allele followed by alternate alleles as ordered in the ALT column. When not already in the dbSNP allele set, alleles from the studies are added to the ALT column.  The minor allele, which was previuosly reported in VCF as the GMAF, is the second largest value in the list.  This is the GMAF reported on the RefSNP and EntrezSNP pages and VariationReporter">
29 | ##INFO=<ID=COMMON,Number=0,Type=Flag,Description="RS is a common SNP.  A common SNP is one that has at least one 1000Genomes population with a minor allele of frequency >= 1% and for which 2 or more founders contribute to that minor allele frequency.">
30 | ##INFO=<ID=CLNHGVS,Number=.,Type=String,Description="Variant names from HGVS.    The order of these variants corresponds to the order of the info in the other clinical  INFO tags.">
31 | ##INFO=<ID=CLNVI,Number=.,Type=String,Description="Variant Identifiers provided and maintained by organizations outside of NCBI, such as OMIM.  Source and id separated by colon (:).  Each identifier is separated by a vertical bar (|)">
32 | ##INFO=<ID=CLNORIGIN,Number=.,Type=String,Description="Allele Origin. One or more of the following values may be summed: 0 - unknown; 1 - germline; 2 - somatic; 4 - inherited; 8 - paternal; 16 - maternal; 32 - de-novo; 64 - biparental; 128 - uniparental; 256 - not-tested; 512 - tested-inconclusive; 1073741824 - other">
33 | ##INFO=<ID=CLNSIG,Number=.,Type=String,Description="Variant Clinical Significance, 0 - Uncertain significance, 1 - not provided, 2 - Benign, 3 - Likely benign, 4 - Likely pathogenic, 5 - Pathogenic, 6 - drug response, 8 - confers sensitivity, 9 - risk-factor, 10 - association, 11 - protective, 12 - conflict, 13 - affects, 255 - other">
34 | ##INFO=<ID=CLNDISDB,Number=.,Type=String,Description="Variant disease database name and ID, separated by colon (:)">
35 | ##INFO=<ID=CLNDN,Number=.,Type=String,Description="Preferred ClinVar disease name">
36 | ##INFO=<ID=CLNREVSTAT,Number=.,Type=String,Description="ClinVar Review Status: no_assertion - No asserition provided by submitter, no_criteria - No assertion criteria provided by submitter, single - Classified by single submitter, mult - Classified by multiple submitters, conf - Criteria provided conflicting interpretations, exp - Reviewed by expert panel, guideline - Practice guideline">
37 | ##INFO=<ID=CLNACC,Number=.,Type=String,Description="For each allele (comma delimited), this is a pipe-delimited list of the Clinvar RCV phenotype accession.version strings associated with that allele.">
38 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
39 | NC_000001.11	10019	.	TA	T	.	.	RS=775809821;dbSNPBuildID=144;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=INDEL
40 | NC_000001.11	10039	.	A	C	.	.	RS=978760828;dbSNPBuildID=150;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV
41 | NC_000001.11	10043	.	T	A	.	.	RS=1008829651;dbSNPBuildID=150;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV
42 | NC_000001.11	10051	.	A	G	.	.	RS=1052373574;dbSNPBuildID=150;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV
43 | NC_000001.11	10051	.	A	AC	.	.	RS=1326880612;dbSNPBuildID=151;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=INDEL
44 | NC_000001.11	10055	.	T	TA	.	.	RS=768019142;dbSNPBuildID=144;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=INDEL
45 | NC_000001.11	10055	.	T	A	.	.	RS=892501864;dbSNPBuildID=150;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV
46 | NC_000001.11	10063	.	A	C	.	.	RS=1010989343;dbSNPBuildID=150;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV
47 | NC_000001.11	10067	.	T	TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC	.	.	RS=1489251879;dbSNPBuildID=151;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=INDEL
48 | NC_000001.11	10077	.	C	G	.	.	RS=1022805358;dbSNPBuildID=150;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV
49 | NC_000001.11	10108	.	C	T	.	.	RS=62651026;dbSNPBuildID=129;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV
50 | NC_000001.11	10108	.	C	CT	.	.	RS=1322538365;dbSNPBuildID=151;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=INS
51 | 


--------------------------------------------------------------------------------
/tutorials/Variation Services/Jupyter_Notebook/README.md:
--------------------------------------------------------------------------------
 1 | # Variation services Notebooks
 2 | 
 3 | You can launch most of the Jupyter Notebooks here in your web browser using
 4 | `Binder` (beta) to test and experiment. The experimental `Binder` server may
 5 | take a few minutes to launch and can be slow, run out of memory, and may not
 6 | always work.
 7 | 
 8 | ## `SPDI` notebooks
 9 | 
10 | * Launch `spdi_batch.ipynb` notebook on Binder:
11 |   [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ncbi/dbsnp/master?filepath=%2Ftutorials%2FVariation%20Services%2FJupyter_Notebook%2Fspdi_batch.ipynb)
12 | 
13 | Batch rs ID annotation and conversion HGVS to SPDI, HGVS to RS ID, and retrieve
14 | RS JSON objects using Variation Services.
15 | 
16 | * Launch `navs_spdi_demo.ipynb` notebook on Binder:
17 |   [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ncbi/dbsnp/master?filepath=tutorials%2FVariation%20Services%2FJupyter_Notebook%2Fnavs_spdi_demo.ipynb)
18 | 
19 | Use NCBI API Variation Service (navs) module to convert variant notations (HGVS,
20 | RS, and SPDI), remap variants between sequences, and normalized variants.
21 | 
22 | ## The Allele Frequency Aggregator (ALFA) project's notebooks
23 | 
24 | * Launch `by_gene.ipynb` notebook on Binder:
25 |   [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ncbi/dbsnp/master?filepath=tutorials%2FVariation%20Services%2FJupyter_Notebook%2Fby_gene.ipynb)
26 | 
27 | Retrieve frequency data by gene using eUtils and Variation Service
28 | 
29 | * Launch `by_rsid.ipynb` notebook on Binder:
30 |   [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ncbi/dbsnp/master?filepath=tutorials%2FVariation%20Services%2FJupyter_Notebook%2Fby_rsid.ipynb)
31 | 
32 | Retrieve frequency data by rsid in JSON format
33 | 
34 | * Launch `frequencies_for_vcf.ipynb` notebook on Binder:
35 |   [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ncbi/dbsnp/master?filepath=tutorials%2FVariation%20Services%2FJupyter_Notebook%2Ffrequencies_for_vcf.ipynb)
36 | 
37 | Compare frequencies for two populations for variations from a VCF file
38 | 
39 | * Launch `metadata_as_hash.ipynb` notebook on Binder:
40 |   [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ncbi/dbsnp/master?filepath=tutorials%2FVariation%20Services%2FJupyter_Notebook%2Fmetadata_as_hash.ipynb)
41 | 
42 | Retrieve and transform ALFA project and population meta data
43 | 
44 | * Launch `plot.ipynb` notebook on Binder:
45 |   [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/ncbi/dbsnp/master?filepath=tutorials%2FVariation%20Services%2FJupyter_Notebook%2Fplot.ipynb)
46 | 
47 | Plot population minor allele frequencies (MAFs) for variants in a gene location
48 | 
49 | * Launch `querying_subsets_ftp.ipynb` notebook on [Google
50 |   Colab](https://colab.research.google.com/github/ncbi/dbsnp/blob/master/tutorials/Variation%20Services/Jupyter_Notebook/querying_subsets_ftp.ipynb)
51 | 
52 | Retrieve subsets of VCF data from the NCBI FTP site.
53 | 
54 | ## Execution environments
55 | 
56 | Unless otherwise noted, the notebooks can be executed in `Binder` as mentioned
57 | earlier. But `querying_subsets_ftp.ipynb` needs to be executed in Google Colab
58 | because it needs FTP access and `Binder` has a policy of not allowing FTP
59 | connections:
60 | <https://github.com/binder-examples/getting-data#large-public-files>
61 | 
62 | A (free) Google account is required to run a notebook in Google Colab.
63 | 


--------------------------------------------------------------------------------
/tutorials/Variation Services/Jupyter_Notebook/by_rsid.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {},
  6 |       "source": [
  7 |         "# Retrieving frequency data by RSID"
  8 |       ]
  9 |     },
 10 |     {
 11 |       "cell_type": "markdown",
 12 |       "metadata": {},
 13 |       "source": [
 14 |         "If you have an RefSNP ID, you can get the data from our API very simply. First, you need to install the `requests` package."
 15 |       ]
 16 |     },
 17 |     {
 18 |       "cell_type": "code",
 19 |       "execution_count": null,
 20 |       "metadata": {},
 21 |       "outputs": [],
 22 |       "source": [
 23 |         "%pip install -q requests\n",
 24 |         "%pip install -q ratelimit"
 25 |       ]
 26 |     },
 27 |     {
 28 |       "cell_type": "code",
 29 |       "execution_count": null,
 30 |       "metadata": {},
 31 |       "outputs": [],
 32 |       "source": [
 33 |         "from requests import get, codes as http_code"
 34 |       ]
 35 |     },
 36 |     {
 37 |       "cell_type": "markdown",
 38 |       "metadata": {},
 39 |       "source": [
 40 |         "Then you contact the \"frequency by rsid\" endpoint [`refsnp/<rsid>/frequency`][freq] (docs at our [Variation Services page][v0]).\n",
 41 |         "\n",
 42 |         "[freq]: https://api.ncbi.nlm.nih.gov/variation/v0/refsnp/16/frequency\n",
 43 |         "[v0]: https://api.ncbi.nlm.nih.gov/variation/v0/])."
 44 |       ]
 45 |     },
 46 |     {
 47 |       "cell_type": "code",
 48 |       "execution_count": null,
 49 |       "metadata": {},
 50 |       "outputs": [],
 51 |       "source": [
 52 |         "reply=get(\"https://api.ncbi.nlm.nih.gov/variation/v0/refsnp/{}/frequency\".format(16))\n",
 53 |         "reply.json()"
 54 |       ]
 55 |     },
 56 |     {
 57 |       "cell_type": "markdown",
 58 |       "metadata": {},
 59 |       "source": [
 60 |         "The reply's contents are of type JSON. So, we use the `json`  method of `requests` to convert it into a dictionary. You can use that dictionary directly."
 61 |       ]
 62 |     },
 63 |     {
 64 |       "cell_type": "code",
 65 |       "execution_count": null,
 66 |       "metadata": {},
 67 |       "outputs": [],
 68 |       "source": [
 69 |         "rj = reply.json()\n",
 70 |         "rj['results']['1@11563271']['counts']['PRJNA507278']['allele_counts']['SAMN10492695']['C']"
 71 |       ]
 72 |     },
 73 |     {
 74 |       "cell_type": "markdown",
 75 |       "metadata": {},
 76 |       "source": [
 77 |         "This means that study `PRJNA507278` (the dbGaP Allele Frequency Aggregation, ALFA, project) counted 160122 `C`s for population `SAMN10492695`, although the result can be different for new ALFA releases. You can translate those into English with the [metadata endpoint][metadata]. The `1@11563271` is an interval of length 1 starting after nucleotide 11563271 if you number nucleotides starting at 0.\n",
 78 |         "\n",
 79 |         "[metadata]: https://api.ncbi.nlm.nih.gov/variation/v0/metadata/frequency"
 80 |       ]
 81 |     },
 82 |     {
 83 |       "cell_type": "code",
 84 |       "execution_count": null,
 85 |       "metadata": {},
 86 |       "outputs": [],
 87 |       "source": [
 88 |         "md_json=get(\"https://api.ncbi.nlm.nih.gov/variation/v0/metadata/frequency\").json()\n",
 89 |         "md = {}\n",
 90 |         "for project_json in md_json:\n",
 91 |         "  p = {}\n",
 92 |         "  p['json']=project_json\n",
 93 |         "  p['pops']={}\n",
 94 |         "  md[project_json['bioproject_id']] = p\n",
 95 |         "\n",
 96 |         "def add_all_pops(populations, project):\n",
 97 |         "  for p in populations:\n",
 98 |         "    project['pops'][p['biosample_id']] = p\n",
 99 |         "  if 'subs' in p:\n",
100 |         "    add_all_pops(p['subs'], project)\n",
101 |         "\n",
102 |         "for prj_id, prj in md.items():\n",
103 |         "  add_all_pops(prj['json']['populations'], prj)\n",
104 |         "\n",
105 |         "print(md['PRJNA507278']['json']['short_name'])\n",
106 |         "print(md['PRJNA507278']['pops']['SAMN10492695']['name'])"
107 |       ]
108 |     },
109 |     {
110 |       "cell_type": "markdown",
111 |       "metadata": {},
112 |       "source": [
113 |         "So, those were the counts for people with European ancestry from the ALFA project."
114 |       ]
115 |     },
116 |     {
117 |       "cell_type": "markdown",
118 |       "metadata": {},
119 |       "source": [
120 |         "However, for programmatic use, we'll want to wrap this in a function because the API is currently limited to one call per second. We can also take care of error conditions."
121 |       ]
122 |     },
123 |     {
124 |       "cell_type": "code",
125 |       "execution_count": null,
126 |       "metadata": {},
127 |       "outputs": [],
128 |       "source": [
129 |         "from requests import get, codes as http_code\n",
130 |         "from ratelimit import limits\n",
131 |         "from typing import Any\n",
132 |         "\n",
133 |         "@limits(calls=1, period=1)  # Only one call per second\n",
134 |         "def get_frequency_for(rs_id: str) -> Any:\n",
135 |         "  \"\"\"\n",
136 |         "  Retrieve frequency data by rsid in JSON format\n",
137 |         "  \"\"\"\n",
138 |         "  BYRSID_URL = (\"https://api.ncbi.nlm.nih.gov/variation/v0/\"\n",
139 |         "                \"refsnp/{}/frequency\".format(rs_id))\n",
140 |         "\n",
141 |         "  reply = get(BYRSID_URL)\n",
142 |         "  if reply.status_code != http_code.ok:\n",
143 |         "    raise Exception(\"Request failed: {}\\n{}\".format(\n",
144 |         "      reply.status_code, BYRSID_URL))\n",
145 |         "\n",
146 |         "  content_type = reply.headers['content-type']\n",
147 |         "  if content_type != 'application/json':\n",
148 |         "    raise Exception(\"Unexpected content type: {}\\n{}\".format(\n",
149 |         "      content_type, BYRSID_URL))\n",
150 |         "\n",
151 |         "  return reply.json()"
152 |       ]
153 |     },
154 |     {
155 |       "cell_type": "markdown",
156 |       "metadata": {},
157 |       "source": [
158 |         "The reply we obtained is represented with a Python dictionary whose element with key `results` contains the frequency data. That data inside that element is also a Python dictionary. Its elements consist of intervals, each of which is keyed by a combination of its `length` and its `start` position. \n",
159 |         "\n",
160 |         "The elements of each interval are keyed by the id of the study that the frequency data comes from. Inside each study element, the data consists of the reference allele of the interval (`ref` element) and its frequency counts (`counts` key)."
161 |       ]
162 |     },
163 |     {
164 |       "cell_type": "code",
165 |       "execution_count": null,
166 |       "metadata": {},
167 |       "outputs": [],
168 |       "source": [
169 |         "def print_study_counts(study_id: str, study_counts: Any) -> None:\n",
170 |         "  \"\"\"\n",
171 |         "  Print counts per study\n",
172 |         "\n",
173 |         "  At present, we only offer counts per allele,\n",
174 |         "  not yet per genotype\n",
175 |         "  \"\"\"\n",
176 |         "  print(\"\\tAllele counts for study: {}\".format(study_id))\n",
177 |         "  allele_counts = study_counts[\"allele_counts\"]\n",
178 |         "\n",
179 |         "  for pop_id, pop_counts in allele_counts.items():\n",
180 |         "    print(\"\\t\\tAllele counts for population {}\".format(pop_id))\n",
181 |         "    for allele, count in pop_counts.items():\n",
182 |         "      print(\"\\t\\t\\tAllele: {}. Count: {}\".format(\n",
183 |         "        allele, count))"
184 |       ]
185 |     },
186 |     {
187 |       "cell_type": "markdown",
188 |       "metadata": {},
189 |       "source": [
190 |         "In the example code below, we start by retrieving the frequency data for RSID 16. We then iterate over the intervals, and print for each their `start` and `length` positions and their `ref` alelle. Then we iterate over each study and print its allele counts using the function `print_study_counts` above. Inside that function we can see that the allele counts are broken down first by population and then by allele."
191 |       ]
192 |     },
193 |     {
194 |       "cell_type": "code",
195 |       "execution_count": null,
196 |       "metadata": {},
197 |       "outputs": [],
198 |       "source": [
199 |         "frequency_data = get_frequency_for(rs_id=16)\n",
200 |         "for interval, freq_by_pop in frequency_data[\"results\"].items():\n",
201 |         "  # Each key describes an interval\n",
202 |         "  # in <length>@<start> format\n",
203 |         "  length, start = interval.split(\"@\")\n",
204 |         "  print(\"Start: {}. Length: {}. Ref. Allele: {}\".format(\n",
205 |         "    start, length, freq_by_pop[\"ref\"]))\n",
206 |         "  counts_per_study = freq_by_pop[\"counts\"]\n",
207 |         "\n",
208 |         "  # Print counts per study\n",
209 |         "  for study_id, study_counts in counts_per_study.items():\n",
210 |         "    print_study_counts(study_id, study_counts)"
211 |       ]
212 |     }
213 |   ],
214 |   "metadata": {
215 |     "colab": {
216 |       "collapsed_sections": [],
217 |       "name": "by_rsid.ipynb",
218 |       "provenance": [],
219 |       "version": "0.3.2"
220 |     },
221 |     "kernelspec": {
222 |       "display_name": "Python 3",
223 |       "language": "python",
224 |       "name": "python3"
225 |     },
226 |     "language_info": {
227 |       "codemirror_mode": {
228 |         "name": "ipython",
229 |         "version": 3
230 |       },
231 |       "file_extension": ".py",
232 |       "mimetype": "text/x-python",
233 |       "name": "python",
234 |       "nbconvert_exporter": "python",
235 |       "pygments_lexer": "ipython3",
236 |       "version": "3.6.2"
237 |     }
238 |   },
239 |   "nbformat": 4,
240 |   "nbformat_minor": 1
241 | }
242 | 


--------------------------------------------------------------------------------
/tutorials/Variation Services/spdi_batch.py:
--------------------------------------------------------------------------------
  1 | #!/opt/python-3.4/bin/python
  2 | # ===========================================================================
  3 | #
  4 | #                            PUBLIC DOMAIN NOTICE
  5 | #               National Center for Biotechnology Information
  6 | #
  7 | #  This software/database is a "United States Government Work" under the
  8 | #  terms of the United States Copyright Act.  It was written as part of
  9 | #  the author's official duties as a United States Government employee and
 10 | #  thus cannot be copyrighted.  This software/database is freely available
 11 | #  to the public for use. The National Library of Medicine and the U.S.
 12 | #  Government have not placed any restriction on its use or reproduction.
 13 | #
 14 | #  Although all reasonable efforts have been taken to ensure the accuracy
 15 | #  and reliability of the software and data, the NLM and the U.S.
 16 | #  Government do not and cannot warrant the performance or results that
 17 | #  may be obtained by using this software or data. The NLM and the U.S.
 18 | #  Government disclaim all warranties, express or implied, including
 19 | #  warranties of performance, merchantability or fitness for any particular
 20 | #  purpose.
 21 | #
 22 | #  Please cite the author in any work or product based on this material.
 23 | #
 24 | # ===========================================================================
 25 | # Script name: spdi_batch.py
 26 | # Description: a demo script to perform batch query to ncbi spdi service using
 27 | # VCF, HGVS, or dbSNP rs IDs
 28 | #
 29 | # Sample use:
 30 | # ---Annotate VCF with RS ID and INFO
 31 | # python spdi_batch.py -i test_vcf.vcf -t VCF
 32 | #
 33 | # ---Retrieve RS JSON objects
 34 | # python spdi_batch.py -i test_rs.txt -t RS
 35 | #
 36 | # ---Convert HGVS to SPDI
 37 | # python spdi_batch.py -i test_hgvs.txt -t HGVS
 38 | #
 39 | # ---Convert HGVS to RS
 40 | # python spdi_batch.py -i test_hgvs.txt -t HGVS_RS
 41 | #
 42 | # Author:  Lon Phan  lonphan@ncbi.nlm.nih.gov
 43 | # For help please contact: tkt-varhd@ncbi.nlm.nih.gov
 44 | #
 45 | # ---------------------------------------------------------------------------
 46 | import requests
 47 | import json
 48 | import argparse
 49 | import re
 50 | import sys
 51 | from itertools import islice, chain
 52 | 
 53 | 
 54 | parser = argparse.ArgumentParser(description='batch process SPDI requests')
 55 | parser.add_argument(
 56 |     '-i', dest='input_file', required=True,
 57 |     help='The name of the input file to parse (VCF, HGVS or rs list, etc.)')
 58 | parser.add_argument(
 59 |     '-t', dest='input_format', required=True,
 60 |     help='The input file format (VCF, HGVS, or RS')
 61 | api_rootURL = 'https://api.ncbi.nlm.nih.gov/variation/v0/'
 62 | 
 63 | 
 64 | def apiRequest(url):
 65 |     try:
 66 |         r = requests.get(url)
 67 |     except requests.exceptions.Timeout:
 68 |         # Maybe set up for a retry, or continue in a retry loop
 69 |         print("ERROR: Timeout")
 70 |     except requests.exceptions.TooManyRedirects:
 71 |         # Tell the user their URL was bad and try a different one
 72 |         print("ERROR: bad url =" + url)
 73 |     except requests.exceptions.RequestException as e:
 74 |         # catastrophic error. bail.
 75 |         print(e)
 76 |         sys.exit(1)
 77 |     if (r.status_code == 200):
 78 |         return r
 79 |     else:
 80 |         print("ERROR: status code = " + str(r.status_code))
 81 |         return None
 82 | 
 83 | 
 84 | def batch(iterable, n):
 85 |     i = iter(iterable)
 86 |     piece = list(islice(i, n))
 87 |     while piece:
 88 |         yield piece
 89 |         piece = list(islice(i, n))
 90 | 
 91 | 
 92 | def batchRS(infile):
 93 |     for rs in infile:
 94 |         rs = re.sub('rs', '', rs.rstrip())
 95 |         if rs.isdigit():
 96 |             url = api_rootURL + 'beta/refsnp/' + rs
 97 |             print(url)
 98 |             req = requests.get(url)
 99 |             print(req.text)
100 | 
101 | 
102 | def batchHGVS(infile, handler=0):
103 |     for hgvs in infile:
104 |         hgvs = hgvs.rstrip()
105 |         url = api_rootURL + 'hgvs/' + hgvs + '/contextuals'
106 |         req = apiRequest(url)
107 |         if req and handler:
108 |             handler(hgvs, req)
109 |         elif req:
110 |             spdi = req2spdi(req)
111 |             print(hgvs + "\t" + spdi)
112 | 
113 | 
114 | def hgvs2rs(hgvs, req):
115 |     spdi = req2spdi(req)
116 |     rslist = spdi2rs(spdi)
117 |     print("\t".join([hgvs, spdi, ",".join(map(str, rslist))]))
118 | 
119 | 
120 | def batchHGVS2RS(infile):
121 |     batchHGVS(infile, hgvs2rs)
122 | 
123 | 
124 | def spdi2rs(spdi):
125 |     url = api_rootURL + 'spdi/' + spdi + '/rsids'
126 |     req = apiRequest(url)
127 |     if req:
128 |         return json.loads(req.text)['data']['rsids']
129 |     else:
130 |         return ["no rs found"]
131 | 
132 | 
133 | def req2spdi(req):
134 |     reqjson = json.loads(req.text)
135 |     spdiobj = reqjson['data']['spdis'][0]
136 |     spdi = ':'.join([
137 |         spdiobj['seq_id'],
138 |         str(spdiobj['position']),
139 |         spdiobj['deleted_sequence'],
140 |         spdiobj['inserted_sequence']])
141 |     return spdi
142 | 
143 | 
144 | def batchVCF(infile):
145 |     vcfbatchsize = 1000
146 |     for batchiter in batch(infile, vcfbatchsize):
147 |         rowcount = 0
148 |         rowdata = ''
149 |         for row in batchiter:
150 |             if not row.startswith("#"):
151 |                 rowcount += 1
152 |                 rowdata += row
153 |         if rowcount > 0:
154 |             req = requests.post(
155 |                 api_rootURL + 'vcf/file/set_rsids?assembly=GCF_000001405.25',
156 |                 data=rowdata)
157 |             print(req.text)
158 | 
159 | 
160 | batchfunctions = {
161 |     'VCF': batchVCF,
162 |     'RS': batchRS,
163 |     'HGVS': batchHGVS,
164 |     'HGVS_RS': batchHGVS2RS}
165 | args = parser.parse_args()
166 | infile = open(args.input_file, "r")
167 | batchfunctions[args.input_format](infile)
168 | 


--------------------------------------------------------------------------------
/tutorials/Variation Services/test_hgvs.txt:
--------------------------------------------------------------------------------
1 | NC_000008.10:g.19819724C>G
2 | NC_000008.11:g.19962213C>G
3 | NG_008855.1:g.28143C>G
4 | NM_000237.2:c.1421C>G
5 | 


--------------------------------------------------------------------------------
/tutorials/Variation Services/test_rs.txt:
--------------------------------------------------------------------------------
1 | 328
2 | rs775809821
3 | rs1052373574
4 | 


--------------------------------------------------------------------------------
/tutorials/Variation Services/test_variation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os, sys
 3 | sys.path.append(os.path.join(os.path.dirname(__file__), "../../lib/python"))
 4 | 
 5 | from navs import Variation
 6 | 
 7 | test_cases = [
 8 |     'rs328',
 9 |     338,
10 |     "NC_000007.14\t8644051\t.\tC\tG,T\t.\t.\tINFO",
11 |     "NC_000007.14\t8644051\t.\tC\tT\t.\t.\tINFO",
12 |     'NC_000007.14:g.8644051C>G',
13 |     'NC_000007.14:g.8644052C>G',
14 |     'NC_000008.10:19813528:1:G',
15 |     'NC_000008.10:19813529:1:G',
16 |     'NC_000008.10:19813529:1:T',
17 |     'NC_000008.11:19956017:1.G',
18 | ]
19 | 
20 | 
21 | for tc in test_cases:
22 |     print()
23 |     print('Input: ' + str(tc))
24 |     print('-------------------------------------------')
25 | 
26 |     v = Variation(tc)
27 |     print("RSID:\n" + "\n".join([str(rsid) for rsid in v.asRsidList()]))
28 |     print()
29 |     rsAsJson = v.asJson()
30 |     if rsAsJson:
31 |         print(rsAsJson[0:400] + '...')
32 |     else:
33 |         print('<No JSON>')
34 |     print()
35 |     #print(v)
36 |     print("SPDI:\n" + "\n".join(v.asSpdiList()))
37 |     print()
38 |     print("HGVS:\n" + "\n".join(v.asHgvsList()))
39 |     print()
40 |     print("VCF:\n" + "\n".join(v.asVcfList()))
41 |     print()
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/tutorials/Variation Services/test_vcf.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.2
 2 | ##fileDate=20180518
 3 | ##source=dbSNP
 4 | ##dbSNP_BUILD_ID=151
 5 | ##reference=GRCh38.p7
 6 | ##phasing=partial
 7 | ##INFO=<ID=RS,Number=1,Type=Integer,Description="dbSNP ID (i.e. rs number)">
 8 | ##INFO=<ID=GENEINFO,Number=1,Type=String,Description="Pairs each of gene symbol:gene id.  The gene symbol and id are delimited by a colon (:) and each pair is delimited by a vertical bar (|).  Does not include pseudogenes.">
 9 | ##INFO=<ID=PSEUDOGENEINFO,Number=1,Type=String,Description="Pairs each of pseudogene symbol:gene id.  The pseudogene symbol and id are delimited by a colon (:) and each pair is delimited by a vertical bar (|)">
10 | ##INFO=<ID=dbSNPBuildID,Number=1,Type=Integer,Description="First dbSNP Build for RS">
11 | ##INFO=<ID=SAO,Number=1,Type=Integer,Description="Variant Allele Origin: 0 - unspecified, 1 - Germline, 2 - Somatic, 3 - Both">
12 | ##INFO=<ID=SSR,Number=1,Type=Integer,Description="Variant Suspect Reason Codes (may be more than one value added together) 0 - unspecified, 1 - Paralog, 2 - byEST, 4 - oldAlign, 8 - Para_EST, 16 - 1kg_failed, 1024 - other">
13 | ##INFO=<ID=VC,Number=1,Type=String,Description="Variation Class">
14 | ##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant has associated publication">
15 | ##INFO=<ID=NSF,Number=0,Type=Flag,Description="Has non-synonymous frameshift A coding region variation where one allele in the set changes all downstream amino acids. FxnClass = 44">
16 | ##INFO=<ID=NSM,Number=0,Type=Flag,Description="Has non-synonymous missense A coding region variation where one allele in the set changes protein peptide. FxnClass = 42">
17 | ##INFO=<ID=NSN,Number=0,Type=Flag,Description="Has non-synonymous nonsense A coding region variation where one allele in the set changes to STOP codon (TER). FxnClass = 41">
18 | ##INFO=<ID=SYN,Number=0,Type=Flag,Description="Has synonymous A coding region variation where one allele in the set does not change the encoded amino acid. FxnCode = 3">
19 | ##INFO=<ID=U3,Number=0,Type=Flag,Description="In 3' UTR Location is in an untranslated region (UTR). FxnCode = 53">
20 | ##INFO=<ID=U5,Number=0,Type=Flag,Description="In 5' UTR Location is in an untranslated region (UTR). FxnCode = 55">
21 | ##INFO=<ID=ASS,Number=0,Type=Flag,Description="In acceptor splice site FxnCode = 73">
22 | ##INFO=<ID=DSS,Number=0,Type=Flag,Description="In donor splice-site FxnCode = 75">
23 | ##INFO=<ID=INT,Number=0,Type=Flag,Description="In Intron FxnCode = 6">
24 | ##INFO=<ID=R3,Number=0,Type=Flag,Description="In 3' gene region FxnCode = 13">
25 | ##INFO=<ID=R5,Number=0,Type=Flag,Description="In 5' gene region FxnCode = 15">
26 | ##INFO=<ID=GNO,Number=0,Type=Flag,Description="Genotypes available.">
27 | ##INFO=<ID=PUB,Number=0,Type=Flag,Description="RefSNP or associated SubSNP is mentioned in a publication">
28 | ##INFO=<ID=FREQ,Number=.,Type=String,Description="An ordered list of allele frequencies as reported by various genomic studies, starting with the reference allele followed by alternate alleles as ordered in the ALT column. When not already in the dbSNP allele set, alleles from the studies are added to the ALT column.  The minor allele, which was previuosly reported in VCF as the GMAF, is the second largest value in the list.  This is the GMAF reported on the RefSNP and EntrezSNP pages and VariationReporter">
29 | ##INFO=<ID=COMMON,Number=0,Type=Flag,Description="RS is a common SNP.  A common SNP is one that has at least one 1000Genomes population with a minor allele of frequency >= 1% and for which 2 or more founders contribute to that minor allele frequency.">
30 | ##INFO=<ID=CLNHGVS,Number=.,Type=String,Description="Variant names from HGVS.    The order of these variants corresponds to the order of the info in the other clinical  INFO tags.">
31 | ##INFO=<ID=CLNVI,Number=.,Type=String,Description="Variant Identifiers provided and maintained by organizations outside of NCBI, such as OMIM.  Source and id separated by colon (:).  Each identifier is separated by a vertical bar (|)">
32 | ##INFO=<ID=CLNORIGIN,Number=.,Type=String,Description="Allele Origin. One or more of the following values may be summed: 0 - unknown; 1 - germline; 2 - somatic; 4 - inherited; 8 - paternal; 16 - maternal; 32 - de-novo; 64 - biparental; 128 - uniparental; 256 - not-tested; 512 - tested-inconclusive; 1073741824 - other">
33 | ##INFO=<ID=CLNSIG,Number=.,Type=String,Description="Variant Clinical Significance, 0 - Uncertain significance, 1 - not provided, 2 - Benign, 3 - Likely benign, 4 - Likely pathogenic, 5 - Pathogenic, 6 - drug response, 8 - confers sensitivity, 9 - risk-factor, 10 - association, 11 - protective, 12 - conflict, 13 - affects, 255 - other">
34 | ##INFO=<ID=CLNDISDB,Number=.,Type=String,Description="Variant disease database name and ID, separated by colon (:)">
35 | ##INFO=<ID=CLNDN,Number=.,Type=String,Description="Preferred ClinVar disease name">
36 | ##INFO=<ID=CLNREVSTAT,Number=.,Type=String,Description="ClinVar Review Status: no_assertion - No asserition provided by submitter, no_criteria - No assertion criteria provided by submitter, single - Classified by single submitter, mult - Classified by multiple submitters, conf - Criteria provided conflicting interpretations, exp - Reviewed by expert panel, guideline - Practice guideline">
37 | ##INFO=<ID=CLNACC,Number=.,Type=String,Description="For each allele (comma delimited), this is a pipe-delimited list of the Clinvar RCV phenotype accession.version strings associated with that allele.">
38 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
39 | NC_000001.11	10019	.	TA	T	.	.	RS=775809821;dbSNPBuildID=144;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=INDEL
40 | NC_000001.11	10039	.	A	C	.	.	RS=978760828;dbSNPBuildID=150;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV
41 | NC_000001.11	10043	.	T	A	.	.	RS=1008829651;dbSNPBuildID=150;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV
42 | NC_000001.11	10051	.	A	G	.	.	RS=1052373574;dbSNPBuildID=150;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV
43 | NC_000001.11	10051	.	A	AC	.	.	RS=1326880612;dbSNPBuildID=151;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=INDEL
44 | NC_000001.11	10055	.	T	TA	.	.	RS=768019142;dbSNPBuildID=144;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=INDEL
45 | NC_000001.11	10055	.	T	A	.	.	RS=892501864;dbSNPBuildID=150;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV
46 | NC_000001.11	10063	.	A	C	.	.	RS=1010989343;dbSNPBuildID=150;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV
47 | NC_000001.11	10067	.	T	TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCC	.	.	RS=1489251879;dbSNPBuildID=151;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=INDEL
48 | NC_000001.11	10077	.	C	G	.	.	RS=1022805358;dbSNPBuildID=150;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV
49 | NC_000001.11	10108	.	C	T	.	.	RS=62651026;dbSNPBuildID=129;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV
50 | NC_000001.11	10108	.	C	CT	.	.	RS=1322538365;dbSNPBuildID=151;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=INS
51 | 


--------------------------------------------------------------------------------
/tutorials/extract_flank.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -norc
 2 | #retrieve RS flanking sequences
 3 | #required Entrez EDirect E-utilities to be installed on your computer (https://www.ncbi.nlm.nih.gov/books/NBK179288/) 
 4 | 
 5 | flank_length=100       #user specified flank sequence length (ie. 100bp)
 6 | efetch -db snp -id 19,268,12516 -format xml | #retrieve one or more dbSNP rs id comma-delimited
 7 | xtract -pattern DocumentSummary -element CHRPOS ACC SNP_ID ALLELE -block GENES/GENE_E -element NAME | #extract xml elements
 8 | tr -s ':' '\t' |
 9 | while read chr pos accn snp_id allele gene
10 | do
11 |     flank_start=$((pos-flank_length))
12 |     flank_end=$((pos+flank_length))
13 | 
14 | 	lft=$(efetch -db nuccore -format fasta -id "$accn" \
15 | 	            -seq_start "$flank_start" -seq_stop "$((pos-1))" < /dev/null |
16 | 	          grep -v '>' | tr -d '\n')
17 | 
18 | 	rgt=$(efetch -db nuccore -format fasta -id "$accn" \
19 | 	            -seq_start "$((pos+1))" -seq_stop "$flank_end" < /dev/null |
20 | 	          grep -v '>' | tr -d '\n')
21 | 
22 | 	echo ">gnl|dbSNP|rs=$snp_id|allele=$allele|gene=$gene|chr=$chr|chr_acc=$accn|flank_start=$flank_start|rs_pos=$pos|flank_end=$flank_end"
23 | 	echo "$lft"
24 | 	echo "$allele"
25 | 	echo "$rgt"
26 | 	echo "||" #FASTA delimiter
27 | 
28 | done
29 | 


--------------------------------------------------------------------------------
/tutorials/get_rs_flank.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "get_rs_flank.ipynb",
  7 |       "provenance": []
  8 |     },
  9 |     "kernelspec": {
 10 |       "name": "python3",
 11 |       "display_name": "Python 3"
 12 |     },
 13 |     "language_info": {
 14 |       "name": "python"
 15 |     }
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "markdown",
 20 |       "source": [
 21 |         "# **Retrieve flanking sequencing and output FASTA format for input RS numbers**"
 22 |       ],
 23 |       "metadata": {
 24 |         "id": "ZWQMMwpIN6mX"
 25 |       }
 26 |     },
 27 |     {
 28 |       "cell_type": "markdown",
 29 |       "source": [
 30 |         "### Import required Python modules"
 31 |       ],
 32 |       "metadata": {
 33 |         "id": "Jxze83rR4eiq"
 34 |       }
 35 |     },
 36 |     {
 37 |       "cell_type": "code",
 38 |       "execution_count": 36,
 39 |       "metadata": {
 40 |         "id": "2awJRgNz1krO"
 41 |       },
 42 |       "outputs": [],
 43 |       "source": [
 44 |         "\n",
 45 |         "from urllib.request import urlopen\n",
 46 |         "import json\n",
 47 |         "import re\n",
 48 |         "import requests\n",
 49 |         "import time"
 50 |       ]
 51 |     },
 52 |     {
 53 |       "cell_type": "markdown",
 54 |       "source": [
 55 |         "### Defined a function named 'get_rs_flank' to retrieve flanks of given rs IDs."
 56 |       ],
 57 |       "metadata": {
 58 |         "id": "RniYuJ3_4dKn"
 59 |       }
 60 |     },
 61 |     {
 62 |       "cell_type": "code",
 63 |       "source": [
 64 |         "def get_rs_flank(rsList, len_of_flank=100):  #Get RefSNP(rs) flanks and output FASTA format; \n",
 65 |         "  for id in rsList:\n",
 66 |         "    rsid=str(id)\n",
 67 |         "    summary_url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=snp&id='+rsid+'&retmode=json' #eSummary to get RS docsum including chr/pos.\n",
 68 |         "    response = urlopen(summary_url)\n",
 69 |         "    data_json = json.loads(response.read()) #load eSummary results \n",
 70 |         "\n",
 71 |         "    chr_gi={'1': 568815597, '2': 568815596,'3': 568815595,'4': 568815594,'5': 568815593,'6': 568815592,'7': 568815591,'8': 568815590,\n",
 72 |         "  '9': 568815589,'10': 568815588,'11': 568815587,'12': 568815586,'13': 568815585,'14': 568815584,'15': 568815583,'16': 568815582,\n",
 73 |         "  '17': 568815581,'18': 568815580,'19': 568815579,'20': 568815578,'21': 568815577,'22': 568815576,'X': 568815575,'Y': 568815574, 'MT': 251831106}\n",
 74 |         "\n",
 75 |         "    snp_class=data_json['result'][rsid]['snp_class']          #get variant type (SNV, DELETIONs, etc.)\n",
 76 |         "    [chr, pos]=data_json['result'][rsid]['chrpos'].split(':') #get chromosome and position\n",
 77 |         "    docsum=data_json['result'][rsid]['docsum']\n",
 78 |         "    allele=re.findall('\\|SEQ=\\[(.*)\\]\\|',docsum)[0]           #get alleles\n",
 79 |         "\n",
 80 |         "    seq=str(chr_gi[chr])\n",
 81 |         "    seq_start=0\n",
 82 |         "    seq_stop=0\n",
 83 |         "\n",
 84 |         "    if snp_class=='snv': # true SNP\n",
 85 |         "      seq_start_5=int(pos)-len_of_flank\n",
 86 |         "      seq_stop_5=int(pos)-1\n",
 87 |         "      seq_start_3=int(pos)+1\n",
 88 |         "      seq_stop_3=int(pos)+len_of_flank\n",
 89 |         "    elif allele.split('/')[0]=='-':  #INSERTIONs\n",
 90 |         "      seq_start_5=int(pos)-len_of_flank+1\n",
 91 |         "      seq_stop_5=int(pos)\n",
 92 |         "      seq_start_3=int(pos)+1\n",
 93 |         "      seq_strop_3=int(pos)+len_of_flank\n",
 94 |         "    else: #DELETIONs or MNVs\n",
 95 |         "      seq_start_5=int(pos)-len_of_flank\n",
 96 |         "      seq_stop_5=int(pos)-1\n",
 97 |         "      seq_start_3=int(pos)+len(allele.split('/')[0])\n",
 98 |         "      seq_stop_3=int(pos)+len(allele.split('/')[0])+len_of_flank\n",
 99 |         "   \n",
100 |         "    #retrieve 5' flanks from nucleotide database using eFetch\n",
101 |         "    seq_url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id='+seq+'&seq_start='+str(seq_start_5)+'&seq_stop='+str(seq_stop_5)+'&rettype=fasta'\n",
102 |         "    response = requests.get(seq_url)\n",
103 |         "    data = response.text\n",
104 |         "    five_prime_flank=''.join(data.split('\\n')[1:]).strip()\n",
105 |         "\n",
106 |         "     #retrieve 3' flanks from nucleotide database using eFetch\n",
107 |         "    seq_url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id='+seq+'&seq_start='+str(seq_start_3)+'&seq_stop='+str(seq_stop_3)+'&rettype=fasta'\n",
108 |         "    response = requests.get(seq_url)\n",
109 |         "    data = response.text\n",
110 |         "    three_prime_flank=''.join(data.split('\\n')[1:]).strip()\n",
111 |         "    \n",
112 |         "    #format and print FASTA results\n",
113 |         "    fasta_header='>rs'+rsid+'|'+snp_class+'|'+chr+':'+pos+'|'+allele\n",
114 |         "\n",
115 |         "    print(fasta_header)\n",
116 |         "    print(five_prime_flank)\n",
117 |         "    print('['+allele+']')\n",
118 |         "    print(three_prime_flank)\n",
119 |         "\n",
120 |         "    time.sleep(1) # set (1 request/sec) to fix HTTP Error 429: Too Many Requests; for faster requests get eUtils API key https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/)"
121 |       ],
122 |       "metadata": {
123 |         "id": "fVxC14jp__6y"
124 |       },
125 |       "execution_count": 37,
126 |       "outputs": []
127 |     },
128 |     {
129 |       "cell_type": "markdown",
130 |       "source": [
131 |         "### An example of calling the function"
132 |       ],
133 |       "metadata": {
134 |         "id": "MQgwkGDH5FGW"
135 |       }
136 |     },
137 |     {
138 |       "cell_type": "code",
139 |       "source": [
140 |         "flank_len = 25 #flank length default is 100\n",
141 |         "get_rs_flank([328,1639546602], flank_len)"
142 |       ],
143 |       "metadata": {
144 |         "colab": {
145 |           "base_uri": "https://localhost:8080/"
146 |         },
147 |         "id": "X9kWw5AhocE8",
148 |         "outputId": "f7961507-3550-493a-bc0f-8edca28b07a1"
149 |       },
150 |       "execution_count": 38,
151 |       "outputs": [
152 |         {
153 |           "output_type": "stream",
154 |           "name": "stdout",
155 |           "text": [
156 |             ">rs328|snv|8:19962213|C/A/G\n",
157 |             "CATGACAAGTCTCTGAATAAGAAGT\n",
158 |             "[C/A/G]\n",
159 |             "AGGCTGGTGAGCATTCTGGGCTAAA\n",
160 |             ">rs1639546602|delins|1:10130|TAACC/-\n",
161 |             "ACCCAACCCTAACCCTAACCCTAAC\n",
162 |             "[TAACC/-]\n",
163 |             "CCCTAACCCTAACCCCTAACCCTAAC\n"
164 |           ]
165 |         }
166 |       ]
167 |     }
168 |   ]
169 | }


--------------------------------------------------------------------------------
/tutorials/hadoop_json_clinical.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # ===========================================================================
  3 | #
  4 | #                            PUBLIC DOMAIN NOTICE
  5 | #               National Center for Biotechnology Information
  6 | #
  7 | #  This software/database is a "United States Government Work" under the
  8 | #  terms of the United States Copyright Act.  It was written as part of
  9 | #  the author's official duties as a United States Government employee and
 10 | #  thus cannot be copyrighted.  This software/database is freely available
 11 | #  to the public for use. The National Library of Medicine and the U.S.
 12 | #  Government have not placed any restriction on its use or reproduction.
 13 | #
 14 | #  Although all reasonable efforts have been taken to ensure the accuracy
 15 | #  and reliability of the software and data, the NLM and the U.S.
 16 | #  Government do not and cannot warrant the performance or results that
 17 | #  may be obtained by using this software or data. The NLM and the U.S.
 18 | #  Government disclaim all warranties, express or implied, including
 19 | #  warranties of performance, merchantability or fitness for any particular
 20 | #  purpose.
 21 | #
 22 | #  Please cite the author in any work or product based on this material.
 23 | #
 24 | # ===========================================================================
 25 | # Script name: hadoop_json_annotation.py
 26 | # Description: a demo script to parse dbSNP RS JSON object and extract clinical
 27 | # rs data.  The script will produce tab-delimited output containing
 28 | # accession_version, allele_id, measure_set_id, organization, accession,
 29 | # snp_id, create_date, update_date, last_evaluated_date, review_status,
 30 | # disease_names, clinical_significances, disease_ids_organization,
 31 | # disease_ids_accession, origins, collection_method, citations, and gene_ids.
 32 | #
 33 | # Sample use:
 34 | # python hadoop_json_clinical.py
 35 | #     -r hadoop hdfs:///path/to/input         \
 36 | #     -o hdfs:///path/to/output               \
 37 | #     --no-output                             \
 38 | #     --jobconf mapreduce.job.name=test.mrjob \
 39 | #     --jobconf mapreduce.job.reduces=100     \
 40 | #
 41 | # Author:  Qiang Wang  wangq2@ncbi.nlm.nih.gov
 42 | # For help please contact: tkt-varhd@ncbi.nlm.nih.gov
 43 | #
 44 | #
 45 | # ---------------------------------------------------------------------------
 46 | 
 47 | import json
 48 | 
 49 | from mrjob.job import MRJob
 50 | 
 51 | 
 52 | class MRJsonProcessor(MRJob):
 53 | 
 54 |     def mapper(self, _, line):
 55 | 
 56 |         data = json.loads(line)
 57 | 
 58 |         snp_id = data["refsnp_id"]
 59 | 
 60 |         annotations = data["primary_snapshot_data"]["allele_annotations"]
 61 | 
 62 |         for a in annotations:
 63 |             clinical = a["clinical"]
 64 | 
 65 |             for c in clinical:
 66 |                 accession_version = c["accession_version"]
 67 |                 allele_id = str(c["allele_id"])
 68 |                 measure_set_id = str(c["measure_set_id"])
 69 |                 variant_identifiers = c["variant_identifiers"]
 70 |                 organization = ";".join(
 71 |                     [vi["organization"] for vi in variant_identifiers])
 72 |                 accession = ";".join(
 73 |                     [vi["accession"] for vi in variant_identifiers])
 74 |                 snp_id = c["refsnp_id"]
 75 |                 create_date = c["create_date"]
 76 |                 update_date = c["update_date"]
 77 | 
 78 |                 if "last_evaluated_date" in c:
 79 |                     last_evaluated_date = c["last_evaluated_date"]
 80 |                 else:
 81 |                     last_evaluated_date = ""
 82 | 
 83 |                 review_status = c["review_status"]
 84 |                 disease_names = ";".join(c["disease_names"])
 85 |                 clinical_significances = ";".join(c["clinical_significances"])
 86 |                 disease_ids = c["disease_ids"]
 87 |                 disease_ids_organization = ";".join(
 88 |                     [di["organization"] for di in disease_ids])
 89 |                 disease_ids_accession = ";".join(
 90 |                     [di["accession"] for di in disease_ids])
 91 |                 origins = ";".join(c["origins"])
 92 |                 collection_method = ";".join(c["collection_method"])
 93 |                 citations = ";".join([str(i) for i in c["citations"]])
 94 |                 gene_ids = ";".join(c["gene_ids"])
 95 |                 values = [accession_version, allele_id, measure_set_id,
 96 |                           organization, accession, str(snp_id), create_date,
 97 |                           update_date, last_evaluated_date, review_status,
 98 |                           disease_names, clinical_significances,
 99 |                           disease_ids_organization, disease_ids_accession,
100 |                           origins, collection_method, citations, gene_ids]
101 |                 print("\t".join(values))
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     MRJsonProcessor.run()
106 | 


--------------------------------------------------------------------------------
/tutorials/hadoop_json_merge.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # ===========================================================================
 3 | #
 4 | #                            PUBLIC DOMAIN NOTICE
 5 | #               National Center for Biotechnology Information
 6 | #
 7 | #  This software/database is a "United States Government Work" under the
 8 | #  terms of the United States Copyright Act.  It was written as part of
 9 | #  the author's official duties as a United States Government employee and
10 | #  thus cannot be copyrighted.  This software/database is freely available
11 | #  to the public for use. The National Library of Medicine and the U.S.
12 | #  Government have not placed any restriction on its use or reproduction.
13 | #
14 | #  Although all reasonable efforts have been taken to ensure the accuracy
15 | #  and reliability of the software and data, the NLM and the U.S.
16 | #  Government do not and cannot warrant the performance or results that
17 | #  may be obtained by using this software or data. The NLM and the U.S.
18 | #  Government disclaim all warranties, express or implied, including
19 | #  warranties of performance, merchantability or fitness for any particular
20 | #  purpose.
21 | #
22 | #  Please cite the author in any work or product based on this material.
23 | #
24 | # ===========================================================================
25 | # Script name: hadoop_json_merge.py
26 | # Description: a demo script to parse dbSNP RS JSON object and obtain the
27 | # records of rs merge history.  The script will produce tab-delimited output
28 | # containing current snp_id, merged snp_id, build_id, and merge date.
29 | #
30 | # Sample use:
31 | # python hadoop_json_merge.py                 \
32 | #     -r hadoop hdfs:///path/to/input         \
33 | #     -o hdfs:///path/to/output               \
34 | #     --no-output                             \
35 | #     --jobconf mapreduce.job.name=test.mrjob \
36 | #     --jobconf mapreduce.job.reduces=100
37 | #
38 | # Author:  Qiang Wang  wangq2@ncbi.nlm.nih.gov
39 | # For help please contact: tkt-varhd@ncbi.nlm.nih.gov
40 | #
41 | #
42 | # ---------------------------------------------------------------------------
43 | 
44 | 
45 | import json
46 | 
47 | from mrjob.job import MRJob
48 | 
49 | 
50 | class MRJsonProcessor(MRJob):
51 | 
52 |     def mapper(self, _, line):
53 | 
54 |         data = json.loads(line)
55 | 
56 |         snp_id = data["refsnp_id"]
57 | 
58 |         merges = data["dbsnp1_merges"]
59 | 
60 |         for m in merges:
61 |             values = (snp_id, m["merged_rsid"], m["revision"], m["merge_date"])
62 |             print("\t".join(values))
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     MRJsonProcessor.run()
67 | 


--------------------------------------------------------------------------------
/tutorials/hadoop_json_placement.py:
--------------------------------------------------------------------------------
 1 | # ===========================================================================
 2 | #
 3 | #                            PUBLIC DOMAIN NOTICE
 4 | #               National Center for Biotechnology Information
 5 | #
 6 | #  This software/database is a "United States Government Work" under the
 7 | #  terms of the United States Copyright Act.  It was written as part of
 8 | #  the author's official duties as a United States Government employee and
 9 | #  thus cannot be copyrighted.  This software/database is freely available
10 | #  to the public for use. The National Library of Medicine and the U.S.
11 | #  Government have not placed any restriction on its use or reproduction.
12 | #
13 | #  Although all reasonable efforts have been taken to ensure the accuracy
14 | #  and reliability of the software and data, the NLM and the U.S.
15 | #  Government do not and cannot warrant the performance or results that
16 | #  may be obtained by using this software or data. The NLM and the U.S.
17 | #  Government disclaim all warranties, express or implied, including
18 | #  warranties of performance, merchantability or fitness for any particular
19 | #  purpose.
20 | #
21 | #  Please cite the author in any work or product based on this material.
22 | #
23 | # ===========================================================================
24 | # Script name: hadoop_json_placement.py
25 | # Description: a demo script to parse dbSNP RS JSON object and extract the
26 | # records of rs placement.  The script will produce tab-delimited output
27 | # containing snp_id, seq_id, is_ptlp, is_aln_opposite_orientation,
28 | # is_mismatch, position, deleted_sequence, inserted_sequence, and hgvs.
29 | #
30 | # Sample use:
31 | # python hadoop_json_placement.py             \
32 | #     -r hadoop hdfs:///path/to/input         \
33 | #     -o hdfs:///path/to/output               \
34 | #     --no-output                             \
35 | #     --jobconf mapreduce.job.name = test.mrjob \
36 | #     --jobconf mapreduce.job.reduces=100     \
37 | #
38 | # Author:  Qiang Wang  wangq2@ncbi.nlm.nih.gov
39 | # For help please contact: tkt-varhd@ncbi.nlm.nih.gov
40 | #
41 | #
42 | # ---------------------------------------------------------------------------
43 | 
44 | 
45 | import json
46 | 
47 | from mrjob.job import MRJob
48 | 
49 | 
50 | class MRJsonProcessor(MRJob):
51 | 
52 |     def mapper(self, _, line):
53 | 
54 |         data = json.loads(line)
55 | 
56 |         snp_id = data["refsnp_id"]
57 | 
58 |         placements = data["primary_snapshot_data"]["placements_with_allele"]
59 | 
60 |         for p in placements:
61 |             is_ptlp = str(int(p["is_ptlp"] == "true"))
62 |             is_aln_opposite_orientation = str(int(
63 |                 p["placement_annot"]["is_aln_opposite_orientation"] == "true"))
64 |             is_mismatch = str(
65 |                 int(p["placement_annot"]["is_mismatch"] == "true"))
66 |             for a in p["alleles"]:
67 |                 if "spdi" in a["allele"]:
68 |                     spdi = a["allele"]["spdi"]
69 |                     hgvs = a["hgvs"]
70 |                     values = (
71 |                         snp_id, p["seq_id"], is_ptlp,
72 |                         is_aln_opposite_orientation, is_mismatch,
73 |                         str(spdi["position"]),
74 |                         spdi["deleted_sequence"],
75 |                         spdi["inserted_sequence"], hgvs)
76 |                 else:
77 |                     frameshift = a["allele"]["frameshift"]
78 |                     hgvs = a["hgvs"]
79 |                     values = (
80 |                         snp_id, p["seq_id"], is_ptlp,
81 |                         is_aln_opposite_orientation,
82 |                         is_mismatch,
83 |                         str(frameshift["position"]), "", "", hgvs)
84 |                 print("\t".join(values))
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     MRJsonProcessor.run()
89 | 


--------------------------------------------------------------------------------
/tutorials/refsnp-sample.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncbi/dbsnp/d141385c10136048195110466612e99f5a97e805/tutorials/refsnp-sample.json.gz


--------------------------------------------------------------------------------
/tutorials/rsjson_allele_info_demo.py:
--------------------------------------------------------------------------------
  1 | #!/opt/python-3.4/bin/python
  2 | # ===========================================================================
  3 | #
  4 | #                            PUBLIC DOMAIN NOTICE
  5 | #               National Center for Biotechnology Information
  6 | #
  7 | #  This software/database is a "United States Government Work" under the
  8 | #  terms of the United States Copyright Act.  It was written as part of
  9 | #  the author's official duties as a United States Government employee and
 10 | #  thus cannot be copyrighted.  This software/database is freely available
 11 | #  to the public for use. The National Library of Medicine and the U.S.
 12 | #  Government have not placed any restriction on its use or reproduction.
 13 | #
 14 | #  Although all reasonable efforts have been taken to ensure the accuracy
 15 | #  and reliability of the software and data, the NLM and the U.S.
 16 | #  Government do not and cannot warrant the performance or results that
 17 | #  may be obtained by using this software or data. The NLM and the U.S.
 18 | #  Government disclaim all warranties, express or implied, including
 19 | #  warranties of performance, merchantability or fitness for any particular
 20 | #  purpose.
 21 | #
 22 | #  Please cite the author in any work or product based on this material.
 23 | #
 24 | # ===========================================================================
 25 | # Script name: rsjson_allele_info_demo.py
 26 | # Description: a demo script to parse dbSNP RS JSON object.  The script will
 27 | # produce tab-delimited output containing tthe assembly version, sequence ID,
 28 | # position, mrna and protein SPDI reference allele (inserted) and variant
 29 | # (deleted) sequence, and ClinVar clinical significance if available.
 30 | # Author:  Lon Phan  lonphan@ncbi.nlm.nih.gov
 31 | # For help please contact: tkt-varhd@ncbi.nlm.nih.gov
 32 | #
 33 | #
 34 | # ---------------------------------------------------------------------------
 35 | 
 36 | 
 37 | import sys
 38 | import json
 39 | import re
 40 | 
 41 | rs = {}
 42 | 
 43 | 
 44 | def printAllele_annotations(primary_refsnp):
 45 |     '''
 46 |     rs clinical significance
 47 |     '''
 48 |     for annot in primary_refsnp['allele_annotations']:
 49 |         for clininfo in annot['clinical']:
 50 |             print(",".join(clininfo['clinical_significances']))
 51 | 
 52 | 
 53 | def getPlacements(info):
 54 |     '''
 55 |     rs genomic positions
 56 |     '''
 57 |     rs['alleles'] = []  # holder for one or more variant alleles
 58 |     for alleleinfo in info:
 59 |         # has top level placement (ptlp) and assembly info
 60 |         if alleleinfo['is_ptlp'] and len(
 61 |                 alleleinfo['placement_annot']['seq_id_traits_by_assembly']
 62 |                 ) > 0:  # get genomic placement and alleles
 63 | 
 64 |             for a in alleleinfo['alleles']:
 65 |                 spdi = a['allele']['spdi']
 66 |                 if spdi['inserted_sequence'] == spdi['deleted_sequence']:
 67 |                     rs['alleles'].append({'allele': spdi['deleted_sequence']})
 68 |                     rs['seq_id'] = spdi['seq_id']
 69 |                     rs['position'] = spdi['position']
 70 |                 else:  # spdi['inserted_sequence'] != spdi['deleted_sequence']:
 71 |                     rs['alleles'].append({'allele': spdi['inserted_sequence']})
 72 | 
 73 | 
 74 | def getRefSeqAnnot(info):
 75 |     '''
 76 |     rs refseq info
 77 |     '''
 78 |     for idx in range(0, len(rs['alleles'])):
 79 |         allele_annotation = info[idx]['assembly_annotation'][0]
 80 |         # get only RefSeq annotation on NC
 81 |         if (re.match('^NC_', allele_annotation['seq_id'])):
 82 |                 for g in allele_annotation['genes']:
 83 |                     # allele and annotation have same ordering
 84 |                     rs['alleles'][idx]['refseq_annot'] = g
 85 | 
 86 | 
 87 | for line in sys.stdin:
 88 |     rs_obj = json.loads(line)
 89 |     rs['id'] = rs_obj['refsnp_id']
 90 |     if 'primary_snapshot_data' in rs_obj:
 91 |         getPlacements(
 92 |             rs_obj['primary_snapshot_data']['placements_with_allele'])
 93 |         getRefSeqAnnot(
 94 |             rs_obj['primary_snapshot_data']['allele_annotations'])
 95 |         idx = 0
 96 |         for a in rs['alleles']:
 97 |             if 'refseq_annot' in a:
 98 |                 rnas = a['refseq_annot']['rnas']
 99 |                 gene_symbol = a['refseq_annot']['locus']
100 |                 gene_name = a['refseq_annot']['name']
101 |                 for r in rnas:
102 |                     if 'transcript_change' in r:
103 |                         mrna = r['transcript_change']
104 |                         protein = r['protein']['variant']['spdi']
105 |                         print("\t".join([rs['id'], a['allele'], gene_name,
106 |                                         gene_symbol, mrna['seq_id'],
107 |                                         mrna['deleted_sequence'],
108 |                                         str(mrna['position']),
109 |                                         mrna['deleted_sequence'],
110 |                                         protein['seq_id'],
111 |                                         protein['deleted_sequence'],
112 |                                         str(protein['position']),
113 |                                         protein['deleted_sequence']]))
114 | 


--------------------------------------------------------------------------------
/tutorials/rsjson_demo.py:
--------------------------------------------------------------------------------
 1 | # ===========================================================================
 2 | #
 3 | #                            PUBLIC DOMAIN NOTICE
 4 | #               National Center for Biotechnology Information
 5 | #
 6 | #  This software/database is a "United States Government Work" under the
 7 | #  terms of the United States Copyright Act.  It was written as part of
 8 | #  the author's official duties as a United States Government employee and
 9 | #  thus cannot be copyrighted.  This software/database is freely available
10 | #  to the public for use. The National Library of Medicine and the U.S.
11 | #  Government have not placed any restriction on its use or reproduction.
12 | #
13 | #  Although all reasonable efforts have been taken to ensure the accuracy
14 | #  and reliability of the software and data, the NLM and the U.S.
15 | #  Government do not and cannot warrant the performance or results that
16 | #  may be obtained by using this software or data. The NLM and the U.S.
17 | #  Government disclaim all warranties, express or implied, including
18 | #  warranties of performance, merchantability or fitness for any particular
19 | #  purpose.
20 | #
21 | #  Please cite the author in any work or product based on this material.
22 | #
23 | # ===========================================================================
24 | # Script name: rsjson_demo.py
25 | # Description: a demo script to parse dbSNP RS JSON object.  The script will
26 | # produce tab-delimited output containing tthe assembly version, sequence ID,
27 | # position, reference allele, variant allele and ClinVar clinical significance
28 | # if available.
29 | # Author:  Lon Phan  lonphan@ncbi.nlm.nih.gov
30 | # For help please contact: tkt-varhd@ncbi.nlm.nih.gov
31 | #
32 | #
33 | # ---------------------------------------------------------------------------
34 | 
35 | 
36 | import argparse
37 | import json
38 | import gzip
39 | 
40 | 
41 | def printAllele_annotations(primary_refsnp):
42 |     '''
43 |     rs clinical significance
44 |     '''
45 |     for annot in primary_refsnp['allele_annotations']:
46 |         for clininfo in annot['clinical']:
47 |             print(",".join(clininfo['clinical_significances']))
48 | 
49 | 
50 | def printPlacements(info):
51 |     '''
52 |     rs genomic positions
53 |     '''
54 | 
55 |     for alleleinfo in info:
56 |         # has top level placement (ptlp) and assembly info
57 |         if alleleinfo['is_ptlp'] and \
58 |                 len(alleleinfo['placement_annot']
59 |                     ['seq_id_traits_by_assembly']) > 0:
60 |             assembly_name = (alleleinfo['placement_annot']
61 |                                        ['seq_id_traits_by_assembly']
62 |                                        [0]['assembly_name'])
63 | 
64 |             for a in alleleinfo['alleles']:
65 |                 spdi = a['allele']['spdi']
66 |                 if spdi['inserted_sequence'] != spdi['deleted_sequence']:
67 |                     (ref, alt, pos, seq_id) = (spdi['deleted_sequence'],
68 |                                                spdi['inserted_sequence'],
69 |                                                spdi['position'],
70 |                                                spdi['seq_id'])
71 |                     break
72 |             print("\t".join([assembly_name, seq_id, str(pos), ref, alt]))
73 | 
74 | 
75 | parser = argparse.ArgumentParser(
76 |     description='Example of parsing JSON RefSNP Data')
77 | parser.add_argument(
78 |     '-i', dest='input_fn', required=True,
79 |     help='The name of the input file to parse')
80 | 
81 | args = parser.parse_args()
82 | 
83 | 
84 | cnt = 0
85 | with gzip.open(args.input_fn, 'rb') as f_in:
86 |     for line in f_in:
87 |         rs_obj = json.loads(line.decode('utf-8'))
88 |         print(rs_obj['refsnp_id'] + "\t")  # rs ID
89 | 
90 |         if 'primary_snapshot_data' in rs_obj:
91 |             printPlacements(
92 |                 rs_obj['primary_snapshot_data']['placements_with_allele'])
93 |             printAllele_annotations(rs_obj['primary_snapshot_data'])
94 |             print("\n")
95 | 
96 |         cnt = cnt + 1
97 |         if (cnt > 1000):
98 |             break
99 | 


--------------------------------------------------------------------------------
/tutorials/rsjson_getss_info_demo.py:
--------------------------------------------------------------------------------
 1 | #!/opt/python-3.4/bin/python
 2 | # ===========================================================================
 3 | #
 4 | #                            PUBLIC DOMAIN NOTICE
 5 | #               National Center for Biotechnology Information
 6 | #
 7 | #  This software/database is a "United States Government Work" under the
 8 | #  terms of the United States Copyright Act.  It was written as part of
 9 | #  the author's official duties as a United States Government employee and
10 | #  thus cannot be copyrighted.  This software/database is freely available
11 | #  to the public for use. The National Library of Medicine and the U.S.
12 | #  Government have not placed any restriction on its use or reproduction.
13 | #
14 | #  Although all reasonable efforts have been taken to ensure the accuracy
15 | #  and reliability of the software and data, the NLM and the U.S.
16 | #  Government do not and cannot warrant the performance or results that
17 | #  may be obtained by using this software or data. The NLM and the U.S.
18 | #  Government disclaim all warranties, express or implied, including
19 | #  warranties of performance, merchantability or fitness for any particular
20 | #  purpose.
21 | #
22 | #  Please cite the author in any work or product based on this material.
23 | #
24 | # ===========================================================================
25 | # Script name: rsjson_allele_info_demo.py
26 | # Description: a demo script to parse dbSNP RS JSON object.  The script will
27 | # produce tab-delimited output containing rs number, handle (if available),
28 | # type (subsnp|clinvar), and id (dbSNP ss|clinvar rcv)
29 | # Author:  Lon Phan  lonphan@ncbi.nlm.nih.gov
30 | # For help please contact: tkt-varhd@ncbi.nlm.nih.gov
31 | #
32 | #
33 | # ---------------------------------------------------------------------------
34 | 
35 | 
36 | import sys
37 | import json
38 | 
39 | 
40 | def getSsInfo(rs, obj):
41 |     for ss in obj:
42 |         id = ss['id']
43 |         print("\t".join([str(rs), ss['submitter_handle'],
44 |                         id['type'], id['value']]))
45 | 
46 | 
47 | for line in sys.stdin:
48 |     rs_obj = json.loads(line)
49 |     if 'primary_snapshot_data' in rs_obj:
50 |         print("\t".join(["rs", "handle", "type", "ss or RCV"]))
51 |         getSsInfo(rs_obj['refsnp_id'],
52 |                   rs_obj['primary_snapshot_data']['support'])
53 | 


--------------------------------------------------------------------------------