├── vicckb
    ├── __init__.py
    ├── definitions.py
    ├── tests
    │   ├── test_harmonizers.py
    │   └── test_model.py
    ├── harmonizers.py
    └── model.py
├── .gitignore
├── README.md
├── requirements.txt
├── LICENSE
└── supporting_scripts
    └── VICCdisease_graphs_by_database.R


/vicckb/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | out/
2 | data/
3 | .cache/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # vicckb
2 | python module for analyzing the VICC meta-Knowledgebase
3 | 


--------------------------------------------------------------------------------
/vicckb/definitions.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | 
3 | PROJECT_ROOT = Path(__file__).resolve().parent
4 | DATA_ROOT = PROJECT_ROOT / 'data'
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pytest
 2 | pyupset==0.1.1.post7
 3 | pandas==0.25
 4 | matplotlib==2.0.2
 5 | scipy==1.4.1
 6 | jupyter
 7 | requests==2.22
 8 | networkx==1.11
 9 | obonet==0.2.2
10 | seaborn==0.9.0
11 | 


--------------------------------------------------------------------------------
/vicckb/tests/test_harmonizers.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from vicckb.definitions import DATA_ROOT
 3 | from vicckb.harmonizers import DiseaseHarmonizer
 4 | 
 5 | 
 6 | @pytest.fixture(scope="module")
 7 | def adh():
 8 |     """Aliased Disease Harmonizer"""
 9 |     return DiseaseHarmonizer(map_file=(DATA_ROOT / 'disease_alias.tsv'),
10 |                              disease_ontology='DOID')
11 | 
12 | 
13 | class TestDiseaseHarmonizer(object):
14 | 
15 |     def test_adh_init(self, adh):
16 |         assert adh
17 | 
18 |     def test_adh_aliases(self, adh):
19 |         assert 325 == len(adh._map)
20 | 
21 |     def test_search(self, adh):
22 |         result = adh.harmonize('breast cancer')
23 |         expected = {
24 |             'ontology': 'DOID',
25 |             'term': 'breast cancer',
26 |             'id': 'DOID:1612',
27 |             'resultEngine': 'ebi'
28 |         }
29 |         assert expected == result
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Alex H. Wagner, PhD
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/vicckb/harmonizers.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import os
  3 | import csv
  4 | 
  5 | BIOONTOLOGY_URL = 'http://data.bioontology.org/search'
  6 | EBI_URL = 'https://www.ebi.ac.uk/ols/api/search'
  7 | ONTOLOGIES = {
  8 |     'DOID': {'ebi': 'doid', 'bioontology': 'DOID'}
  9 | }
 10 | 
 11 | 
 12 | class DiseaseHarmonizer:
 13 | 
 14 |     def __init__(self, api_key=None, map_file=None, disease_ontology=None):
 15 |         if api_key is None:
 16 |             self.api_key = os.environ['BIOONTOLOGY_API_KEY']
 17 |         else:
 18 |             self.api_key = api_key
 19 |         if disease_ontology:
 20 |             assert disease_ontology in ONTOLOGIES
 21 |         self.disease_ontology = disease_ontology
 22 |         self._cache = dict()
 23 |         self._init_map(map_file)
 24 | 
 25 |     def _init_map(self, filename=None):
 26 |         """Subclasses may override this to preload source-specific terms.
 27 |         It may also be called with the optional filename to preload from a TSV.
 28 |         TSV is expected to be of format: source term, expanded term"""
 29 |         self._map = dict()
 30 |         if filename is None:
 31 |             return
 32 |         with open(filename, 'r', newline='') as f:
 33 |             reader = csv.reader(f, delimiter="\t")
 34 |             for line in reader:
 35 |                 self._map[line[0].lower()] = line[1]
 36 | 
 37 |     def harmonize(self, term):
 38 |         assert isinstance(term, str)
 39 |         alias = self._map.get(term.lower(), False)
 40 |         if alias:
 41 |             term = alias
 42 |         if term.lower() in self._cache:
 43 |             return self._cache[term.lower()]
 44 |         result = self.query_ebi(term)
 45 |         if result:
 46 |             return self._prepare_result(term, result, 'ebi')
 47 |         result = self.query_bioontology(term)
 48 |         if result:
 49 |             return self._prepare_result(term, result, 'bioontology')
 50 |         self._cache[term] = None
 51 |         return None
 52 | 
 53 |     def _prepare_result(self, term, result, engine):
 54 |         out = {k: v for k, v in result.items() if k in ['ontology', 'term', 'id']}
 55 |         out['resultEngine'] = engine
 56 |         self._cache[term.lower()] = out
 57 |         return out
 58 | 
 59 |     def _submit_query(self, url, payload):
 60 |         r = requests.get(url, params=payload)
 61 |         r.raise_for_status()
 62 |         return r.json()
 63 | 
 64 |     def query_ebi(self, term):
 65 |         payload = {
 66 |             'q': term,
 67 |             'groupField': 'iri',
 68 |             'exact': 'on',
 69 |             'start': '0'
 70 |         }
 71 |         if self.disease_ontology and ONTOLOGIES[self.disease_ontology]['ebi']:
 72 |             payload['ontology'] = ONTOLOGIES[self.disease_ontology]['ebi']
 73 |         j = self._submit_query(EBI_URL, payload)['response']
 74 |         if j['numFound'] == 0:
 75 |             return None
 76 |         match = j['docs'][0]
 77 |         return {
 78 |             'id': match['obo_id'],
 79 |             'ontology': match['ontology_prefix'],
 80 |             'term': match['label']
 81 |         }
 82 | 
 83 |     def query_bioontology(self, term):
 84 |         payload = {'q': term, 'apikey': self.api_key}
 85 |         if self.disease_ontology and ONTOLOGIES[self.disease_ontology]['bioontology']:
 86 |             payload['ontologies'] = ONTOLOGIES[self.disease_ontology]['bioontology']
 87 |         j = self._submit_query(BIOONTOLOGY_URL, payload)
 88 |         if j.get('collection', False):
 89 |             match = j['collection'][0]
 90 |             prefLabel = match['prefLabel']
 91 |             ontology, oid = match['@id'].split('/')[-2:]
 92 |             if ontology == 'obo':
 93 |                 ontology = oid.split('_')[0]
 94 |             return {
 95 |                 'ontology': ontology,
 96 |                 'term': prefLabel,
 97 |                 'id': oid.replace('_', ':'),
 98 |                 'matchType': match['matchType']
 99 |             }
100 |         return None
101 | 


--------------------------------------------------------------------------------
/vicckb/tests/test_model.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from collections import Counter
  3 | from vicckb.model import ViccDb, GenomicFeature, Disease
  4 | from os import environ
  5 | import networkx
  6 | 
  7 | 
  8 | CACHE_PRESENT = ViccDb.DEFAULT_CACHE.exists()
  9 | PAPER_TESTING = bool(environ.get('VICC_PAPER_TESTING'))
 10 | if PAPER_TESTING:
 11 |     SOURCES = ['molecularmatch', 'civic', 'pmkb', 'oncokb', 'jax', 'cgi']
 12 | else:
 13 |     SOURCES = ['molecularmatch', 'civic', 'pmkb', 'brca', 'oncokb', 'jax', 'cgi']
 14 | 
 15 | 
 16 | @pytest.fixture(scope="module")
 17 | def vdb():
 18 |     vdb = ViccDb(load_cache=CACHE_PRESENT)
 19 |     return vdb
 20 | 
 21 | 
 22 | @pytest.fixture(scope="module", params=SOURCES)
 23 | def sourcedb(vdb, request):
 24 |     return ViccDb(vdb.by_source(request.param))
 25 | 
 26 | 
 27 | @pytest.fixture(scope="module")
 28 | def gfa():
 29 |     return GenomicFeature(1, 1, 1, 'GRCh37', 'Feature A', 'FAKE1', alt='G')
 30 | 
 31 | 
 32 | @pytest.fixture(scope="module")
 33 | def gfb():
 34 |     return GenomicFeature(1, 1, 1, 'GRCh37', 'Feature A', 'FAKE1', alt='C')
 35 | 
 36 | 
 37 | @pytest.fixture(scope="module")
 38 | def gfa2():
 39 |     return GenomicFeature(1, 1, 1, 'GRCh37', 'Feature A', 'FAKE1', alt='G', sequence_ontology={'soid': 141})
 40 | 
 41 | 
 42 | class TestViccDb(object):
 43 | 
 44 |     def test_len(self, vdb):
 45 |         assert len(vdb) > 5000
 46 | 
 47 |     def test_select(self, vdb):
 48 |         civicdb = vdb.by_source('civic')
 49 |         assert len(civicdb) == len(vdb.select(lambda x: x['source'] == 'civic'))
 50 |         assert len(civicdb) > 1000
 51 | 
 52 |     def test_iter(self, vdb):
 53 |         count = 0
 54 |         for _ in vdb:
 55 |             count += 1
 56 |         assert len(vdb) == count
 57 | 
 58 |     def test_subtraction(self, vdb):
 59 |         civicdb = vdb.by_source('civic')
 60 |         delta = vdb - civicdb
 61 |         assert len(delta) == len(vdb) - len(civicdb)
 62 |         assert len(delta) > 5000
 63 | 
 64 |     def test_search_features(self, vdb):
 65 |         hits = vdb.search_by_feature(chromosome=7, start=140453136, end=140453136,
 66 |                                      reference_name='GRCh37', name='V600E', gene_symbol='BRAF')
 67 |         assert len(hits) >= 500
 68 |         associations = [hit['association'] for hit in hits]
 69 |         results = ViccDb(associations)
 70 |         assert len(results.sources) == 6
 71 |         gf = GenomicFeature(chromosome=7, start=140453136, end=140453136,
 72 |                                      reference_name='GRCh37', name='V600E', gene_symbol='BRAF')
 73 |         hits2 = list(vdb.search_by_features([gf]))
 74 |         assert len(hits2) == len(hits)
 75 | 
 76 |     def test_multisearch_features(self, vdb):
 77 |         TEST_SIZE = 500
 78 |         unique_features = set()
 79 |         x = [x.features for x in vdb]
 80 |         for fset in x:
 81 |             unique_features.update(fset)
 82 |         unique_features = list(unique_features)
 83 |         hits = vdb.search_by_features(unique_features[:TEST_SIZE])
 84 |         aset = {hit['association'] for hit in hits}
 85 |         assert len(aset) > TEST_SIZE
 86 |         qset = {hit['query'] for hit in hits}
 87 |         assert len(qset) == TEST_SIZE
 88 | 
 89 | 
 90 | class TestGenomicFeatures(object):
 91 | 
 92 |     def test_len(self, vdb):
 93 |         for association in vdb:
 94 |             for feature in association.features:
 95 |                 try:
 96 |                     assert len(feature) >= 1
 97 |                 except ValueError:
 98 |                     raise ValueError("Association {} has feature {} with invalid length".format(association, feature))
 99 | 
100 |     def test_equality(self, gfa, gfb):
101 |         assert gfa == gfb
102 | 
103 |     def test_hash(self, gfa, gfb, gfa2):
104 |         assert hash(gfa) != hash(gfb)
105 |         assert hash(gfa) == hash(gfa2)
106 | 
107 | 
108 | class TestDisease(object):
109 | 
110 |     def test_ontology(self):
111 |         graph = Disease.DISEASE_ONTOLOGY
112 |         assert networkx.is_directed_acyclic_graph(graph)
113 | 
114 |     def test_uniqueness(self):
115 |         s = set()
116 |         disease_a = Disease('DOID:1', 'DOID', 'Disease_1')
117 |         disease_b = Disease('DOID:1', 'DOID', 'Disease 1')
118 |         disease_c = Disease('DOID:2', 'DOID', 'Disease_2')
119 |         s.add(disease_a)
120 |         s.add(disease_b)
121 |         s.add(disease_c)
122 |         assert 2 == len(s)
123 | 
124 | 
125 | class TestOncokb(object):
126 | 
127 |     def test_types(self, vdb):
128 |         okb = vdb.by_source('oncokb')
129 |         for x in okb:
130 |             assert 'clinical' in x['raw']  # Only clinical results
131 | 
132 | 
133 | class TestSource(object):
134 | 
135 |     def test_hash(self, sourcedb):
136 |         c = Counter(map(lambda x: hash(x), sourcedb))   # Implicit test that all elements hash
137 |         assert len(c) == sum(c.values())             # Tests that hashes are unique
138 | 
139 |     def test_genes(self, sourcedb):
140 |         count = 0
141 |         for x in sourcedb:
142 |             if len(x.genes) == 0:
143 |                 count += 1
144 |         assert count < 0.01 * len(sourcedb)  # less than 1% of associations lacking genes
145 | 
146 |     def test_features(self, sourcedb):
147 |         count = 0
148 |         for x in sourcedb:
149 |             if len(x.features) == 0:
150 |                 count += 1
151 |         assert count < 0.02 * len(sourcedb)  # less than 2% of associations lacking features
152 | 
153 |     def test_diseases(self, sourcedb):
154 |         count = 0
155 |         for x in sourcedb:
156 |             if x.disease is None:
157 |                 count += 1
158 |         assert count < 5
159 | 
160 |     def test_evidence_level(self, sourcedb):
161 |         count = 0
162 |         for x in sourcedb:
163 |             if not x.evidence_level:
164 |                 count += 1
165 |         assert count == 0
166 | 
167 |     # def test_drugs(self, sourcedb):
168 |     #     count = 0
169 |     #     for x in sourcedb:
170 |     #         if len(x.drugs) == 0:
171 |     #             count += 1
172 |     #     assert count == 0


--------------------------------------------------------------------------------
/supporting_scripts/VICCdisease_graphs_by_database.R:
--------------------------------------------------------------------------------
  1 | ###################################################################################################
  2 | ##################################### Plot disease enrichment #####################################
  3 | ###################################################################################################
  4 | 
  5 | library(ggplot2)
  6 | library(plotrix)
  7 | library(viridis)
  8 | library(reshape2)
  9 | library(multtest)
 10 | library(stats)
 11 | 
 12 | setwd('~/Google Drive/MGI/VICC/Manuscript/Figures/misc_figures/')
 13 | 
 14 | ###################################################################################################
 15 | #### Read in files
 16 | ###################################################################################################
 17 | ## Get disease count lists
 18 | data <- read.csv(file="~/Google Drive/MGI/VICC/Manuscript/Figures/misc_figures/Data/disease_counts.csv",header=T, stringsAsFactors = F)
 19 | 
 20 | ###################################################################################################
 21 | #### summarize top level data for plotting
 22 | ###################################################################################################
 23 | top_data <- aggregate(. ~ TopNode_disease + TopNode_doid, data=data[,3:ncol(data)], FUN=sum)
 24 | 
 25 | ## Get a list of all of the database names
 26 | group_cols <- colnames(top_data)[!(colnames(top_data) %in% c("TopNode_disease","TopNode_doid"))]
 27 | 
 28 | ## Get total counts for each cancer across databases
 29 | top_data$total <- apply(top_data[,colnames(top_data) %in% group_cols], 1, FUN = sum)
 30 | top_data$total_perc <- top_data$total/sum(top_data$total)*100
 31 | 
 32 | ## Create top-level bins
 33 | top_data[(top_data$TopNode_disease %in% c("cancer")),c("TopNode_disease","TopNode_doid")] <- c("other cancers","other cancers")
 34 | top_data[(top_data$TopNode_disease %in% c("other")),c("TopNode_disease","TopNode_doid")] <- c("other disease","other disease")
 35 | 
 36 | ## Figure out the max percent a disease is in any dataset
 37 | tmp_top <- top_data
 38 | tmp_top[,group_cols] <- apply(tmp_top[,group_cols], 2, function(x) x/sum(x)*100)
 39 | tmp_top$max <- apply(tmp_top[,group_cols], 1, FUN = max)
 40 | top_data <- merge(top_data, tmp_top[,c("TopNode_disease","TopNode_doid","max")])
 41 | 
 42 | # ## Get the top 10 cancers (includes top hit for all 6)
 43 | # top <- head(top_data[order(-top_data$total),], n = 5)
 44 | ## Get diseases for plotting that make up more than 5% of the total
 45 | top <- top_data[which(top_data$max > 5),]
 46 | 
 47 | ## Create a dataframe of only the top 10 + a binned "other cancer" category for all other cancers combined
 48 | top_data_scaled <- as.data.frame(top_data[order(-top_data$total),])
 49 | top_data_scaled[!(top_data_scaled$TopNode_disease %in% c(top$TopNode_disease, "benign neoplasm", "other disease")),c("TopNode_disease","TopNode_doid")] <- c("other cancers","other cancers")
 50 | top_data_scaled <- aggregate(. ~ TopNode_disease + TopNode_doid, data=top_data_scaled, FUN=sum)
 51 | top_data_scaled$TopNode_disease <- gsub(" ","\n",top_data_scaled$TopNode_disease)
 52 | 
 53 | ###################################################################################################
 54 | #### Create pie charts of data
 55 | ###################################################################################################
 56 | # pie(top_data_scaled[which(top_data_scaled$cgi > 0),"cgi"], labels=top_data_scaled[which(top_data_scaled$cgi > 0),"TopNode_disease"], main="CGI", col=viridis(length(top_data_scaled[,"cgi"]), option="B"))
 57 | # pie3D(top_data_scaled[which(top_data_scaled$cgi > 0),"cgi"], labels=top_data_scaled[which(top_data_scaled$cgi > 0),"TopNode_disease"], explode=0.1, main="CGI")
 58 | 
 59 | ## Create pie charts for each database
 60 | create_a_pie <- function(x,y){
 61 |   p <- pie(x[which(x[,y] > 0),y], labels=x[which(x[,y] > 0),"TopNode_disease"], main=y, col=viridis(length(x[,y]), option="B"))
 62 |   return(p)
 63 | }
 64 | # png(file = paste(getwd(),"disease_by_database__piechart_cgi.png", sep = "/"), height=1200, width=1150, res=150)
 65 | #   print(create_a_pie(top_data_scaled,"cgi"))
 66 | # dev.off()
 67 | create_a_pie(top_data_scaled,"civic")
 68 | 
 69 | ###################################################################################################
 70 | #### Create bar charts of data
 71 | ###################################################################################################
 72 | ## Melt the df
 73 | top_data_scaled_long <- melt(top_data_scaled, id.vars = c("TopNode_disease", "TopNode_doid"))
 74 | 
 75 | ## Refactor to desired order
 76 | top_data_scaled_long$TopNode_disease <- factor(top_data_scaled_long$TopNode_disease, levels = c((unique(top_data_scaled[order(top_data_scaled$total,decreasing=F),"TopNode_disease"]))), exclude = NULL)
 77 | 
 78 | ## Plot the disease by database
 79 | png(file = paste(getwd(),"disease_by_database.png", sep = "/"), height=1200, width=1150, res=150)
 80 |   ggplot(top_data_scaled_long[!(top_data_scaled_long$variable %in% c("total","total_perc","max")),], aes(x=TopNode_disease, y=value, fill = variable)) + geom_col(position = 'stack') + xlab("Disease") + ylab("Evidence") + guides(fill=guide_legend(title="Database")) + scale_fill_viridis(discrete = T, direction = -1) 
 81 | dev.off()
 82 | 
 83 | 
 84 | ## Spaces are more desirable for the following plots
 85 | top_data_scaled_long$TopNode_disease <- gsub("\n"," ",top_data_scaled_long$TopNode_disease)
 86 | ## Change the disease order
 87 | top_data_scaled_long$TopNode_disease <- factor(top_data_scaled_long$TopNode_disease, levels = as.vector(c("other disease", "benign neoplasm",as.character(unique(top_data_scaled_long[!(top_data_scaled_long$TopNode_disease %in% c("benign neoplasm","other disease")),"TopNode_disease"])))), exclude = NULL)
 88 | ## Create a custom color palette
 89 | plma <- rev(viridis::plasma(n=nrow(top)))
 90 | new_palette = c("grey87","grey66", plma)
 91 | 
 92 | ## Plot the databases by disease proportions
 93 | png(file = paste(getwd(),"disease_by_database_proportion.png", sep = "/"), height=1200, width=1150, res=150)
 94 |   ggplot(top_data_scaled_long[!(top_data_scaled_long$variable %in% c("total","total_perc")),], aes(x=variable, y=value, fill = TopNode_disease)) + geom_col(position = 'fill') + xlab("Database") + ylab("Proportion of Interpretations") + guides(fill=guide_legend(title="Disease")) + scale_fill_manual(values=new_palette) + theme_bw()
 95 | dev.off()
 96 | 
 97 | pdf(file = paste(getwd(),"disease_by_database_proportion.pdf", sep = "/"), height=9, width=8.625)
 98 |   ggplot(top_data_scaled_long[!(top_data_scaled_long$variable %in% c("total","total_perc")),], aes(x=variable, y=value, fill = TopNode_disease)) + geom_col(position = 'fill') + xlab("Database") + ylab("Proportion of Interpretations") + guides(fill=guide_legend(title="Disease")) + scale_fill_manual(values=new_palette) + theme_bw()
 99 | dev.off()
100 | 
101 | ###################################################################################################
102 | #### Look for disease enrichment per database
103 | ###################################################################################################
104 | top_data_scaled$TopNode_disease <- gsub("\n"," ",top_data_scaled$TopNode_disease)
105 | 
106 | ## Get a list of cancer types to test
107 | cancer_to_test <- top_data_scaled$TopNode_disease[!(top_data_scaled$TopNode_disease %in% c("other disease","other cancers"))]
108 | 
109 | ## Restrict to desired columns and eliminate generic other disease or cancer rows
110 | disease_vs_dataset <- top_data_scaled[!(top_data_scaled$TopNode_disease %in% c("other disease","other cancers")),c("TopNode_disease",group_cols)]
111 | 
112 | ## Convert to table
113 | rownames(disease_vs_dataset) <- disease_vs_dataset$TopNode_disease
114 | disease_vs_dataset$TopNode_disease <- NULL
115 | chisquared_all <- chisq.test(disease_vs_dataset)
116 | 
117 | ## Restrict to desired columns
118 | all_disease_vs_dataset <- top_data_scaled[,c("TopNode_disease",group_cols)]
119 | 
120 | ## Create a results table
121 | results = matrix(nrow=length(cancer_to_test), ncol=3)
122 | colnames(results) <- c("disease","pvalue", "evidence_count")
123 | 
124 | ## Create contingency tables
125 | for(c in 1:length(cancer_to_test)){
126 |   ctab <- all_disease_vs_dataset
127 |   ctab[!grepl(cancer_to_test[c], ctab$TopNode_disease),"TopNode_disease"] <- paste("not",cancer_to_test[c])
128 |   ctab <- aggregate(. ~ TopNode_disease, data=ctab, FUN=sum)
129 |   rownames(ctab) <- ctab$TopNode_disease
130 |   ctab$TopNode_disease <- NULL
131 |   chisq_results = chisq.test(ctab)
132 |   results[c,"disease"] <- cancer_to_test[c]
133 |   results[c,"pvalue"] <- chisq_results$p.value
134 |   results[c,"evidence_count"] <- sum(ctab[cancer_to_test[c],])
135 |   #write.table(ctab, file=paste0(cancer_to_test[c],"_contingency_table.tsv"), sep="\t", row.names = F, quote = F)
136 | }
137 | 
138 | #Correct p-values
139 | pvalues=as.numeric(results[,"pvalue"])
140 | pvalues_adj=mt.rawp2adjp(pvalues, proc=c("Bonferroni","BH"))
141 | pvalues_adj_orig_order=pvalues_adj$adjp[order(pvalues_adj$index),]
142 | results=cbind(results, pvalues_adj_orig_order[,2:3])
143 | results <- as.data.frame(results)
144 | 
145 | 
146 | ## Create a results table
147 | db_results = matrix(nrow=length(group_cols), ncol=3)
148 | colnames(db_results) <- c("database","pvalue","evidence_count")
149 | 
150 | ## Create contingency tables
151 | for(d in 1:length(group_cols)){
152 |   ctab <- all_disease_vs_dataset
153 |   others <- group_cols[!(group_cols %in% c(group_cols[d]))]
154 |   ctab$others <- rowSums(ctab[,others])
155 |   ctab <- ctab[,c("TopNode_disease",group_cols[d],"others")]
156 |   rownames(ctab) <- ctab$TopNode_disease
157 |   ctab$TopNode_disease <- NULL
158 |   chisq_db_results = chisq.test(ctab)
159 |   db_results[d,"database"] <- group_cols[d]
160 |   db_results[d,"pvalue"] <- chisq_db_results$p.value
161 |   db_results[d,"evidence_count"] <- sum(ctab[,group_cols[d]])
162 |   #write.table(ctab, file=paste0(group_cols[d],"_contingency_table.tsv"), sep="\t", row.names = F, quote = F)
163 | }
164 | 
165 | #Correct p-values
166 | pvalues=as.numeric(db_results[,"pvalue"])
167 | pvalues_adj=mt.rawp2adjp(pvalues, proc=c("Bonferroni","BH"))
168 | pvalues_adj_orig_order=pvalues_adj$adjp[order(pvalues_adj$index),]
169 | db_results=cbind(db_results, pvalues_adj_orig_order[,2:3])
170 | db_results <- as.data.frame(db_results)
171 | 
172 | 
173 | ## Create a results table
174 | results = matrix(nrow=length(cancer_to_test)*length(group_cols), ncol=5)
175 | colnames(results) <- c("disease", "database", "pvalue", "interpretation_freq", "interpretation_freq_other_databases")
176 | n=0
177 | 
178 | ## Create contingency tables
179 | for(c in 1:length(cancer_to_test)){
180 |   for(d in 1:length(group_cols)){
181 |     n=n+1
182 |     ctab <- all_disease_vs_dataset
183 |     ctab[!grepl(cancer_to_test[c], ctab$TopNode_disease),"TopNode_disease"] <- paste("not",cancer_to_test[c])
184 |     ctab <- aggregate(. ~ TopNode_disease, data=ctab, FUN=sum)
185 |     others <- group_cols[!(group_cols %in% c(group_cols[d]))]
186 |     ctab$others <- rowSums(ctab[,others])
187 |     ctab <- ctab[,c("TopNode_disease",group_cols[d],"others")]
188 |     rownames(ctab) <- ctab$TopNode_disease
189 |     ctab$TopNode_disease <- NULL
190 |     chisq_results = chisq.test(ctab)
191 |     results[n,"disease"] <- cancer_to_test[c]
192 |     results[n,"database"] <- group_cols[d]
193 |     results[n,"pvalue"] <- chisq_results$p.value
194 |     results[n,"interpretation_freq"] <- ctab[cancer_to_test[c],group_cols[d]]/sum(ctab[,group_cols[d]])
195 |     results[n,"interpretation_freq_other_databases"] <- ctab[cancer_to_test[c],"others"]/sum(ctab[,"others"])
196 |     write.table(ctab, file=paste0("contingency_tables/", group_cols[d], "_", gsub(" ", "_", cancer_to_test[c]), "_contingency_table.tsv"), sep="\t", row.names = F, quote = F)
197 |   }
198 | }
199 | 
200 | #Correct p-values
201 | pvalues=as.numeric(results[,"pvalue"])
202 | pvalues_adj=mt.rawp2adjp(pvalues, proc=c("Bonferroni","BH"))
203 | pvalues_adj_orig_order=pvalues_adj$adjp[order(pvalues_adj$index),]
204 | results=cbind(results, pvalues_adj_orig_order[,2:3])
205 | results <- as.data.frame(results)
206 | results[,3:ncol(results)] <- apply(results[,3:ncol(results)], 2, FUN=as.numeric)
207 | results$representation <- NA
208 | results[which(as.numeric(results$pvalue) >0.05),"representation"] <- "not significant"
209 | results[which(as.numeric(results$pvalue) <0.05 & as.numeric(results$interpretation_freq) > as.numeric(results$interpretation_freq_other_databases)),"representation"] <- "overrepresented"
210 | results[which(as.numeric(results$pvalue) <0.05 & as.numeric(results$interpretation_freq) < as.numeric(results$interpretation_freq_other_databases)),"representation"] <- "underrepresented"
211 | 
212 | write.table(results, file="database_vs_disease.tsv", quote=FALSE, sep="\t", row.names=FALSE, col.names=TRUE)
213 | 
214 | 
215 | 
216 | library("gplots")
217 | # 1. convert the data as a table
218 | rownames(all_disease_vs_dataset) <- all_disease_vs_dataset$TopNode_disease
219 | all_disease_vs_dataset$TopNode_disease <- NULL
220 | dt <- as.table(as.matrix(all_disease_vs_dataset))
221 | # 2. Graph
222 | png(file = paste(getwd(),"disease_by_database_balloonplot.png", sep = "/"), height=1200, width=1550, res=150)
223 |   balloonplot(t(dt), main = "Diseases by database",xlab ="", ylab="", label = FALSE, show.margins = FALSE)
224 | dev.off()
225 | png(file = paste(getwd(),"disease_by_database_balloonplot_noother.png", sep = "/"), height=1200, width=1550, res=150)
226 |   balloonplot(t(dt[cancer_to_test[!(cancer_to_test %in% "benign neoplasm")],]), main = "Diseases by database", xlab ="", ylab="", label = FALSE, label.size=0.5)
227 | dev.off()
228 | 


--------------------------------------------------------------------------------
/vicckb/model.py:
--------------------------------------------------------------------------------
  1 | from .definitions import *
  2 | import json
  3 | from collections import defaultdict, Counter
  4 | import pickle
  5 | import re
  6 | import pyupset as pyu
  7 | import pandas as pd
  8 | from math import ceil
  9 | import hashlib
 10 | from warnings import warn
 11 | import obonet
 12 | from operator import itemgetter
 13 | 
 14 | 
 15 | class Element:
 16 | 
 17 |     def __repr__(self):
 18 |         return "{}: {}".format(str(type(self)), str(self))
 19 | 
 20 |     def __hash__(self):
 21 |         return hash(str(self))
 22 | 
 23 |     def __eq__(self, other):
 24 |         return str(other) == str(self)
 25 | 
 26 |     def __lt__(self, other):
 27 |         return str(self) < str(other)
 28 | 
 29 |     def __gt__(self, other):
 30 |         return str(self) > str(other)
 31 | 
 32 |     def __str__(self):
 33 |         raise NotImplementedError
 34 | 
 35 | 
 36 | class Disease(Element):
 37 | 
 38 |     def __init__(self, id, source, term):
 39 |         self.id = id
 40 |         self.source = source
 41 |         self.term = term
 42 | 
 43 |     @property
 44 |     def name(self):
 45 |         return self.term
 46 | 
 47 |     def __str__(self):
 48 |         if self.id and self.source:
 49 |             return f'{self.source}: {self.id}'
 50 |         else:
 51 |             return f'No reference: {self.term}'
 52 | 
 53 |     _DISEASE_ONTOLOGY_URL = 'https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/master/src/ontology/HumanDO.obo'
 54 |     DISEASE_ONTOLOGY = obonet.read_obo(_DISEASE_ONTOLOGY_URL)
 55 | 
 56 | 
 57 | class Drug(Element):
 58 | 
 59 |     def __init__(self, id, source, term, **kwargs):
 60 |         self.id = id
 61 |         self.source = source
 62 |         self.term = term
 63 | 
 64 |     def __str__(self):
 65 |         return str(self.term)
 66 | 
 67 | 
 68 | class Gene(Element):
 69 | 
 70 |     SYMBOL_TABLE = dict()
 71 |     SYMBOL_ALIAS_TABLE = defaultdict(list)
 72 |     with open(str(DATA_ROOT / 'non_alt_loci_set.json'), 'r') as f:
 73 |         d = json.load(f)
 74 |         for doc in d['response']['docs']:
 75 |             SYMBOL_TABLE[doc['symbol']] = doc
 76 |             for alias in doc.get('alias_symbol', []):
 77 |                 SYMBOL_ALIAS_TABLE[alias].append(doc['symbol'])
 78 |             for prev in doc.get('prev_symbol', []):
 79 |                 SYMBOL_ALIAS_TABLE[prev].append(doc['symbol'])
 80 |         SYMBOL_ALIAS_TABLE = dict(SYMBOL_ALIAS_TABLE)
 81 | 
 82 |     def __init__(self, gene_symbol):
 83 |         self.gene_symbol = gene_symbol
 84 |         try:
 85 |             doc = Gene.SYMBOL_TABLE[gene_symbol]
 86 |         except KeyError:
 87 |             aliases = Gene.SYMBOL_ALIAS_TABLE[gene_symbol]
 88 |             # if len(aliases) > 1:
 89 |             #     raise KeyError("{} is an ambiguous gene symbol.".format(gene_symbol))
 90 |             assert len(aliases) <= 1, 'Ambiguous gene symbol {}'.format(gene_symbol)
 91 |             doc = Gene.SYMBOL_TABLE[aliases[0]]
 92 |         self.entrez_id = doc['entrez_id']
 93 |         self._doc = doc
 94 | 
 95 |     @property
 96 |     def symbol(self):
 97 |         return self.gene_symbol
 98 | 
 99 |     def __str__(self):
100 |         return str(self.gene_symbol)
101 | 
102 |     def __bool__(self):
103 |         return bool(self.entrez_id)
104 | 
105 |     def __hash__(self):
106 |         return int(self.entrez_id)
107 | 
108 |     def __eq__(self, other):
109 |         return self.entrez_id == other.entrez_id
110 | 
111 |     def __lt__(self, other):
112 |         return self.entrez_id < other.entrez_id
113 | 
114 |     def __gt__(self, other):
115 |         return self.entrez_id > other.entrez_id
116 | 
117 | 
118 | class GenomicFeature(Element):
119 | 
120 |     CHROMOSOMES = [str(x) for x in range(1, 23)] + ['X', 'Y', 'MT']
121 |     REFERENCE_BUILDS = ['GRCh37', 'GRCh38']
122 | 
123 |     def __init__(self, chromosome, start, end, referenceName='GRCh37', name='', geneSymbol='', sequence_ontology={}, alt=None, ref=None, **kwargs):
124 |         chromosome = str(chromosome)
125 |         if chromosome.lower().startswith('chr'):
126 |             chromosome = chromosome[3:]
127 |         if chromosome == '23':
128 |             chromosome = 'X'
129 |         if chromosome == '24':
130 |             chromosome = 'Y'
131 |         assert chromosome in GenomicFeature.CHROMOSOMES
132 |         self.chromosome = chromosome
133 |         self.start = int(start)
134 |         self.end = int(end)
135 |         assert self.start <= self.end
136 |         self.so = sequence_ontology
137 |         self.alt = alt
138 |         self.ref = ref
139 |         self.name = name
140 |         self.gene_symbol = geneSymbol
141 |         assert referenceName in GenomicFeature.REFERENCE_BUILDS
142 |         self.reference_name = referenceName
143 | 
144 |     def __str__(self):
145 |         return ':'.join([str(getattr(self, x)) for x in ['reference_name', 'chromosome', 'start', 'end', 'name']])
146 | 
147 |     def __eq__(self, other):
148 |         return all([
149 |             self.chromosome == other.chromosome,
150 |             self.start == other.start,
151 |             self.end == other.end,
152 |             self.reference_name == other.reference_name
153 |         ])
154 | 
155 |     def __hash__(self):
156 |         return hash(tuple([str(getattr(self, x)) for x in ['reference_name', 'chromosome', 'start', 'end', 'alt']]))
157 | 
158 |     def issubfeature(self, other):
159 |         return all([
160 |             self.chromosome == other.chromosome,
161 |             self.start >= other.start,
162 |             self.end <= other.end,
163 |             self.reference_name == other.reference_name
164 |         ])
165 | 
166 |     def issuperfeature(self, other):
167 |         return all([
168 |             self.chromosome == other.chromosome,
169 |             self.start <= other.start,
170 |             self.end >= other.end,
171 |             self.reference_name == other.reference_name
172 |         ])
173 | 
174 |     def __lt__(self, other):
175 |         if self.reference_name != other.reference_name:
176 |             return self.reference_name < other.reference_name
177 |         elif self.chromosome != other.chromosome:
178 |             c = GenomicFeature.CHROMOSOMES
179 |             return c.index(self.chromosome) < c.index(other.chromosome)
180 |         elif self.start != other.start:
181 |             return self.start < other.start
182 |         elif self.end != other.end:
183 |             return self.end < other.end
184 |         else:
185 |             return False
186 | 
187 |     def __gt__(self, other):
188 |         return not self < other and self != other
189 | 
190 |     def __le__(self, other):
191 |         return not self > other
192 | 
193 |     def __ge__(self, other):
194 |         return not self < other
195 | 
196 |     def __contains__(self, item):
197 |         return self.issuperfeature(item)
198 | 
199 |     def __len__(self):
200 |         return self.end - self.start + 1
201 | 
202 | 
203 | class Publication(Element):
204 | 
205 |     pmid_re = re.compile(r'https?://.*pubmed/(\d+)$')
206 | 
207 |     def __init__(self, publication_string):
208 |         pmid_match = Publication.pmid_re.match(publication_string)
209 |         self.pmid = None
210 |         self.publication_string = publication_string
211 |         if pmid_match:
212 |             self.pmid = int(pmid_match[1])
213 | 
214 |     def __str__(self):
215 |         if self.pmid:
216 |             return str(self.pmid)
217 |         else:
218 |             return self.publication_string
219 | 
220 | 
221 | class ViccAssociation(dict):
222 | 
223 |     def __str__(self):
224 |         return str(hash(self))
225 | 
226 |     def __hash__(self):
227 |         return self._stable_hash()
228 | 
229 |     def _stable_hash(self):
230 |         raise NotImplementedError
231 | 
232 |     @property
233 |     def publications(self):
234 |         evidence = self['association']['evidence']
235 |         all_pubs = list()
236 |         for e in evidence:
237 |             all_pubs += [Publication(p) for p in e['info']['publications'] if p]
238 |         return all_pubs
239 | 
240 |     @property
241 |     def evidence_level(self):
242 |         return self['association']['evidence_label']
243 | 
244 |     @property
245 |     def genes(self):
246 |         if getattr(self, '_genes', None):
247 |             return self._genes
248 |         out = list()
249 |         for g in self['genes']:
250 |             if not g:
251 |                 continue
252 |             try:
253 |                 out.append(Gene(g))
254 |             except KeyError:
255 |                 continue
256 |             except AssertionError:
257 |                 warn('Ambiguous gene symbol {} in assertion {}'.format(g, self))
258 |                 continue
259 |         self._genes = out
260 |         return out
261 | 
262 |     @property
263 |     def source(self):
264 |         return self['source']
265 | 
266 |     @property
267 |     def features(self):
268 |         if getattr(self, '_features', None):
269 |             return self._features
270 |         out = list()
271 |         for f in self['features']:
272 |             try:
273 |                 f2 = GenomicFeature(**f)
274 |             except:
275 |                 continue
276 |             out.append(f2)
277 |         self._features = sorted(out)
278 |         return sorted(out)
279 | 
280 |     @property
281 |     def disease(self):
282 |         try:
283 |             return Disease(**self['association']['phenotype']['type'])
284 |         except KeyError:
285 |             return None
286 | 
287 |     @property
288 |     def drugs(self):
289 |         out = list()
290 |         for d in self['association'].get('environmentalContexts', []):
291 |             try:
292 |                 d2 = Drug(**d)
293 |             except:
294 |                 continue
295 |             out.append(d2)
296 |         return out
297 | 
298 |     def __eq__(self, other):
299 |         return hash(self) == hash(other)
300 | 
301 |     @property
302 |     def description(self):
303 |         return self['association'].get('description', None)
304 | 
305 | 
306 | class RawAssociation(ViccAssociation):
307 | 
308 |     def _stable_hash(self):
309 |         source = self['source']
310 |         if source == 'civic':
311 |             assert len(self['association']['evidence']) == 1  # we currently import 1 evidence per association.
312 |             k = 'civic:{}'.format(self['association']['evidence'][0]['evidenceType']['id'])
313 |         elif source == 'molecularmatch':
314 |             k = 'mm:{}'.format(self['raw']['hashKey'])
315 |         elif source == 'brca':
316 |             k = 'brca:{}'.format(self['raw']['id'])
317 |         elif source == 'pmkb':
318 |             t = [self['raw']['variant']['id'], self['raw']['tumor']['id']] + [x['id'] for x in self['raw']['tissues']]
319 |             k = 'pmkb:{}'.format('-'.join([str(x) for x in t]))  # There's no interpretation ID, made compound ID from components
320 |         elif source == 'oncokb':
321 |             try:
322 |                 pre = self['raw']['clinical']
323 |                 t = [pre['cancerType'], pre['drug'], pre['gene'], pre['variant']['name'], pre['level']]
324 |                 k = 'oncokb_clinical:{}'.format('-'.join(t))
325 |             except KeyError:
326 |                 pre = self['raw']['biological']
327 |                 t = [pre['gene'], pre['variant']['name'], pre['oncogenic'], pre['mutationEffectPmids']]
328 |                 k = 'oncokb_biological:{}'.format('-'.join(t))
329 |         elif source == 'jax':
330 |             k = 'jax:{}'.format(self['raw']['id'])
331 |         elif source == 'cgi':
332 |             t = [self['raw']['Drug full name'], self['raw']['Primary Tumor type'],
333 |                  self['raw']['Alteration'], self['raw']['Source']] + self['raw']['individual_mutation']
334 |             k = 'cgi:{}'.format('-'.join(t))
335 |         else:
336 |             raise NotImplementedError("No hash routine defined for source '{}'".format(source))
337 |         b = k.encode()
338 |         m = hashlib.sha256()
339 |         m.update(b)
340 |         return int(m.hexdigest(), 16) % 10**16  # Last 16 digits of sha256 hash are sufficient
341 | 
342 | 
343 | class ViccDb:
344 | 
345 |     DEFAULT_CACHE = DATA_ROOT / 'association_cache.pkl'
346 | 
347 |     def __init__(self, associations=None, load_cache=False,
348 |                  save_cache=False, cache_path=DEFAULT_CACHE,
349 |                  data_dir=(DATA_ROOT / '0.10')
350 |                  ):
351 |         if load_cache and save_cache:
352 |             raise ValueError('Can only load or save cache, not both.')
353 |         if load_cache:
354 |             with open(str(cache_path), 'rb') as f:
355 |                 self.associations = pickle.load(f)
356 |         elif associations is not None:
357 |             self.associations = associations
358 |         else:
359 |             self.load_data(data_dir=data_dir)
360 |         self._index_associations()
361 |         if save_cache:
362 |             self.cache_data(cache_path)
363 | 
364 |     def load_data(self, data_dir):
365 |         resource_paths = list(data_dir.glob('*.json'))
366 |         if resource_paths:
367 |             self._load_local(resource_paths)
368 |         else:
369 |             self._load_s3()
370 | 
371 |     def _load_local(self, resource_paths):
372 |         self.associations = list()
373 |         for path in resource_paths:
374 |             source = path.parts[-1].split('.')[0]
375 |             with path.open() as json_data:
376 |                 for line in json_data:
377 |                     association = RawAssociation(json.loads(line))  # TODO: Move to ViccAssociation after RawAssociation checks pass
378 |                     association['raw'] = association.pop(source)
379 |                     self.associations.append(association)
380 | 
381 |     def _load_s3(self):
382 |         raise NotImplementedError
383 | 
384 |     def _index_associations(self):
385 |         features = []
386 |         associations_by_source = defaultdict(list)
387 |         hashed = defaultdict(list)
388 |         for association in self.associations:
389 |             source = association['source']
390 |             associations_by_source[source].append(association)
391 |             h = hash(association)
392 |             hashed[h].append(association)
393 |             for feature in association.features:
394 |                 features.append((feature, h))
395 |         self._features = features
396 |         self._features_sorted = False
397 |         self.associations_by_source = dict(associations_by_source)
398 |         self._hashed = hashed
399 |         self._element_by_source = dict()
400 | 
401 |     @property
402 |     def features(self):
403 |         if self._features_sorted:
404 |             return self._features
405 |         else:
406 |             self._features = sorted(self._features, key=itemgetter(0))
407 |             self._features_sorted = True
408 |             return self._features
409 | 
410 |     def select(self, filter_function):
411 |         associations = filter(filter_function, self.associations)
412 |         return ViccDb(list(associations))
413 | 
414 |     def by_source(self, source):
415 |         return ViccDb(self.associations_by_source[source])
416 | 
417 |     def report_groups(self, superset=None):
418 |         if superset is None:
419 |             total = len(self)
420 |             for group in sorted(self.associations_by_source):
421 |                 count = len(self.associations_by_source[group])
422 |                 print("{}: {} ({:.1f}% of total)".format(group, count, count / total * 100))
423 |             print("{} total associations".format(total))
424 |         else:
425 |             for group in sorted(self.associations_by_source):
426 |                 count = len(self.associations_by_source[group])
427 |                 # intended: below will raise error if key doesn't exit in superset, should be actual superset of self.
428 |                 superset_count = len(superset.associations_by_source[group])
429 |                 print("{}: {} ({:.1f}% of superset)".format(group, count, count / superset_count * 100))
430 |             print("Total: {} ({:.1f}% of superset)".format(len(self.associations),
431 |                                                            len(self.associations) / len(superset.associations) * 100))
432 | 
433 |     def cache_data(self, cache_path=DEFAULT_CACHE):
434 |         with open(str(cache_path), 'wb') as f:
435 |             pickle.dump(self.associations, f)
436 | 
437 |     def __len__(self):
438 |         return len(self.associations)
439 | 
440 |     def __iter__(self):
441 |         return iter(self.associations)
442 | 
443 |     def __contains__(self, item):
444 |         return hash(item) in self._hashed
445 | 
446 |     def __getitem__(self, item):
447 |         return self.associations[item]
448 | 
449 |     def __sub__(self, other):
450 |         for h, associations in other._hashed.items():
451 |             error_msg = "Cannot perform set substraction, association hash not unique."
452 |             assert len(associations) == 1, error_msg
453 |             assert len(self._hashed.get(h, [])) <= 1, error_msg
454 |             # these assertions assume that hash uniquely identifies an association.
455 |             # Currently not true, but should be with harvester changes.
456 |         return ViccDb([x for x in self if x not in other])
457 | 
458 |     def plot_element_by_source(self, element, filter_func=lambda x: bool(x), min_bound=1, max_bound=1000000000):
459 |         element_by_source = self.get_element_by_source(element)
460 | 
461 |         df_dict = dict()
462 |         column_name = ['attribute']
463 |         for source in element_by_source:
464 |             filtered_elements = list(filter(filter_func, element_by_source[source]))
465 |             df_dict[source] = pd.DataFrame(filtered_elements, columns=column_name)
466 |         x = pyu.plot(df_dict, unique_keys=column_name, inters_size_bounds=(min_bound, max_bound))
467 |         x['input_data'] = element_by_source
468 |         return x
469 | 
470 |     def element_by_source_stats(self, element, filter_func=lambda x: bool(x)):
471 |         element_by_source = self.get_element_by_source(element)
472 |         for source, elements in element_by_source.items():
473 |             element_by_source[source] = set(list(filter(filter_func, elements)))
474 |         ubiquitous_elements = set.intersection(*(element_by_source.values()))
475 |         total_elements = set.union(*(element_by_source.values()))
476 |         count = Counter()
477 |         for source in element_by_source:
478 |             count.update(element_by_source[source])
479 |         majority_size = ceil(len(element_by_source) / 2)
480 |         majority_elements = set([element for element in count if count[element] >= majority_size])
481 |         unique_elements = set([element for element in count if count[element] == 1])
482 |         out = {
483 |             'total': total_elements,
484 |             'ubiquitous': ubiquitous_elements,
485 |             'majority': majority_elements,
486 |             'majority_size': majority_size,
487 |             'unique_elements': unique_elements
488 |         }
489 |         a = len(unique_elements)
490 |         b = len(total_elements)
491 |         print("{} / {} ({:.2%}) of {} are represented in only 1 resource."
492 |               .format(a, b, a / b, element))
493 | 
494 |         a = len(majority_elements)
495 |         print("{} / {} ({:.2%}) of {} are represented in the majority of ({}) resources."
496 |               .format(a, b, a / b, element, majority_size))
497 | 
498 |         a = len(ubiquitous_elements)
499 |         print("{} / {} ({:.2%}) of {} are represented across all resources."
500 |               .format(a, b, a/b, element))
501 |         return out
502 | 
503 |     def get_element_by_source(self, element):
504 |         try:
505 |             e = self._element_by_source[element]
506 |         except KeyError:
507 |             element_by_source = defaultdict(set)
508 |             for association in self:
509 |                 association_element = None
510 |                 association_element = getattr(association, element)
511 |                 if hasattr(association_element, '__iter__') and not isinstance(association_element, str):
512 |                     element_by_source[association.source].update(association_element)
513 |                 elif association_element is None:
514 |                     continue
515 |                 else:
516 |                     element_by_source[association.source].add(association_element)
517 |             self._element_by_source[element] = dict(element_by_source)
518 |             e = self._element_by_source[element]
519 |         return e
520 | 
521 |     MATCH_RANKING = ['exact', 'positional', 'focal', 'regional']
522 | 
523 |     def search_by_feature(self, chromosome=None, start=None, end=None, reference_name=None,
524 |                           name=None, alt=None, gene_symbol=None, genomic_feature=None):
525 |         """
526 |         Returns a list of hits, each corresponding to a single query / association match.
527 |         All features of an association matching a query are stored in the matches attribute.
528 |         The best match between a query and association is stored in the best_match attribute.
529 |         """
530 |         if not isinstance(genomic_feature, GenomicFeature):
531 |             query = GenomicFeature(chromosome, start, end, reference_name, name, gene_symbol, alt=alt)
532 |         else:
533 |             query = genomic_feature
534 |         return self.search_by_features([query])
535 | 
536 |     @staticmethod
537 |     def _get_best_match(matches):
538 |         if len(matches) == 1:
539 |             best_match = matches[0]
540 |         else:
541 |             s1 = sorted(matches, key=lambda x: x['p'], reverse=True)
542 |             s2 = sorted(s1, key=lambda x: ViccDb.MATCH_RANKING.index(x['type']))
543 |             best_match = s2[0]
544 |         return best_match
545 | 
546 |     @staticmethod
547 |     def _get_match_type(query, feature):
548 |         match = {'feature': feature}
549 |         if query == feature:
550 |             if query.alt and feature.alt and query.alt == feature.alt:
551 |                 match['type'] = 'exact'
552 |             else:
553 |                 match['type'] = 'positional'
554 |             match['p'] = 1
555 |         else:
556 |             if query.issubfeature(feature):
557 |                 p = len(query) / len(feature)
558 |             elif query.issuperfeature(feature):
559 |                 p = len(feature) / len(query)
560 |             else:
561 |                 # When query and feature are only partially overlapping
562 |                 if query < feature:
563 |                     num = query.end - feature.start + 1
564 |                 else:
565 |                     num = feature.end - query.start + 1
566 |                 den = max(len(feature), len(query))
567 |                 p = num / den
568 |             assert p < 1, f'{p} should be less than 1 for {query} on {feature}'
569 |             match['p'] = p
570 |             if p >= 0.1:
571 |                 match['type'] = 'focal'
572 |             elif p > 0:
573 |                 match['type'] = 'regional'
574 |             else:
575 |                 raise ValueError(f'Expected an overlap between {query} and {feature}')
576 |         return match
577 | 
578 |     def search_by_features(self, genomic_features):
579 |         assert isinstance(genomic_features, list)
580 |         db_features_pointer = 0
581 |         query_features_pointer = 0
582 |         match_start = None
583 |         last_query_features_pointer = -1
584 |         c = GenomicFeature.CHROMOSOMES
585 |         query_features = sorted(genomic_features)
586 |         hit_index = dict()
587 |         while query_features_pointer < len(query_features) and db_features_pointer < len(self.features):
588 |             if last_query_features_pointer != query_features_pointer:
589 |                 q = query_features[query_features_pointer]
590 |                 if match_start is not None:
591 |                     db_features_pointer = match_start
592 |                     match_start = None
593 |                 last_query_features_pointer = query_features_pointer
594 |             d, association_hash = self.features[db_features_pointer]
595 |             if q.reference_name != d.reference_name:
596 |                 raise NotImplementedError('All records in query and datastore currently must match same reference')
597 |             if c.index(q.chromosome) < c.index(d.chromosome):
598 |                 query_features_pointer += 1
599 |                 continue
600 |             if c.index(q.chromosome) > c.index(d.chromosome):
601 |                 db_features_pointer += 1
602 |                 continue
603 |             if q.start > d.end:
604 |                 db_features_pointer += 1
605 |                 continue
606 |             if q.end < d.start:
607 |                 query_features_pointer += 1
608 |                 continue
609 |             m = ViccDb._get_match_type(q, d)
610 |             key = (q, association_hash)
611 |             matches = hit_index.get(key, list())
612 |             matches.append(m)
613 |             hit_index[key] = matches
614 |             if match_start is None:
615 |                 match_start = db_features_pointer
616 |             db_features_pointer += 1
617 |         hits = list()
618 |         for key, matches in hit_index.items():
619 |             q, association_hash = key
620 |             best_match = ViccDb._get_best_match(matches)
621 |             a = self.get_association_by_hash(association_hash)
622 |             hit = {
623 |                 'query': q,
624 |                 'association': a,
625 |                 'matches': matches,
626 |                 'best_match': best_match
627 |             }
628 |             hits.append(hit)
629 |         return hits
630 | 
631 |     def get_association_by_hash(self, key):
632 |         a = self._hashed[key]
633 |         assert len(a) == 1
634 |         return a[0]
635 | 
636 |     @property
637 |     def sources(self):
638 |         return self.associations_by_source.keys()


--------------------------------------------------------------------------------