├── vicckb ├── __init__.py ├── definitions.py ├── tests │ ├── test_harmonizers.py │ └── test_model.py ├── harmonizers.py └── model.py ├── .gitignore ├── README.md ├── requirements.txt ├── LICENSE └── supporting_scripts └── VICCdisease_graphs_by_database.R /vicckb/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | out/ 2 | data/ 3 | .cache/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # vicckb 2 | python module for analyzing the VICC meta-Knowledgebase 3 | -------------------------------------------------------------------------------- /vicckb/definitions.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | PROJECT_ROOT = Path(__file__).resolve().parent 4 | DATA_ROOT = PROJECT_ROOT / 'data' 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pyupset==0.1.1.post7 3 | pandas==0.25 4 | matplotlib==2.0.2 5 | scipy==1.4.1 6 | jupyter 7 | requests==2.22 8 | networkx==1.11 9 | obonet==0.2.2 10 | seaborn==0.9.0 11 | -------------------------------------------------------------------------------- /vicckb/tests/test_harmonizers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from vicckb.definitions import DATA_ROOT 3 | from vicckb.harmonizers import DiseaseHarmonizer 4 | 5 | 6 | @pytest.fixture(scope="module") 7 | def adh(): 8 | """Aliased Disease Harmonizer""" 9 | return DiseaseHarmonizer(map_file=(DATA_ROOT / 'disease_alias.tsv'), 10 | disease_ontology='DOID') 11 | 12 | 13 | class TestDiseaseHarmonizer(object): 14 | 15 | def test_adh_init(self, adh): 16 | assert adh 17 | 18 | def test_adh_aliases(self, adh): 19 | assert 325 == len(adh._map) 20 | 21 | def test_search(self, adh): 22 | result = adh.harmonize('breast cancer') 23 | expected = { 24 | 'ontology': 'DOID', 25 | 'term': 'breast cancer', 26 | 'id': 'DOID:1612', 27 | 'resultEngine': 'ebi' 28 | } 29 | assert expected == result 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Alex H. Wagner, PhD 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /vicckb/harmonizers.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | import csv 4 | 5 | BIOONTOLOGY_URL = 'http://data.bioontology.org/search' 6 | EBI_URL = 'https://www.ebi.ac.uk/ols/api/search' 7 | ONTOLOGIES = { 8 | 'DOID': {'ebi': 'doid', 'bioontology': 'DOID'} 9 | } 10 | 11 | 12 | class DiseaseHarmonizer: 13 | 14 | def __init__(self, api_key=None, map_file=None, disease_ontology=None): 15 | if api_key is None: 16 | self.api_key = os.environ['BIOONTOLOGY_API_KEY'] 17 | else: 18 | self.api_key = api_key 19 | if disease_ontology: 20 | assert disease_ontology in ONTOLOGIES 21 | self.disease_ontology = disease_ontology 22 | self._cache = dict() 23 | self._init_map(map_file) 24 | 25 | def _init_map(self, filename=None): 26 | """Subclasses may override this to preload source-specific terms. 27 | It may also be called with the optional filename to preload from a TSV. 28 | TSV is expected to be of format: source term, expanded term""" 29 | self._map = dict() 30 | if filename is None: 31 | return 32 | with open(filename, 'r', newline='') as f: 33 | reader = csv.reader(f, delimiter="\t") 34 | for line in reader: 35 | self._map[line[0].lower()] = line[1] 36 | 37 | def harmonize(self, term): 38 | assert isinstance(term, str) 39 | alias = self._map.get(term.lower(), False) 40 | if alias: 41 | term = alias 42 | if term.lower() in self._cache: 43 | return self._cache[term.lower()] 44 | result = self.query_ebi(term) 45 | if result: 46 | return self._prepare_result(term, result, 'ebi') 47 | result = self.query_bioontology(term) 48 | if result: 49 | return self._prepare_result(term, result, 'bioontology') 50 | self._cache[term] = None 51 | return None 52 | 53 | def _prepare_result(self, term, result, engine): 54 | out = {k: v for k, v in result.items() if k in ['ontology', 'term', 'id']} 55 | out['resultEngine'] = engine 56 | self._cache[term.lower()] = out 57 | return out 58 | 59 | def _submit_query(self, url, payload): 60 | r = requests.get(url, params=payload) 61 | r.raise_for_status() 62 | return r.json() 63 | 64 | def query_ebi(self, term): 65 | payload = { 66 | 'q': term, 67 | 'groupField': 'iri', 68 | 'exact': 'on', 69 | 'start': '0' 70 | } 71 | if self.disease_ontology and ONTOLOGIES[self.disease_ontology]['ebi']: 72 | payload['ontology'] = ONTOLOGIES[self.disease_ontology]['ebi'] 73 | j = self._submit_query(EBI_URL, payload)['response'] 74 | if j['numFound'] == 0: 75 | return None 76 | match = j['docs'][0] 77 | return { 78 | 'id': match['obo_id'], 79 | 'ontology': match['ontology_prefix'], 80 | 'term': match['label'] 81 | } 82 | 83 | def query_bioontology(self, term): 84 | payload = {'q': term, 'apikey': self.api_key} 85 | if self.disease_ontology and ONTOLOGIES[self.disease_ontology]['bioontology']: 86 | payload['ontologies'] = ONTOLOGIES[self.disease_ontology]['bioontology'] 87 | j = self._submit_query(BIOONTOLOGY_URL, payload) 88 | if j.get('collection', False): 89 | match = j['collection'][0] 90 | prefLabel = match['prefLabel'] 91 | ontology, oid = match['@id'].split('/')[-2:] 92 | if ontology == 'obo': 93 | ontology = oid.split('_')[0] 94 | return { 95 | 'ontology': ontology, 96 | 'term': prefLabel, 97 | 'id': oid.replace('_', ':'), 98 | 'matchType': match['matchType'] 99 | } 100 | return None 101 | -------------------------------------------------------------------------------- /vicckb/tests/test_model.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from collections import Counter 3 | from vicckb.model import ViccDb, GenomicFeature, Disease 4 | from os import environ 5 | import networkx 6 | 7 | 8 | CACHE_PRESENT = ViccDb.DEFAULT_CACHE.exists() 9 | PAPER_TESTING = bool(environ.get('VICC_PAPER_TESTING')) 10 | if PAPER_TESTING: 11 | SOURCES = ['molecularmatch', 'civic', 'pmkb', 'oncokb', 'jax', 'cgi'] 12 | else: 13 | SOURCES = ['molecularmatch', 'civic', 'pmkb', 'brca', 'oncokb', 'jax', 'cgi'] 14 | 15 | 16 | @pytest.fixture(scope="module") 17 | def vdb(): 18 | vdb = ViccDb(load_cache=CACHE_PRESENT) 19 | return vdb 20 | 21 | 22 | @pytest.fixture(scope="module", params=SOURCES) 23 | def sourcedb(vdb, request): 24 | return ViccDb(vdb.by_source(request.param)) 25 | 26 | 27 | @pytest.fixture(scope="module") 28 | def gfa(): 29 | return GenomicFeature(1, 1, 1, 'GRCh37', 'Feature A', 'FAKE1', alt='G') 30 | 31 | 32 | @pytest.fixture(scope="module") 33 | def gfb(): 34 | return GenomicFeature(1, 1, 1, 'GRCh37', 'Feature A', 'FAKE1', alt='C') 35 | 36 | 37 | @pytest.fixture(scope="module") 38 | def gfa2(): 39 | return GenomicFeature(1, 1, 1, 'GRCh37', 'Feature A', 'FAKE1', alt='G', sequence_ontology={'soid': 141}) 40 | 41 | 42 | class TestViccDb(object): 43 | 44 | def test_len(self, vdb): 45 | assert len(vdb) > 5000 46 | 47 | def test_select(self, vdb): 48 | civicdb = vdb.by_source('civic') 49 | assert len(civicdb) == len(vdb.select(lambda x: x['source'] == 'civic')) 50 | assert len(civicdb) > 1000 51 | 52 | def test_iter(self, vdb): 53 | count = 0 54 | for _ in vdb: 55 | count += 1 56 | assert len(vdb) == count 57 | 58 | def test_subtraction(self, vdb): 59 | civicdb = vdb.by_source('civic') 60 | delta = vdb - civicdb 61 | assert len(delta) == len(vdb) - len(civicdb) 62 | assert len(delta) > 5000 63 | 64 | def test_search_features(self, vdb): 65 | hits = vdb.search_by_feature(chromosome=7, start=140453136, end=140453136, 66 | reference_name='GRCh37', name='V600E', gene_symbol='BRAF') 67 | assert len(hits) >= 500 68 | associations = [hit['association'] for hit in hits] 69 | results = ViccDb(associations) 70 | assert len(results.sources) == 6 71 | gf = GenomicFeature(chromosome=7, start=140453136, end=140453136, 72 | reference_name='GRCh37', name='V600E', gene_symbol='BRAF') 73 | hits2 = list(vdb.search_by_features([gf])) 74 | assert len(hits2) == len(hits) 75 | 76 | def test_multisearch_features(self, vdb): 77 | TEST_SIZE = 500 78 | unique_features = set() 79 | x = [x.features for x in vdb] 80 | for fset in x: 81 | unique_features.update(fset) 82 | unique_features = list(unique_features) 83 | hits = vdb.search_by_features(unique_features[:TEST_SIZE]) 84 | aset = {hit['association'] for hit in hits} 85 | assert len(aset) > TEST_SIZE 86 | qset = {hit['query'] for hit in hits} 87 | assert len(qset) == TEST_SIZE 88 | 89 | 90 | class TestGenomicFeatures(object): 91 | 92 | def test_len(self, vdb): 93 | for association in vdb: 94 | for feature in association.features: 95 | try: 96 | assert len(feature) >= 1 97 | except ValueError: 98 | raise ValueError("Association {} has feature {} with invalid length".format(association, feature)) 99 | 100 | def test_equality(self, gfa, gfb): 101 | assert gfa == gfb 102 | 103 | def test_hash(self, gfa, gfb, gfa2): 104 | assert hash(gfa) != hash(gfb) 105 | assert hash(gfa) == hash(gfa2) 106 | 107 | 108 | class TestDisease(object): 109 | 110 | def test_ontology(self): 111 | graph = Disease.DISEASE_ONTOLOGY 112 | assert networkx.is_directed_acyclic_graph(graph) 113 | 114 | def test_uniqueness(self): 115 | s = set() 116 | disease_a = Disease('DOID:1', 'DOID', 'Disease_1') 117 | disease_b = Disease('DOID:1', 'DOID', 'Disease 1') 118 | disease_c = Disease('DOID:2', 'DOID', 'Disease_2') 119 | s.add(disease_a) 120 | s.add(disease_b) 121 | s.add(disease_c) 122 | assert 2 == len(s) 123 | 124 | 125 | class TestOncokb(object): 126 | 127 | def test_types(self, vdb): 128 | okb = vdb.by_source('oncokb') 129 | for x in okb: 130 | assert 'clinical' in x['raw'] # Only clinical results 131 | 132 | 133 | class TestSource(object): 134 | 135 | def test_hash(self, sourcedb): 136 | c = Counter(map(lambda x: hash(x), sourcedb)) # Implicit test that all elements hash 137 | assert len(c) == sum(c.values()) # Tests that hashes are unique 138 | 139 | def test_genes(self, sourcedb): 140 | count = 0 141 | for x in sourcedb: 142 | if len(x.genes) == 0: 143 | count += 1 144 | assert count < 0.01 * len(sourcedb) # less than 1% of associations lacking genes 145 | 146 | def test_features(self, sourcedb): 147 | count = 0 148 | for x in sourcedb: 149 | if len(x.features) == 0: 150 | count += 1 151 | assert count < 0.02 * len(sourcedb) # less than 2% of associations lacking features 152 | 153 | def test_diseases(self, sourcedb): 154 | count = 0 155 | for x in sourcedb: 156 | if x.disease is None: 157 | count += 1 158 | assert count < 5 159 | 160 | def test_evidence_level(self, sourcedb): 161 | count = 0 162 | for x in sourcedb: 163 | if not x.evidence_level: 164 | count += 1 165 | assert count == 0 166 | 167 | # def test_drugs(self, sourcedb): 168 | # count = 0 169 | # for x in sourcedb: 170 | # if len(x.drugs) == 0: 171 | # count += 1 172 | # assert count == 0 -------------------------------------------------------------------------------- /supporting_scripts/VICCdisease_graphs_by_database.R: -------------------------------------------------------------------------------- 1 | ################################################################################################### 2 | ##################################### Plot disease enrichment ##################################### 3 | ################################################################################################### 4 | 5 | library(ggplot2) 6 | library(plotrix) 7 | library(viridis) 8 | library(reshape2) 9 | library(multtest) 10 | library(stats) 11 | 12 | setwd('~/Google Drive/MGI/VICC/Manuscript/Figures/misc_figures/') 13 | 14 | ################################################################################################### 15 | #### Read in files 16 | ################################################################################################### 17 | ## Get disease count lists 18 | data <- read.csv(file="~/Google Drive/MGI/VICC/Manuscript/Figures/misc_figures/Data/disease_counts.csv",header=T, stringsAsFactors = F) 19 | 20 | ################################################################################################### 21 | #### summarize top level data for plotting 22 | ################################################################################################### 23 | top_data <- aggregate(. ~ TopNode_disease + TopNode_doid, data=data[,3:ncol(data)], FUN=sum) 24 | 25 | ## Get a list of all of the database names 26 | group_cols <- colnames(top_data)[!(colnames(top_data) %in% c("TopNode_disease","TopNode_doid"))] 27 | 28 | ## Get total counts for each cancer across databases 29 | top_data$total <- apply(top_data[,colnames(top_data) %in% group_cols], 1, FUN = sum) 30 | top_data$total_perc <- top_data$total/sum(top_data$total)*100 31 | 32 | ## Create top-level bins 33 | top_data[(top_data$TopNode_disease %in% c("cancer")),c("TopNode_disease","TopNode_doid")] <- c("other cancers","other cancers") 34 | top_data[(top_data$TopNode_disease %in% c("other")),c("TopNode_disease","TopNode_doid")] <- c("other disease","other disease") 35 | 36 | ## Figure out the max percent a disease is in any dataset 37 | tmp_top <- top_data 38 | tmp_top[,group_cols] <- apply(tmp_top[,group_cols], 2, function(x) x/sum(x)*100) 39 | tmp_top$max <- apply(tmp_top[,group_cols], 1, FUN = max) 40 | top_data <- merge(top_data, tmp_top[,c("TopNode_disease","TopNode_doid","max")]) 41 | 42 | # ## Get the top 10 cancers (includes top hit for all 6) 43 | # top <- head(top_data[order(-top_data$total),], n = 5) 44 | ## Get diseases for plotting that make up more than 5% of the total 45 | top <- top_data[which(top_data$max > 5),] 46 | 47 | ## Create a dataframe of only the top 10 + a binned "other cancer" category for all other cancers combined 48 | top_data_scaled <- as.data.frame(top_data[order(-top_data$total),]) 49 | top_data_scaled[!(top_data_scaled$TopNode_disease %in% c(top$TopNode_disease, "benign neoplasm", "other disease")),c("TopNode_disease","TopNode_doid")] <- c("other cancers","other cancers") 50 | top_data_scaled <- aggregate(. ~ TopNode_disease + TopNode_doid, data=top_data_scaled, FUN=sum) 51 | top_data_scaled$TopNode_disease <- gsub(" ","\n",top_data_scaled$TopNode_disease) 52 | 53 | ################################################################################################### 54 | #### Create pie charts of data 55 | ################################################################################################### 56 | # pie(top_data_scaled[which(top_data_scaled$cgi > 0),"cgi"], labels=top_data_scaled[which(top_data_scaled$cgi > 0),"TopNode_disease"], main="CGI", col=viridis(length(top_data_scaled[,"cgi"]), option="B")) 57 | # pie3D(top_data_scaled[which(top_data_scaled$cgi > 0),"cgi"], labels=top_data_scaled[which(top_data_scaled$cgi > 0),"TopNode_disease"], explode=0.1, main="CGI") 58 | 59 | ## Create pie charts for each database 60 | create_a_pie <- function(x,y){ 61 | p <- pie(x[which(x[,y] > 0),y], labels=x[which(x[,y] > 0),"TopNode_disease"], main=y, col=viridis(length(x[,y]), option="B")) 62 | return(p) 63 | } 64 | # png(file = paste(getwd(),"disease_by_database__piechart_cgi.png", sep = "/"), height=1200, width=1150, res=150) 65 | # print(create_a_pie(top_data_scaled,"cgi")) 66 | # dev.off() 67 | create_a_pie(top_data_scaled,"civic") 68 | 69 | ################################################################################################### 70 | #### Create bar charts of data 71 | ################################################################################################### 72 | ## Melt the df 73 | top_data_scaled_long <- melt(top_data_scaled, id.vars = c("TopNode_disease", "TopNode_doid")) 74 | 75 | ## Refactor to desired order 76 | top_data_scaled_long$TopNode_disease <- factor(top_data_scaled_long$TopNode_disease, levels = c((unique(top_data_scaled[order(top_data_scaled$total,decreasing=F),"TopNode_disease"]))), exclude = NULL) 77 | 78 | ## Plot the disease by database 79 | png(file = paste(getwd(),"disease_by_database.png", sep = "/"), height=1200, width=1150, res=150) 80 | ggplot(top_data_scaled_long[!(top_data_scaled_long$variable %in% c("total","total_perc","max")),], aes(x=TopNode_disease, y=value, fill = variable)) + geom_col(position = 'stack') + xlab("Disease") + ylab("Evidence") + guides(fill=guide_legend(title="Database")) + scale_fill_viridis(discrete = T, direction = -1) 81 | dev.off() 82 | 83 | 84 | ## Spaces are more desirable for the following plots 85 | top_data_scaled_long$TopNode_disease <- gsub("\n"," ",top_data_scaled_long$TopNode_disease) 86 | ## Change the disease order 87 | top_data_scaled_long$TopNode_disease <- factor(top_data_scaled_long$TopNode_disease, levels = as.vector(c("other disease", "benign neoplasm",as.character(unique(top_data_scaled_long[!(top_data_scaled_long$TopNode_disease %in% c("benign neoplasm","other disease")),"TopNode_disease"])))), exclude = NULL) 88 | ## Create a custom color palette 89 | plma <- rev(viridis::plasma(n=nrow(top))) 90 | new_palette = c("grey87","grey66", plma) 91 | 92 | ## Plot the databases by disease proportions 93 | png(file = paste(getwd(),"disease_by_database_proportion.png", sep = "/"), height=1200, width=1150, res=150) 94 | ggplot(top_data_scaled_long[!(top_data_scaled_long$variable %in% c("total","total_perc")),], aes(x=variable, y=value, fill = TopNode_disease)) + geom_col(position = 'fill') + xlab("Database") + ylab("Proportion of Interpretations") + guides(fill=guide_legend(title="Disease")) + scale_fill_manual(values=new_palette) + theme_bw() 95 | dev.off() 96 | 97 | pdf(file = paste(getwd(),"disease_by_database_proportion.pdf", sep = "/"), height=9, width=8.625) 98 | ggplot(top_data_scaled_long[!(top_data_scaled_long$variable %in% c("total","total_perc")),], aes(x=variable, y=value, fill = TopNode_disease)) + geom_col(position = 'fill') + xlab("Database") + ylab("Proportion of Interpretations") + guides(fill=guide_legend(title="Disease")) + scale_fill_manual(values=new_palette) + theme_bw() 99 | dev.off() 100 | 101 | ################################################################################################### 102 | #### Look for disease enrichment per database 103 | ################################################################################################### 104 | top_data_scaled$TopNode_disease <- gsub("\n"," ",top_data_scaled$TopNode_disease) 105 | 106 | ## Get a list of cancer types to test 107 | cancer_to_test <- top_data_scaled$TopNode_disease[!(top_data_scaled$TopNode_disease %in% c("other disease","other cancers"))] 108 | 109 | ## Restrict to desired columns and eliminate generic other disease or cancer rows 110 | disease_vs_dataset <- top_data_scaled[!(top_data_scaled$TopNode_disease %in% c("other disease","other cancers")),c("TopNode_disease",group_cols)] 111 | 112 | ## Convert to table 113 | rownames(disease_vs_dataset) <- disease_vs_dataset$TopNode_disease 114 | disease_vs_dataset$TopNode_disease <- NULL 115 | chisquared_all <- chisq.test(disease_vs_dataset) 116 | 117 | ## Restrict to desired columns 118 | all_disease_vs_dataset <- top_data_scaled[,c("TopNode_disease",group_cols)] 119 | 120 | ## Create a results table 121 | results = matrix(nrow=length(cancer_to_test), ncol=3) 122 | colnames(results) <- c("disease","pvalue", "evidence_count") 123 | 124 | ## Create contingency tables 125 | for(c in 1:length(cancer_to_test)){ 126 | ctab <- all_disease_vs_dataset 127 | ctab[!grepl(cancer_to_test[c], ctab$TopNode_disease),"TopNode_disease"] <- paste("not",cancer_to_test[c]) 128 | ctab <- aggregate(. ~ TopNode_disease, data=ctab, FUN=sum) 129 | rownames(ctab) <- ctab$TopNode_disease 130 | ctab$TopNode_disease <- NULL 131 | chisq_results = chisq.test(ctab) 132 | results[c,"disease"] <- cancer_to_test[c] 133 | results[c,"pvalue"] <- chisq_results$p.value 134 | results[c,"evidence_count"] <- sum(ctab[cancer_to_test[c],]) 135 | #write.table(ctab, file=paste0(cancer_to_test[c],"_contingency_table.tsv"), sep="\t", row.names = F, quote = F) 136 | } 137 | 138 | #Correct p-values 139 | pvalues=as.numeric(results[,"pvalue"]) 140 | pvalues_adj=mt.rawp2adjp(pvalues, proc=c("Bonferroni","BH")) 141 | pvalues_adj_orig_order=pvalues_adj$adjp[order(pvalues_adj$index),] 142 | results=cbind(results, pvalues_adj_orig_order[,2:3]) 143 | results <- as.data.frame(results) 144 | 145 | 146 | ## Create a results table 147 | db_results = matrix(nrow=length(group_cols), ncol=3) 148 | colnames(db_results) <- c("database","pvalue","evidence_count") 149 | 150 | ## Create contingency tables 151 | for(d in 1:length(group_cols)){ 152 | ctab <- all_disease_vs_dataset 153 | others <- group_cols[!(group_cols %in% c(group_cols[d]))] 154 | ctab$others <- rowSums(ctab[,others]) 155 | ctab <- ctab[,c("TopNode_disease",group_cols[d],"others")] 156 | rownames(ctab) <- ctab$TopNode_disease 157 | ctab$TopNode_disease <- NULL 158 | chisq_db_results = chisq.test(ctab) 159 | db_results[d,"database"] <- group_cols[d] 160 | db_results[d,"pvalue"] <- chisq_db_results$p.value 161 | db_results[d,"evidence_count"] <- sum(ctab[,group_cols[d]]) 162 | #write.table(ctab, file=paste0(group_cols[d],"_contingency_table.tsv"), sep="\t", row.names = F, quote = F) 163 | } 164 | 165 | #Correct p-values 166 | pvalues=as.numeric(db_results[,"pvalue"]) 167 | pvalues_adj=mt.rawp2adjp(pvalues, proc=c("Bonferroni","BH")) 168 | pvalues_adj_orig_order=pvalues_adj$adjp[order(pvalues_adj$index),] 169 | db_results=cbind(db_results, pvalues_adj_orig_order[,2:3]) 170 | db_results <- as.data.frame(db_results) 171 | 172 | 173 | ## Create a results table 174 | results = matrix(nrow=length(cancer_to_test)*length(group_cols), ncol=5) 175 | colnames(results) <- c("disease", "database", "pvalue", "interpretation_freq", "interpretation_freq_other_databases") 176 | n=0 177 | 178 | ## Create contingency tables 179 | for(c in 1:length(cancer_to_test)){ 180 | for(d in 1:length(group_cols)){ 181 | n=n+1 182 | ctab <- all_disease_vs_dataset 183 | ctab[!grepl(cancer_to_test[c], ctab$TopNode_disease),"TopNode_disease"] <- paste("not",cancer_to_test[c]) 184 | ctab <- aggregate(. ~ TopNode_disease, data=ctab, FUN=sum) 185 | others <- group_cols[!(group_cols %in% c(group_cols[d]))] 186 | ctab$others <- rowSums(ctab[,others]) 187 | ctab <- ctab[,c("TopNode_disease",group_cols[d],"others")] 188 | rownames(ctab) <- ctab$TopNode_disease 189 | ctab$TopNode_disease <- NULL 190 | chisq_results = chisq.test(ctab) 191 | results[n,"disease"] <- cancer_to_test[c] 192 | results[n,"database"] <- group_cols[d] 193 | results[n,"pvalue"] <- chisq_results$p.value 194 | results[n,"interpretation_freq"] <- ctab[cancer_to_test[c],group_cols[d]]/sum(ctab[,group_cols[d]]) 195 | results[n,"interpretation_freq_other_databases"] <- ctab[cancer_to_test[c],"others"]/sum(ctab[,"others"]) 196 | write.table(ctab, file=paste0("contingency_tables/", group_cols[d], "_", gsub(" ", "_", cancer_to_test[c]), "_contingency_table.tsv"), sep="\t", row.names = F, quote = F) 197 | } 198 | } 199 | 200 | #Correct p-values 201 | pvalues=as.numeric(results[,"pvalue"]) 202 | pvalues_adj=mt.rawp2adjp(pvalues, proc=c("Bonferroni","BH")) 203 | pvalues_adj_orig_order=pvalues_adj$adjp[order(pvalues_adj$index),] 204 | results=cbind(results, pvalues_adj_orig_order[,2:3]) 205 | results <- as.data.frame(results) 206 | results[,3:ncol(results)] <- apply(results[,3:ncol(results)], 2, FUN=as.numeric) 207 | results$representation <- NA 208 | results[which(as.numeric(results$pvalue) >0.05),"representation"] <- "not significant" 209 | results[which(as.numeric(results$pvalue) <0.05 & as.numeric(results$interpretation_freq) > as.numeric(results$interpretation_freq_other_databases)),"representation"] <- "overrepresented" 210 | results[which(as.numeric(results$pvalue) <0.05 & as.numeric(results$interpretation_freq) < as.numeric(results$interpretation_freq_other_databases)),"representation"] <- "underrepresented" 211 | 212 | write.table(results, file="database_vs_disease.tsv", quote=FALSE, sep="\t", row.names=FALSE, col.names=TRUE) 213 | 214 | 215 | 216 | library("gplots") 217 | # 1. convert the data as a table 218 | rownames(all_disease_vs_dataset) <- all_disease_vs_dataset$TopNode_disease 219 | all_disease_vs_dataset$TopNode_disease <- NULL 220 | dt <- as.table(as.matrix(all_disease_vs_dataset)) 221 | # 2. Graph 222 | png(file = paste(getwd(),"disease_by_database_balloonplot.png", sep = "/"), height=1200, width=1550, res=150) 223 | balloonplot(t(dt), main = "Diseases by database",xlab ="", ylab="", label = FALSE, show.margins = FALSE) 224 | dev.off() 225 | png(file = paste(getwd(),"disease_by_database_balloonplot_noother.png", sep = "/"), height=1200, width=1550, res=150) 226 | balloonplot(t(dt[cancer_to_test[!(cancer_to_test %in% "benign neoplasm")],]), main = "Diseases by database", xlab ="", ylab="", label = FALSE, label.size=0.5) 227 | dev.off() 228 | -------------------------------------------------------------------------------- /vicckb/model.py: -------------------------------------------------------------------------------- 1 | from .definitions import * 2 | import json 3 | from collections import defaultdict, Counter 4 | import pickle 5 | import re 6 | import pyupset as pyu 7 | import pandas as pd 8 | from math import ceil 9 | import hashlib 10 | from warnings import warn 11 | import obonet 12 | from operator import itemgetter 13 | 14 | 15 | class Element: 16 | 17 | def __repr__(self): 18 | return "{}: {}".format(str(type(self)), str(self)) 19 | 20 | def __hash__(self): 21 | return hash(str(self)) 22 | 23 | def __eq__(self, other): 24 | return str(other) == str(self) 25 | 26 | def __lt__(self, other): 27 | return str(self) < str(other) 28 | 29 | def __gt__(self, other): 30 | return str(self) > str(other) 31 | 32 | def __str__(self): 33 | raise NotImplementedError 34 | 35 | 36 | class Disease(Element): 37 | 38 | def __init__(self, id, source, term): 39 | self.id = id 40 | self.source = source 41 | self.term = term 42 | 43 | @property 44 | def name(self): 45 | return self.term 46 | 47 | def __str__(self): 48 | if self.id and self.source: 49 | return f'{self.source}: {self.id}' 50 | else: 51 | return f'No reference: {self.term}' 52 | 53 | _DISEASE_ONTOLOGY_URL = 'https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/master/src/ontology/HumanDO.obo' 54 | DISEASE_ONTOLOGY = obonet.read_obo(_DISEASE_ONTOLOGY_URL) 55 | 56 | 57 | class Drug(Element): 58 | 59 | def __init__(self, id, source, term, **kwargs): 60 | self.id = id 61 | self.source = source 62 | self.term = term 63 | 64 | def __str__(self): 65 | return str(self.term) 66 | 67 | 68 | class Gene(Element): 69 | 70 | SYMBOL_TABLE = dict() 71 | SYMBOL_ALIAS_TABLE = defaultdict(list) 72 | with open(str(DATA_ROOT / 'non_alt_loci_set.json'), 'r') as f: 73 | d = json.load(f) 74 | for doc in d['response']['docs']: 75 | SYMBOL_TABLE[doc['symbol']] = doc 76 | for alias in doc.get('alias_symbol', []): 77 | SYMBOL_ALIAS_TABLE[alias].append(doc['symbol']) 78 | for prev in doc.get('prev_symbol', []): 79 | SYMBOL_ALIAS_TABLE[prev].append(doc['symbol']) 80 | SYMBOL_ALIAS_TABLE = dict(SYMBOL_ALIAS_TABLE) 81 | 82 | def __init__(self, gene_symbol): 83 | self.gene_symbol = gene_symbol 84 | try: 85 | doc = Gene.SYMBOL_TABLE[gene_symbol] 86 | except KeyError: 87 | aliases = Gene.SYMBOL_ALIAS_TABLE[gene_symbol] 88 | # if len(aliases) > 1: 89 | # raise KeyError("{} is an ambiguous gene symbol.".format(gene_symbol)) 90 | assert len(aliases) <= 1, 'Ambiguous gene symbol {}'.format(gene_symbol) 91 | doc = Gene.SYMBOL_TABLE[aliases[0]] 92 | self.entrez_id = doc['entrez_id'] 93 | self._doc = doc 94 | 95 | @property 96 | def symbol(self): 97 | return self.gene_symbol 98 | 99 | def __str__(self): 100 | return str(self.gene_symbol) 101 | 102 | def __bool__(self): 103 | return bool(self.entrez_id) 104 | 105 | def __hash__(self): 106 | return int(self.entrez_id) 107 | 108 | def __eq__(self, other): 109 | return self.entrez_id == other.entrez_id 110 | 111 | def __lt__(self, other): 112 | return self.entrez_id < other.entrez_id 113 | 114 | def __gt__(self, other): 115 | return self.entrez_id > other.entrez_id 116 | 117 | 118 | class GenomicFeature(Element): 119 | 120 | CHROMOSOMES = [str(x) for x in range(1, 23)] + ['X', 'Y', 'MT'] 121 | REFERENCE_BUILDS = ['GRCh37', 'GRCh38'] 122 | 123 | def __init__(self, chromosome, start, end, referenceName='GRCh37', name='', geneSymbol='', sequence_ontology={}, alt=None, ref=None, **kwargs): 124 | chromosome = str(chromosome) 125 | if chromosome.lower().startswith('chr'): 126 | chromosome = chromosome[3:] 127 | if chromosome == '23': 128 | chromosome = 'X' 129 | if chromosome == '24': 130 | chromosome = 'Y' 131 | assert chromosome in GenomicFeature.CHROMOSOMES 132 | self.chromosome = chromosome 133 | self.start = int(start) 134 | self.end = int(end) 135 | assert self.start <= self.end 136 | self.so = sequence_ontology 137 | self.alt = alt 138 | self.ref = ref 139 | self.name = name 140 | self.gene_symbol = geneSymbol 141 | assert referenceName in GenomicFeature.REFERENCE_BUILDS 142 | self.reference_name = referenceName 143 | 144 | def __str__(self): 145 | return ':'.join([str(getattr(self, x)) for x in ['reference_name', 'chromosome', 'start', 'end', 'name']]) 146 | 147 | def __eq__(self, other): 148 | return all([ 149 | self.chromosome == other.chromosome, 150 | self.start == other.start, 151 | self.end == other.end, 152 | self.reference_name == other.reference_name 153 | ]) 154 | 155 | def __hash__(self): 156 | return hash(tuple([str(getattr(self, x)) for x in ['reference_name', 'chromosome', 'start', 'end', 'alt']])) 157 | 158 | def issubfeature(self, other): 159 | return all([ 160 | self.chromosome == other.chromosome, 161 | self.start >= other.start, 162 | self.end <= other.end, 163 | self.reference_name == other.reference_name 164 | ]) 165 | 166 | def issuperfeature(self, other): 167 | return all([ 168 | self.chromosome == other.chromosome, 169 | self.start <= other.start, 170 | self.end >= other.end, 171 | self.reference_name == other.reference_name 172 | ]) 173 | 174 | def __lt__(self, other): 175 | if self.reference_name != other.reference_name: 176 | return self.reference_name < other.reference_name 177 | elif self.chromosome != other.chromosome: 178 | c = GenomicFeature.CHROMOSOMES 179 | return c.index(self.chromosome) < c.index(other.chromosome) 180 | elif self.start != other.start: 181 | return self.start < other.start 182 | elif self.end != other.end: 183 | return self.end < other.end 184 | else: 185 | return False 186 | 187 | def __gt__(self, other): 188 | return not self < other and self != other 189 | 190 | def __le__(self, other): 191 | return not self > other 192 | 193 | def __ge__(self, other): 194 | return not self < other 195 | 196 | def __contains__(self, item): 197 | return self.issuperfeature(item) 198 | 199 | def __len__(self): 200 | return self.end - self.start + 1 201 | 202 | 203 | class Publication(Element): 204 | 205 | pmid_re = re.compile(r'https?://.*pubmed/(\d+)$') 206 | 207 | def __init__(self, publication_string): 208 | pmid_match = Publication.pmid_re.match(publication_string) 209 | self.pmid = None 210 | self.publication_string = publication_string 211 | if pmid_match: 212 | self.pmid = int(pmid_match[1]) 213 | 214 | def __str__(self): 215 | if self.pmid: 216 | return str(self.pmid) 217 | else: 218 | return self.publication_string 219 | 220 | 221 | class ViccAssociation(dict): 222 | 223 | def __str__(self): 224 | return str(hash(self)) 225 | 226 | def __hash__(self): 227 | return self._stable_hash() 228 | 229 | def _stable_hash(self): 230 | raise NotImplementedError 231 | 232 | @property 233 | def publications(self): 234 | evidence = self['association']['evidence'] 235 | all_pubs = list() 236 | for e in evidence: 237 | all_pubs += [Publication(p) for p in e['info']['publications'] if p] 238 | return all_pubs 239 | 240 | @property 241 | def evidence_level(self): 242 | return self['association']['evidence_label'] 243 | 244 | @property 245 | def genes(self): 246 | if getattr(self, '_genes', None): 247 | return self._genes 248 | out = list() 249 | for g in self['genes']: 250 | if not g: 251 | continue 252 | try: 253 | out.append(Gene(g)) 254 | except KeyError: 255 | continue 256 | except AssertionError: 257 | warn('Ambiguous gene symbol {} in assertion {}'.format(g, self)) 258 | continue 259 | self._genes = out 260 | return out 261 | 262 | @property 263 | def source(self): 264 | return self['source'] 265 | 266 | @property 267 | def features(self): 268 | if getattr(self, '_features', None): 269 | return self._features 270 | out = list() 271 | for f in self['features']: 272 | try: 273 | f2 = GenomicFeature(**f) 274 | except: 275 | continue 276 | out.append(f2) 277 | self._features = sorted(out) 278 | return sorted(out) 279 | 280 | @property 281 | def disease(self): 282 | try: 283 | return Disease(**self['association']['phenotype']['type']) 284 | except KeyError: 285 | return None 286 | 287 | @property 288 | def drugs(self): 289 | out = list() 290 | for d in self['association'].get('environmentalContexts', []): 291 | try: 292 | d2 = Drug(**d) 293 | except: 294 | continue 295 | out.append(d2) 296 | return out 297 | 298 | def __eq__(self, other): 299 | return hash(self) == hash(other) 300 | 301 | @property 302 | def description(self): 303 | return self['association'].get('description', None) 304 | 305 | 306 | class RawAssociation(ViccAssociation): 307 | 308 | def _stable_hash(self): 309 | source = self['source'] 310 | if source == 'civic': 311 | assert len(self['association']['evidence']) == 1 # we currently import 1 evidence per association. 312 | k = 'civic:{}'.format(self['association']['evidence'][0]['evidenceType']['id']) 313 | elif source == 'molecularmatch': 314 | k = 'mm:{}'.format(self['raw']['hashKey']) 315 | elif source == 'brca': 316 | k = 'brca:{}'.format(self['raw']['id']) 317 | elif source == 'pmkb': 318 | t = [self['raw']['variant']['id'], self['raw']['tumor']['id']] + [x['id'] for x in self['raw']['tissues']] 319 | k = 'pmkb:{}'.format('-'.join([str(x) for x in t])) # There's no interpretation ID, made compound ID from components 320 | elif source == 'oncokb': 321 | try: 322 | pre = self['raw']['clinical'] 323 | t = [pre['cancerType'], pre['drug'], pre['gene'], pre['variant']['name'], pre['level']] 324 | k = 'oncokb_clinical:{}'.format('-'.join(t)) 325 | except KeyError: 326 | pre = self['raw']['biological'] 327 | t = [pre['gene'], pre['variant']['name'], pre['oncogenic'], pre['mutationEffectPmids']] 328 | k = 'oncokb_biological:{}'.format('-'.join(t)) 329 | elif source == 'jax': 330 | k = 'jax:{}'.format(self['raw']['id']) 331 | elif source == 'cgi': 332 | t = [self['raw']['Drug full name'], self['raw']['Primary Tumor type'], 333 | self['raw']['Alteration'], self['raw']['Source']] + self['raw']['individual_mutation'] 334 | k = 'cgi:{}'.format('-'.join(t)) 335 | else: 336 | raise NotImplementedError("No hash routine defined for source '{}'".format(source)) 337 | b = k.encode() 338 | m = hashlib.sha256() 339 | m.update(b) 340 | return int(m.hexdigest(), 16) % 10**16 # Last 16 digits of sha256 hash are sufficient 341 | 342 | 343 | class ViccDb: 344 | 345 | DEFAULT_CACHE = DATA_ROOT / 'association_cache.pkl' 346 | 347 | def __init__(self, associations=None, load_cache=False, 348 | save_cache=False, cache_path=DEFAULT_CACHE, 349 | data_dir=(DATA_ROOT / '0.10') 350 | ): 351 | if load_cache and save_cache: 352 | raise ValueError('Can only load or save cache, not both.') 353 | if load_cache: 354 | with open(str(cache_path), 'rb') as f: 355 | self.associations = pickle.load(f) 356 | elif associations is not None: 357 | self.associations = associations 358 | else: 359 | self.load_data(data_dir=data_dir) 360 | self._index_associations() 361 | if save_cache: 362 | self.cache_data(cache_path) 363 | 364 | def load_data(self, data_dir): 365 | resource_paths = list(data_dir.glob('*.json')) 366 | if resource_paths: 367 | self._load_local(resource_paths) 368 | else: 369 | self._load_s3() 370 | 371 | def _load_local(self, resource_paths): 372 | self.associations = list() 373 | for path in resource_paths: 374 | source = path.parts[-1].split('.')[0] 375 | with path.open() as json_data: 376 | for line in json_data: 377 | association = RawAssociation(json.loads(line)) # TODO: Move to ViccAssociation after RawAssociation checks pass 378 | association['raw'] = association.pop(source) 379 | self.associations.append(association) 380 | 381 | def _load_s3(self): 382 | raise NotImplementedError 383 | 384 | def _index_associations(self): 385 | features = [] 386 | associations_by_source = defaultdict(list) 387 | hashed = defaultdict(list) 388 | for association in self.associations: 389 | source = association['source'] 390 | associations_by_source[source].append(association) 391 | h = hash(association) 392 | hashed[h].append(association) 393 | for feature in association.features: 394 | features.append((feature, h)) 395 | self._features = features 396 | self._features_sorted = False 397 | self.associations_by_source = dict(associations_by_source) 398 | self._hashed = hashed 399 | self._element_by_source = dict() 400 | 401 | @property 402 | def features(self): 403 | if self._features_sorted: 404 | return self._features 405 | else: 406 | self._features = sorted(self._features, key=itemgetter(0)) 407 | self._features_sorted = True 408 | return self._features 409 | 410 | def select(self, filter_function): 411 | associations = filter(filter_function, self.associations) 412 | return ViccDb(list(associations)) 413 | 414 | def by_source(self, source): 415 | return ViccDb(self.associations_by_source[source]) 416 | 417 | def report_groups(self, superset=None): 418 | if superset is None: 419 | total = len(self) 420 | for group in sorted(self.associations_by_source): 421 | count = len(self.associations_by_source[group]) 422 | print("{}: {} ({:.1f}% of total)".format(group, count, count / total * 100)) 423 | print("{} total associations".format(total)) 424 | else: 425 | for group in sorted(self.associations_by_source): 426 | count = len(self.associations_by_source[group]) 427 | # intended: below will raise error if key doesn't exit in superset, should be actual superset of self. 428 | superset_count = len(superset.associations_by_source[group]) 429 | print("{}: {} ({:.1f}% of superset)".format(group, count, count / superset_count * 100)) 430 | print("Total: {} ({:.1f}% of superset)".format(len(self.associations), 431 | len(self.associations) / len(superset.associations) * 100)) 432 | 433 | def cache_data(self, cache_path=DEFAULT_CACHE): 434 | with open(str(cache_path), 'wb') as f: 435 | pickle.dump(self.associations, f) 436 | 437 | def __len__(self): 438 | return len(self.associations) 439 | 440 | def __iter__(self): 441 | return iter(self.associations) 442 | 443 | def __contains__(self, item): 444 | return hash(item) in self._hashed 445 | 446 | def __getitem__(self, item): 447 | return self.associations[item] 448 | 449 | def __sub__(self, other): 450 | for h, associations in other._hashed.items(): 451 | error_msg = "Cannot perform set substraction, association hash not unique." 452 | assert len(associations) == 1, error_msg 453 | assert len(self._hashed.get(h, [])) <= 1, error_msg 454 | # these assertions assume that hash uniquely identifies an association. 455 | # Currently not true, but should be with harvester changes. 456 | return ViccDb([x for x in self if x not in other]) 457 | 458 | def plot_element_by_source(self, element, filter_func=lambda x: bool(x), min_bound=1, max_bound=1000000000): 459 | element_by_source = self.get_element_by_source(element) 460 | 461 | df_dict = dict() 462 | column_name = ['attribute'] 463 | for source in element_by_source: 464 | filtered_elements = list(filter(filter_func, element_by_source[source])) 465 | df_dict[source] = pd.DataFrame(filtered_elements, columns=column_name) 466 | x = pyu.plot(df_dict, unique_keys=column_name, inters_size_bounds=(min_bound, max_bound)) 467 | x['input_data'] = element_by_source 468 | return x 469 | 470 | def element_by_source_stats(self, element, filter_func=lambda x: bool(x)): 471 | element_by_source = self.get_element_by_source(element) 472 | for source, elements in element_by_source.items(): 473 | element_by_source[source] = set(list(filter(filter_func, elements))) 474 | ubiquitous_elements = set.intersection(*(element_by_source.values())) 475 | total_elements = set.union(*(element_by_source.values())) 476 | count = Counter() 477 | for source in element_by_source: 478 | count.update(element_by_source[source]) 479 | majority_size = ceil(len(element_by_source) / 2) 480 | majority_elements = set([element for element in count if count[element] >= majority_size]) 481 | unique_elements = set([element for element in count if count[element] == 1]) 482 | out = { 483 | 'total': total_elements, 484 | 'ubiquitous': ubiquitous_elements, 485 | 'majority': majority_elements, 486 | 'majority_size': majority_size, 487 | 'unique_elements': unique_elements 488 | } 489 | a = len(unique_elements) 490 | b = len(total_elements) 491 | print("{} / {} ({:.2%}) of {} are represented in only 1 resource." 492 | .format(a, b, a / b, element)) 493 | 494 | a = len(majority_elements) 495 | print("{} / {} ({:.2%}) of {} are represented in the majority of ({}) resources." 496 | .format(a, b, a / b, element, majority_size)) 497 | 498 | a = len(ubiquitous_elements) 499 | print("{} / {} ({:.2%}) of {} are represented across all resources." 500 | .format(a, b, a/b, element)) 501 | return out 502 | 503 | def get_element_by_source(self, element): 504 | try: 505 | e = self._element_by_source[element] 506 | except KeyError: 507 | element_by_source = defaultdict(set) 508 | for association in self: 509 | association_element = None 510 | association_element = getattr(association, element) 511 | if hasattr(association_element, '__iter__') and not isinstance(association_element, str): 512 | element_by_source[association.source].update(association_element) 513 | elif association_element is None: 514 | continue 515 | else: 516 | element_by_source[association.source].add(association_element) 517 | self._element_by_source[element] = dict(element_by_source) 518 | e = self._element_by_source[element] 519 | return e 520 | 521 | MATCH_RANKING = ['exact', 'positional', 'focal', 'regional'] 522 | 523 | def search_by_feature(self, chromosome=None, start=None, end=None, reference_name=None, 524 | name=None, alt=None, gene_symbol=None, genomic_feature=None): 525 | """ 526 | Returns a list of hits, each corresponding to a single query / association match. 527 | All features of an association matching a query are stored in the matches attribute. 528 | The best match between a query and association is stored in the best_match attribute. 529 | """ 530 | if not isinstance(genomic_feature, GenomicFeature): 531 | query = GenomicFeature(chromosome, start, end, reference_name, name, gene_symbol, alt=alt) 532 | else: 533 | query = genomic_feature 534 | return self.search_by_features([query]) 535 | 536 | @staticmethod 537 | def _get_best_match(matches): 538 | if len(matches) == 1: 539 | best_match = matches[0] 540 | else: 541 | s1 = sorted(matches, key=lambda x: x['p'], reverse=True) 542 | s2 = sorted(s1, key=lambda x: ViccDb.MATCH_RANKING.index(x['type'])) 543 | best_match = s2[0] 544 | return best_match 545 | 546 | @staticmethod 547 | def _get_match_type(query, feature): 548 | match = {'feature': feature} 549 | if query == feature: 550 | if query.alt and feature.alt and query.alt == feature.alt: 551 | match['type'] = 'exact' 552 | else: 553 | match['type'] = 'positional' 554 | match['p'] = 1 555 | else: 556 | if query.issubfeature(feature): 557 | p = len(query) / len(feature) 558 | elif query.issuperfeature(feature): 559 | p = len(feature) / len(query) 560 | else: 561 | # When query and feature are only partially overlapping 562 | if query < feature: 563 | num = query.end - feature.start + 1 564 | else: 565 | num = feature.end - query.start + 1 566 | den = max(len(feature), len(query)) 567 | p = num / den 568 | assert p < 1, f'{p} should be less than 1 for {query} on {feature}' 569 | match['p'] = p 570 | if p >= 0.1: 571 | match['type'] = 'focal' 572 | elif p > 0: 573 | match['type'] = 'regional' 574 | else: 575 | raise ValueError(f'Expected an overlap between {query} and {feature}') 576 | return match 577 | 578 | def search_by_features(self, genomic_features): 579 | assert isinstance(genomic_features, list) 580 | db_features_pointer = 0 581 | query_features_pointer = 0 582 | match_start = None 583 | last_query_features_pointer = -1 584 | c = GenomicFeature.CHROMOSOMES 585 | query_features = sorted(genomic_features) 586 | hit_index = dict() 587 | while query_features_pointer < len(query_features) and db_features_pointer < len(self.features): 588 | if last_query_features_pointer != query_features_pointer: 589 | q = query_features[query_features_pointer] 590 | if match_start is not None: 591 | db_features_pointer = match_start 592 | match_start = None 593 | last_query_features_pointer = query_features_pointer 594 | d, association_hash = self.features[db_features_pointer] 595 | if q.reference_name != d.reference_name: 596 | raise NotImplementedError('All records in query and datastore currently must match same reference') 597 | if c.index(q.chromosome) < c.index(d.chromosome): 598 | query_features_pointer += 1 599 | continue 600 | if c.index(q.chromosome) > c.index(d.chromosome): 601 | db_features_pointer += 1 602 | continue 603 | if q.start > d.end: 604 | db_features_pointer += 1 605 | continue 606 | if q.end < d.start: 607 | query_features_pointer += 1 608 | continue 609 | m = ViccDb._get_match_type(q, d) 610 | key = (q, association_hash) 611 | matches = hit_index.get(key, list()) 612 | matches.append(m) 613 | hit_index[key] = matches 614 | if match_start is None: 615 | match_start = db_features_pointer 616 | db_features_pointer += 1 617 | hits = list() 618 | for key, matches in hit_index.items(): 619 | q, association_hash = key 620 | best_match = ViccDb._get_best_match(matches) 621 | a = self.get_association_by_hash(association_hash) 622 | hit = { 623 | 'query': q, 624 | 'association': a, 625 | 'matches': matches, 626 | 'best_match': best_match 627 | } 628 | hits.append(hit) 629 | return hits 630 | 631 | def get_association_by_hash(self, key): 632 | a = self._hashed[key] 633 | assert len(a) == 1 634 | return a[0] 635 | 636 | @property 637 | def sources(self): 638 | return self.associations_by_source.keys() --------------------------------------------------------------------------------