├── iBKH-KD-protocol ├── data │ └── test ├── output │ ├── shortest_path_interpretation_Disease_Drug.pdf │ ├── prediction_drug_top100_transE_l2.csv │ └── prediction_drug_top100_ensemble.csv ├── funcs │ ├── knowledge_visualization.py │ └── KG_link_pred.py └── Knowledge_Discovery_Pipeline.ipynb ├── Codes_Term Harmonization ├── README.md ├── Entity_Integration │ ├── entity_side_effect.py │ ├── entity_gene.py │ ├── entity_pathway.py │ ├── entity_symptom.py │ ├── entity_anatomy.py │ └── entity_disease.py └── Relation_Integration │ ├── integrate_drug_related.py │ ├── integrate_disease_related.py │ ├── integrate_drug_disease.py │ ├── integrate_drug_gene.py │ ├── integrate_gene_related.py │ └── integrate_disease_gene.py ├── iBKH_Schema.png ├── Codes_Analysis ├── image │ └── knowledge_discover.png └── README.md ├── iBKH ├── iBKH_2021_04_12 │ ├── Relation │ │ └── README.md │ └── Entity │ │ └── README.md └── iBKH_2021_05_03 │ ├── Relation │ └── README.md │ └── Entity │ └── README.md ├── README.md └── Source Information └── README.md /iBKH-KD-protocol/data/test: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Codes_Term Harmonization/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /iBKH_Schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wcm-wanglab/iBKH/HEAD/iBKH_Schema.png -------------------------------------------------------------------------------- /Codes_Analysis/image/knowledge_discover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wcm-wanglab/iBKH/HEAD/Codes_Analysis/image/knowledge_discover.png -------------------------------------------------------------------------------- /iBKH-KD-protocol/output/shortest_path_interpretation_Disease_Drug.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wcm-wanglab/iBKH/HEAD/iBKH-KD-protocol/output/shortest_path_interpretation_Disease_Drug.pdf -------------------------------------------------------------------------------- /Codes_Term Harmonization/Entity_Integration/entity_side_effect.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | folder = '' 4 | 5 | 6 | def extract_SIDER(): 7 | sider_df = pd.read_table(folder + 'entity/side_effect/meddra_all_se.tsv', header=None) 8 | sider_df = sider_df[sider_df[3] == 'PT'] 9 | sider_df = sider_df[[0, 4, 5]] 10 | sider_df = sider_df.rename(columns={0: 'CID', 4: 'umls_cui', 5: 'name'}) 11 | sider_df = sider_df.reset_index(drop=True) 12 | print(sider_df) 13 | res = sider_df[['umls_cui', 'name']] 14 | res = res.drop_duplicates(subset='umls_cui', keep='first') 15 | res['primary'] = 'UMLS:' + res['umls_cui'].astype(str) 16 | res = res[['primary', 'umls_cui', 'name']] 17 | print(res) 18 | res.to_csv(folder + 'entity/side_effect/side_effect_vocab.csv', index=False) 19 | 20 | 21 | def main(): 22 | extract_SIDER() 23 | 24 | 25 | if __name__ == '__main__': 26 | main() 27 | -------------------------------------------------------------------------------- /iBKH/iBKH_2021_04_12/Relation/README.md: -------------------------------------------------------------------------------- 1 | ## Download iBKH relations 2 | To access the relations in the iBKH, you can directly download the iBKH relations by the following link. 3 | 4 | ``` 5 | https://wcm.box.com/s/fzzsx9ldj8a64jsa04hyf8khple7js7n 6 | ``` 7 | 8 | When you unzip the file, you will get the following .csv files. 9 | ``` 10 | ./relation/A_G_res.csv 11 | ./relation/D_D_res.csv 12 | ./relation/D_Di_res.csv 13 | ./relation/D_G_res.csv 14 | ./relation/Di_Di_res.csv 15 | ./relation/Di_G_res.csv 16 | ./relation/Di_S_res.csv 17 | ./relation/DSP_SDSI_res.csv 18 | ./relation/G_G_res.csv 19 | ./relation/SDSI_Ares.csv 20 | ./relation/SDSI_D_res.csv 21 | ./relation/SDSI_Di_res.csv 22 | ./relation/SDSI_S.csv 23 | ./relation/SDSI_TC_res.csv 24 | ``` 25 | 26 | ## iBKH relations 27 | Each row in the iBKH relationship describes the relationship between a pair of entities. We kept all the relationship types from the source database in the iBKH relations tables, and use binary to express exist/non-exist. 1 indicates that the relationship exists between the entity pairs, and 0 indicates that the relationship does not exist. For example, 28 | Drug | Disease | Palliates | Treats | Effect | Association | Source 29 | --- | --- | --- | --- |--- |--- |--- 30 | DrugBank:DB00843 | DOID:10652 | 0 | 1 | 0 | 1 | Hetionet; CTD 31 | ... | ... | ... | ... | ... | ... | ... 32 | 33 | From the above record, we can observe that the entity 'Donepezil' (primary ID is DrugBank:DB00843) and the entity 'Alzheimer's Disease' (primary ID is DOID:10652) have the relation 'Treats' and 'Association', and the relations come from the Hetionet and CTD respectively. 34 | -------------------------------------------------------------------------------- /iBKH/iBKH_2021_05_03/Relation/README.md: -------------------------------------------------------------------------------- 1 | ## Download iBKH relations 2 | To access the relations in the iBKH, you can directly download the iBKH relations by the following link. 3 | 4 | ``` 5 | https://wcm.box.com/s/dcq6lj4vxzs4rnxu6xx60ziwl62qrzyp 6 | ``` 7 | 8 | When you unzip the file, you will get the following .csv files. 9 | ``` 10 | ./relation/A_G_res.csv 11 | ./relation/D_D_res.csv 12 | ./relation/D_Di_res.csv 13 | ./relation/D_G_res.csv 14 | ./relation/D_Pwy_res.csv 15 | ./relation/D_SE_res.csv 16 | ./relation/Di_Di_res.csv 17 | ./relation/Di_G_res.csv 18 | ./relation/Di_Pwy_res.csv 19 | ./relation/Di_Sy_res.csv 20 | ./relation/DSP_SDSI_res.csv 21 | ./relation/G_G_res.csv 22 | ./relation/G_Pwy_res.csv 23 | ./relation/SDSI_A_res.csv 24 | ./relation/SDSI_D_res.csv 25 | ./relation/SDSI_Di_res.csv 26 | ./relation/SDSI_Sy.csv 27 | ./relation/SDSI_TC_res.csv 28 | ``` 29 | 30 | ## iBKH relations 31 | Each row in the iBKH relationship describes the relationship between a pair of entities. We kept all the relationship types from the source database in the iBKH relations tables, and use binary to express exist/non-exist. 1 indicates that the relationship exists between the entity pairs, and 0 indicates that the relationship does not exist. The inference score reflects the degree of similarity between the drug-disease network in the CTD inferred relationship. The triplets will be assigned an inference score when the triplets are only an inferred relation from the CTD. 32 | 33 | | Drug | Disease | Treats | Palliates | Effect | Associate | Inferred_Relation | ... | Source | Inference_Score | 34 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | 35 | | DrugBank:DB00843 | DOID:10652 | 1 | 0 | 1 | 1 | 0 | ... | CTD;DRKG;Hetionet;KEGG | ... | 36 | | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | 37 | 38 | From the above example record, we can observe that the entity 'Donepezil' (primary ID is DrugBank:DB00843) and the entity 'Alzheimer's Disease' (primary ID is DOID:10652) have the relation 'Treats' and 'Association', and the relations come from the Hetionet and CTD curated relation respectively. 39 | 40 | | Drug | Disease | Treats | Palliates | Effect | Associate | Inferred_Relation | ... | Source | Inference_Score | 41 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | 42 | | DrugBank:DB06767 | DOID:10283 | 0 | 0 | 0 | 0 | 1 | ... |CTD | 342.19 | 43 | | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | 44 | 45 | From the above example record, we can observe that the entity 'Ammonium chloride' (primary ID is DrugBank: DB06767) and the entity 'Prostate cancer' (primary ID is DOID:10283) have the relation from the CTD inferred relation. The relation is assigned an inference score (342.19). 46 | -------------------------------------------------------------------------------- /iBKH/iBKH_2021_04_12/Entity/README.md: -------------------------------------------------------------------------------- 1 | ## Download iBKH entities 2 | To access the entity vocabulary in the iBKH, you can directly download the iBKH entities by the following link. 3 | ``` 4 | https://wcm.box.com/s/kz7lnowhf2iejjwsopo6cqsdati0yj6i 5 | ``` 6 | 7 | When you unzip the file, you will get the following .csv files. 8 | ``` 9 | ./entity/anatomy_vocab.csv 10 | ./entity/disease_vocab.csv 11 | ./entity/drug_vocab.csv 12 | ./entity/dsp_vocab.csv 13 | ./entity/gene_vocab.csv 14 | ./entity/molecule_vocab.csv 15 | ./entity/sdsi_vocab.csv 16 | ./entity/symptom_vocab.csv 17 | ./entity/tc_vocab.csv 18 | ``` 19 | 20 | ## iBKH entities vocabulary 21 | Each row in the iBKH entity vocabulary describes an entity, and each column in this row records the entity's information in different source databases (such as original ID, name, etc.). For example, 22 | | primary | do_id | do_name | kegg_id | kegg_name | pharmgkb_id | pharmgkb_name | umls_cui | mesh_id | ... | 23 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | 24 | | DOID:0001816 | 0001816 | angiosarcoma | H01666 | Angiosarcoma | PA444390 | Hemangiosarcoma | C0018923 | D006394 | ... | 25 | | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | 26 | 27 | The above row comes from the disease vocabulary, which describes a disease entity 'Angiosarcoma'. We can observe that the entity 'Angiosarcoma' has the following information, Disease Ontology ID (DOID:0001816), KEGG ID (H01666), PharmGKB ID (PA444390), the name in PharmGKB ('Hemangiosarcoma'), UMLS CUI (C0018923) and MeSH ID (D006394). 28 | 29 | | primary | symbol | hgnc_id | ncbi_id | pharmgkb_id | 30 | | --- | --- | --- | --- | --- | 31 | | HGNC:5 | A1BG | 5 | 1 | PA24356 | 32 | | ... | ... | ... | ... | ... | 33 | 34 | This example comes from the gene vocabulary, it describes a gene entity 'A1BG'. The corresponding information of the entity 'A1BG' has, HGNC ID (HGNC:5), NCBI ID (NCBI:1), gene symbol (A1BG) and PharmGKB ID (PA24356). 35 | 36 | We assigned the primary ID for each type of entity, for example, we used HGNC ID as the primary ID in gene entity vocabulary. And we used the entity's primary ID to describe the entities in the relationship. For example, there is a relation 'Treats' between entities 'Donepezil' and 'Alzheimer's Disease' in the iBKH. And we used the entity Donepezil's primary ID (DrugBank:DB00843) and AD's primary ID (DOID:10652) to describe them respectively. When an entity can't find the corresponding primary ID, we will follow the primary priority order to do the mapping. For example, the NCBI ID is the second primary ID for the Gene entity vocabulary. Currently, the existing entity vocabularies have the following primary ID order: 37 | * Gene: HGNC ID, NCBI ID 38 | * Disease: Disease Ontology ID, KEGG ID, PharmGKB ID, MeSH ID, OMIM ID, iDISK ID 39 | * Drug: DrugBank ID, KEGG ID, PharmGKB ID, MeSH ID, iDISK ID 40 | * Anatomy: Uberon ID, BTO ID, MeSH ID, CL ID 41 | * Molecule: ChEMBL ID, ChEBI ID 42 | * Symptom: MeSH ID, UMLS CUI 43 | * DSI: iDISK 44 | * DSP: iDISK 45 | * TC: UMLS CUI 46 | * Pathway: Reactome ID, KEGG ID 47 | * Side-Effect: UMLS CUI 48 | -------------------------------------------------------------------------------- /iBKH/iBKH_2021_05_03/Entity/README.md: -------------------------------------------------------------------------------- 1 | ## Download iBKH entities 2 | To access the entity vocabulary in the iBKH, you can directly download the iBKH entities using the following link. 3 | ``` 4 | https://wcm.box.com/s/gagu6yj2toyk4kirb6hpsb1qu4dm203p 5 | ``` 6 | 7 | When you unzip the file, you will get the following .csv files. 8 | ``` 9 | ./entity/anatomy_vocab.csv 10 | ./entity/disease_vocab.csv 11 | ./entity/drug_vocab.csv 12 | ./entity/dsp_vocab.csv 13 | ./entity/gene_vocab.csv 14 | ./entity/molecule_vocab.csv 15 | ./entity/pathway_vocab.csv 16 | ./entity/sdsi_vocab.csv 17 | ./entity/side_effect_vocab.csv 18 | ./entity/symptom_vocab.csv 19 | ./entity/tc_vocab.csv 20 | ``` 21 | 22 | ## iBKH entities vocabulary 23 | Each row in the iBKH entity vocabulary describes an entity, and each column in this row records the entity's information in different source databases (such as original ID, name, etc.). For example, 24 | | primary | name | do_id | kegg_id | pharmgkb_id | mesh_id | umls_cui | icd_10 | icd_9 | omim_id | iDISK_id | 25 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | 26 | | DOID:10652 | alzheimer's disease | DOID:10652 | H00056 | PA443319 | D000544 | C0002395 | G30 | 331 | ... | ... | 27 | | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | 28 | 29 | The above row comes from the disease vocabulary, which describes a disease entity 'alzheimer's disease'. We can observe that the entity 'alzheimer's disease' has the following information, Disease Ontology ID (DOID:10652), KEGG ID (H00056), PharmGKB ID (PA443319), UMLS CUI (C0002395), MeSH ID (D000544), ICD-10 code (G30) and ICD-9 code (331). 30 | 31 | | primary | symbol | hgnc_id | ncbi_id | pharmgkb_id | 32 | | --- | --- | --- | --- | --- | 33 | | HGNC:5 | A1BG | 5 | 1 | PA24356 | 34 | | ... | ... | ... | ... | ... | 35 | 36 | This example comes from the gene vocabulary, it describes a gene entity 'A1BG'. The corresponding information of the entity 'A1BG' has, HGNC ID (HGNC:5), NCBI ID (NCBI:1), gene symbol (A1BG) and PharmGKB ID (PA24356). 37 | 38 | We assigned the primary ID for each type of entity, for example, we used HGNC ID as the primary ID in gene entity vocabulary. And we used the entity's primary ID to describe the entities in the relationship. For example, there is a relation 'Treats' between entities 'Donepezil' and 'Alzheimer's Disease' in the iBKH. And we used the entity Donepezil's primary ID (DrugBank:DB00843) and AD's primary ID (DOID:10652) to describe them respectively. When an entity can't find the corresponding primary ID, we will follow the primary priority order to do the mapping. For example, the NCBI ID is the second primary ID for the Gene entity vocabulary. Currently, the existing entity vocabularies have the following primary ID order: 39 | * Gene: HGNC ID, NCBI ID 40 | * Disease: Disease Ontology ID, KEGG ID, PharmGKB ID, MeSH ID, OMIM ID, iDISK ID 41 | * Drug: DrugBank ID, KEGG ID, PharmGKB ID, MeSH ID, iDISK ID 42 | * Anatomy: Uberon ID, BTO ID, MeSH ID, CL ID 43 | * Molecule: ChEMBL ID, ChEBI ID 44 | * Symptom: MeSH ID, UMLS CUI 45 | * DSI: iDISK 46 | * DSP: iDISK 47 | * TC: UMLS CUI 48 | * Pathway: Reactome ID, KEGG ID 49 | * Side-Effect: UMLS CUI 50 | -------------------------------------------------------------------------------- /Codes_Term Harmonization/Relation_Integration/integrate_drug_related.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | folder = '' 5 | 6 | 7 | def integrate_DPwy(): 8 | D_Pwy_kegg = pd.read_csv(folder + '/kegg_drug_pathway.csv') 9 | print(D_Pwy_kegg) 10 | pathway_vocab = pd.read_csv(folder + '/pathway_vocab.csv') 11 | drug_vocab = pd.read_csv(folder + '/drug_vocab.csv') 12 | 13 | pathway_primary_dict = pathway_vocab.dropna(subset=['kegg_id']).set_index('kegg_id')['primary'].to_dict() 14 | kegg_drug_vocab = drug_vocab.dropna(subset=['kegg_id']) 15 | kegg_drug_primary_dict = kegg_drug_vocab.set_index('kegg_id')['primary'].to_dict() 16 | 17 | D_Pwy_res = D_Pwy_kegg.replace({'kegg_id': kegg_drug_primary_dict, 'pathway_id': pathway_primary_dict}) 18 | D_Pwy_res = D_Pwy_res.rename(columns={'kegg_id': 'Drug', 'pathway_id': 'Pathway'}) 19 | D_Pwy_res = D_Pwy_res[['Drug', 'Pathway']] 20 | D_Pwy_res['Association'] = [1] * len(D_Pwy_res) 21 | D_Pwy_res['Source'] = ['KEGG'] * len(D_Pwy_res) 22 | print(D_Pwy_res) 23 | D_Pwy_res.to_csv(folder + '/D_Pw_res.csv', index=False) 24 | 25 | 26 | def integrate_DSE(): 27 | sider_df = pd.read_table(folder + '/meddra_all_se.tsv', header=None) 28 | sider_df = sider_df[sider_df[3] == 'PT'] 29 | sider_df = sider_df[[0, 4, 5]] 30 | sider_df = sider_df.rename(columns={0: 'CID', 4: 'umls_cui', 5: 'name'}) 31 | sider_df = sider_df.reset_index(drop=True) 32 | 33 | drug_vocab = pd.read_csv(folder + '/drug_vocab.csv') 34 | side_effect_vocab = pd.read_csv(folder + '/side_effect_vocab.csv') 35 | 36 | cid_primary = drug_vocab.dropna(subset=['CID']).set_index('CID')['primary'].to_dict() 37 | se_primary = side_effect_vocab.set_index(['umls_cui'])['primary'].to_dict() 38 | 39 | D_SE_res = sider_df[['CID', 'umls_cui']] 40 | D_SE_res = D_SE_res.replace({'CID': cid_primary}) 41 | D_SE_res = D_SE_res.replace({'umls_cui': se_primary}) 42 | D_SE_res['Cause'] = [1] * len(D_SE_res) 43 | D_SE_res['Source'] = ['SIDER'] * len(D_SE_res) 44 | D_SE_res = D_SE_res.rename(columns={'CID': 'Drug', 'umls_cui': 'Side_Effect'}) 45 | 46 | print(D_SE_res) 47 | D_SE_res.to_csv(folder + '/D_SE_res.csv', index=False) 48 | 49 | 50 | def integrate_DSDSI(): 51 | sdsi_spd = pd.read_table(folder + '/MRREL.RRF', delimiter='|') 52 | sdsi_spd = sdsi_spd[sdsi_spd['REL'] == 'interacts_with'] 53 | sdsi_spd = sdsi_spd.reset_index(drop=True) 54 | 55 | sdsi_vocab = pd.read_csv(folder + '/sdsi_vocab.csv') 56 | sdsi_primary_dict = sdsi_vocab.set_index('iDISK_id')['primary'].to_dict() 57 | drug_vocab = pd.read_csv(folder + '/drug_vocab.csv') 58 | drug_idisk_primary_dict = drug_vocab.dropna(subset=['iDISK_id']).set_index('iDISK_id')['primary'].to_dict() 59 | 60 | sdsi_spd_res = pd.DataFrame(columns=['SDSI', 'Drug', 'interacts_with', 'Source']) 61 | for i in range(len(sdsi_spd)): 62 | sdsi = sdsi_spd.loc[i, 'CUI1'] 63 | drug = sdsi_spd.loc[i, 'CUI2'] 64 | sdsi_primary = sdsi_primary_dict[sdsi] 65 | drug_primary = drug_idisk_primary_dict[drug] 66 | sdsi_spd_res.loc[i] = [sdsi_primary, drug_primary, 1, 'iDISK'] 67 | print(i + 1, '/', len(sdsi_spd), 'Completed (SDSI_SPD)...') 68 | 69 | sdsi_spd_res.to_csv(folder + '/SDSI_D_res.csv', index=False) 70 | 71 | 72 | def main(): 73 | # integrate_DPwy() 74 | # integrate_DSE() 75 | integrate_DSDSI() 76 | 77 | 78 | if __name__ == '__main__': 79 | main() 80 | -------------------------------------------------------------------------------- /Codes_Term Harmonization/Entity_Integration/entity_gene.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | folder = '' 5 | res_folder = '' 6 | 7 | 8 | def remove_duplicated_ncbi(): 9 | gene_vocab = pd.read_csv(folder + 'entity/gene_vocab.csv') 10 | gene_vocab = gene_vocab[['primary', 'symbol', 'hgnc_id', 'ncbi_id']] 11 | # gene_vocab = gene_vocab.drop_duplicates(subset='ncbi_id', keep='first') 12 | gene_vocab = gene_vocab[(~gene_vocab.duplicated(subset='ncbi_id')) | (gene_vocab['ncbi_id'].isnull())] 13 | print(len(gene_vocab), len(gene_vocab.drop_duplicates(subset='primary', keep='first'))) 14 | hgnc_vocab = gene_vocab.dropna(subset=['hgnc_id']) 15 | print(len(hgnc_vocab), len(hgnc_vocab.drop_duplicates(subset='hgnc_id', keep='first'))) 16 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 17 | print(len(ncbi_vocab), len(ncbi_vocab.drop_duplicates(subset='ncbi_id', keep='first'))) 18 | print(gene_vocab) 19 | gene_vocab.to_csv(res_folder + 'gene_vocab.csv', index=False) 20 | 21 | 22 | def add_PharmGKB_gene(): 23 | gene_vocab = pd.read_csv(res_folder + 'gene_vocab.csv') 24 | gene_vocab['pharmgkb_id'] = [''] * len(gene_vocab) 25 | idx = len(gene_vocab) 26 | 27 | hgnc_vocab = gene_vocab.dropna(subset=['hgnc_id']) 28 | hgnc_vocab['hgnc_id'] = hgnc_vocab['hgnc_id'].astype(int).astype(str) 29 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 30 | hgnc_list = list(hgnc_vocab['hgnc_id']) 31 | ncbi_list = list(ncbi_vocab['ncbi_id']) 32 | 33 | pharmgkb_gene = pd.read_table(res_folder + 'pharmgkb_gene.tsv') 34 | for i in range(len(pharmgkb_gene)): 35 | p_id = pharmgkb_gene.loc[i, 'PharmGKB Accession Id'] 36 | hgnc_id = pharmgkb_gene.loc[i, 'HGNC ID'] 37 | ncbi_id = pharmgkb_gene.loc[i, 'NCBI Gene ID'] 38 | symbol = pharmgkb_gene.loc[i, 'Symbol'] 39 | 40 | if not pd.isnull(hgnc_id): 41 | hgnc_id = hgnc_id.replace('HGNC:', '') 42 | if hgnc_id in hgnc_list: 43 | gene_vocab.loc[gene_vocab['hgnc_id'] == int(hgnc_id), 'pharmgkb_id'] = p_id 44 | elif not pd.isnull(ncbi_id): 45 | if ncbi_id in ncbi_list: 46 | gene_vocab.loc[gene_vocab['ncbi_id'] == ncbi_id, 'pharmgkb_id'] = p_id 47 | else: 48 | gene_vocab.loc[idx] = ['PharmGKB:' + p_id, symbol, '', '', p_id] 49 | idx += 1 50 | print(i + 1, '/', len(pharmgkb_gene), 'Completed...') 51 | print(gene_vocab) 52 | gene_vocab.to_csv(res_folder + 'gene_vocab_2.csv', index=False) 53 | 54 | 55 | def add_ensembl(): 56 | gene_vocab = pd.read_csv(folder + 'entity/gene_vocab.csv') 57 | 58 | ensembl_df = pd.read_table(res_folder + 'gene2ensembl_May_3') 59 | ncbi_ensembl_dict = ensembl_df.set_index('GeneID')['Ensembl_gene_identifier'].to_dict() 60 | # ncbi_protein_dict = ensembl_df.set_index('GeneID')['Ensembl_protein_identifier'].to_dict() 61 | # print(gene_vocab) 62 | # print(ncbi_ensembl_dict[100527964], ncbi_protein_dict[100527964]) 63 | ensembl_list = [] 64 | for i in range(len(gene_vocab)): 65 | ncbi_id = gene_vocab.loc[i, 'ncbi_id'] 66 | ensembl_id = ncbi_ensembl_dict[ncbi_id] if ncbi_id in ncbi_ensembl_dict else '' 67 | ensembl_list.append(ensembl_id) 68 | print(i + 1, '/', len(gene_vocab), 'Completed...') 69 | gene_vocab['ensembl_id'] = ensembl_list 70 | print(gene_vocab) 71 | gene_vocab.to_csv(res_folder + 'gene_vocab_3.csv', index=False) 72 | 73 | 74 | def main(): 75 | # remove_duplicated_ncbi() 76 | # add_PharmGKB_gene() 77 | 78 | add_ensembl() 79 | 80 | 81 | if __name__ == '__main__': 82 | main() 83 | -------------------------------------------------------------------------------- /Codes_Term Harmonization/Relation_Integration/integrate_disease_related.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | folder = '' 5 | 6 | 7 | def integrate_DiPwy(): 8 | Di_Pwy_kegg = pd.read_csv(folder + 'kegg_disease_pathway.csv') 9 | print(Di_Pwy_kegg) 10 | 11 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv') 12 | kegg_disease_vocab = disease_vocab.dropna(subset=['kegg_id']) 13 | kegg_disease_primary_dict = kegg_disease_vocab.set_index('kegg_id')['primary'].to_dict() 14 | 15 | pathway_vocab = pd.read_csv(folder + 'pathway_vocab.csv') 16 | pathway_primary_dict = pathway_vocab.dropna(subset=['kegg_id']).set_index('kegg_id')['primary'].to_dict() 17 | 18 | Di_Pwy_res = Di_Pwy_kegg.replace({'kegg_id': kegg_disease_primary_dict, 'pathway_id': pathway_primary_dict}) 19 | Di_Pwy_res = Di_Pwy_res.rename(columns={'kegg_id': 'Disease', 'pathway_id': 'Pathway'}) 20 | Di_Pwy_res = Di_Pwy_res[['Disease', 'Pathway']] 21 | Di_Pwy_res['Association'] = [1] * len(Di_Pwy_res) 22 | Di_Pwy_res['Source'] = ['KEGG'] * len(Di_Pwy_res) 23 | print(Di_Pwy_res) 24 | Di_Pwy_res.to_csv(folder + 'Di_Pw_res.csv', index=False) 25 | 26 | 27 | def integrate_DiSy(): 28 | hetionet_DiSy = pd.read_csv(folder + 'hetionet_DiS.csv') 29 | hetionet_DiSy = hetionet_DiSy.rename(columns={'source': 'Disease', 'target': 'Symptom'}) 30 | hetionet_DiSy = hetionet_DiSy[['Disease', 'Symptom']] 31 | print(hetionet_DiSy) 32 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv') 33 | do_vocab = disease_vocab.dropna(subset=['do_id']) 34 | do_primary_dict = do_vocab.set_index('do_id')['primary'].to_dict() 35 | 36 | symptom_vocab = pd.read_csv(folder + 'symptom_vocab.csv') 37 | symptom_primary_dict = symptom_vocab.set_index('mesh_id')['primary'].to_dict() 38 | 39 | hetionet_DiSy = hetionet_DiSy.replace({'Disease': do_primary_dict, 'Symptom': symptom_primary_dict}) 40 | DiSy_res = hetionet_DiSy 41 | DiSy_res['Present'] = [1] * len(DiSy_res) 42 | DiSy_res['Source'] = ['Hetionet'] * len(DiSy_res) 43 | DiSy_res.to_csv(folder + 'Di_S_res.csv', index=False) 44 | 45 | 46 | def integrate_DiDSI(): 47 | sdsi_dis = pd.read_table(folder + 'MRREL.RRF', delimiter='|') 48 | sdsi_dis = sdsi_dis[sdsi_dis['REL'] == 'is_effective_for'] 49 | sdsi_dis = sdsi_dis.reset_index(drop=True) 50 | 51 | sdsi_vocab = pd.read_csv(folder + 'sdsi_vocab.csv') 52 | sdsi_primary_dict = sdsi_vocab.set_index('iDISK_id')['primary'].to_dict() 53 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv') 54 | iDISK_vocab = disease_vocab.dropna(subset=['iDISK_id']) 55 | iDISK_primary_dict = iDISK_vocab.set_index('iDISK_id')['primary'].to_dict() 56 | 57 | sdsi_dis = sdsi_dis.rename(columns={'CUI1': 'DSI', 'CUI2': 'Disease'}) 58 | sdsi_dis = sdsi_dis[['DSI', 'Disease']] 59 | sdsi_dis = sdsi_dis.replace({'DSI': sdsi_primary_dict, 'Disease': iDISK_primary_dict}) 60 | 61 | DSIDi_res = sdsi_dis 62 | DSIDi_res['is_effective_for'] = [1] * len(DSIDi_res) 63 | DSIDi_res['Source'] = ['iDISK'] * len(DSIDi_res) 64 | DSIDi_res.to_csv(folder + 'SDSI_Di_res.csv', index=False) 65 | 66 | 67 | def integrate_DSISy(): 68 | sdsi_ss = pd.read_table(folder + 'MRREL.RRF', delimiter='|') 69 | sdsi_ss = sdsi_ss[sdsi_ss['REL'] == 'has_adverse_reaction'] 70 | sdsi_ss = sdsi_ss.reset_index(drop=True) 71 | 72 | sdsi_vocab = pd.read_csv(folder + 'sdsi_vocab.csv') 73 | sdsi_primary_dict = sdsi_vocab.set_index('iDISK_id')['primary'].to_dict() 74 | 75 | symptom_vocab = pd.read_csv(folder + 'symptom_vocab.csv') 76 | symptom_primary_dict = symptom_vocab.dropna(subset=['iDISK_id']).set_index('iDISK_id')['primary'].to_dict() 77 | 78 | sdsi_ss = sdsi_ss.rename(columns={'CUI1': 'DSI', 'CUI2': 'Symptom'}) 79 | sdsi_ss = sdsi_ss[['DSI', 'Symptom']] 80 | sdsi_ss = sdsi_ss.replace({'DSI': sdsi_primary_dict, 'Symptom': symptom_primary_dict}) 81 | 82 | DSIDy_res = sdsi_ss 83 | DSIDy_res['has_adverse_reaction'] = [1] * len(DSIDy_res) 84 | DSIDy_res['Source'] = ['iDISK'] * len(DSIDy_res) 85 | DSIDy_res.to_csv(folder + 'SDSI_S_res.csv', index=False) 86 | 87 | 88 | def main(): 89 | # integrate_DiPwy() 90 | # integrate_DiSy() 91 | # integrate_DiDSI() 92 | integrate_DSISy() 93 | 94 | 95 | if __name__ == '__main__': 96 | main() 97 | -------------------------------------------------------------------------------- /Codes_Analysis/README.md: -------------------------------------------------------------------------------- 1 | # iBKH Case Study 2 | ## Overview 3 | We enable high-quality knowledge discovery based on iBKH. We developed a knowledge discovery module based on [DGL-KE (Deep Graph Library – Knowledge Embedding)](https://github.com/awslabs/dgl-ke), a Python package for efficient and scalable graph learning. To demonstrate its potentials, we conducted two proof-of-concept studies: 1) Case Study I: in-silico hypothesis generation for Alzheimer’s disease (AD) drug repurposing, and 2) Case Study II: knowledge-enhanced cohort exploration for older adults with Apolipoprotein E (APOE) ε4 genotype (a significant genetic risk factor of AD). 4 | 5 | ## Python Dependencies 6 | The codes mainly depends on the scientific stacks on the Python 3.7. 7 | ``` 8 | numpy 1.21.5 9 | pandas 1.3.5 10 | torch 1.2.0 (https://pytorch.org/) 11 | sklearn 0.0 12 | neo4j 5.2.0 (https://pypi.org/project/neo4j/5.2.0/) 13 | matplotlib 3.1.1 14 | statsmodels 0.11.1 15 | ``` 16 | 17 | ## DGL-KE Platform for iBKH Setup 18 | In this work, we used the [Deep Graph Library - Knowledge Embedding (DGL-KE)](https://github.com/awslabs/dgl-ke), a Python-based implementation for the advanced KGE algorithms, such as TransE, TransR, ComplEx, and DistMult. You may follow the [Installation Guide](https://dglke.dgl.ai/doc/install.html) to complete the DGL-KE installation. 19 | 20 | ## Case Study - Alzheimer's Disease (AD) drug repurposing 21 | This is the implementation of AD drug repurposing based on iBKH. The task is to dicover drugs that potentially link to AD in the iBKH. Detailed information and codes can be found [here](https://github.com/wcm-wanglab/iBKH/blob/main/Codes/Case_Study-AD_Drug_Repurposing.ipynb). 22 | 23 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /Codes_Term Harmonization/Entity_Integration/entity_pathway.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from bs4 import BeautifulSoup 3 | from urllib.request import Request, urlopen 4 | import urllib.error 5 | 6 | folder = '' 7 | 8 | 9 | def extract_Reactome_vocab(): 10 | pathway_vocab = pd.read_table(folder + 'ReactomePathways.txt', header=None) 11 | pathway_vocab = pathway_vocab.rename(columns={0: 'Reactome_ID', 1: 'Name', 2: 'Species'}) 12 | pathway_vocab['primary'] = ['REACT:'] + pathway_vocab['Reactome_ID'].astype(str) 13 | pathway_res = pathway_vocab[['primary', 'Reactome_ID', 'Name', 'Species']] 14 | pathway_res = pathway_res[pathway_res['Species'] == 'Homo sapiens'] 15 | pathway_res = pathway_res[['primary', 'Reactome_ID', 'Name']] 16 | print(pathway_res) 17 | pathway_res.to_csv(folder + 'res/pathway_res.csv', index=False) 18 | 19 | 20 | def add_CTD_pathway(): 21 | ctd_pw = pd.read_csv('/Users/yuhou/Documents/Knowledge_Graph/CTD/vocabulary/CTD_pathways.csv', header=27) 22 | ctd_pw = ctd_pw.dropna(subset=['PathwayID']) 23 | ctd_pw = ctd_pw.reset_index(drop=True) 24 | pathway_res = pd.read_csv(folder + 'res/pathway_res.csv') 25 | idx = len(pathway_res) 26 | pathway_res['KEGG_ID'] = [''] * idx 27 | react_list = list(pathway_res['Reactome_ID']) 28 | 29 | for i in range(len(ctd_pw)): 30 | pathway_id = ctd_pw.loc[i, 'PathwayID'] 31 | pathway_name = ctd_pw.loc[i, '# PathwayName'] 32 | if 'REACT' in pathway_id: 33 | pathway_id = pathway_id.replace('REACT:', '') 34 | if pathway_id not in react_list: 35 | pathway_res.loc[idx] = ['REACT:' + pathway_id, pathway_id, pathway_name, ''] 36 | idx += 1 37 | elif 'KEGG' in pathway_id: 38 | pathway_id = pathway_id.replace('KEGG:hsa', '') 39 | if 'M' in pathway_id: 40 | pathway_res.loc[idx] = ['KEGG:' + pathway_id.replace('_', ''), '', pathway_name, 'hsa' + pathway_id] 41 | idx += 1 42 | else: 43 | pathway_res.loc[idx] = ['KEGG:map' + pathway_id, '', pathway_name, 'hsa' + pathway_id] 44 | idx += 1 45 | print(i + 1, '/', len(ctd_pw), 'Completed...') 46 | pathway_res.to_csv(folder + 'res/pathway_res_2.csv', index=False) 47 | 48 | 49 | def process_reactome_go(): 50 | reactome_vocab = pd.read_csv(folder + 'res/pathway_res.csv') 51 | go_id_list = [] 52 | for i in range(len(reactome_vocab)): 53 | reactome_id = reactome_vocab.loc[i, 'Reactome_ID'] 54 | url = 'https://reactome.org/content/detail/' + reactome_id 55 | req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) 56 | go_id = '' 57 | try: 58 | rep = urlopen(req) 59 | webpage = rep.read() 60 | soup = BeautifulSoup(webpage, 'html.parser') 61 | go_link = soup.findAll("a", {"title": "go to GO"}) 62 | if len(go_link) > 0: 63 | go_id = go_link[0].get('href').replace('https://www.ebi.ac.uk/QuickGO/term/', '') 64 | except urllib.error.HTTPError as e: 65 | print(reactome_id, 'HTTPError: {}'.format(e.code)) 66 | go_id_list.append(go_id) 67 | print(i + 1, '/', len(reactome_vocab), 'Completed...') 68 | reactome_vocab['go_id'] = go_id_list 69 | reactome_vocab.to_csv(folder + 'res/reactome_pathway.csv', index=False) 70 | 71 | 72 | def integrate_reactome_kegg(): 73 | pathway_res = pd.read_csv(folder + 'stage_4/entity/pathway/res/pathway_res.csv') 74 | pathway_res['kegg_id'] = [''] * len(pathway_res) 75 | idx = len(pathway_res) 76 | kegg_pathway = pd.read_csv(folder + 'KEGG/kegg_pathway.csv') 77 | print(kegg_pathway) 78 | reactome_golist = list(pathway_res.dropna(subset=['go_id'])['go_id']) 79 | 80 | for i in range(len(kegg_pathway)): 81 | kegg_id = kegg_pathway.loc[i, 'kegg_id'] 82 | pathway_name = kegg_pathway.loc[i, 'name'] 83 | go_id = kegg_pathway.loc[i, 'go_id'] 84 | if go_id in reactome_golist: 85 | pathway_res.loc[pathway_res['go_id'] == go_id, 'kegg_id'] = kegg_id 86 | else: 87 | pathway_res.loc[idx] = ['KEGG:' + kegg_id, '', pathway_name, go_id, kegg_id] 88 | idx += 1 89 | print(i + 1, '/', len(kegg_pathway), 'Completed...') 90 | print(pathway_res) 91 | pathway_res.to_csv(folder + 'stage_4/entity/pathway/res/pathway_res_2.csv', index=False) 92 | with open(folder + 'stage_4/entity/pathway/res/integrate_note.txt', 'w') as f: 93 | f.write('pathway_res_2.csv: Reactome; KEGG') 94 | f.close() 95 | 96 | 97 | def main(): 98 | # extract_Reactome_vocab() 99 | # add_CTD_pathway() 100 | # process_reactome_go() 101 | 102 | integrate_reactome_kegg() 103 | 104 | 105 | if __name__ == '__main__': 106 | main() 107 | -------------------------------------------------------------------------------- /Codes_Term Harmonization/Entity_Integration/entity_symptom.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import requests 3 | from lxml.html import fromstring 4 | 5 | folder = '' 6 | iDISK_folder = '' 7 | 8 | 9 | def get_UMLS_tgt(apikey): 10 | uri = "https://utslogin.nlm.nih.gov" 11 | auth_endpoint = "/cas/v1/api-key" 12 | params = {'apikey': apikey} 13 | h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent": "python"} 14 | r = requests.post(uri + auth_endpoint, data=params, headers=h) 15 | response = fromstring(r.text) 16 | tgt = response.xpath('//form/@action')[0] 17 | return tgt 18 | 19 | 20 | def get_UMLS_ts(tgt): 21 | service = "http://umlsks.nlm.nih.gov" 22 | params = {'service': service} 23 | h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent": "python"} 24 | r = requests.post(tgt, data=params, headers=h) 25 | st = r.text 26 | return st 27 | 28 | 29 | def mesh2umls(tgt, mesh_id): 30 | st = get_UMLS_ts(tgt) 31 | mesh_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/source/MSH/' + mesh_id + '/atoms?ttys=MH,NM&ticket=' + st 32 | mesh_resp = requests.get(mesh_url) 33 | umls_cui = '' 34 | if 'error' not in mesh_resp.json(): 35 | mesh_content = mesh_resp.json()['result'][0] 36 | umls_cui = mesh_content['concept'].replace('https://uts-ws.nlm.nih.gov/rest/content/2020AB/CUI/', '') 37 | 38 | return umls_cui 39 | 40 | 41 | def UMLS2MeSH(tgt, umls_cui): 42 | st = get_UMLS_ts(tgt) 43 | mesh_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + umls_cui + '/atoms?sabs=MSH&ttys=MH,NM,PT&ticket=' + st 44 | mesh_resp = requests.get(mesh_url) 45 | mesh_id = '' 46 | if 'error' not in mesh_resp.json(): 47 | mesh_content = mesh_resp.json()['result'] 48 | mesh_id = mesh_content[0]['code'].replace( 49 | 'https://uts-ws.nlm.nih.gov/rest/content/2020AB/source/MSH/', '') 50 | return mesh_id 51 | 52 | 53 | def enrich_Hetionet(): 54 | hetionet_df = pd.read_table('/Users/yuhou/Documents/Knowledge_Graph/hetionet/hetionet-v1.0-nodes.tsv') 55 | hetionet_symptom = hetionet_df[hetionet_df['kind'] == 'Symptom'] 56 | hetionet_symptom = hetionet_symptom.reset_index(drop=True) 57 | print(hetionet_symptom) 58 | res = pd.DataFrame(columns=['primary', 'name', 'mesh_id', 'umls_cui']) 59 | idx = 0 60 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6' 61 | tgt = get_UMLS_tgt(apikey) 62 | for i in range(len(hetionet_symptom)): 63 | mesh_id = hetionet_symptom.loc[i, 'id'].replace('Symptom::', '') 64 | name = hetionet_symptom.loc[i, 'name'] 65 | umls_cui = mesh2umls(tgt, mesh_id) 66 | res.loc[idx] = ['MESH:' + mesh_id, name, mesh_id, umls_cui] 67 | idx += 1 68 | print(i + 1, '/', len(hetionet_symptom), 'Completed...') 69 | res.to_csv(folder + 'symptom_vocab_refined.csv', index=False) 70 | 71 | 72 | def integrate_iDISK(): 73 | iDISK_SS = pd.read_csv(iDISK_folder + 'entity/SS_enriched.csv') 74 | 75 | symptom_vocab = pd.read_csv(folder + 'symptom_vocab_refined.csv') 76 | symptom_vocab['iDISK_id'] = [''] * len(symptom_vocab) 77 | idx = len(symptom_vocab) 78 | mesh_vocab_list = list(symptom_vocab.dropna(subset=['mesh_id'])['mesh_id']) 79 | umls_vocab_list = list(symptom_vocab.dropna(subset=['umls_cui'])['umls_cui']) 80 | 81 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6' 82 | tgt = get_UMLS_tgt(apikey) 83 | 84 | for i in range(len(iDISK_SS)): 85 | cui = iDISK_SS.loc[i, 'CUI'] 86 | name = iDISK_SS.loc[i, 'name'] 87 | umls_cui = iDISK_SS.loc[i, 'UMLS'] 88 | mesh_id = UMLS2MeSH(tgt, umls_cui) 89 | 90 | if mesh_id in mesh_vocab_list: 91 | symptom_vocab.loc[symptom_vocab['mesh_id'] == mesh_id, 'iDISK_id'] = cui 92 | elif umls_cui in umls_vocab_list: 93 | symptom_vocab.loc[symptom_vocab['umls_cui'] == umls_cui, 'iDISK_id'] = cui 94 | else: 95 | if mesh_id != '': 96 | symptom_vocab.loc[idx] = ['MESH:' + mesh_id, name, mesh_id, umls_cui, cui] 97 | idx += 1 98 | else: 99 | symptom_vocab.loc[idx] = ['UMLS:' + umls_cui, name, mesh_id, umls_cui, cui] 100 | idx += 1 101 | print(i + 1, '/', len(iDISK_SS), 'Completed...') 102 | symptom_vocab.to_csv(folder + 'symptom_vocab_refined_2.csv', index=False) 103 | 104 | 105 | def main(): 106 | # enrich_Hetionet() 107 | integrate_iDISK() 108 | 109 | # symptom_vocab = pd.read_csv(folder + 'symptom_vocab_refined.csv') 110 | # print(len(symptom_vocab), len(symptom_vocab.drop_duplicates(subset='primary', keep='first'))) 111 | # mesh_vocab = symptom_vocab.dropna(subset=['mesh_id']) 112 | # print(len(mesh_vocab), len(mesh_vocab.drop_duplicates(subset='mesh_id', keep='first'))) 113 | # umls_vocab = symptom_vocab.dropna(subset=['umls_cui']) 114 | # print(len(umls_vocab), len(umls_vocab.drop_duplicates(subset='umls_cui', keep='first'))) 115 | 116 | 117 | if __name__ == '__main__': 118 | main() 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # integrative Biomedical Knowledge Hub (iBKH) 2 | iBKH integrates data from 17 publicly available biomedical databases. The current version of iBKH contains a total of 2,384,501 entities of 11 types. Specifically, the iBKH includes 23,003 anatomy entities, 19,236 disease entities, 37,997 drug entities, 88,376 gene entities, 2,065,015 molecule entities, 1,361 symptom entities, 4,101 DSI entities, 137,568 DSP entities, 605 TC entities, 2,988 pathway entities and 4,251 side-effect entities. For the relationships in the iBKH, there are 86 relation types within 18 kinds of entity pairs, including Anatomy-Gene, Drug-Disease, Drug-Drug, Drug-Gene, Disease-Disease, Disease-Gene, Disease-Symptom, Gene-Gene, DSI-Disease, DSI-Symptom, DSI-Drug, DSI-Anatomy, DSI-DSP, DSI-TC, Disease-Pathway, Drug-Pathway, Gene-Pathway and Drug-Side Effect. In total, iBKH contains 48,194,646 relations. 3 | 4 | 5 | 6 | ## Materials and Methods 7 | Our ultimate goal was to build a biomedical knowledge graph via comprehensively incorporating biomedical knowledge as much as possible. To this end, we collected and integrated 17 publicly available data sources to curate a comprehensive one. Details of the used data resources were listed in [Table](https://github.com/wcm-wanglab/iBKH/blob/main/Source%20Information/README.md). 8 | 9 | ## Statistics of iBKH 10 | | Entity Type | Number | Included Identifiers | 11 | | ---------------|:---------:|:--------------------:| 12 | | Anatomy | 23,003 | Uberon ID, BTO ID, MeSH ID, Cell Ontology ID | 13 | | Disease | 19,236 | Disease Ontology ID, PharmGKB ID, MeSH ID, OMIM ID | 14 | | Drug | 37,997 | DrugBank ID, PharmGKB ID, MeSH ID | 15 | | Gene | 88,376 | HGNC ID, NCBI ID, PharmGKB ID | 16 | | Molecule | 2,065,015 | CHEMBL ID, CHEBI ID | 17 | | Symptom | 1,361 | MeSH ID | 18 | | Dietary Supplement Ingredient | 4,101 | iDISK ID | 19 | | Dietary Supplement Product | 137,568 | iDISK ID | 20 | | Therapeutic Class | 605 | iDISK ID, UMLS CUI | 21 | | Pathway | 2,988 | Reactome ID, Gene Ontology ID | 22 | | Side-Effect | 4,251 | UMLS CUI | 23 | | **Total Entities** | **2,384,501** | - | 24 | 25 | | Relation Type | Number | 26 | | ----------------|:----------:| 27 | | Anatomy-Gene | 12,171,021 | 28 | | Drug-Disease | 2,717,947 | 29 | | Drug-Drug | 2,684,682 | 30 | | Drug-Gene | 1,303,747 | 31 | | Disease-Disease | 11,072 | 32 | | Disease-Gene | 27,538,774 | 33 | | Disease-Symptom | 3,357 | 34 | | Gene-Gene | 735,156 | 35 | | DSI-Symptom | 2,093 | 36 | | DSI-Disease | 5,134 | 37 | | DSI-Drug | 3,057 | 38 | | DSI-Anatomy | 4,334 | 39 | | DSP-DSI | 689,297 | 40 | | DSI-TC | 5,430 | 41 | | Disease-Pathway | 1,941 | 42 | | Drug-Pathway | 3,231 | 43 | | Gene-Pathway | 152,243 | 44 | | Drug-Side Effect| 163,206 | 45 | | **Total Relations** | **48,194,646** | 46 | 47 | ## Neo4j Deployment 48 | We deployed our iBKH using Neo4j (https://neo4j.com) on AWS, a robust graph database platform. Specifically, Neo4j can take the CSV files of entities and relations we curated as input and automatically create a KG instance. In this way, the iBKH can be updated efficiently and flexibly. Follow instructions [here](https://docs.google.com/document/d/1cLDPLp_nVCJ5xrDlJ-B-Q3wf24tb-Dyq55nAXxaNgTM/edit?usp=sharing) to deploy iBKH in your AWS server. 49 | 50 | ## iBKH-based knowledge discovery 51 | We developed a knowledge discovery pipeline in iBKH. We utilized [Deep Graph Library - Knowledge Graph Embedding (DGL-KE)](https://github.com/awslabs/dgl-ke) to learn embeddings of iBKH, based on which we can derive novel biomedical knowledge. We applied our pipeline for two case studies, Alzheimer's disease drug repurposing hypothesis generation and a knowledge-enhanced EHR patient cohort study. All codes for these analyses can be found [here](https://github.com/wcm-wanglab/iBKH/tree/main/Codes). And please refer to [readme for case studies](https://github.com/wcm-wanglab/iBKH/blob/main/Codes/README.md) for more details. 52 | 53 | ## Licence 54 | iBKH is licensed under the [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) and [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/). The iBKH integrated the data from many resources, and users should consider the licenses for each of them (see the detail in the [table](https://github.com/wcm-wanglab/iBKH/blob/main/Source%20Information/README.md)). For sources with defined licenses, we apply the license attribute on a per-node and per-edge basis. However, some sources do not provide any licenses, so for those, we have requested permission. 55 | 56 | ## Cite 57 | ``` 58 | @article {Su2021.03.12.21253461, 59 | title = {Biomedical Discovery through the integrative Biomedical Knowledge Hub (iBKH)}, 60 | author = {Chang Su, Yu Hou, Suraj Rajendran, Jacqueline R. M. A. Maasch, Zehra Abedi, Haotan Zhang, Zilong Bai, 61 | Anthony Cuturrufo, Winston Guo, Fayzan F. Chaudhry, Gregory Ghahramani, Jian Tang, Feixiong Cheng, 62 | Yue Li, Rui Zhang, Jiang Bian, Fei Wang}, 63 | year = {2022}, 64 | doi = {10.1101/2021.03.12.21253461}, 65 | publisher = {Cold Spring Harbor Laboratory Press}, 66 | URL = {https://www.medrxiv.org/content/10.1101/2021.03.12.21253461v4}, 67 | journal = {medRxiv} 68 | } 69 | 70 | ``` 71 | 72 | -------------------------------------------------------------------------------- /iBKH-KD-protocol/funcs/knowledge_visualization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Jul 26 17:48:29 2023 5 | 6 | @author: changsu 7 | """ 8 | 9 | 10 | import networkx as nx 11 | from neo4j import GraphDatabase 12 | import matplotlib.pyplot as plt 13 | import os 14 | 15 | 16 | 17 | 18 | 19 | def subgraph_visualization(target_type, target_list, predicted_type, predicted_list, 20 | excluded_r_type = [], 21 | neo4j_url = "neo4j://54.210.251.104:7687", 22 | username = "neo4j", password = "password", 23 | alpha = 1, k=0.3, nsize=200, target_size_ratio=2.5, 24 | with_node_label=True, node_label_size = 10, 25 | with_edge_label=True, edge_label_size = 7, 26 | figsize=(14, 10), 27 | save=True, save_path='output', save_name=None): 28 | 29 | # Connect to the Neo4j database 30 | driver = GraphDatabase.driver(neo4j_url, 31 | auth=(username, password), 32 | encrypted=False) 33 | neo4j_res_list = [] 34 | 35 | # Build Cypher statement 36 | for target in target_list: 37 | for predict in predicted_list: 38 | cypher = "MATCH (e1:" + target_type + " {Name: \"" + target + "\"})" 39 | cypher += ", (e2:" + predicted_type + " {Name: \"" + predict + "\"})" 40 | cypher += ", p = allShortestPaths((e1)-[*..5]-(e2)) RETURN p LIMIT 30" 41 | 42 | # Run the Neo4j query and retrieve the results 43 | session = driver.session() 44 | neo4j_res = session.run(cypher) 45 | neo4j_res_list.append(neo4j_res) 46 | 47 | # Create a NetworX Graph object 48 | g = nx.MultiGraph() 49 | 50 | # Define node groups and their corresponding colors 51 | group_colors = { 52 | "Disease": "#E0C3FC", 53 | "Drug": "#83B5D1", 54 | "Gene": "#F28482", 55 | "Symptom": "#7B967A", 56 | "Side-effect": "#9DA1DD", 57 | "Pathway": "#94D2BD" 58 | } 59 | 60 | 61 | node_id_map = {} 62 | id_node_map = {} 63 | node_color = {} 64 | 65 | edge_label_map = {} 66 | # Iterate over the Neo4j query result and add nodes to the network 67 | idx = 0 68 | for neo4j_res in neo4j_res_list: 69 | for record in neo4j_res: 70 | path = record["p"] 71 | 72 | # adding node 73 | for node in path.nodes: 74 | node_type_list = list(node.labels) 75 | node_type = node_type_list[0] if node_type_list[0] != 'Node' else node_type_list[1] 76 | node_name = node['Name'] 77 | 78 | if node_name not in node_id_map: 79 | node_id_map[node_name] = idx 80 | id_node_map[idx] = node_name 81 | 82 | g.add_node(node_name) 83 | node_color[node_name] = group_colors[node_type] 84 | 85 | idx += 1 86 | 87 | # adding edges 88 | for relation in path.relationships: 89 | start_node_type = list(relation.start_node.labels)[0] 90 | end_node_type = list(relation.end_node.labels)[0] 91 | 92 | r_type = relation.type 93 | 94 | if r_type in excluded_r_type: 95 | continue 96 | 97 | start = relation.start_node['Name'] 98 | end = relation.end_node["Name"] 99 | 100 | edge_label_map[(start, end)] = r_type 101 | 102 | g.add_edge(start, end, label=r_type) 103 | 104 | 105 | color_map = [] 106 | size_map = [] 107 | for n in g.nodes(): 108 | color_map.append(node_color[n]) 109 | if (n in target_list) or (n in predicted_list): 110 | size_map.append(nsize * target_size_ratio) 111 | else: 112 | size_map.append(nsize) 113 | 114 | 115 | plt.figure(figsize=figsize) 116 | 117 | positions = nx.spring_layout(g, k=k) 118 | 119 | 120 | for node in target_list: 121 | positions[node][0] -= alpha # adjust value as needed 122 | 123 | for node in predicted_list: 124 | positions[node][0] += alpha # adjust value as needed 125 | 126 | nx.draw(g, with_labels = with_node_label, 127 | node_color=color_map, node_size=size_map, 128 | edge_color='#E0E0E0', pos=positions, 129 | font_size=node_label_size) 130 | 131 | nx.draw_networkx_edge_labels( 132 | g, positions, 133 | edge_labels=edge_label_map, 134 | font_color='blue', 135 | font_size = edge_label_size 136 | ) 137 | 138 | if save == True: 139 | 140 | if not os.path.exists(save_path): 141 | os.makedirs(save_path) 142 | 143 | if save_name == None: 144 | save_name ='shortest_path_interpretation_%s_%s.pdf' % (target_type, predicted_type) 145 | plt.savefig(save_path + '/' + save_name) 146 | 147 | 148 | plt.show() 149 | 150 | return g 151 | 152 | 153 | #excluded_r_type = [ 154 | # 'Inferred_Relation_DDi', 'Semantic_Relation_DDi' 155 | # 'Semantic_Relation_DG', '19 Semantic_Relation_DiG', 156 | # 'Semantic_Relation_GG', 'Inferred_Relation_DiG'] 157 | 158 | -------------------------------------------------------------------------------- /Source Information/README.md: -------------------------------------------------------------------------------- 1 | **Source information** 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 |
SourceEntityRelationURLLicense
TypeNumberTypeNumber
BgeeAnatomy, Gene60,072Anatomy-Express Present-Gene, Anatomy-Express Absent-Gene11,731,369https://bgee.org/https://creativecommons.org/publicdomain/zero/1.0/
Brenda Tissue OntologyTissue6,478--https://www.brenda-enzymes.org/index.phphttps://creativecommons.org/licenses/by/4.0/
Cell OntologyCell2,200--http://obofoundry.org/ontology/cl.htmlhttps://creativecommons.org/licenses/by/4.0/
Comparative Toxicogenomics Database Disease, Gene, Chemical, Pathway73,922Chemical-Gene, Chemical-Disease, Chemical-Pathway, Gene-Disease, Gene-Pathway, Disease-Pathway38,344,568http://ctdbase.org/Confirmed via email
ChEMBLMolecular1,940,733--https://www.ebi.ac.uk/chembl/https://creativecommons.org/licenses/by-sa/3.0/
ChEBIMolecular155,342--https://www.ebi.ac.uk/chebi/init.dohttps://creativecommons.org/licenses/by/4.0/
Drug Repurposing Knowledge GraphAnatomy, Atc, Biological process, Cellular component, Compound, Disease, Gene, Molecular function, Pathway, Pharmacologic class, Side effect, Symptom, Tax97,238Gene-Gene, Compound-Gene, Disease-Gene, Atc-Compound, Compound-Compound, Compound-Disease, Gene-Tax, Biological process-Gene, Disease-Symptom, Anatomy-Disease, Disease-Disease, Anatomy-Gene, Gene-Molecular function, Compound-Pharmacologic class, Cellular component-Gene, Gene-Pathway, Compound-Side effect5,874,261https://github.com/gnn4dr/DRKGhttps://www.apache.org/licenses/LICENSE-2.0
Disease OntologyDisease10,648--https://disease-ontology.org/https://creativecommons.org/publicdomain/zero/1.0/
DrugBankDrug15,128Drug-Target, Drug-Enzyme, Drug-Carrier, Drug-Transporter28,014https://go.drugbank.com/http://creativecommons.org/licenses/by-nc/4.0/
HetionetAnatomy, Biological process, Cellular component, Compound, Diease, Gene, Molecular function, Pathway, Pharmacologic class, Side effect, Symptom47,031Anatomy–downregulates–Gene, Anatomy–expresses–Gene, Anatomy–upregulates–Gene, Compound–binds–Gene, Compound–causes–Side Effect, Compound–downregulates–Gene, Compound–palliates–Disease, Compound–resembles–Compound, Compound–treats–Disease, Compound–upregulates–Gene, Disease–associates–Gene, Disease–downregulates–Gene, Disease–localizes–Anatomy, Disease–presents–Symptom, Disease–resembles–Disease, Disease–upregulates–Gene, Gene–covaries–Gene, Gene–interacts–Gene, Gene–participates–Biological Process, Gene–participates–Cellular Component, Gene–participates–Molecular Function, Gene–participates–Pathway, Gene→regulates→Gene, Pharmacologic Class–includes–Compound2,250,197https://github.com/hetio/hetionethttps://creativecommons.org/publicdomain/zero/1.0/
HUGO Gene Nomenclature CommitteeGene41,439--https://www.genenames.org/No restriction (https://www.genenames.org/about/)
iDISKDietary Supplement Ingredient, Dietary Supplement Product, Disease, Drug, Anatomy, Symptom, Therapeutic Class144,536DSI-Anatomy, DSI-Symptom, DSI-Disease, DSI-Drug, DSI-DSP, DSI-TC705,075https://conservancy.umn.edu/handle/11299/204783https://creativecommons.org/licenses/by-sa/3.0/us/
PharmGKBGenes, Variant, Drug/Chemical, Phenotype43,112Disease-Gene, Drug/Chemical -Gene, Gene-Gene, Gene-Variant, Disease-Variant, Drug/Chemical-Variant61,616https://www.pharmgkb.org/https://creativecommons.org/licenses/by-sa/4.0/
ReactomeGenes, Pathway13,589Gene-Pathway118,480https://reactome.org/https://creativecommons.org/licenses/by/4.0/
SIDERDrug, Side effect5,681Drug-Side effect163,206http://sideeffects.embl.de/https://creativecommons.org/licenses/by-nc-sa/4.0/
TISSUETissue, Gene26,260Tissue-Express-Gene6,788,697https://tissues.jensenlab.org/https://creativecommons.org/licenses/by/4.0/
UberonAnatomy14,944--https://www.ebi.ac.uk/ols/ontologies/uberonhttp://creativecommons.org/licenses/by/3.0/
175 | -------------------------------------------------------------------------------- /iBKH-KD-protocol/output/prediction_drug_top100_transE_l2.csv: -------------------------------------------------------------------------------- 1 | primary,name,drugbank_id,kegg_id,pharmgkb_id,umls_cui,mesh_id,iDISK_id,CID,id,score,score_norm 2 | DrugBank:DB01080,Vigabatrin,DB01080,D00535,PA10231,C0048044,D020888,,CID100005665,359,-0.11448057,1.0 3 | DrugBank:DB01104,Sertraline,DB01104,D02360,PA451333,C0074393,D020280,DC0480486,CID100005203,521,-0.11556599,0.9998298 4 | DrugBank:DB06155,Rimonabant,DB06155,D05731,PA152407999,C1142933,D000077285,,CID100104849,13872,-0.12436323,0.9984506 5 | DrugBank:DB00907,Cocaine,DB00907,D00110,PA449072,C0009170,D003042,DC0479713,CID100002826,8911,-0.1275249,0.9979549 6 | DrugBank:DB00472,Fluoxetine,DB00472,D00823,PA449673,C0016365,D005473,DC0480534,CID100003386,590,-0.1280578,0.99787134 7 | DrugBank:DB01041,Thalidomide,DB01041,D00754,PA451644,C0039736,D013792,,CID100005426,238,-0.13207258,0.9972419 8 | DrugBank:DB01577,Metamfetamine,DB01577,D08187,PA450403,C0025611,D008694,DC0480968,CID100001206,382,-0.13832352,0.9962619 9 | DrugBank:DB00470,Dronabinol,DB00470,D00306,PA449421,C0039663,D013759,,,1970,-0.13955317,0.9960691 10 | DrugBank:DB00898,Ethanol,DB00898,D00068,PA448073,C0001962,D000431,DC0478554,,1259,-0.14065692,0.99589604 11 | DrugBank:DB00313,Valproic acid,DB00313,D00399,PA451846,C0042291,D014635,DC0479769,,225,-0.1472781,0.99485797 12 | DrugBank:DB01149,Nefazodone,DB01149,D08257,PA450603,C0068485,C051752,DC0481580,CID100004449,2235,-0.15075247,0.99431324 13 | DrugBank:DB00715,Paroxetine,DB00715,D02362,PA450801,C0070122,D017374,DC0480716,CID100004691,476,-0.15228291,0.9940733 14 | DrugBank:DB01065,Melatonin,DB01065,D08170,PA164752558,C0025219,D008550,DC0492506,CID100000896,1380,-0.1569333,0.9933442 15 | DrugBank:DB00752,Tranylcypromine,DB00752,D08625,PA451741,C0040778,D014191,DC0479660,CID100005530,2101,-0.15885594,0.99304277 16 | DrugBank:DB01454,Midomafetamine,DB01454,D11172,PA131887008,C0115471,D018817,,,12548,-0.1606406,0.9927629 17 | DrugBank:DB13323,Trichloroethylene,DB13323,,PA166115521,C0040905,D014241,,,14834,-0.16149126,0.9926296 18 | DrugBank:DB00328,Indomethacin,DB00328,D00141,PA449982,C0021246,D007213,DC0479186,CID100003715,170,-0.16267565,0.9924439 19 | DrugBank:DB00344,Protriptyline,DB00344,D08447,PA451168,C0033743,D011530,,CID100004976,1894,-0.1631321,0.99237233 20 | DrugBank:DB00184,Nicotine,DB00184,D03365,PA450626,C0028040,D009538,DC0479431,CID100000942,428,-0.16453218,0.9921528 21 | DrugBank:DB02852,Domoic Acid,DB02852,,,C0058678,C012301,,,9792,-0.16459528,0.9921429 22 | DrugBank:DB01156,Bupropion,DB01156,D07591,PA448687,C0085208,D016642,DC0481575,CID100000444,300,-0.1654988,0.9920013 23 | DrugBank:DB00182,Amphetamine,DB00182,D07445,PA448408,C0002658,D000661,,CID100003007,497,-0.17122842,0.991103 24 | DrugBank:DB00363,Clozapine,DB00363,D00283,PA449061,C0009079,D003024,DC0479429,CID100002818,481,-0.1722086,0.9909493 25 | DrugBank:DB00734,Risperidone,DB00734,D00426,PA451257,C0073393,D018967,DC0480519,CID100005073,491,-0.174525,0.9905861 26 | DrugBank:DB00679,Thioridazine,DB00679,D00373,PA451666,C0039943,D013881,,CID100005452,496,-0.17616771,0.99032855 27 | DrugBank:DB04819,Methapyrilene,DB04819,,,C0025625,D008701,,,11847,-0.17761308,0.990102 28 | DrugBank:DB00477,Chlorpromazine,DB00477,D00270,PA448964,C0008286,D002746,,CID100002726,482,-0.17900142,0.9898843 29 | DrugBank:DB01224,Quetiapine,DB01224,D08456,PA451201,C0123091,,,CID100005002,471,-0.17924665,0.9898459 30 | DrugBank:DB00564,Carbamazepine,DB00564,D00252,PA448785,C0006949,D002220,DC0478574,CID100002554,447,-0.18068857,0.9896198 31 | DrugBank:DB01174,Phenobarbital,DB01174,D00506,PA450911,C0031412,D010634,DC0479421,CID100004763,373,-0.18244445,0.9893445 32 | DrugBank:DB00502,Haloperidol,DB00502,D00136,PA449841,C0018546,D006220,DC0479623,CID100003559,507,-0.18472835,0.98898643 33 | DrugBank:DB00554,Piroxicam,DB00554,D00127,PA450985,C0031990,D010894,,CID123690938,479,-0.1871792,0.98860216 34 | DrugBank:DB00740,Riluzole,DB00740,D00775,PA451251,C0073379,D019782,DC0479400,CID100005070,404,-0.18738426,0.98857003 35 | DrugBank:DB00831,Trifluoperazine,DB00831,D08636,PA451771,C0040979,D014268,,CID100005566,519,-0.18778177,0.9885077 36 | DrugBank:DB01202,Levetiracetam,DB01202,D00709,PA450206,C0377265,D000077287,,CID100059708,120,-0.18812412,0.98845404 37 | DrugBank:DB00924,Cyclobenzaprine,DB00924,D07758,PA449160,C0056732,C004704,,CID100002895,1170,-0.18916538,0.9882908 38 | DrugBank:DB00482,Celecoxib,DB00482,D00567,PA448871,C0538927,D000068579,,CID100002662,504,-0.19024923,0.98812085 39 | DrugBank:DB00397,Phenylpropanolamine,DB00397,D01224,PA164748965,C0031495,D010665,DC0479424,CID100004786,390,-0.1904266,0.988093 40 | DrugBank:DB04827,Urethane,DB04827,,,C0041964,D014520,,,14986,-0.19091032,0.9880172 41 | DrugBank:DB00321,Amitriptyline,DB00321,D07448,PA448385,C0002600,D000639,,CID100002160,385,-0.1919516,0.98785394 42 | DrugBank:DB00252,Phenytoin,DB00252,D00512,PA450947,C0031507,D010672,DC0479283,CID100001775,454,-0.19265485,0.9877437 43 | DrugBank:DB00370,Mirtazapine,DB00370,D00563,PA450522,C0049506,D000078785,,CID100004205,1410,-0.19276918,0.98772573 44 | DrugBank:DB00605,Sulindac,DB00605,D00120,PA451565,C0038792,D013467,,CID100005352,1532,-0.19316767,0.98766327 45 | DrugBank:DB00334,Olanzapine,DB00334,D00454,PA450688,C0171023,D000077152,,CID100004585,474,-0.19318072,0.98766124 46 | DrugBank:DB00315,Zolmitriptan,DB00315,D00415,PA451975,C0528166,C089750,,CID100005731,480,-0.19324104,0.98765177 47 | DrugBank:DB00997,Doxorubicin,DB00997,D03899,PA449412,C0013089,D004317,DC0479994,CID100001690,0,-0.19488305,0.98739433 48 | DrugBank:DB00906,Tiagabine,DB00906,D02097,PA451682,C0068897,D000078308,DC0479416,CID100005466,1552,-0.19495434,0.9873832 49 | DrugBank:DB01623,Thiothixene,DB01623,D00374,PA451669,C0039955,D013888,,CID100005454,522,-0.19659656,0.9871257 50 | DrugBank:DB00822,Disulfiram,DB00822,D00131,PA449376,C0012772,D004221,DC0479284,CID100003117,453,-0.19794677,0.98691404 51 | DrugBank:DB00747,Scopolamine,DB00747,D00138,PA451308,C0036442,D012601,DC0478717,CID100005184,1516,-0.19825011,0.9868665 52 | DrugBank:DB01216,Finasteride,DB01216,D00321,PA449627,C0060389,D018120,DC0481582,CID100003350,1274,-0.20089561,0.9864517 53 | DrugBank:DB00934,Maprotiline,DB00934,D02566,PA450322,C0024778,D008376,,CID100004011,1368,-0.20286947,0.9861422 54 | DrugBank:DB03575,Phencyclidine,DB03575,,PA128406980,C0031381,D010622,,CID100006468,13152,-0.20314965,0.9860983 55 | DrugBank:DB01234,Dexamethasone,DB01234,D00292,PA449247,C0011777,D003907,DC0480274,CID100002367,62,-0.20375183,0.9860039 56 | DrugBank:DB00188,Bortezomib,DB00188,D03150,PA10252,C1176309,D000069286,DC0480574,CID100093860,326,-0.20386507,0.9859861 57 | DrugBank:DB09167,Dosulepin,DB09167,D07872,,C0013065,D004308,,CID100003155,9806,-0.2062184,0.98561716 58 | DrugBank:DB00865,Benzphetamine,DB00865,D07514,PA448586,C0005096,D001589,,CID100002341,339,-0.20694748,0.98550284 59 | DrugBank:DB01236,Sevoflurane,DB01236,D00547,PA451341,C0074414,D000077149,DC0478680,CID100005206,14145,-0.20871544,0.9852257 60 | DrugBank:DB01151,Desipramine,DB01151,D07791,PA449233,C0011685,D003891,DC0481239,CID100002995,1203,-0.20973898,0.9850652 61 | DrugBank:DB01171,Moclobemide,DB01171,D02561,PA452615,C0066673,D020912,,CID100004235,1415,-0.20980196,0.9850553 62 | DrugBank:DB00601,Linezolid,DB00601,D00947,PA450233,C0663241,D000069349,DC0489100,CID100003929,1354,-0.21180709,0.984741 63 | DrugBank:DB00273,Topiramate,DB00273,D00537,PA451728,C0076829,D000077236,DC0479772,CID100005514,219,-0.21294571,0.98456246 64 | DrugBank:DB01176,Cyclizine,DB01176,D03621,PA164742937,C0010547,D003501,,CID100006726,2245,-0.21753024,0.9838437 65 | DrugBank:DB01198,Zopiclone,DB01198,D01372,PA10236,C0078847,C515050,,CID100005735,754,-0.21950008,0.9835348 66 | DrugBank:DB00829,Diazepam,DB00829,D00293,PA449283,C0012010,D003975,,CID100003016,452,-0.21965425,0.9835107 67 | DrugBank:DB00425,Zolpidem,DB00425,D00706,PA451976,C0078839,D000077334,DC0481605,CID100005732,922,-0.22110155,0.98328376 68 | DrugBank:DB00295,Morphine,DB00295,D08233,PA450550,C0026549,D009020,DC0490234,CID100004253,1865,-0.22138812,0.9832388 69 | DrugBank:DB01238,Aripiprazole,DB01238,D01164,PA10026,C0299792,D000068180,DC0491787,CID100060795,500,-0.22148094,0.9832243 70 | DrugBank:DB00540,Nortriptyline,DB00540,D08288,PA450657,C0028420,D009661,,CID100004543,1440,-0.22218998,0.9831131 71 | DrugBank:DB00316,Acetaminophen,DB00316,D00217,PA448015,C0000970,D000082,DC0479594,CID100001983,980,-0.2224822,0.9830673 72 | DrugBank:DB00215,Citalopram,DB00215,D07704,PA449015,C0008845,D015283,DC0481664,CID100002771,83,-0.22288562,0.98300403 73 | DrugBank:DB00850,Perphenazine,DB00850,D00503,PA450882,C0031184,D010546,,CID100004748,589,-0.22457759,0.9827388 74 | DrugBank:DB00776,Oxcarbazepine,DB00776,D00533,PA450732,C0069751,D000078330,,CID100034312,282,-0.22752447,0.98227674 75 | DrugBank:DB00091,Cyclosporine,DB00091,D00184,PA449167,C0010592,D016572,DC0479202,,97,-0.22848886,0.9821256 76 | DrugBank:DB00515,Cisplatin,DB00515,D00275,PA449014,C0008838,D002945,DC0479316,CID100002767,137,-0.22941272,0.98198074 77 | DrugBank:DB00937,Diethylpropion,DB00937,D07444,PA164778098,C0012201,D004053,,CID100007029,129,-0.23071453,0.9817766 78 | DrugBank:DB00843,Donepezil,DB00843,D00670,PA449394,C0527316,D000077265,,CID100003152,379,-0.2313868,0.9816712 79 | DrugBank:DB01050,Ibuprofen,DB01050,D00126,PA449957,C0020740,D007052,DC0480526,CID100003672,654,-0.23151053,0.98165184 80 | DrugBank:DB00332,Ipratropium,DB00332,D02212,PA450082,C0027235,D009241,,CID100003746,450,-0.23167636,0.9816258 81 | DrugBank:DB01544,Flunitrazepam,DB01544,D01230,PA164781320,C0016296,D005445,,CID100003380,3571,-0.23227203,0.98153245 82 | DrugBank:DB04017,Clorgiline,DB04017,D03248,,C0009035,D003010,,,8867,-0.23293683,0.9814282 83 | DrugBank:DB00819,Acetazolamide,DB00819,D00218,PA448018,C0000981,D000086,DC0481912,CID100001986,188,-0.23434326,0.98120767 84 | DrugBank:DB01242,Clomipramine,DB01242,D00811,PA449048,C0009010,D002997,DC0482226,CID100002801,503,-0.23548973,0.98102796 85 | DrugBank:DB00575,Clonidine,DB00575,D00281,PA449051,C0009014,D003000,DC0481087,CID100002803,304,-0.23552413,0.98102254 86 | DrugBank:DB06148,Mianserin,DB06148,D08216,PA134687937,C0025912,D008803,,CID100004184,12005,-0.23575918,0.9809857 87 | DrugBank:DB01105,Sibutramine,DB01105,D08513,PA451344,C0074493,C058254,,CID100005210,323,-0.23596597,0.9809533 88 | DrugBank:DB00555,Lamotrigine,DB00555,D00354,PA450164,C0064636,D000077213,,CID100003878,11,-0.23617615,0.9809203 89 | DrugBank:DB00652,Pentazocine,DB00652,D00498,PA164744326,C0030873,D010423,DC0478463,CID100004736,13056,-0.23656079,0.98086005 90 | DrugBank:DB01168,Procarbazine,DB01168,D08423,PA451112,C0033223,D011344,,CID100004915,246,-0.23699106,0.9807926 91 | DrugBank:DB01618,Molindone,DB01618,D08226,PA164746756,C0026388,D008972,,CID100023897,1416,-0.23750307,0.9807123 92 | DrugBank:DB00234,Reboxetine,DB00234,D02573,PA144614921,C0168388,D000077593,,CID100065856,13800,-0.23847479,0.98055995 93 | DrugBank:DB12278,Propiverine,DB12278,D08441,,C0138666,C015586,,,1500,-0.23917194,0.98045063 94 | DrugBank:DB12093,Tetrahydropalmatine,DB12093,,,C0076278,C014215,,,14608,-0.23968408,0.98037034 95 | DrugBank:DB00899,Remifentanil,DB00899,D08473,PA451232,C0246631,D000077208,,CID100060814,13810,-0.24021445,0.9802872 96 | DrugBank:DB00969,Alosetron,DB00969,D07129,PA164745502,C0291772,C090840,,CID100002099,1001,-0.24023908,0.9802833 97 | DrugBank:DB01142,Doxepin,DB01142,D07875,PA449409,C0013085,D004316,,CID100003158,1238,-0.24116267,0.98013854 98 | DrugBank:DB00593,Ethosuximide,DB00593,D00539,PA449533,C0015043,D005013,DC0479436,CID100003291,1261,-0.2412753,0.9801209 99 | DrugBank:DB00371,Meprobamate,DB00371,D00376,PA450377,C0025386,D008620,,CID100004064,1914,-0.24243839,0.9799385 100 | DrugBank:DB00497,Oxycodone,DB00497,D05312,PA450741,C0030049,D010098,DC0481597,CID100004635,1977,-0.24295442,0.9798576 101 | DrugBank:DB00949,Felbamate,DB00949,D00536,PA449590,C0060135,D000078328,DC0479407,CID100003331,224,-0.24399206,0.97969496 102 | -------------------------------------------------------------------------------- /iBKH-KD-protocol/funcs/KG_link_pred.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Jul 4 10:06:14 2023 5 | 6 | @author: changsu 7 | """ 8 | 9 | import torch as th 10 | import torch.nn.functional as fn 11 | 12 | import numpy as np 13 | import pandas as pd 14 | 15 | from sklearn.preprocessing import MinMaxScaler 16 | 17 | import os 18 | 19 | 20 | """ 21 | The following codes define functions 22 | """ 23 | # link prediction based on embeddings derived using TransE 24 | def transE_l2(head, rel, tail, gamma=12.0): 25 | # Paper link: https://papers.nips.cc/paper/5071-translating-embeddings-for-modeling-multi-relational-data 26 | score = head + rel - tail 27 | 28 | return gamma - th.norm(score, p=2, dim=-1) 29 | 30 | 31 | # link prediction based on embeddings derived using TransR 32 | def transR(head, rel, tail, proj, rel_idx, gamma=12.0): 33 | # Paper link: https://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/download/9571/9523 34 | proj = proj.reshape(-1, head.shape[1], rel.shape[0])[rel_idx] 35 | head_r = th.einsum('ab,bc->ac', head, proj) 36 | tail_r = th.einsum('b,bc->c', th.tensor(tail), proj) 37 | score = head_r + rel - tail_r 38 | 39 | return gamma - th.norm(score, p=1, dim=-1) 40 | 41 | 42 | # link prediction based on embeddings derived using DistMult 43 | def DistMult(head, rel, tail): 44 | # Paper link: https://arxiv.org/abs/1412.6575 45 | score = head * rel * tail 46 | 47 | return th.sum(score, dim=-1) 48 | 49 | 50 | 51 | # link prediction based on embeddings derived using complEx 52 | def complEx(head, rel, tail, gamma=12.0): 53 | # Paper link: https://arxiv.org/abs/1606.06357 54 | real_head, img_head = th.chunk(head, 2, dim=-1) 55 | real_tail, img_tail = th.chunk(th.tensor(tail), 2, dim=-1) 56 | real_rel, img_rel = th.chunk(rel, 2, dim=-1) 57 | 58 | score = real_head * real_tail * real_rel \ 59 | + img_head * img_tail * real_rel \ 60 | + real_head * img_tail * img_rel \ 61 | - img_head * real_tail * img_rel 62 | 63 | return th.sum(score, -1) 64 | 65 | 66 | def generate_hypothesis(target_entity, candidate_entity_type, relation_type, 67 | embedding_folder='data/embeddings', method='transE_l2', 68 | kg_folder = 'data/iBKH', triplet_folder = 'data/triplets', 69 | without_any_rel=False, topK=100, 70 | save_path='output', save=True): 71 | 72 | # load entity vocab 73 | entities = {} 74 | for e in ['anatomy', 'disease', 'drug', 'dsp', 'gene', 75 | 'molecule', 'pathway', 'sdsi', 'side_effect', 76 | 'symptom', 'tc']: 77 | e_df = pd.read_csv(kg_folder + '/entity/' + e + '_vocab.csv', header=0, low_memory=False) 78 | if e == 'gene': 79 | e_df = e_df.rename(columns={'symbol':'name'}) 80 | if e == 'molecule': 81 | e_df = e_df.rename(columns={'chembl_id':'name'}) 82 | 83 | entities[e] = e_df 84 | 85 | # get target entity vocab 86 | target_entity_vocab = pd.DataFrame() 87 | for e in entities: 88 | e_df = entities[e][['primary', 'name']] 89 | target_entity_vocab = pd.concat([target_entity_vocab, e_df[e_df['name'].isin(target_entity)]]) 90 | 91 | 92 | # load embeddings 93 | entity_emb = np.load(embedding_folder + '/' + method + '/iBKH_' + method + '_entity.npy') 94 | rel_emb = np.load(embedding_folder + '/' + method + '/iBKH_' + method + '_relation.npy') 95 | if method == 'transR': 96 | proj_np = np.load(embedding_folder + '/' + method + '/iBKH_TransRprojection.npy') 97 | proj_emb = th.tensor(proj_np) 98 | 99 | # load entity and relation embedding map 100 | entity_emb_map = pd.read_csv(embedding_folder + '/' + method + '/entities.tsv', sep='\t', header=None, low_memory=False) 101 | entity_emb_map.columns = ['id', 'primary'] 102 | rel_emb_map = pd.read_csv(embedding_folder + '/' + method + '/relations.tsv', sep='\t', header=None, low_memory=False) 103 | rel_emb_map.columns = ['rid', 'relation'] 104 | 105 | target_entity_vocab = pd.merge(target_entity_vocab, entity_emb_map, on='primary', how='left') 106 | 107 | 108 | target_entity_ids = [] 109 | target_entity_names = [] 110 | target_entity_primaries = [] 111 | for idx, row in target_entity_vocab.iterrows(): 112 | target_entity_ids.append(row['id']) 113 | target_entity_names.append(row['name']) 114 | target_entity_primaries.append(row['primary']) 115 | 116 | 117 | # get candidate entity embeddings 118 | candidate_entities = pd.merge(entities[candidate_entity_type], entity_emb_map, on='primary', how='inner') 119 | candidate_entity_ids = th.tensor(candidate_entities.id.tolist()).long() 120 | candidate_embs = th.tensor(entity_emb[candidate_entity_ids]) 121 | 122 | 123 | # get target relation embeddings 124 | target_relations = rel_emb_map[rel_emb_map['relation'].isin(relation_type)] 125 | target_relation_ids = th.tensor(target_relations.rid.tolist()).long() 126 | target_relation_embs = [th.tensor(rel_emb[rid]) for rid in target_relation_ids] 127 | 128 | 129 | 130 | 131 | # rank candidate entities 132 | scores_per_target_ent = [] 133 | candidate_ids = [] 134 | for rid in range(len(target_relation_embs)): 135 | rel_emb=target_relation_embs[rid] 136 | for target_id in target_entity_ids: 137 | target_emb = entity_emb[target_id] 138 | 139 | if method == 'transE_l2': 140 | score = fn.logsigmoid(transE_l2(candidate_embs, rel_emb, target_emb)) 141 | elif method == 'transR': 142 | score = fn.logsigmoid(transR(candidate_embs, rel_emb, target_emb, proj_emb, target_relation_ids[rid])) 143 | elif method == 'complEx': 144 | score = fn.logsigmoid(complEx(candidate_embs, rel_emb, target_emb)) 145 | elif method == 'DistMult': 146 | score = fn.logsigmoid(DistMult(candidate_embs, rel_emb, target_emb)) 147 | else: 148 | print("Method name error!!! Please check name of the knowledge graph embedding method you used.") 149 | 150 | scores_per_target_ent.append(score) 151 | candidate_ids.append(candidate_entity_ids) 152 | scores = th.cat(scores_per_target_ent) 153 | candidate_ids = th.cat(candidate_ids) 154 | 155 | idx = th.flip(th.argsort(scores), dims=[0]) 156 | scores = scores[idx].numpy() 157 | candidate_ids = candidate_ids[idx].numpy() 158 | 159 | 160 | # de-duplication 161 | _, unique_indices = np.unique(candidate_ids, return_index=True) 162 | # sorting 163 | ranked_unique_indices = np.sort(unique_indices) 164 | proposed_candidate_ids = candidate_ids[ranked_unique_indices] 165 | proposed_scores = scores[ranked_unique_indices] 166 | proposed_scores_norm = MinMaxScaler().fit_transform(proposed_scores.reshape(-1, 1)) 167 | 168 | 169 | proposed_df = pd.DataFrame() 170 | proposed_df['id'] = proposed_candidate_ids 171 | proposed_df['score'] = proposed_scores 172 | proposed_df['score_norm'] = proposed_scores_norm 173 | 174 | # proposed_df = pd.merge(proposed_df, candidate_entities, on='id', how='left') 175 | proposed_df = pd.merge(candidate_entities, proposed_df, on='id', how='right') 176 | 177 | 178 | ### remove candidate entities who have already linked to the target entity 179 | rel_meta_type = relation_type[0].split('_')[-1] # e.g., Treats_DDi => DDi 180 | # load triplet file 181 | triplet_df = pd.read_csv(triplet_folder + '/' + rel_meta_type + '_triplet.csv', header=0, low_memory=False) 182 | if without_any_rel == False: 183 | triplet_df = triplet_df[triplet_df['Relation'].isin(relation_type)] 184 | # only keep triplets that contain target entity 185 | triplet_df = triplet_df[(triplet_df['Head'].isin(target_entity_primaries)) | (triplet_df['Tail'].isin(target_entity_primaries))] 186 | # candidate entities with known relation with target entity 187 | candidates_known = triplet_df['Head'].tolist() + triplet_df['Tail'].tolist() 188 | candidates_known = list(set(candidates_known) - set(target_entity_primaries)) 189 | 190 | # in results, filter out andidate entities with known relation with target entity 191 | proposed_df = proposed_df[~proposed_df['primary'].isin(candidates_known)] 192 | proposed_df = proposed_df[~proposed_df['name'].isin(target_entity_names)] 193 | 194 | proposed_df = proposed_df.reset_index(drop=True) 195 | 196 | if topK != None: 197 | proposed_df = proposed_df[: topK] 198 | 199 | if save == True: 200 | if not os.path.exists(save_path): 201 | os.makedirs(save_path) 202 | proposed_df.to_csv(save_path + '/prediction_%s_top%s_%s.csv' % (candidate_entity_type, topK, method), index=False) 203 | 204 | return proposed_df 205 | 206 | 207 | 208 | def generate_hypothesis_ensemble_model(target_entity, candidate_entity_type, relation_type, 209 | embedding_folder='data/embeddings', 210 | kg_folder = 'data/iBKH', triplet_folder = 'data/triplets', 211 | without_any_rel=False, topK=100, 212 | save_path='output', save=True): 213 | 214 | transE_res = generate_hypothesis(target_entity=target_entity, candidate_entity_type=candidate_entity_type, 215 | relation_type=relation_type, embedding_folder=embedding_folder, method='transE_l2', 216 | kg_folder = kg_folder, triplet_folder = triplet_folder, 217 | without_any_rel=without_any_rel, topK=None, save=False) 218 | transE_res['transE_vote'] = len(transE_res) - transE_res.index.values 219 | 220 | transR_res = generate_hypothesis(target_entity=target_entity, candidate_entity_type=candidate_entity_type, 221 | relation_type=relation_type, embedding_folder=embedding_folder, method='transR', 222 | kg_folder = kg_folder, triplet_folder = triplet_folder, 223 | without_any_rel=without_any_rel, topK=None, save=False) 224 | transR_res['transR_vote'] = len(transR_res) - transR_res.index.values 225 | 226 | complEx_res = generate_hypothesis(target_entity=target_entity, candidate_entity_type=candidate_entity_type, 227 | relation_type=relation_type, embedding_folder=embedding_folder, method='complEx', 228 | kg_folder = kg_folder, triplet_folder = triplet_folder, 229 | without_any_rel=without_any_rel, topK=None, save=False) 230 | complEx_res['complEx_vote'] = len(complEx_res) - complEx_res.index.values 231 | 232 | DistMult_res = generate_hypothesis(target_entity=target_entity, candidate_entity_type=candidate_entity_type, 233 | relation_type=relation_type, embedding_folder=embedding_folder, method='DistMult', 234 | kg_folder = kg_folder, triplet_folder = triplet_folder, 235 | without_any_rel=without_any_rel, topK=None, save=False) 236 | DistMult_res['DistMult_vote'] = len(DistMult_res) - DistMult_res.index.values 237 | 238 | 239 | combined_res = pd.merge(transE_res, transR_res[['primary', 'transR_vote']], on='primary', how='left') 240 | combined_res = pd.merge(combined_res, complEx_res[['primary', 'complEx_vote']], on='primary', how='left') 241 | combined_res = pd.merge(combined_res, DistMult_res[['primary', 'DistMult_vote']], on='primary', how='left') 242 | 243 | combined_res['vote_score'] = combined_res['transE_vote'] + combined_res['transR_vote'] + combined_res['complEx_vote'] + combined_res['DistMult_vote'] 244 | combined_res['vote_score_normed'] = MinMaxScaler().fit_transform(combined_res['vote_score'].values.reshape(-1, 1)) 245 | 246 | combined_res = combined_res.sort_values(by='vote_score_normed', ascending=False) 247 | 248 | combined_res = combined_res.reset_index(drop=True) 249 | 250 | if topK != None: 251 | combined_res = combined_res[: topK] 252 | 253 | if save == True: 254 | if not os.path.exists(save_path): 255 | os.makedirs(save_path) 256 | combined_res.to_csv(save_path + '/prediction_%s_top%s_ensemble.csv' % (candidate_entity_type, topK), index=False) 257 | 258 | return combined_res 259 | -------------------------------------------------------------------------------- /iBKH-KD-protocol/output/prediction_drug_top100_ensemble.csv: -------------------------------------------------------------------------------- 1 | primary,name,drugbank_id,kegg_id,pharmgkb_id,umls_cui,mesh_id,iDISK_id,CID,id,score,score_norm,transE_vote,transR_vote,complEx_vote,DistMult_vote,vote_score,vote_score_normed 2 | DrugBank:DB00143,Glutathione,DB00143,D00014,PA449780,C0017817,D005978,,,3092,-1.3192606,0.81111217,24263,26927,26138,24317,101645,1.0 3 | DrugBank:DB04815,Clioquinol,DB04815,D03538,PA449039,C0021978,D007464,,,1603,-0.9699813,0.86587286,24994,26876,24297,25295,101462,0.9980499344650107 4 | MeSH:D013256,Steroids,,,,C0038317,D013256,,,14378,-1.3089085,0.8127352,24275,25877,24796,26138,101086,0.994043242436836 5 | DrugBank:DB01956,Taurine,DB01956,D00047,PA451590,C0039350,D013654,,,3451,-1.1711981,0.8343257,24518,26020,25620,24751,100909,0.9921571134767644 6 | DrugBank:DB00431,Lindane,DB00431,D00360,PA164754914,C0005038,D001556,,CID100000727,1308,-0.6793266,0.9114423,25776,26705,25516,22897,100894,0.9919972720394703 7 | DrugBank:DB00132,alpha-Linolenic acid,DB00132,,PA449384,C0051405,D017962,,,6868,-1.6717641,0.7558459,23680,26141,25097,25957,100875,0.9917948062188975 8 | DrugBank:DB00179,Masoprocol,DB00179,D04862,PA164746493,C0733397,D009637,,,11706,-1.6596137,0.75775087,23697,26983,24552,25515,100747,0.9904308259539871 9 | DrugBank:DB00336,Nitrofural,DB00336,D00862,PA164754877,C0028157,D009583,,,12524,-1.3049827,0.8133507,24280,25621,25051,25766,100718,0.9901217991752183 10 | DrugBank:DB04115,Berberine,DB04115,D00092,PA165860812,C0005117,D001599,,,7823,-1.3932794,0.7995073,24156,25945,25326,25269,100696,0.9898873650671867 11 | DrugBank:DB09086,Eugenol,DB09086,D04117,,C0015153,D005054,,,10173,-1.7526864,0.7431588,23552,26093,24917,26097,100659,0.989493089521861 12 | DrugBank:DB12290,Puerarin,DB12290,,,C0072591,C033607,,,13634,-1.7054859,0.750559,23620,26102,24885,25981,100588,0.9887365067186684 13 | MeSH:D008094,Lithium,,,,C0023870,D008094,DC0478809,CID100011125,11539,-0.5504055,0.9316548,26175,26771,25921,21578,100445,0.9872126850164638 14 | DrugBank:DB11118,Ammonia,DB11118,,PA166131585,C0002607,D000641,,CID100000222,7068,-1.2208875,0.8265353,24419,26092,24718,25118,100347,0.9861683876261416 15 | DrugBank:DB02587,Colforsin,DB02587,D03584,PA146096022,C0917964,D005576,,,8994,-1.310175,0.8125366,24272,26546,25592,23826,100236,0.9849855609901645 16 | DrugBank:DB11735,Galactose,DB11735,D04291,PA449725,C0016945,D005690,,,10530,-1.7624575,0.74162686,23538,26153,23936,26573,100200,0.9846019415406584 17 | PharmGKB:PA10832,corticosteroids,,,PA10832,C0001617,D000305,DC0478594,,6611,-1.385069,0.8007946,24169,25704,24801,25497,100171,0.9842929147618895 18 | DrugBank:DB04422,Homocysteine,DB04422,,,C0019878,D006710,,,10902,-1.2244079,0.82598335,24414,26287,24528,24890,100119,0.9837387977792698 19 | DrugBank:DB09153,Sodium chloride,DB09153,D02056,PA451382,C0037494,D012965,,,1522,-1.5519575,0.7746295,23890,26184,25322,24645,100041,0.9829076223053399 20 | DrugBank:DB11672,Curcumin,DB11672,,PA151958596,C0010467,D003474,,,9140,-1.250675,0.82186514,24354,27049,26388,22138,99929,0.9817141395735431 21 | MeSH:D002331,Carnitine,,,,C0007258,D002331,,CID100000085,8437,-1.4813418,0.78570074,24020,26586,26241,22912,99759,0.9799026032842089 22 | DrugBank:DB03057,Malonaldehyde,DB03057,,,C0024643,D008315,,,11669,-2.0908349,0.69014317,22823,26283,24464,26126,99696,0.9792312692475731 23 | DrugBank:DB00746,Deferoxamine,DB00746,D03670,PA164746490,C0011145,D003676,,CID100002973,1199,-0.75536406,0.899521,25558,26389,26096,21631,99674,0.9789968351395417 24 | DrugBank:DB00171,ATP,DB00171,D08646,PA164743471,C0001480,D000255,,,6591,-1.2064426,0.82879996,24448,27085,26364,21719,99616,0.978378781582004 25 | KEGG:D03878,Dizocilpine maleate,,D03878,,C0079246,D016291,,,9775,-1.1505466,0.8375635,24561,25198,23376,26451,99586,0.9780590987074156 26 | DrugBank:DB00119,Pyruvic acid,DB00119,,PA164778686,C0072802,D019289,,,13706,-1.4370607,0.7926432,24081,25924,25727,23755,99487,0.9770041452212739 27 | DrugBank:DB06757,Manganese,DB06757,D04854,,C0024706,D008345,,,11680,-0.8935449,0.87785673,25210,26223,25240,22783,99456,0.9766738062508658 28 | DrugBank:DB06750,Ginsenoside Rg1,DB06750,,,C0074018,C035054,,,10610,-1.7730337,0.73996866,23517,25615,23643,26622,99397,0.9760450965975087 29 | DrugBank:DB04786,Suramin,DB04786,,PA10292,C0038880,D013498,,,14468,-1.4168519,0.7958116,24112,26679,25487,22965,99243,0.9744040578412883 30 | PharmGKB:PA452347,glucocorticoids,,,PA452347,C0017710,D005938,DC0481533,,10630,-1.746833,0.7440765,23562,26047,23637,25951,99197,0.9739138774335858 31 | PharmGKB:PA452233,antipsychotics,,,PA452233,C0040615,D014150,DC0479620,,7308,-1.3567141,0.8052401,24200,24500,24754,25552,99006,0.9718785631320397 32 | PharmGKB:PA449509,estrogens,,,PA449509,C0014939,D004967,DC0478555,CID100000699,10066,-0.89519787,0.8775976,25203,26280,26417,21027,98927,0.9710367315622903 33 | DrugBank:DB02772,Sucrose,DB02772,D00025,PA451525,C0038636,D013395,,,14419,-2.229847,0.66834855,22388,26152,25481,24881,98902,0.9707703291668 34 | DrugBank:DB00151,Cysteine,DB00151,D00026,PA449173,C0010654,D003545,,,3013,-2.1263072,0.68458176,22721,26370,26384,23412,98887,0.9706104877295056 35 | DrugBank:DB02315,Cyclic GMP,DB02315,,,C0018338,D006152,,,9181,-2.0006838,0.7042773,23043,25451,24994,25348,98836,0.9700670268427054 36 | MeSH:D013481,Superoxides,,,,C0038836,D013481,,,14466,-1.8277962,0.7313829,23420,26103,23652,25648,98823,0.9699284975970504 37 | DrugBank:DB01782,Pyrazolanthrone,DB01782,,,C0968382,C432165,,,13664,-2.033749,0.6990932,22961,26094,23559,26205,98819,0.9698858732137721 38 | MeSH:D014222,Triamcinolone Acetonide,,,,C0040866,D014222,,,14804,-1.8468038,0.72840285,23382,24951,23478,26851,98662,0.9682128661700926 39 | DrugBank:DB13063,Parthenolide,DB13063,,,C0070126,C002669,,,12987,-2.1425145,0.68204075,22673,24848,24131,26989,98641,0.9679890881578808 40 | MeSH:D014801,Vitamin A,,,,C0042839,D014801,DC0489740,CID100001071,15107,-1.5368314,0.77700096,23926,25370,24244,25096,98636,0.9679358076787827 41 | DrugBank:DB02266,Flufenamic acid,DB02266,D01581,PA166049190,C0016282,D005439,,,10357,-1.5224365,0.77925783,23954,25890,24214,24573,98631,0.9678825271996847 42 | DrugBank:DB15584,Luteolin,DB15584,,,C0065264,D047311,,,11604,-2.0775828,0.6922208,22852,25538,23548,26682,98620,0.9677653101456689 43 | DrugBank:DB00148,Creatine,DB00148,,PA164778930,C0010286,D003401,,,9102,-1.55466,0.77420574,23882,26593,25414,22629,98518,0.9666783883720682 44 | DrugBank:DB01914,D-glucose,DB01914,,PA449773,C0017725,D005947,,CID100000206,1296,-0.87310547,0.88106126,25256,26377,25496,21377,98506,0.9665505152222329 45 | DrugBank:DB01025,Amlexanox,DB01025,D01828,PA164745310,C0103049,C045742,,CID100002161,7058,-1.4479347,0.7909384,24063,24373,25396,24628,98460,0.9660603348145307 46 | DrugBank:DB04468,Afimoxifene,DB04468,D06551,,C2347999,C016601,,,6665,-1.1752086,0.8336969,24509,26481,25058,22395,98443,0.9658791811855973 47 | DrugBank:DB13721,Cypermethrin,DB13721,D07763,,C0056849,C017160,,,9242,-1.3620205,0.8044082,24194,25968,24217,24033,98412,0.9655488422151892 48 | MeSH:D004205,Cromolyn Sodium,,,,C0012694,D004205,,,9116,-1.991005,0.7057947,23070,24919,23665,26743,98397,0.9653890007778951 49 | DrugBank:DB14104,Linoleic acid,DB14104,,,C0023749,D019787,,,11504,-2.3294046,0.65273976,22032,25650,24142,26572,98396,0.9653783446820754 50 | DrugBank:DB05382,Iodine,DB05382,D00108,PA450049,C0021968,D007455,,,1644,-1.8361273,0.73007673,23401,25177,24176,25603,98357,0.9649627569451105 51 | MeSH:D009573,Nitrites,,,,C0028137,D009573,,CID100000946,12516,-1.677129,0.7550048,23673,23802,25641,25199,98315,0.9645152009206868 52 | PharmGKB:PA166123346,bilirubin,,,PA166123346,C0005437,D001663,,,7909,-1.9876075,0.7063274,23080,26255,23677,25292,98304,0.964397983866671 53 | DrugBank:DB04224,Oleic Acid,DB04224,D02315,,C0028928,D019301,,,12768,-1.6043189,0.7664201,23798,25321,23913,25254,98286,0.9642061741419179 54 | DrugBank:DB07352,Apigenin,DB07352,,,C0912024,D047310,,,7340,-1.84235,0.7291011,23387,25168,23479,26203,98237,0.9636840254467569 55 | DrugBank:DB08818,Hyaluronic acid,DB08818,D08043,PA165958431,C0020196,D006820,,,10921,-1.7635304,0.74145865,23534,26041,24275,24376,98226,0.9635668083927411 56 | DrugBank:DB00772,Malathion,DB00772,D00534,PA164748092,C0024547,D008294,,CID100004004,2111,-1.1328325,0.84034073,24602,25733,25305,22550,98190,0.963183188943235 57 | DrugBank:DB02010,Staurosporine,DB02010,,PA165109623,C0075193,D019311,,,14370,-1.932675,0.71493983,23203,24550,24279,26119,98151,0.96276760120627 58 | DrugBank:DB02375,Myricetin,DB02375,,,C0067067,C040015,,,12180,-2.1731112,0.6772437,22568,25747,23222,26587,98124,0.9624798866191404 59 | DrugBank:DB03310,Glutathione disulfide,DB03310,D00031,,C0061516,D019803,,,10651,-1.9640263,0.7100245,23134,25693,23026,26248,98101,0.9622347964152893 60 | DrugBank:DB11656,Rebamipide,DB11656,D01121,,C0069562,C052785,,,3058,-1.6594839,0.75777125,23698,25614,22950,25783,98045,0.961638055049391 61 | DrugBank:DB00756,Hexachlorophene,DB00756,D00859,PA449871,C0019435,D006582,,CID100003598,2105,-0.64097184,0.9174557,25876,26128,24219,21807,98030,0.9614782136120967 62 | PharmGKB:PA133822447,catecholamines,,,PA133822447,C0007412,D002395,,,8460,-1.395436,0.7991692,24150,26108,22783,24966,98007,0.9612331234082456 63 | DrugBank:DB14154,Gold,DB14154,,,C0018026,D006046,,CID100022318,10687,-0.7806944,0.89554965,25480,24740,25761,21959,97940,0.9605191649883316 64 | DrugBank:DB03904,Urea,DB03904,D00023,PA451831,C0041942,D014508,,,3161,-1.6132491,0.76502,23784,26524,24742,22845,97895,0.9600396406764491 65 | DrugBank:DB02679,Cyanamide,DB02679,D00123,,C0010502,D003484,,,3781,-1.4897939,0.7843756,24014,24719,23454,25639,97826,0.9593043700648958 66 | MeSH:C013592,mangiferin,,,,C0065654,C013592,,,11693,-1.8169029,0.73309076,23441,24587,23350,26426,97804,0.9590699359568642 67 | DrugBank:DB13242,Bucladesine,DB13242,D07546,,C0012054,D003994,,,8127,-1.5925876,0.7682594,23823,25973,24687,23282,97765,0.9586543482198993 68 | KEGG:D00767,Vecuronium bromide,,D00767,,C0042435,D014673,,,15056,-1.2183957,0.82692593,24423,26837,23105,23396,97761,0.9586117238366207 69 | DrugBank:DB12695,Phenethyl Isothiocyanate,DB12695,,,C0070558,C058305,,,13157,-1.5559322,0.7740063,23880,26328,25704,21837,97749,0.9584838506867854 70 | DrugBank:DB11846,Creatinine,DB11846,D03600,,C0010294,D003404,,,9105,-2.3239262,0.65359867,22048,26272,24596,24797,97713,0.9581002312372793 71 | DrugBank:DB11831,Dinitrochlorobenzene,DB11831,,,C0012460,D004137,,,9694,-1.8853838,0.7223542,23299,25764,24966,23679,97708,0.9580469507581812 72 | DrugBank:DB00643,Mebendazole,DB00643,D00368,PA164776669,C0025023,D008463,,CID100004030,806,-0.5924298,0.9250662,26028,26789,25705,19183,97705,0.9580149824707225 73 | DrugBank:DB14574,Cobalt,DB14574,,,C0009148,D003035,,,8897,-2.1436973,0.6818553,22669,25897,23404,25728,97698,0.9579403897999852 74 | DrugBank:DB00403,Ceruletide,DB00403,D03442,PA164774919,C0006639,D002108,,,8552,-2.0493033,0.69665456,22922,24122,24357,26293,97694,0.9578977654167067 75 | DrugBank:DB02530,gamma-Aminobutyric acid,DB02530,D00058,,C0016904,D005680,,,10543,-1.0121074,0.85926825,24880,25520,26253,20938,97591,0.9568001875472865 76 | DrugBank:DB00848,Levamisole,DB00848,D08114,PA450205,C0023556,D007978,,,11459,-1.6001302,0.76707685,23804,25536,23189,25038,97567,0.9565444412476158 77 | DrugBank:DB11263,Polydatin,DB11263,,,C0071538,C058229,,,13400,-2.0231712,0.70075166,22990,24229,23849,26434,97502,0.9558517950193408 78 | DrugBank:DB16101,Baicalein,DB16101,,,C0052927,C006680,,,7655,-2.1985002,0.6732632,22496,24826,23079,27029,97430,0.9550845561203287 79 | DrugBank:DB03796,Palmitic Acid,DB03796,D05341,,C0030234,D019308,,,2343,-1.2327998,0.8246677,24398,24630,24269,24116,97413,0.9549034024913952 80 | DrugBank:DB04173,Fructose,DB04173,D00114,PA449716,C0016745,D005632,,,1630,-1.612917,0.7650721,23785,24169,23703,25721,97378,0.9545304391377087 81 | KEGG:D04970,Methacholine chloride,,D04970,PA450398,C0079829,D016210,,,11828,-2.16005,0.6792915,22626,25365,23356,26025,97372,0.9544665025627912 82 | DrugBank:DB11588,Carbon monoxide,DB11588,D09706,,C0007018,D002248,,,8398,-2.3546972,0.64877427,21926,25392,23663,26378,97359,0.9543279733171361 83 | MeSH:C093642,SB 203580,,,,C0297666,C093642,,,14045,-2.1055696,0.6878331,22781,25073,23313,26060,97227,0.952921368668947 84 | DrugBank:DB12029,Chlorogenic Acid,DB12029,,,C0008240,D002726,,,8647,-1.9954457,0.7050985,23055,23907,24553,25711,97226,0.9529107125731275 85 | MeSH:D008628,Mercury,,,,C0025424,D008628,,,11803,-1.2403855,0.82347834,24372,25102,24252,23499,97225,0.9529000564773078 86 | KEGG:D03625,Cycloheximide,,D03625,,C0010572,D003513,,,9191,-1.7577664,0.7423623,23545,24809,23931,24935,97220,0.9528467759982098 87 | DrugBank:DB13366,Hydrochloric acid,DB13366,D02057,,C0020259,D006851,,,10932,-2.2469037,0.6656744,22345,24687,23626,26419,97077,0.9513229542960051 88 | DrugBank:DB09526,Hydroquinone,DB09526,D00073,PA449924,C0020306,C031927,,,1313,-1.8660864,0.7253797,23344,25658,24823,23246,97071,0.9512590177210873 89 | DrugBank:DB00050,Cetrorelix,DB00050,D07665,PA164764506,C0209366,C062876,,CID125074886,8562,-1.4797181,0.7859553,24026,26225,25323,21477,97051,0.951045895804695 90 | DrugBank:DB07795,Fisetin,DB07795,,,C0060397,C017875,,,10318,-1.9922534,0.70559895,23064,24741,22719,26486,97010,0.950608995876091 91 | PharmGKB:PA451776,triglycerides,,,PA451776,C0041004,D014280,,,14863,-2.4083483,0.64036274,21715,25268,23187,26838,97008,0.9505876836844518 92 | MeSH:D007306,Insecticides,,,,C0021576,D007306,,,11144,-2.16775,0.67808425,22600,24143,23269,26994,97006,0.9505663714928125 93 | DrugBank:DB09028,Cytisine,DB09028,D07770,PA166153416,C0056913,C004712,,,9268,-1.7558718,0.74265933,23548,24573,24143,24727,96991,0.9504065300555182 94 | DrugBank:DB01917,Putrescine,DB01917,,,C0034170,D011700,,,13645,-2.062014,0.69466174,22894,25899,23990,24184,96967,0.9501507837558476 95 | DrugBank:DB11134,Cupric oxide,DB11134,,,C0056598,C030973,,,9136,-1.2453867,0.82269424,24365,25525,24645,22400,96935,0.94980978868962 96 | DrugBank:DB04942,Tamibarotene,DB04942,D01418,PA164743464,C1567753,C061133,,,3643,-1.1598458,0.8361055,24542,25712,24806,21867,96927,0.9497245399230629 97 | KEGG:D02413,Butylated hydroxytoluene,,D02413,PA448704,C0006507,D002084,,,8190,-1.800713,0.7356291,23459,24490,23174,25798,96921,0.9496606033481454 98 | DrugBank:DB04652,Corticosterone,DB04652,,,C0010124,D003345,,,9060,-1.5524349,0.7745546,23888,25748,25584,21681,96901,0.9494474814317531 99 | DrugBank:DB08842,Acetylcarnitine,DB08842,,,C0001040,D000108,,,6491,-1.8005266,0.7356583,23460,24755,23700,24979,96894,0.9493728887610158 100 | DrugBank:DB02342,2-Methoxyestradiol,DB02342,,PA13496724,C0046319,D000077584,,,5432,-1.9019606,0.7197553,23266,25054,24082,24462,96864,0.9490532058864274 101 | PharmGKB:PA128406983,pesticides,,,PA128406983,C0031253,D010575,,,13118,-1.7440054,0.7445198,23567,25509,23656,24113,96845,0.9488507400658548 102 | -------------------------------------------------------------------------------- /Codes_Term Harmonization/Relation_Integration/integrate_drug_disease.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | folder = '' 5 | CTD_folder = '../CTD/' 6 | 7 | 8 | pd.set_option('display.max_columns', None) 9 | 10 | 11 | def integrate_Hetionet_KEGG(): 12 | hetionet_DDi = pd.read_csv(folder + 'hetionet_DDi.csv') 13 | hetionet_DDi = hetionet_DDi.rename(columns={'source': 'Drug', 'target': 'Disease'}) 14 | hetionet_DDi['Drug'] = hetionet_DDi['Drug'].str.replace('Compound::', '') 15 | hetionet_DDi['Disease'] = hetionet_DDi['Disease'].str.replace('Disease::', '') 16 | 17 | drug_vocab = pd.read_csv(folder + 'drug_vocab.csv') 18 | db_vocab = drug_vocab.dropna(subset=['drugbank_id']) 19 | db_primary_dict = db_vocab.set_index('drugbank_id')['primary'].to_dict() 20 | kegg_drug_vocab = drug_vocab.dropna(subset=['kegg_id']) 21 | kegg_drug_primary_dict = kegg_drug_vocab.set_index('kegg_id')['primary'].to_dict() 22 | 23 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv') 24 | do_vocab = disease_vocab.dropna(subset=['do_id']) 25 | do_primary_dict = do_vocab.set_index('do_id')['primary'].to_dict() 26 | kegg_disease_vocab = disease_vocab.dropna(subset=['kegg_id']) 27 | kegg_disease_primary_dict = kegg_disease_vocab.set_index('kegg_id')['primary'].to_dict() 28 | 29 | hetionet_ddi_ctd = hetionet_DDi[hetionet_DDi['metaedge'] == 'CtD'] 30 | hetionet_ddi_ctd = hetionet_ddi_ctd.replace({'Drug': db_primary_dict, 'Disease': do_primary_dict}) 31 | hetionet_ddi_ctd = hetionet_ddi_ctd[['Drug', 'Disease']] 32 | hetionet_ddi_ctd['Treats_Hetionet'] = [1] * len(hetionet_ddi_ctd) 33 | hetionet_ddi_ctd['Palliates_Hetionet'] = [0] * len(hetionet_ddi_ctd) 34 | 35 | hetionet_ddi_cpd = hetionet_DDi[hetionet_DDi['metaedge'] == 'CpD'] 36 | hetionet_ddi_cpd = hetionet_ddi_cpd.replace({'Drug': db_primary_dict, 'Disease': do_primary_dict}) 37 | hetionet_ddi_cpd = hetionet_ddi_cpd[['Drug', 'Disease']] 38 | hetionet_ddi_cpd['Treats_Hetionet'] = [0] * len(hetionet_ddi_cpd) 39 | hetionet_ddi_cpd['Palliates_Hetionet'] = [1] * len(hetionet_ddi_cpd) 40 | 41 | DDi_res = pd.concat((hetionet_ddi_ctd, hetionet_ddi_cpd)) 42 | DDi_res.loc[DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False), 'Palliates_Hetionet'] = 1 43 | DDi_res = DDi_res.drop_duplicates(subset=['Drug', 'Disease'], keep='first') 44 | 45 | DDi_res['Source'] = ['Hetionet'] * len(DDi_res) 46 | print(DDi_res) 47 | DDi_res['Effect_KEGG'] = [0] * len(DDi_res) 48 | kegg_df = pd.read_csv(folder + 'kegg_drug_disease.csv') 49 | kegg_df = kegg_df.rename(columns={'drug': 'Drug', 'disease': 'Disease'}) 50 | kegg_df = kegg_df.replace({'Drug': kegg_drug_primary_dict, 'Disease': kegg_disease_primary_dict}) 51 | kegg_df['Treats_Hetionet'] = [0] * len(kegg_df) 52 | kegg_df['Palliates_Hetionet'] = [0] * len(kegg_df) 53 | kegg_df['Source'] = ['KEGG'] * len(kegg_df) 54 | kegg_df['Effect_KEGG'] = [1] * len(kegg_df) 55 | 56 | DDi_res = pd.concat((DDi_res, kegg_df)) 57 | DDi_res.loc[DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False), 'Effect_KEGG'] = 1 58 | DDi_res['Source'] = np.where(DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False), 59 | DDi_res['Source'].astype(str) + ';KEGG', DDi_res['Source'].astype(str) + '') 60 | DDi_res = DDi_res.drop_duplicates(subset=['Drug', 'Disease'], keep='first') 61 | DDi_res_col = list(DDi_res.columns) 62 | DDi_res_col_new = DDi_res_col[:-2] + DDi_res_col[-1:] + DDi_res_col[-2:-1] 63 | DDi_res = DDi_res[DDi_res_col_new] 64 | DDi_res['Source'] = DDi_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 65 | print(DDi_res) 66 | DDi_res.to_csv(folder + 'DDi_res.csv', index=False) 67 | with open(folder + 'integration_notes.txt', 'w') as f: 68 | f.write('DDi_res: Hetionet (Treats and Palliates) and KEGG (Effect).\n') 69 | f.close() 70 | 71 | 72 | def extract_PharmGKB_DDi(): 73 | pharmgkb_rel = pd.read_table(folder + 'pharmgkb_rel.tsv') 74 | pharmgkb_rel = pharmgkb_rel[pharmgkb_rel['Association'] == 'associated'] 75 | pharmgkb_rel = pharmgkb_rel.reset_index(drop=True) 76 | res = pd.DataFrame(columns=['Drug', 'Disease']) 77 | idx = 0 78 | for i in range(len(pharmgkb_rel)): 79 | p1_id = pharmgkb_rel.loc[i, 'Entity1_id'] 80 | p1_type = pharmgkb_rel.loc[i, 'Entity1_type'] 81 | p2_id = pharmgkb_rel.loc[i, 'Entity2_id'] 82 | p2_type = pharmgkb_rel.loc[i, 'Entity2_type'] 83 | if p1_type == 'Chemical' and p2_type == 'Disease': 84 | drug = p1_id 85 | disease = p2_id 86 | elif p2_type == 'Chemical' and p1_type == 'Disease': 87 | drug = p2_id 88 | disease = p1_id 89 | else: 90 | continue 91 | res.loc[idx] = [drug, disease] 92 | idx += 1 93 | res.to_csv(folder + 'pharmgkb_drug_disease.csv', index=False) 94 | 95 | 96 | def integrate_CTD_DDi_curated(): 97 | chem_disease = pd.read_csv(CTD_folder + 'CTD_chemicals_diseases.csv', header=27) 98 | chem_disease = chem_disease.dropna(subset=['ChemicalID', 'DiseaseID']) 99 | chem_disease = chem_disease.drop_duplicates(subset=['ChemicalID', 'DiseaseID']) 100 | chem_disease = chem_disease.reset_index(drop=True) 101 | chem_disease = chem_disease.rename(columns={'ChemicalID': 'Drug', 'DiseaseID': 'Disease'}) 102 | chem_disease_curated = chem_disease[pd.isnull(chem_disease['InferenceScore'])] 103 | 104 | chem_disease_curated = chem_disease_curated[['Drug', 'Disease']] 105 | chem_disease_curated = chem_disease_curated.reset_index(drop=True) 106 | 107 | drug_vocab = pd.read_csv(folder + 'drug_vocab.csv') 108 | mesh_drug_vocab = drug_vocab.dropna(subset=['mesh_id']) 109 | mesh_durg_primary_dict = mesh_drug_vocab.set_index('mesh_id')['primary'].to_dict() 110 | 111 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv') 112 | mesh_disease_vocab = disease_vocab.dropna(subset=['mesh_id']) 113 | mesh_disease_primary_dict = mesh_disease_vocab.set_index('mesh_id')['primary'].to_dict() 114 | omim_vocab = disease_vocab.dropna(subset=['omim_id']) 115 | omim_vocab['omim_id'] = omim_vocab['omim_id'].astype(int).astype(str) 116 | omim_primary_dict = omim_vocab.set_index('omim_id')['primary'].to_dict() 117 | 118 | DDi_res = pd.read_csv(folder + 'DDi_res.csv') 119 | DDi_res_col = list(DDi_res.columns)[2:] 120 | DDi_res['Associate_CTD'] = [0] * len(DDi_res) 121 | 122 | for i in range(len(chem_disease_curated)): 123 | drug_id = chem_disease_curated.loc[i, 'Drug'] 124 | disease_id = chem_disease_curated.loc[i, 'Disease'] 125 | 126 | chem_disease_curated.loc[i, 'Drug'] = mesh_durg_primary_dict[drug_id] 127 | if 'MESH' in disease_id: 128 | disease_id = disease_id.replace('MESH:', '') 129 | chem_disease_curated.loc[i, 'Disease'] = mesh_disease_primary_dict[disease_id] 130 | else: 131 | disease_id = disease_id.replace('OMIM:', '') 132 | chem_disease_curated.loc[i, 'Disease'] = omim_primary_dict[disease_id] 133 | print(i + 1, '/', len(chem_disease_curated), 'Completed...') 134 | print(chem_disease_curated) 135 | 136 | for col in DDi_res_col[:-1]: 137 | chem_disease_curated[col] = [0] * len(chem_disease_curated) 138 | chem_disease_curated['Source'] = ['CTD'] * len(chem_disease_curated) 139 | chem_disease_curated['Associate_CTD'] = [1] * len(chem_disease_curated) 140 | DDi_res = pd.concat((DDi_res, chem_disease_curated)) 141 | DDi_res.loc[DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False), 'Associate_CTD'] = 1 142 | DDi_res['Source'] = np.where(DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False), 143 | DDi_res['Source'].astype(str) + ';CTD', DDi_res['Source'].astype(str) + '') 144 | DDi_res = DDi_res.drop_duplicates(subset=['Drug', 'Disease'], keep='first') 145 | DDi_res_col = list(DDi_res.columns) 146 | DDi_res_col_new = DDi_res_col[:-2] + DDi_res_col[-1:] + DDi_res_col[-2:-1] 147 | DDi_res = DDi_res[DDi_res_col_new] 148 | DDi_res['Source'] = DDi_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 149 | DDi_res.to_csv(folder + 'DDi_res_2.csv', index=False) 150 | with open(folder + 'integration_notes.txt', 'a') as f: 151 | f.write('DDi_res_2: Hetionet, KEGG and CTD (Associate).\n') 152 | f.close() 153 | 154 | 155 | def integrate_CTD_DDi_inferred(): 156 | DDi_res = pd.read_csv(folder + 'DDi_res_2.csv') 157 | DDi_res_col = list(DDi_res.columns)[2:] 158 | DDi_res['Inferred_Relation'] = [0] * len(DDi_res) 159 | DDi_res['Inference_Score'] = [''] * len(DDi_res) 160 | 161 | chem_disease_inferred = pd.read_csv(folder + 'CTD_chem_disease_inferred.csv') 162 | 163 | for col in DDi_res_col[:-1]: 164 | chem_disease_inferred[col] = [0] * len(chem_disease_inferred) 165 | chem_disease_inferred['Source'] = ['CTD'] * len(chem_disease_inferred) 166 | chem_disease_inferred['Inferred_Relation'] = [1] * len(chem_disease_inferred) 167 | temp_col = list(chem_disease_inferred.columns) 168 | chem_disease_inferred_col = temp_col[:2] + temp_col[3:] + temp_col[2:3] 169 | chem_disease_inferred = chem_disease_inferred[chem_disease_inferred_col] 170 | print(list(chem_disease_inferred.columns)) 171 | DDi_res = pd.concat((DDi_res, chem_disease_inferred)) 172 | DDi_res.loc[DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False), 'Inferred_Relation'] = 1 173 | DDi_res['Source'] = np.where(DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False), 174 | DDi_res['Source'].astype(str) + ';CTD', DDi_res['Source'].astype(str) + '') 175 | DDi_res = DDi_res.drop_duplicates(subset=['Drug', 'Disease'], keep='first') 176 | DDi_res_col = list(DDi_res.columns) 177 | DDi_res_col_new = DDi_res_col[:-3] + DDi_res_col[-2:-1] + DDi_res_col[-3:-2] + DDi_res_col[-1:] 178 | DDi_res = DDi_res[DDi_res_col_new] 179 | DDi_res['Source'] = DDi_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 180 | DDi_res.to_csv(folder + 'DDi_res_3.csv', index=False) 181 | with open(folder + 'integration_notes.txt', 'a') as f: 182 | f.write('DDi_res_3: Hetionet, KEGG and CTD (Inferred_Relation).\n') 183 | f.close() 184 | 185 | 186 | def integrate_DRKG_DDi(): 187 | DDi_res = pd.read_csv(folder + 'DDi_res_3.csv') 188 | DDi_res_col = list(DDi_res.columns)[2:] 189 | 190 | drkg_DDi = pd.read_csv('drkg_DDi.csv') 191 | # drkg_DDi = pd.read_csv('drkg_DDi.csv') 192 | drkg_DDi = drkg_DDi.rename(columns={'entity_1': 'Drug', 'entity_2': 'Disease'}) 193 | drkg_DDi['Drug'] = drkg_DDi['Drug'].str.replace('Compound::', '') 194 | drkg_DDi['Disease'] = drkg_DDi['Disease'].str.replace('Disease::', '') 195 | ddi_relation_list = list(drkg_DDi.drop_duplicates(subset='relation', keep='first')['relation']) 196 | # ddi_source_list = list(drkg_DDi.drop_duplicates(subset='source', keep='first')['source']) 197 | # print(ddi_relation_list) 198 | # print(ddi_source_list) 199 | # print(drkg_DDi.drop_duplicates(subset='relation', keep='first')) 200 | 201 | drug_vocab = pd.read_csv(folder + 'drug_vocab.csv') 202 | db_vocab = drug_vocab.dropna(subset=['drugbank_id']) 203 | db_primary_dict = db_vocab.set_index('drugbank_id')['primary'].to_dict() 204 | mesh_drug_vocab = drug_vocab.dropna(subset=['mesh_id']) 205 | mesh_durg_primary_dict = mesh_drug_vocab.set_index('mesh_id')['primary'].to_dict() 206 | 207 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv') 208 | mesh_disease_vocab = disease_vocab.dropna(subset=['mesh_id']) 209 | mesh_disease_primary_dict = mesh_disease_vocab.set_index('mesh_id')['primary'].to_dict() 210 | omim_vocab = disease_vocab.dropna(subset=['omim_id']) 211 | omim_vocab['omim_id'] = omim_vocab['omim_id'].astype(int).astype(str) 212 | omim_primary_dict = omim_vocab.set_index('omim_id')['primary'].to_dict() 213 | 214 | for drkg_rel in ddi_relation_list: 215 | print(drkg_rel) 216 | DDi_res[drkg_rel] = [0] * len(DDi_res) 217 | drkg_DDi_temp = drkg_DDi[drkg_DDi['relation'] == drkg_rel] 218 | drkg_DDi_temp = drkg_DDi_temp[['Drug', 'Disease']] 219 | drkg_DDi_temp = drkg_DDi_temp.reset_index(drop=True) 220 | drkg_DDi_temp_primary = pd.DataFrame(columns=['Drug', 'Disease']) 221 | idx = 0 222 | for i in range(len(drkg_DDi_temp)): 223 | drug_id = drkg_DDi_temp.loc[i, 'Drug'] 224 | disease_id = drkg_DDi_temp.loc[i, 'Disease'] 225 | 226 | if 'DB' in drug_id: 227 | drug_id_primary = db_primary_dict[drug_id] 228 | elif 'MESH' in drug_id: 229 | drug_id = drug_id.replace('MESH:', '') 230 | if drug_id in mesh_durg_primary_dict: 231 | drug_id_primary = mesh_durg_primary_dict[drug_id] 232 | else: 233 | continue 234 | else: 235 | continue 236 | 237 | if 'MESH' in disease_id: 238 | disease_id = disease_id.replace('MESH:', '') 239 | disease_id_primary = mesh_disease_primary_dict[disease_id] 240 | else: 241 | disease_id = disease_id.replace('OMIM:', '') 242 | disease_id_primary = omim_primary_dict[disease_id] 243 | 244 | drkg_DDi_temp_primary.loc[idx] = [drug_id_primary, disease_id_primary] 245 | idx += 1 246 | for col in DDi_res_col[:-2]: 247 | drkg_DDi_temp_primary[col] = [0] * len(drkg_DDi_temp_primary) 248 | drkg_DDi_temp_primary['Source'] = ['DRKG'] * len(drkg_DDi_temp_primary) 249 | drkg_DDi_temp_primary['Inference_Score'] = [''] * len(drkg_DDi_temp_primary) 250 | drkg_DDi_temp_primary[drkg_rel] = [1] * len(drkg_DDi_temp_primary) 251 | DDi_res = pd.concat((DDi_res, drkg_DDi_temp_primary)) 252 | DDi_res.loc[DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False), drkg_rel] = 1 253 | DDi_res['Source'] = np.where(DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False), 254 | DDi_res['Source'].astype(str) + ';DRKG', DDi_res['Source'].astype(str) + '') 255 | DDi_res = DDi_res.drop_duplicates(subset=['Drug', 'Disease'], keep='first') 256 | DDi_res_col = list(DDi_res.columns) 257 | DDi_res_col_new = DDi_res_col[:-3] + DDi_res_col[-1:] + DDi_res_col[-3:-1] 258 | DDi_res = DDi_res[DDi_res_col_new] 259 | DDi_res_col = DDi_res_col_new[2:] 260 | DDi_res['Source'] = DDi_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 261 | 262 | DDi_res = DDi_res.rename(columns={'Compound treats the disease': 'Treats_DRKG'}) 263 | DDi_res.to_csv(folder + 'DDi_res_4.csv', index=False) 264 | with open(folder + 'integration_notes.txt', 'a') as f: 265 | f.write('DDi_res_4: Hetionet, KEGG, CTD and DRKG (Treats and Semantic Relations).\n') 266 | f.close() 267 | 268 | 269 | def main(): 270 | # integrate_Hetionet_KEGG() 271 | # extract_PharmGKB_DDi() 272 | integrate_CTD_DDi_curated() 273 | integrate_CTD_DDi_inferred() 274 | integrate_DRKG_DDi() 275 | 276 | 277 | if __name__ == '__main__': 278 | main() 279 | -------------------------------------------------------------------------------- /Codes_Term Harmonization/Relation_Integration/integrate_drug_gene.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | folder = '' 5 | CTD_folder = '../CTD/' 6 | 7 | # pd.set_option('display.max_columns', None) 8 | # pd.set_option('display.max_rows', None) 9 | 10 | 11 | def integrate_DrugBank_KEGG(): 12 | DG_res = pd.read_csv(folder + '/DGres_DrugBank_regulate.csv') 13 | DG_res_col = list(DG_res.columns) 14 | DG_res_col = [col_name.replace('_DrugBank', '') for col_name in DG_res_col] 15 | DG_res.columns = DG_res_col 16 | rel_list = list(DG_res_col)[2:] 17 | DG_res['Associate_KEGG'] = [0] * len(DG_res) 18 | DG_res['Source'] = ['DrugBank'] * len(DG_res) 19 | # print(DG_res) 20 | kegg_res = pd.read_csv(folder + '/kegg_drug_gene.csv') 21 | 22 | drug_vocab = pd.read_csv(folder + '/drug_vocab.csv') 23 | kegg_vocab = drug_vocab.dropna(subset=['kegg_id']) 24 | kegg_primary_dict = kegg_vocab.set_index('kegg_id')['primary'].to_dict() 25 | 26 | gene_vocab = pd.read_csv(folder + '/gene_vocab_2.csv') 27 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 28 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict() 29 | 30 | kegg_res = kegg_res.replace({'drug': kegg_primary_dict, 'gene': ncbi_primary_dict}) 31 | kegg_res = kegg_res.rename(columns={'drug': 'Drug', 'gene': 'Gene'}) 32 | kegg_res = kegg_res[['Drug', 'Gene']] 33 | for col in rel_list: 34 | kegg_res[col] = [0] * len(kegg_res) 35 | kegg_res['Associate_KEGG'] = [1] * len(kegg_res) 36 | kegg_res['Source'] = ['KEGG'] * len(kegg_res) 37 | # print(kegg_res) 38 | DG_res = pd.concat((DG_res, kegg_res)) 39 | DG_res.loc[DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 'Associate_KEGG'] = 1 40 | DG_res.loc[DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 'Source'] = 'DrugBank;KEGG' 41 | DG_res = DG_res.drop_duplicates(subset=['Drug', 'Gene'], keep='first') 42 | DG_res.to_csv(folder + '/DG_res.csv', index=False) 43 | with open(folder + '/integration_notes.txt', 'w') as f: 44 | f.write('DG_res: DrugBank (Target, Transporter, Enzyme, Carrier, Downregulates and Upregulates) and KEGG (Associate).\n') 45 | f.close() 46 | 47 | 48 | def extract_PharmGKB_DG(): 49 | pharmgkb_rel = pd.read_table(folder + 'pharmgkb_rel.tsv') 50 | pharmgkb_rel = pharmgkb_rel[pharmgkb_rel['Association'] == 'associated'] 51 | pharmgkb_rel = pharmgkb_rel.reset_index(drop=True) 52 | res = pd.DataFrame(columns=['Drug', 'Gene']) 53 | idx = 0 54 | for i in range(len(pharmgkb_rel)): 55 | p1_id = pharmgkb_rel.loc[i, 'Entity1_id'] 56 | p1_type = pharmgkb_rel.loc[i, 'Entity1_type'] 57 | p2_id = pharmgkb_rel.loc[i, 'Entity2_id'] 58 | p2_type = pharmgkb_rel.loc[i, 'Entity2_type'] 59 | if p1_type == 'Chemical' and p2_type == 'Gene': 60 | drug = p1_id 61 | gene = p2_id 62 | elif p2_type == 'Chemical' and p1_type == 'Gene': 63 | drug = p2_id 64 | gene = p1_id 65 | else: 66 | continue 67 | res.loc[idx] = [drug, gene] 68 | idx += 1 69 | res.to_csv(folder + '/pharmgkb_drug_gene.csv', index=False) 70 | 71 | 72 | def integrate_PharmGKB(): 73 | DG_res = pd.read_csv(folder + '/DG_res.csv') 74 | DG_res_col = list(DG_res.columns)[2:] 75 | DG_res['Associate_PharmGKB'] = [0] * len(DG_res) 76 | # print(DG_res) 77 | pharmgkb_res = pd.read_csv(folder + '/pharmgkb_drug_gene.csv') 78 | 79 | drug_vocab = pd.read_csv(folder + '/drug_vocab.csv') 80 | pharmgkb_drug_vocab = drug_vocab.dropna(subset=['pharmgkb_id']) 81 | pharmgkb_drug_primary_dict = pharmgkb_drug_vocab.set_index('pharmgkb_id')['primary'].to_dict() 82 | 83 | gene_vocab = pd.read_csv(folder + '/gene_vocab_2.csv') 84 | pharmgkb_gene_vocab = gene_vocab.dropna(subset=['pharmgkb_id']) 85 | pharmgkb_gene_primary_dict = pharmgkb_gene_vocab.set_index('pharmgkb_id')['primary'].to_dict() 86 | 87 | pharmgkb_res = pharmgkb_res.replace({'Drug': pharmgkb_drug_primary_dict, 'Gene': pharmgkb_gene_primary_dict}) 88 | for col in DG_res_col[:-1]: 89 | pharmgkb_res[col] = [0] * len(pharmgkb_res) 90 | pharmgkb_res['Source'] = ['PharmGKB'] * len(pharmgkb_res) 91 | pharmgkb_res['Associate_PharmGKB'] = [1] * len(pharmgkb_res) 92 | # print(pharmgkb_res) 93 | DG_res = pd.concat((DG_res, pharmgkb_res)) 94 | DG_res.loc[DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 'Associate_PharmGKB'] = 1 95 | DG_res['Source'] = np.where(DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), DG_res['Source'].astype(str) + ';PharmGKB', DG_res['Source'].astype(str) + '') 96 | DG_res = DG_res.drop_duplicates(subset=['Drug', 'Gene'], keep='first') 97 | DG_res_col = list(DG_res.columns) 98 | DG_res_col_new = DG_res_col[:-2] + DG_res_col[-1:] + DG_res_col[-2:-1] 99 | DG_res = DG_res[DG_res_col_new] 100 | DG_res['Source'] = DG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 101 | DG_res.to_csv(folder + '/DG_res_2.csv', index=False) 102 | with open(folder + '/integration_notes.txt', 'a') as f: 103 | f.write('DG_res_2: DrugBank, KEGG and PharmGKB (Associate).\n') 104 | f.close() 105 | 106 | 107 | def integrate_Hetionet(): 108 | DG_res = pd.read_csv(folder + '/DG_res_2.csv') 109 | DG_res_col = list(DG_res.columns)[2:] 110 | DG_res['Binds_Hetionet'] = [0] * len(DG_res) 111 | 112 | hetionet_DG = pd.read_csv(folder + '/hetionet_DG.csv') 113 | hetionet_DG = hetionet_DG.rename(columns={'source': 'Drug', 'target': 'Gene'}) 114 | hetionet_DG['Drug'] = hetionet_DG['Drug'].str.replace('Compound::', '') 115 | hetionet_DG['Gene'] = hetionet_DG['Gene'].str.replace('Gene::', '') 116 | 117 | drug_vocab = pd.read_csv(folder + '/drug_vocab.csv') 118 | db_vocab = drug_vocab.dropna(subset=['drugbank_id']) 119 | db_primary_dict = db_vocab.set_index('drugbank_id')['primary'].to_dict() 120 | 121 | gene_vocab = pd.read_csv(folder + '/gene_vocab_2.csv') 122 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 123 | ncbi_vocab['ncbi_id'] = ncbi_vocab['ncbi_id'].astype(int).astype(str) 124 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict() 125 | 126 | # integrate binds 127 | hetionet_binds = hetionet_DG[hetionet_DG['metaedge'] == 'CbG'] 128 | hetionet_binds = hetionet_binds.replace({'Drug': db_primary_dict, 'Gene': ncbi_primary_dict}) 129 | hetionet_binds = hetionet_binds[['Drug', 'Gene']] 130 | for col in DG_res_col[:-1]: 131 | hetionet_binds[col] = [0] * len(hetionet_binds) 132 | hetionet_binds['Source'] = ['Hetinoet'] * len(hetionet_binds) 133 | hetionet_binds['Binds_Hetionet'] = [1] * len(hetionet_binds) 134 | DG_res = pd.concat((DG_res, hetionet_binds)) 135 | DG_res.loc[DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 'Binds_Hetionet'] = 1 136 | DG_res['Source'] = np.where(DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), DG_res['Source'].astype(str) + ';Hetinoet', DG_res['Source'].astype(str) + '') 137 | DG_res = DG_res.drop_duplicates(subset=['Drug', 'Gene'], keep='first') 138 | DG_res_col = list(DG_res.columns) 139 | DG_res_col_new = DG_res_col[:-2] + DG_res_col[-1:] + DG_res_col[-2:-1] 140 | DG_res = DG_res[DG_res_col_new] 141 | DG_res['Source'] = DG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 142 | print(DG_res) 143 | 144 | # integrate Downregulates 145 | DG_res_col = list(DG_res.columns)[2:] 146 | DG_res['Downregulates_Hetionet'] = [0] * len(DG_res) 147 | hetionet_downregulates = hetionet_DG[hetionet_DG['metaedge'] == 'CdG'] 148 | hetionet_downregulates = hetionet_downregulates.replace({'Drug': db_primary_dict, 'Gene': ncbi_primary_dict}) 149 | hetionet_downregulates = hetionet_downregulates[['Drug', 'Gene']] 150 | for col in DG_res_col[:-1]: 151 | hetionet_downregulates[col] = [0] * len(hetionet_downregulates) 152 | hetionet_downregulates['Source'] = ['Hetinoet'] * len(hetionet_downregulates) 153 | hetionet_downregulates['Downregulates_Hetionet'] = [1] * len(hetionet_downregulates) 154 | DG_res = pd.concat((DG_res, hetionet_downregulates)) 155 | DG_res.loc[DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 'Downregulates_Hetionet'] = 1 156 | DG_res['Source'] = np.where(DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), DG_res['Source'].astype(str) + ';Hetinoet', DG_res['Source'].astype(str) + '') 157 | DG_res = DG_res.drop_duplicates(subset=['Drug', 'Gene'], keep='first') 158 | DG_res_col = list(DG_res.columns) 159 | DG_res_col_new = DG_res_col[:-2] + DG_res_col[-1:] + DG_res_col[-2:-1] 160 | DG_res = DG_res[DG_res_col_new] 161 | DG_res['Source'] = DG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 162 | print(DG_res) 163 | 164 | # integrate Upregulates 165 | DG_res_col = list(DG_res.columns)[2:] 166 | DG_res['Upregulates_Hetionet'] = [0] * len(DG_res) 167 | hetionet_upregulates = hetionet_DG[hetionet_DG['metaedge'] == 'CuG'] 168 | hetionet_upregulates = hetionet_upregulates.replace({'Drug': db_primary_dict, 'Gene': ncbi_primary_dict}) 169 | hetionet_upregulates = hetionet_upregulates[['Drug', 'Gene']] 170 | for col in DG_res_col[:-1]: 171 | hetionet_upregulates[col] = [0] * len(hetionet_upregulates) 172 | hetionet_upregulates['Source'] = ['Hetinoet'] * len(hetionet_upregulates) 173 | hetionet_upregulates['Upregulates_Hetionet'] = [1] * len(hetionet_upregulates) 174 | DG_res = pd.concat((DG_res, hetionet_upregulates)) 175 | DG_res.loc[DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 'Upregulates_Hetionet'] = 1 176 | DG_res['Source'] = np.where(DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 177 | DG_res['Source'].astype(str) + ';Hetinoet', DG_res['Source'].astype(str) + '') 178 | DG_res = DG_res.drop_duplicates(subset=['Drug', 'Gene'], keep='first') 179 | DG_res_col = list(DG_res.columns) 180 | DG_res_col_new = DG_res_col[:-2] + DG_res_col[-1:] + DG_res_col[-2:-1] 181 | DG_res = DG_res[DG_res_col_new] 182 | DG_res['Source'] = DG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 183 | print(DG_res) 184 | DG_res.to_csv(folder + '/DG_res_3.csv', index=False) 185 | with open(folder + '/integration_notes.txt', 'a') as f: 186 | f.write('DG_res_3: DrugBank, KEGG, PharmGKB and Hetionet (Binds, Downregulates and Upregulates).\n') 187 | f.close() 188 | 189 | 190 | def intergrate_CTD_DG(): 191 | DG_res = pd.read_csv(folder + '/DG_res_3.csv') 192 | DG_res_col = list(DG_res.columns)[2:] 193 | DG_res['Interaction_CTD'] = [0] * len(DG_res) 194 | 195 | chem_gene = pd.read_csv(CTD_folder + 'CTD_chem_gene_ixns.csv', header=27) 196 | chem_gene = chem_gene[['ChemicalID', 'GeneID']].dropna() 197 | chem_gene = chem_gene.drop_duplicates(subset=['ChemicalID', 'GeneID']) 198 | chem_gene = chem_gene.reset_index(drop=True) 199 | chem_gene = chem_gene.rename(columns={'ChemicalID': 'Drug', 'GeneID': 'Gene'}) 200 | 201 | drug_vocab = pd.read_csv(folder + '/drug_vocab.csv') 202 | mesh_vocab = drug_vocab.dropna(subset=['mesh_id']) 203 | mesh_primary_dict = mesh_vocab.set_index('mesh_id')['primary'].to_dict() 204 | 205 | gene_vocab = pd.read_csv(folder + '/gene_vocab_2.csv') 206 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 207 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict() 208 | 209 | # chem_gene = chem_gene.replace({'Drug': mesh_primary_dict, 'Gene': ncbi_primary_dict}) 210 | 211 | for i in range(len(chem_gene)): 212 | drug_id = chem_gene.loc[i, 'Drug'] 213 | gene_id = chem_gene.loc[i, 'Gene'] 214 | 215 | chem_gene.loc[i, 'Drug'] = mesh_primary_dict[drug_id] 216 | chem_gene.loc[i, 'Gene'] = ncbi_primary_dict[gene_id] 217 | print(i + 1, '/', len(chem_gene), 'Completed...') 218 | print(chem_gene) 219 | 220 | for col in DG_res_col[:-1]: 221 | chem_gene[col] = [0] * len(chem_gene) 222 | chem_gene['Source'] = ['CTD'] * len(chem_gene) 223 | chem_gene['Interaction_CTD'] = [1] * len(chem_gene) 224 | DG_res = pd.concat((DG_res, chem_gene)) 225 | DG_res.loc[DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 'Interaction_CTD'] = 1 226 | DG_res['Source'] = np.where(DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 227 | DG_res['Source'].astype(str) + ';CTD', DG_res['Source'].astype(str) + '') 228 | DG_res = DG_res.drop_duplicates(subset=['Drug', 'Gene'], keep='first') 229 | DG_res_col = list(DG_res.columns) 230 | DG_res_col_new = DG_res_col[:-2] + DG_res_col[-1:] + DG_res_col[-2:-1] 231 | DG_res = DG_res[DG_res_col_new] 232 | DG_res['Source'] = DG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 233 | DG_res.to_csv(folder + '/DG_res_4.csv', index=False) 234 | with open(folder + '/integration_notes.txt', 'a') as f: 235 | f.write('DG_res_4: DrugBank, KEGG, PharmGKB, Hetionet and CTD (Interaction).\n') 236 | f.close() 237 | 238 | 239 | def integrate_DRKG_DG(): 240 | DG_res = pd.read_csv(folder + '/DG_res_4.csv') 241 | DG_res_col = list(DG_res.columns)[2:] 242 | 243 | drkg_DG = pd.read_csv('drug/drkg_DG.csv') 244 | drkg_DG = drkg_DG.rename(columns={'entity_1': 'Drug', 'entity_2': 'Gene'}) 245 | drkg_DG['Drug'] = drkg_DG['Drug'].str.replace('Compound::', '') 246 | drkg_DG['Gene'] = drkg_DG['Gene'].str.replace('Gene::', '') 247 | dg_relation_list = list(drkg_DG.drop_duplicates(subset='relation', keep='first')['relation']) 248 | # dg_source_list = list(drkg_DG.drop_duplicates(subset='source', keep='first')['source']) 249 | # print(dg_relation_list) 250 | # print(dg_source_list) 251 | # print(drkg_DG.drop_duplicates(subset='relation', keep='first')) 252 | 253 | drug_vocab = pd.read_csv(folder + '/drug_vocab.csv') 254 | db_vocab = drug_vocab.dropna(subset=['drugbank_id']) 255 | db_primary_dict = db_vocab.set_index('drugbank_id')['primary'].to_dict() 256 | 257 | gene_vocab = pd.read_csv(folder + '/gene_vocab_2.csv') 258 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 259 | ncbi_vocab['ncbi_id'] = ncbi_vocab['ncbi_id'].astype(int).astype(str) 260 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict() 261 | 262 | for drkg_rel in dg_relation_list: 263 | print(drkg_rel) 264 | DG_res[drkg_rel] = [0] * len(DG_res) 265 | drkg_DG_temp = drkg_DG[drkg_DG['relation'] == drkg_rel] 266 | drkg_DG_temp = drkg_DG_temp.replace({'Drug': db_primary_dict, 'Gene': ncbi_primary_dict}) 267 | drkg_DG_temp = drkg_DG_temp[['Drug', 'Gene']] 268 | for col in DG_res_col[:-1]: 269 | drkg_DG_temp[col] = [0] * len(drkg_DG_temp) 270 | drkg_DG_temp['Source'] = ['DRKG'] * len(drkg_DG_temp) 271 | drkg_DG_temp[drkg_rel] = [1] * len(drkg_DG_temp) 272 | DG_res = pd.concat((DG_res, drkg_DG_temp)) 273 | DG_res.loc[DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), drkg_rel] = 1 274 | DG_res['Source'] = np.where(DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 275 | DG_res['Source'].astype(str) + ';DRKG', DG_res['Source'].astype(str) + '') 276 | DG_res = DG_res.drop_duplicates(subset=['Drug', 'Gene'], keep='first') 277 | DG_res_col = list(DG_res.columns) 278 | DG_res_col_new = DG_res_col[:-2] + DG_res_col[-1:] + DG_res_col[-2:-1] 279 | DG_res = DG_res[DG_res_col_new] 280 | DG_res_col = DG_res_col_new[2:] 281 | DG_res['Source'] = DG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 282 | 283 | DG_res = DG_res.rename(columns={'association': 'Associate_DRKG', 'direct interation': 'Interaction_DRKG'}) 284 | DG_res.to_csv(folder + '/DG_res_5.csv', index=False) 285 | with open(folder + '/integration_notes.txt', 'a') as f: 286 | f.write('DG_res_5: DrugBank, KEGG, PharmGKB, Hetionet, CTD and DRKG (Semantic Relations, Interaction and Associate).\n') 287 | f.close() 288 | 289 | 290 | def main(): 291 | # integrate_DrugBank_KEGG() 292 | # # extract_PharmGKB_DG() 293 | # integrate_PharmGKB() 294 | integrate_Hetionet() 295 | intergrate_CTD_DG() 296 | integrate_DRKG_DG() 297 | 298 | 299 | if __name__ == '__main__': 300 | main() 301 | -------------------------------------------------------------------------------- /Codes_Term Harmonization/Relation_Integration/integrate_gene_related.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | pd.set_option('display.max_columns', None) 5 | 6 | folder = '' 7 | 8 | 9 | def integrate_Hetionet_GG(): 10 | hetionet_GG = pd.read_csv(folder + '/hetionet_GG.csv') 11 | hetionet_GG = hetionet_GG.rename(columns={'source': 'Gene_1', 'target': 'Gene_2'}) 12 | hetionet_GG['Gene_1'] = hetionet_GG['Gene_1'].str.replace('Gene::', '') 13 | hetionet_GG['Gene_2'] = hetionet_GG['Gene_2'].str.replace('Gene::', '') 14 | 15 | gene_vocab = pd.read_csv(folder + '/gene_vocab_2.csv') 16 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 17 | ncbi_vocab['ncbi_id'] = ncbi_vocab['ncbi_id'].astype(int).astype(str) 18 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict() 19 | 20 | hetionet_GcG = hetionet_GG[hetionet_GG['metaedge'] == 'GcG'] 21 | print(hetionet_GcG) 22 | hetionet_GcG = hetionet_GcG.replace({'Gene_1': ncbi_primary_dict, 'Gene_2': ncbi_primary_dict}) 23 | hetionet_GcG = hetionet_GcG[['Gene_1', 'Gene_2']] 24 | print(hetionet_GcG) 25 | hetionet_GcG['Covaries'] = [1] * len(hetionet_GcG) 26 | hetionet_GcG['Interacts'] = [0] * len(hetionet_GcG) 27 | 28 | hetionet_GiG = hetionet_GG[hetionet_GG['metaedge'] == 'GiG'] 29 | hetionet_GiG = hetionet_GiG.replace({'Gene_1': ncbi_primary_dict, 'Gene_2': ncbi_primary_dict}) 30 | hetionet_GiG = hetionet_GiG[['Gene_1', 'Gene_2']] 31 | hetionet_GiG['Covaries'] = [0] * len(hetionet_GiG) 32 | hetionet_GiG['Interacts'] = [1] * len(hetionet_GiG) 33 | 34 | GG_res = pd.concat((hetionet_GcG, hetionet_GiG)) 35 | GG_res.loc[GG_res.duplicated(subset=['Gene_1', 'Gene_2'], keep=False), 'Interacts'] = 1 36 | GG_res = GG_res.drop_duplicates(subset=['Gene_1', 'Gene_2'], keep='first') 37 | 38 | GG_res['Regulates'] = [0] * len(GG_res) 39 | 40 | hetionet_GrG = hetionet_GG[hetionet_GG['metaedge'] == 'Gr>G'] 41 | hetionet_GrG = hetionet_GrG.replace({'Gene_1': ncbi_primary_dict, 'Gene_2': ncbi_primary_dict}) 42 | hetionet_GrG = hetionet_GrG[['Gene_1', 'Gene_2']] 43 | hetionet_GrG['Covaries'] = [0] * len(hetionet_GrG) 44 | hetionet_GrG['Interacts'] = [0] * len(hetionet_GrG) 45 | hetionet_GrG['Regulates'] = [1] * len(hetionet_GrG) 46 | 47 | GG_res = pd.concat((GG_res, hetionet_GrG)) 48 | GG_res.loc[GG_res.duplicated(subset=['Gene_1', 'Gene_2'], keep=False), 'Regulates'] = 1 49 | GG_res = GG_res.drop_duplicates(subset=['Gene_1', 'Gene_2'], keep='first') 50 | 51 | GG_res['Source'] = ['Hetionet'] * len(GG_res) 52 | print(GG_res) 53 | GG_res.to_csv(folder + '/GG_res.csv', index=False) 54 | with open(folder + '/integration_notes.txt', 'w') as f: 55 | f.write('GG_res: Hetionet (Covaries, Interacts and Regulates).\n') 56 | f.close() 57 | 58 | 59 | def extract_PharmGKB_GG(): 60 | pharmgkb_rel = pd.read_table(folder + 'pharmgkb_rel.tsv') 61 | pharmgkb_rel = pharmgkb_rel[pharmgkb_rel['Association'] == 'associated'] 62 | pharmgkb_rel = pharmgkb_rel.reset_index(drop=True) 63 | res = pd.DataFrame(columns=['Gene_1', 'Gene_2']) 64 | idx = 0 65 | for i in range(len(pharmgkb_rel)): 66 | p1_id = pharmgkb_rel.loc[i, 'Entity1_id'] 67 | p1_type = pharmgkb_rel.loc[i, 'Entity1_type'] 68 | p2_id = pharmgkb_rel.loc[i, 'Entity2_id'] 69 | p2_type = pharmgkb_rel.loc[i, 'Entity2_type'] 70 | if p1_type == 'Gene' and p2_type == 'Gene': 71 | gene_1 = p1_id 72 | gene_2 = p2_id 73 | else: 74 | continue 75 | res.loc[idx] = [gene_1, gene_2] 76 | idx += 1 77 | res.to_csv(folder + '/pharmgkb_gene_gene.csv', index=False) 78 | 79 | 80 | def integrate_PharmGKB_GG(): 81 | GG_res = pd.read_csv(folder + '/GG_res.csv') 82 | GG_res_cols = list(GG_res.columns)[2:] 83 | GG_res['Associate'] = [0] * len(GG_res) 84 | 85 | pharmgkb_res = pd.read_csv(folder + '/pharmgkb_gene_gene.csv') 86 | 87 | gene_vocab = pd.read_csv(folder + '/gene_vocab_2.csv') 88 | pharmgkb_gene_vocab = gene_vocab.dropna(subset=['pharmgkb_id']) 89 | pharmgkb_gene_primary_dict = pharmgkb_gene_vocab.set_index('pharmgkb_id')['primary'].to_dict() 90 | 91 | pharmgkb_res = pharmgkb_res.replace({'Gene_1': pharmgkb_gene_primary_dict, 'Gene_2': pharmgkb_gene_primary_dict}) 92 | for col in GG_res_cols[:-1]: 93 | pharmgkb_res[col] = [0] * len(pharmgkb_res) 94 | pharmgkb_res['Source'] = ['PharmGKB'] * len(pharmgkb_res) 95 | pharmgkb_res['Associate'] = [1] * len(pharmgkb_res) 96 | GG_res = pd.concat((GG_res, pharmgkb_res)) 97 | GG_res.loc[GG_res.duplicated(subset=['Gene_1', 'Gene_2'], keep=False), 'Associate'] = 1 98 | GG_res['Source'] = np.where(GG_res.duplicated(subset=['Gene_1', 'Gene_2'], keep=False), 99 | GG_res['Source'].astype(str) + ';PharmGKB', GG_res['Source'].astype(str) + '') 100 | GG_res = GG_res.drop_duplicates(subset=['Gene_1', 'Gene_2'], keep='first') 101 | GG_res_col = list(GG_res.columns) 102 | GG_res_col_new = GG_res_col[:-2] + GG_res_col[-1:] + GG_res_col[-2:-1] 103 | GG_res = GG_res[GG_res_col_new] 104 | GG_res['Source'] = GG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 105 | GG_res.to_csv(folder + '/GG_res_2.csv', index=False) 106 | with open(folder + '/integration_notes.txt', 'a') as f: 107 | f.write('GG_res_2: Hetionet and PharmGKB (Associate).\n') 108 | f.close() 109 | 110 | 111 | def integrate_DRKG_GG(): 112 | drkg_GG = pd.read_csv(folder + '/drkg_GG.csv') 113 | drkg_GG = drkg_GG[(drkg_GG['source'] == 'GNBR') | (drkg_GG['source'] == 'IntAct')] 114 | drkg_GG = drkg_GG[['entity_1', 'relation', 'entity_2']] 115 | drkg_GG = drkg_GG[~((drkg_GG['entity_1'] == 'Gene::') | (drkg_GG['entity_2'] == 'Gene::'))] 116 | drkg_GG = drkg_GG.drop_duplicates(subset=['entity_1', 'entity_2']) 117 | drkg_GG = drkg_GG.reset_index(drop=True) 118 | drkg_GG = drkg_GG.rename(columns={'entity_1': 'Gene_1', 'entity_2': 'Gene_2'}) 119 | drkg_GG['Gene_1'] = drkg_GG['Gene_1'].str.replace('Gene::', '') 120 | drkg_GG['Gene_2'] = drkg_GG['Gene_2'].str.replace('Gene::', '') 121 | gg_relation_list = list(drkg_GG.drop_duplicates(subset='relation', keep='first')['relation']) 122 | # gg_source_list = list(drkg_GG.drop_duplicates(subset='source', keep='first')['source']) 123 | # print(gg_relation_list) 124 | # print(gg_source_list) 125 | # print(drkg_GG.drop_duplicates(subset='relation', keep='first')) 126 | 127 | gene_vocab = pd.read_csv(folder + '/gene_vocab_2.csv') 128 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 129 | ncbi_vocab['ncbi_id'] = ncbi_vocab['ncbi_id'].astype(int).astype(str) 130 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict() 131 | 132 | GG_res = pd.read_csv(folder + '/GG_res_2.csv') 133 | GG_res_cols = list(GG_res.columns)[2:] 134 | 135 | for drkg_rel in gg_relation_list: 136 | print(drkg_rel) 137 | GG_res[drkg_rel] = [0] * len(GG_res) 138 | drkg_GG_temp = drkg_GG[drkg_GG['relation'] == drkg_rel] 139 | drkg_GG_temp = drkg_GG_temp.replace({'Gene_1': ncbi_primary_dict, 'Gene_2': ncbi_primary_dict}) 140 | drkg_GG_temp = drkg_GG_temp[['Gene_1', 'Gene_2']] 141 | for col in GG_res_cols[:-1]: 142 | drkg_GG_temp[col] = [0] * len(drkg_GG_temp) 143 | drkg_GG_temp['Source'] = ['DRKG'] * len(drkg_GG_temp) 144 | drkg_GG_temp[drkg_rel] = [1] * len(drkg_GG_temp) 145 | GG_res = pd.concat((GG_res, drkg_GG_temp)) 146 | GG_res.loc[GG_res.duplicated(subset=['Gene_1', 'Gene_2'], keep=False), drkg_rel] = 1 147 | GG_res['Source'] = np.where(GG_res.duplicated(subset=['Gene_1', 'Gene_2'], keep=False), 148 | GG_res['Source'].astype(str) + ';DRKG', GG_res['Source'].astype(str) + '') 149 | GG_res = GG_res.drop_duplicates(subset=['Gene_1', 'Gene_2'], keep='first') 150 | GG_res_col = list(GG_res.columns) 151 | GG_res_col_new = GG_res_col[:-2] + GG_res_col[-1:] + GG_res_col[-2:-1] 152 | GG_res = GG_res[GG_res_col_new] 153 | GG_res_cols = GG_res_col_new[2:] 154 | GG_res['Source'] = GG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 155 | GG_res.to_csv(folder + '/GG_res_3.csv', index=False) 156 | with open(folder + '/integration_notes.txt', 'a') as f: 157 | f.write('GG_res_3: Hetionet, PharmGKB and DRKG.\n') 158 | f.close() 159 | 160 | 161 | def integrate_GA_Bgee_present(): 162 | Bgee_present = pd.read_csv('/processed_Bgee_present.csv') 163 | 164 | ncbi_df = pd.read_table('/gene2ensembl') 165 | ensembl_ncbi_dict = ncbi_df.set_index('Ensembl_gene_identifier')['GeneID'].to_dict() 166 | 167 | gene_vocab = pd.read_csv('/gene_vocab_2.csv') 168 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 169 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict() 170 | 171 | anatomy_vocab = pd.read_csv('/anatomy_res_3.csv') 172 | uberon_vocab = anatomy_vocab.dropna(subset=['uberon_id']) 173 | uberon_primary_dict = uberon_vocab.set_index('uberon_id')['primary'].to_dict() 174 | 175 | gene_list = [] 176 | anatomy_list = [] 177 | for i in range(len(Bgee_present)): 178 | gene_id = Bgee_present.loc[i, 'Gene ID'] 179 | anatomy_id = Bgee_present.loc[i, 'Anatomical entity ID'] 180 | 181 | if gene_id in ensembl_ncbi_dict: 182 | ncbi_id = ensembl_ncbi_dict[gene_id] 183 | gene_primary = ncbi_primary_dict[ncbi_id] 184 | else: 185 | continue 186 | 187 | anatomy_primary = uberon_primary_dict[anatomy_id] 188 | gene_list.append(gene_primary) 189 | anatomy_list.append(anatomy_primary) 190 | print(i + 1, '/', len(Bgee_present), 'Completed (Bgee present)...') 191 | GA_res = pd.DataFrame({'Gene': gene_list, 'Anatomy': anatomy_list, 'Present': [1] * len(gene_list), 'Source': ['Reactome'] * len(gene_list)}) 192 | GA_res.to_csv('/GA_res.csv', index=False) 193 | 194 | 195 | def integrate_GA_Bgee_absent(): 196 | GA_res = pd.read_csv('/GA_res.csv') 197 | GA_res = GA_res.rename(columns={'Present': 'Express'}) 198 | GA_res['Absent'] = [0] * len(GA_res) 199 | print(list(GA_res.columns)) 200 | Bgee_absent = pd.read_csv('/processed_Bgee_absent.csv') 201 | 202 | ncbi_df = pd.read_table('/gene2ensembl') 203 | ensembl_ncbi_dict = ncbi_df.set_index('Ensembl_gene_identifier')['GeneID'].to_dict() 204 | 205 | gene_vocab = pd.read_csv('/gene_vocab_2.csv') 206 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 207 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict() 208 | 209 | anatomy_vocab = pd.read_csv('res/entity/anatomy_res_3.csv') 210 | uberon_vocab = anatomy_vocab.dropna(subset=['uberon_id']) 211 | uberon_primary_dict = uberon_vocab.set_index('uberon_id')['primary'].to_dict() 212 | 213 | gene_list = [] 214 | anatomy_list = [] 215 | for i in range(len(Bgee_absent)): 216 | gene_id = Bgee_absent.loc[i, 'Gene ID'] 217 | anatomy_id = Bgee_absent.loc[i, 'Anatomical entity ID'] 218 | 219 | if gene_id in ensembl_ncbi_dict: 220 | ncbi_id = ensembl_ncbi_dict[gene_id] 221 | gene_primary = ncbi_primary_dict[ncbi_id] 222 | else: 223 | continue 224 | 225 | anatomy_primary = uberon_primary_dict[anatomy_id] 226 | gene_list.append(gene_primary) 227 | anatomy_list.append(anatomy_primary) 228 | print(i + 1, '/', len(Bgee_absent), 'Completed (Bgee absent)...') 229 | Bgee_absent = pd.DataFrame({'Gene': gene_list, 'Anatomy': anatomy_list, 'Express': [0] * len(gene_list), 230 | 'Source': ['Reactome'] * len(gene_list), 'Absent': [1] * len(gene_list)}) 231 | print(Bgee_absent) 232 | GA_res = pd.concat((GA_res, Bgee_absent)) 233 | GA_res.loc[GA_res.duplicated(subset=['Gene', 'Anatomy'], keep=False), 'Absent'] = 1 234 | GA_res = GA_res.drop_duplicates(subset=['Gene', 'Anatomy'], keep='first') 235 | GA_res = GA_res[['Gene', 'Anatomy', 'Express', 'Absent', 'Source']] 236 | GA_res.to_csv('/GA_res_2.csv', index=False) 237 | 238 | 239 | def integrate_GA_TISSUE(): 240 | GA_res = pd.read_csv('/GA_res_2.csv') 241 | GA_res['Express_TISSUE'] = [0] * len(GA_res) 242 | print(list(GA_res.columns)) 243 | tissue_df = pd.read_csv('/processed_TISSUE.csv') 244 | 245 | gene_vocab = pd.read_csv('/gene_vocab_2.csv') 246 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 247 | ncbi_vocab['ncbi_id'] = ncbi_vocab['ncbi_id'].astype(int).astype(str) 248 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict() 249 | 250 | anatomy_vocab = pd.read_csv('/anatomy_res_3.csv') 251 | bto_vocab = anatomy_vocab.dropna(subset=['bto_id']) 252 | bto_primary_dict = bto_vocab.set_index('bto_id')['primary'].to_dict() 253 | 254 | gene_list = [] 255 | anatomy_list = [] 256 | for i in range(len(tissue_df)): 257 | gene_id = tissue_df.loc[i, 'gene_id'].replace('NCBI:', '') 258 | anatomy_id = tissue_df.loc[i, 'tissue_id'] 259 | 260 | gene_primary = ncbi_primary_dict[gene_id] if gene_id in ncbi_primary_dict else gene_id 261 | gene_list.append(gene_primary) 262 | anatomy_list.append(bto_primary_dict[anatomy_id]) 263 | 264 | print(i + 1, '/', len(tissue_df), 'Completed (TISSUE)...') 265 | tissue_res = pd.DataFrame({'Gene': gene_list, 'Anatomy': anatomy_list, 'Express': [0] * len(gene_list), 266 | 'Absent': [0] * len(gene_list), 'Source': ['TISSUE'] * len(gene_list), 267 | 'Express_TISSUE': [1] * len(gene_list)}) 268 | print(tissue_res) 269 | GA_res = pd.concat((GA_res, tissue_res)) 270 | GA_res.loc[GA_res.duplicated(subset=['Gene', 'Anatomy'], keep=False), 'Express_TISSUE'] = 1 271 | GA_res['Source'] = np.where(GA_res.duplicated(subset=['Gene', 'Anatomy'], keep=False), 272 | GA_res['Source'].astype(str) + ';TISSUE', GA_res['Source'].astype(str) + '') 273 | GA_res = GA_res.drop_duplicates(subset=['Gene', 'Anatomy'], keep='first') 274 | GA_res['Source'] = GA_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 275 | GA_res['Express'] = GA_res['Express'] + GA_res['Express_TISSUE'] 276 | GA_res.loc[GA_res['Express'] != 0, 'Express'] = 1 277 | GA_res_col = list(GA_res.columns) 278 | GA_res_col.remove('Express_TISSUE') 279 | GA_res = GA_res[GA_res_col] 280 | GA_res.to_csv('/GA_res_3.csv', index=False) 281 | 282 | 283 | def integrate_GPwy_Reactome(): 284 | gpwy_Reactome = pd.read_table(folder + 'NCBI2Reactome_All_Levels.txt', header=None) 285 | homo_Reactome = gpwy_Reactome[gpwy_Reactome[5] == 'Homo sapiens'] 286 | homo_Reactome = homo_Reactome[homo_Reactome[0].astype(str).str.isdigit()] 287 | homo_Reactome[0] = homo_Reactome[0].astype(int).astype(str) 288 | homo_Reactome = homo_Reactome.drop_duplicates(subset=[0, 1], keep='first') 289 | homo_Reactome = homo_Reactome.reset_index(drop=True) 290 | homo_Reactome = homo_Reactome[[0, 1]] 291 | homo_Reactome = homo_Reactome.rename(columns={0: 'Gene', 1: 'Pathway'}) 292 | print(homo_Reactome) 293 | gene_vocab = pd.read_csv(folder + 'gene_vocab.csv') 294 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 295 | ncbi_vocab['ncbi_id'] = ncbi_vocab['ncbi_id'].astype(int).astype(str) 296 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict() 297 | 298 | pwy_vocab = pd.read_csv(folder + 'pathway_vocab.csv') 299 | reactome_vocab = pwy_vocab.dropna(subset=['reactome_id']) 300 | reactome_primary_dict = reactome_vocab.set_index('reactome_id')['primary'].to_dict() 301 | 302 | homo_Reactome = homo_Reactome.replace({'Gene': ncbi_primary_dict, 'Pathway': reactome_primary_dict}) 303 | print(homo_Reactome) 304 | homo_Reactome['Reaction'] = [1] * len(homo_Reactome) 305 | homo_Reactome['Source'] = ['Reactome'] * len(homo_Reactome) 306 | homo_Reactome.to_csv(folder + 'GPwy_res.csv', index=False) 307 | 308 | 309 | def integrate_GPwy_KEGG(): 310 | GPwy_res = pd.read_csv(folder + 'GPwy_res.csv') 311 | GPwy_res['Associate'] = [0] * len(GPwy_res) 312 | 313 | kegg_GPwy = pd.read_csv(folder + '/kegg_gene_pathway.csv') 314 | kegg_GPwy = kegg_GPwy.rename(columns={'pathway_id': 'Pathway', 'ncbi_id': 'Gene'}) 315 | kegg_GPwy = kegg_GPwy[['Gene', 'Pathway']] 316 | print(kegg_GPwy) 317 | gene_vocab = pd.read_csv(folder + 'gene_vocab.csv') 318 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 319 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict() 320 | 321 | pwy_vocab = pd.read_csv(folder + 'pathway_vocab.csv') 322 | kegg_vocab = pwy_vocab.dropna(subset=['kegg_id']) 323 | kegg_primary_dict = kegg_vocab.set_index('kegg_id')['primary'].to_dict() 324 | 325 | kegg_GPwy = kegg_GPwy.replace({'Gene': ncbi_primary_dict, 'Pathway': kegg_primary_dict}) 326 | print(kegg_GPwy) 327 | kegg_GPwy['Reaction'] = [0] * len(kegg_GPwy) 328 | kegg_GPwy['Source'] = ['KEGG'] * len(kegg_GPwy) 329 | kegg_GPwy['Associate'] = [1] * len(kegg_GPwy) 330 | 331 | GPwy_res = pd.concat((GPwy_res, kegg_GPwy)) 332 | GPwy_res.loc[GPwy_res.duplicated(subset=['Gene', 'Pathway'], keep=False), 'Associate'] = 1 333 | GPwy_res['Source'] = np.where(GPwy_res.duplicated(subset=['Gene', 'Pathway'], keep=False), 334 | GPwy_res['Source'].astype(str) + ';KEGG', GPwy_res['Source'].astype(str) + '') 335 | GPwy_res = GPwy_res.drop_duplicates(subset=['Gene', 'Pathway'], keep='first') 336 | GPwy_res['Source'] = GPwy_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 337 | GPwy_res_cols = list(GPwy_res.columns) 338 | GPwy_res_cols_new = GPwy_res_cols[:-2] + GPwy_res_cols[-1:] + GPwy_res_cols[-2:-1] 339 | GPwy_res = GPwy_res[GPwy_res_cols_new] 340 | GPwy_res.to_csv(folder + 'GPwy_res_2.csv', index=False) 341 | 342 | 343 | def modify_res(): 344 | AG_res = pd.read_csv(folder + 'A_G_res.csv') 345 | AG_res['Source'] = AG_res['Source'].str.replace('Reactome', 'Bgee') 346 | print(AG_res) 347 | AG_res.to_csv(folder + 'A_G_res.csv', index=False) 348 | 349 | 350 | def main(): 351 | # integrate_Hetionet_GG() 352 | # extract_PharmGKB_GG() 353 | # integrate_PharmGKB_GG() 354 | # integrate_DRKG_GG() 355 | # integrate_GA_Bgee_present() 356 | # integrate_GA_Bgee_absent() 357 | # integrate_GA_TISSUE() 358 | # integrate_GPwy_Reactome() 359 | # integrate_GPwy_KEGG() 360 | 361 | modify_res() 362 | 363 | 364 | if __name__ == '__main__': 365 | main() 366 | -------------------------------------------------------------------------------- /Codes_Term Harmonization/Relation_Integration/integrate_disease_gene.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | pd.set_option('display.max_columns', None) 5 | 6 | folder = '' 7 | CTD_folder = '../CTD/' 8 | 9 | 10 | def integrate_Hetionet(): 11 | hetionet_DiG = pd.read_csv(folder + 'hetionet_DiG.csv') 12 | hetionet_DiG = hetionet_DiG.rename(columns={'source': 'Disease', 'target': 'Gene'}) 13 | hetionet_DiG['Disease'] = hetionet_DiG['Disease'].str.replace('Disease::', '') 14 | hetionet_DiG['Gene'] = hetionet_DiG['Gene'].str.replace('Gene::', '') 15 | 16 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv') 17 | do_vocab = disease_vocab.dropna(subset=['do_id']) 18 | do_primary_dict = do_vocab.set_index('do_id')['primary'].to_dict() 19 | 20 | gene_vocab = pd.read_csv(folder + 'gene_vocab_2.csv') 21 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 22 | ncbi_vocab['ncbi_id'] = ncbi_vocab['ncbi_id'].astype(int).astype(str) 23 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict() 24 | 25 | hetionet_dig_dag = hetionet_DiG[hetionet_DiG['metaedge'] == 'DaG'] 26 | hetionet_dig_dag = hetionet_dig_dag.replace({'Disease': do_primary_dict, 'Gene': ncbi_primary_dict}) 27 | hetionet_dig_dag = hetionet_dig_dag[['Disease', 'Gene']] 28 | hetionet_dig_dag['Associate_Hetionet'] = [1] * len(hetionet_dig_dag) 29 | hetionet_dig_dag['Downregulates_Hetionet'] = [0] * len(hetionet_dig_dag) 30 | 31 | hetionet_dig_ddg = hetionet_DiG[hetionet_DiG['metaedge'] == 'DdG'] 32 | hetionet_dig_ddg = hetionet_dig_ddg.replace({'Disease': do_primary_dict, 'Gene': ncbi_primary_dict}) 33 | hetionet_dig_ddg = hetionet_dig_ddg[['Disease', 'Gene']] 34 | hetionet_dig_ddg['Associate_Hetionet'] = [0] * len(hetionet_dig_ddg) 35 | hetionet_dig_ddg['Downregulates_Hetionet'] = [1] * len(hetionet_dig_ddg) 36 | 37 | DiG_res = pd.concat((hetionet_dig_dag, hetionet_dig_ddg)) 38 | DiG_res.loc[DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 'Downregulates_Hetionet'] = 1 39 | DiG_res = DiG_res.drop_duplicates(subset=['Disease', 'Gene'], keep='first') 40 | 41 | DiG_res['Upregulates_Hetionet'] = [0] * len(DiG_res) 42 | 43 | hetionet_dig_dug = hetionet_DiG[hetionet_DiG['metaedge'] == 'DuG'] 44 | hetionet_dig_dug = hetionet_dig_dug.replace({'Disease': do_primary_dict, 'Gene': ncbi_primary_dict}) 45 | hetionet_dig_dug = hetionet_dig_dug[['Disease', 'Gene']] 46 | hetionet_dig_dug['Associate_Hetionet'] = [0] * len(hetionet_dig_dug) 47 | hetionet_dig_dug['Downregulates_Hetionet'] = [0] * len(hetionet_dig_dug) 48 | hetionet_dig_dug['Upregulates_Hetionet'] = [1] * len(hetionet_dig_dug) 49 | 50 | DiG_res = pd.concat((DiG_res, hetionet_dig_dug)) 51 | DiG_res.loc[DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 'Upregulates_Hetionet'] = 1 52 | DiG_res = DiG_res.drop_duplicates(subset=['Disease', 'Gene'], keep='first') 53 | 54 | DiG_res['Source'] = ['Hetionet'] * len(DiG_res) 55 | print(DiG_res) 56 | DiG_res.to_csv(folder + 'DiG_res.csv', index=False) 57 | with open(folder + 'integration_notes.txt', 'w') as f: 58 | f.write('DiG_res: Hetionet (Associate, Downregulates and Upregulates).\n') 59 | f.close() 60 | 61 | 62 | def integrate_KEGG(): 63 | DiG_res = pd.read_csv(folder + 'DiG_res.csv') 64 | DiG_res_cols = list(DiG_res.columns)[2:] 65 | DiG_res['Associate_KEGG'] = [0] * len(DiG_res) 66 | 67 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv') 68 | kegg_disease_vocab = disease_vocab.dropna(subset=['kegg_id']) 69 | kegg_disease_primary_dict = kegg_disease_vocab.set_index('kegg_id')['primary'].to_dict() 70 | 71 | gene_vocab = pd.read_csv(folder + 'gene_vocab_2.csv') 72 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 73 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict() 74 | 75 | kegg_df = pd.read_csv(folder + 'kegg_disease_gene.csv') 76 | kegg_df = kegg_df.rename(columns={'disease': 'Disease', 'gene': 'Gene'}) 77 | kegg_df = kegg_df.replace({'Disease': kegg_disease_primary_dict, 'Gene': ncbi_primary_dict}) 78 | 79 | for col in DiG_res_cols[:-1]: 80 | kegg_df[col] = [0] * len(kegg_df) 81 | kegg_df['Source'] = ['KEGG'] * len(kegg_df) 82 | kegg_df['Associate_KEGG'] = [1] * len(kegg_df) 83 | DiG_res = pd.concat((DiG_res, kegg_df)) 84 | DiG_res.loc[DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 'Associate_KEGG'] = 1 85 | DiG_res['Source'] = np.where(DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 86 | DiG_res['Source'].astype(str) + ';KEGG', DiG_res['Source'].astype(str) + '') 87 | DiG_res = DiG_res.drop_duplicates(subset=['Disease', 'Gene'], keep='first') 88 | DiG_res_col = list(DiG_res.columns) 89 | DiG_res_col_new = DiG_res_col[:-2] + DiG_res_col[-1:] + DiG_res_col[-2:-1] 90 | DiG_res = DiG_res[DiG_res_col_new] 91 | print(DiG_res) 92 | DiG_res.to_csv(folder + 'DiG_res_2.csv', index=False) 93 | with open(folder + 'integration_notes.txt', 'a') as f: 94 | f.write('DiG_res_2: Hetionet and KEGG (Associate).\n') 95 | f.close() 96 | 97 | 98 | def extract_PharmGKB_DiG(): 99 | pharmgkb_rel = pd.read_table(folder + 'pharmgkb_rel.tsv') 100 | pharmgkb_rel = pharmgkb_rel[pharmgkb_rel['Association'] == 'associated'] 101 | pharmgkb_rel = pharmgkb_rel.reset_index(drop=True) 102 | res = pd.DataFrame(columns=['Disease', 'Gene']) 103 | idx = 0 104 | for i in range(len(pharmgkb_rel)): 105 | p1_id = pharmgkb_rel.loc[i, 'Entity1_id'] 106 | p1_type = pharmgkb_rel.loc[i, 'Entity1_type'] 107 | p2_id = pharmgkb_rel.loc[i, 'Entity2_id'] 108 | p2_type = pharmgkb_rel.loc[i, 'Entity2_type'] 109 | if p1_type == 'Disease' and p2_type == 'Gene': 110 | disease = p1_id 111 | gene = p2_id 112 | elif p2_type == 'Disease' and p1_type == 'Gene': 113 | disease = p2_id 114 | gene = p1_id 115 | else: 116 | continue 117 | res.loc[idx] = [disease, gene] 118 | idx += 1 119 | res.to_csv(folder + 'pharmgkb_disease_gene.csv', index=False) 120 | 121 | 122 | def integrate_PharmGKB(): 123 | DiG_res = pd.read_csv(folder + 'DiG_res_2.csv') 124 | DiG_res_cols = list(DiG_res.columns)[2:] 125 | DiG_res['Associate_PharmGKB'] = [0] * len(DiG_res) 126 | 127 | pharmgkb_res = pd.read_csv(folder + 'pharmgkb_disease_gene.csv') 128 | 129 | gene_vocab = pd.read_csv(folder + 'gene_vocab_2.csv') 130 | pharmgkb_gene_vocab = gene_vocab.dropna(subset=['pharmgkb_id']) 131 | pharmgkb_gene_primary_dict = pharmgkb_gene_vocab.set_index('pharmgkb_id')['primary'].to_dict() 132 | 133 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv') 134 | pharmgkb_disease_vocab = disease_vocab.dropna(subset=['pharmgkb_id']) 135 | pharmgkb_disease_primary_dict = pharmgkb_disease_vocab.set_index('pharmgkb_id')['primary'].to_dict() 136 | 137 | pharmgkb_res = pharmgkb_res.replace({'Disease': pharmgkb_disease_primary_dict, 'Gene': pharmgkb_gene_primary_dict}) 138 | for col in DiG_res_cols[:-1]: 139 | pharmgkb_res[col] = [0] * len(pharmgkb_res) 140 | pharmgkb_res['Source'] = ['PharmGKB'] * len(pharmgkb_res) 141 | pharmgkb_res['Associate_PharmGKB'] = [1] * len(pharmgkb_res) 142 | DiG_res = pd.concat((DiG_res, pharmgkb_res)) 143 | DiG_res.loc[DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 'Associate_PharmGKB'] = 1 144 | DiG_res['Source'] = np.where(DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 145 | DiG_res['Source'].astype(str) + ';PharmGKB', DiG_res['Source'].astype(str) + '') 146 | DiG_res = DiG_res.drop_duplicates(subset=['Disease', 'Gene'], keep='first') 147 | DiG_res_col = list(DiG_res.columns) 148 | DiG_res_col_new = DiG_res_col[:-2] + DiG_res_col[-1:] + DiG_res_col[-2:-1] 149 | DiG_res = DiG_res[DiG_res_col_new] 150 | DiG_res['Source'] = DiG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 151 | DiG_res.to_csv(folder + 'DiG_res_3.csv', index=False) 152 | with open(folder + 'integration_notes.txt', 'a') as f: 153 | f.write('DiG_res_3: Hetionet, KEGG and PharmGKB (Associate).\n') 154 | f.close() 155 | 156 | 157 | def integrate_CTD_DiG_curated(): 158 | disease_gene = pd.read_csv(CTD_folder + 'CTD_genes_diseases.csv', header=27) 159 | disease_gene = disease_gene.dropna(subset=['GeneID', 'DiseaseID']) 160 | disease_gene = disease_gene.drop_duplicates(subset=['GeneID', 'DiseaseID']) 161 | disease_gene = disease_gene.reset_index(drop=True) 162 | disease_gene = disease_gene.rename(columns={'DiseaseID': 'Disease', 'GeneID': 'Gene'}) 163 | disease_gene_curated = disease_gene[pd.isnull(disease_gene['InferenceScore'])] 164 | 165 | disease_gene_curated = disease_gene_curated[['Disease', 'Gene']] 166 | disease_gene_curated = disease_gene_curated.reset_index(drop=True) 167 | 168 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv') 169 | mesh_disease_vocab = disease_vocab.dropna(subset=['mesh_id']) 170 | mesh_disease_primary_dict = mesh_disease_vocab.set_index('mesh_id')['primary'].to_dict() 171 | omim_vocab = disease_vocab.dropna(subset=['omim_id']) 172 | omim_vocab['omim_id'] = omim_vocab['omim_id'].astype(int).astype(str) 173 | omim_primary_dict = omim_vocab.set_index('omim_id')['primary'].to_dict() 174 | 175 | gene_vocab = pd.read_csv(folder + 'gene_vocab_2.csv') 176 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 177 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict() 178 | 179 | DiG_res = pd.read_csv(folder + 'DiG_res_3.csv') 180 | DiG_res_col = list(DiG_res.columns)[2:] 181 | DiG_res['Associate_CTD'] = [0] * len(DiG_res) 182 | print(disease_gene_curated) 183 | disease_list = [] 184 | gene_list = [] 185 | for i in range(len(disease_gene_curated)): 186 | disease_id = disease_gene_curated.loc[i, 'Disease'] 187 | gene_id = disease_gene_curated.loc[i, 'Gene'] 188 | 189 | gene_list.append(ncbi_primary_dict[gene_id]) 190 | if 'MESH' in disease_id: 191 | disease_id = disease_id.replace('MESH:', '') 192 | disease_list.append(mesh_disease_primary_dict[disease_id]) 193 | else: 194 | disease_id = disease_id.replace('OMIM:', '') 195 | disease_list.append(omim_primary_dict[disease_id]) 196 | print(i + 1, '/', len(disease_gene_curated), 'Completed...') 197 | disease_gene_curated = pd.DataFrame({'Disease': disease_list, 'Gene': gene_list}) 198 | print(disease_gene_curated) 199 | 200 | for col in DiG_res_col[:-1]: 201 | disease_gene_curated[col] = [0] * len(disease_gene_curated) 202 | disease_gene_curated['Source'] = ['CTD'] * len(disease_gene_curated) 203 | disease_gene_curated['Associate_CTD'] = [1] * len(disease_gene_curated) 204 | DiG_res = pd.concat((DiG_res, disease_gene_curated)) 205 | DiG_res.loc[DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 'Associate_CTD'] = 1 206 | DiG_res['Source'] = np.where(DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 207 | DiG_res['Source'].astype(str) + ';CTD', DiG_res['Source'].astype(str) + '') 208 | DiG_res = DiG_res.drop_duplicates(subset=['Disease', 'Gene'], keep='first') 209 | DiG_res_col = list(DiG_res.columns) 210 | DiG_res_col_new = DiG_res_col[:-2] + DiG_res_col[-1:] + DiG_res_col[-2:-1] 211 | DiG_res = DiG_res[DiG_res_col_new] 212 | DiG_res['Source'] = DiG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 213 | DiG_res.to_csv(folder + 'DiG_res_4.csv', index=False) 214 | with open(folder + 'integration_notes.txt', 'a') as f: 215 | f.write('DiG_res_4: Hetionet, KEGG, PharmGKB and CTD_curated (Associate).\n') 216 | f.close() 217 | 218 | 219 | def integrate_CTD_DiG_inferred(): 220 | DiG_res = pd.read_csv(folder + 'DiG_res_4.csv') 221 | DiG_res_col = list(DiG_res.columns)[2:] 222 | DiG_res['Inferred_Relation'] = [0] * len(DiG_res) 223 | DiG_res['Inference_Score'] = [''] * len(DiG_res) 224 | 225 | disease_gene_inferred = pd.read_csv(folder + 'CTD_disease_gene_inferred.csv') 226 | 227 | for col in DiG_res_col[:-1]: 228 | disease_gene_inferred[col] = [0] * len(disease_gene_inferred) 229 | disease_gene_inferred['Source'] = ['CTD'] * len(disease_gene_inferred) 230 | disease_gene_inferred['Inferred_Relation'] = [1] * len(disease_gene_inferred) 231 | temp_col = list(disease_gene_inferred.columns) 232 | disease_gene_inferred_col = temp_col[:2] + temp_col[3:] + temp_col[2:3] 233 | disease_gene_inferred = disease_gene_inferred[disease_gene_inferred_col] 234 | print(list(disease_gene_inferred.columns)) 235 | DiG_res = pd.concat((DiG_res, disease_gene_inferred)) 236 | DiG_res.loc[DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 'Inferred_Relation'] = 1 237 | DiG_res['Source'] = np.where(DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 238 | DiG_res['Source'].astype(str) + ';CTD', DiG_res['Source'].astype(str) + '') 239 | DiG_res = DiG_res.drop_duplicates(subset=['Disease', 'Gene'], keep='first') 240 | DiG_res_col = list(DiG_res.columns) 241 | DiG_res_col_new = DiG_res_col[:-3] + DiG_res_col[-2:-1] + DiG_res_col[-3:-2] + DiG_res_col[-1:] 242 | DiG_res = DiG_res[DiG_res_col_new] 243 | DiG_res['Source'] = DiG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 244 | DiG_res.to_csv(folder + 'DiG_res_5.csv', index=False) 245 | with open(folder + 'integration_notes.txt', 'a') as f: 246 | f.write('DiG_res_5: Hetionet, KEGG, PharmGKB, CTD_curated and CTD (Inferred_Relation).\n') 247 | f.close() 248 | 249 | 250 | def integrate_DRKG_DiG(): 251 | DiG_res = pd.read_csv(folder + 'DiG_res_5.csv') 252 | DiG_res_col = list(DiG_res.columns)[2:] 253 | 254 | drkg_DiG = pd.read_csv('drkg_DiG.csv') 255 | # drkg_DDi = pd.read_csv('/drkg_DDi.csv') 256 | drkg_DiG = drkg_DiG.rename(columns={'entity_1': 'Disease', 'entity_2': 'Gene'}) 257 | drkg_DiG['Disease'] = drkg_DiG['Disease'].str.replace('Disease::', '') 258 | drkg_DiG['Gene'] = drkg_DiG['Gene'].str.replace('Gene::', '') 259 | dig_relation_list = list(drkg_DiG.drop_duplicates(subset='relation', keep='first')['relation']) 260 | # dig_source_list = list(drkg_DiG.drop_duplicates(subset='source', keep='first')['source']) 261 | # print(dig_relation_list) 262 | # print(dig_source_list) 263 | # print(drkg_DiG.drop_duplicates(subset='relation', keep='first')) 264 | 265 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv') 266 | mesh_disease_vocab = disease_vocab.dropna(subset=['mesh_id']) 267 | mesh_disease_primary_dict = mesh_disease_vocab.set_index('mesh_id')['primary'].to_dict() 268 | omim_vocab = disease_vocab.dropna(subset=['omim_id']) 269 | omim_vocab['omim_id'] = omim_vocab['omim_id'].astype(int).astype(str) 270 | omim_primary_dict = omim_vocab.set_index('omim_id')['primary'].to_dict() 271 | 272 | gene_vocab = pd.read_csv(folder + 'gene_vocab_2.csv') 273 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id']) 274 | ncbi_vocab['ncbi_id'] = ncbi_vocab['ncbi_id'].astype(int).astype(str) 275 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict() 276 | 277 | for drkg_rel in dig_relation_list: 278 | print(drkg_rel) 279 | DiG_res[drkg_rel] = [0] * len(DiG_res) 280 | drkg_DiG_temp = drkg_DiG[drkg_DiG['relation'] == drkg_rel] 281 | drkg_DiG_temp = drkg_DiG_temp[['Disease', 'Gene']] 282 | drkg_DiG_temp = drkg_DiG_temp.reset_index(drop=True) 283 | 284 | disease_list = [] 285 | gene_list = [] 286 | for i in range(len(drkg_DiG_temp)): 287 | disease_id = drkg_DiG_temp.loc[i, 'Disease'] 288 | gene_id = drkg_DiG_temp.loc[i, 'Gene'] 289 | 290 | if gene_id in ncbi_primary_dict: 291 | gene_list.append(ncbi_primary_dict[gene_id]) 292 | else: 293 | continue 294 | 295 | if 'MESH' in disease_id: 296 | disease_id = disease_id.replace('MESH:', '') 297 | disease_list.append(mesh_disease_primary_dict[disease_id]) 298 | else: 299 | disease_id = disease_id.replace('OMIM:', '') 300 | disease_list.append(omim_primary_dict[disease_id]) 301 | 302 | print(i + 1, '/', len(drkg_DiG_temp), 'Completed...') 303 | 304 | drkg_DiG_temp_primary = pd.DataFrame({'Disease': disease_list, 'Gene': gene_list}) 305 | 306 | for col in DiG_res_col[:-2]: 307 | drkg_DiG_temp_primary[col] = [0] * len(drkg_DiG_temp_primary) 308 | drkg_DiG_temp_primary['Source'] = ['DRKG'] * len(drkg_DiG_temp_primary) 309 | drkg_DiG_temp_primary['Inference_Score'] = [''] * len(drkg_DiG_temp_primary) 310 | drkg_DiG_temp_primary[drkg_rel] = [1] * len(drkg_DiG_temp_primary) 311 | DiG_res = pd.concat((DiG_res, drkg_DiG_temp_primary)) 312 | DiG_res.loc[DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), drkg_rel] = 1 313 | DiG_res['Source'] = np.where(DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 314 | DiG_res['Source'].astype(str) + ';DRKG', DiG_res['Source'].astype(str) + '') 315 | DiG_res = DiG_res.drop_duplicates(subset=['Disease', 'Gene'], keep='first') 316 | DiG_res_col = list(DiG_res.columns) 317 | DiG_res_col_new = DiG_res_col[:-3] + DiG_res_col[-1:] + DiG_res_col[-3:-1] 318 | DiG_res = DiG_res[DiG_res_col_new] 319 | DiG_res_col = DiG_res_col_new[2:] 320 | DiG_res['Source'] = DiG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';'))))) 321 | 322 | DiG_res.to_csv(folder + 'DiG_res_6.csv', index=False) 323 | with open(folder + 'integration_notes.txt', 'a') as f: 324 | f.write('DiG_res_6: Hetionet, KEGG, CTD and DRKG (Semantic Relations).\n') 325 | f.close() 326 | 327 | 328 | def main(): 329 | integrate_Hetionet() 330 | integrate_KEGG() 331 | # extract_PharmGKB_DiG() 332 | integrate_PharmGKB() 333 | integrate_CTD_DiG_curated() 334 | integrate_CTD_DiG_inferred() 335 | integrate_DRKG_DiG() 336 | 337 | # DiG = pd.read_csv(res_folder + 'relation/Di_G_res_6.csv') 338 | # print(len(DiG), len(DiG.drop_duplicates(subset=['Disease', 'Gene'], keep='first'))) 339 | # DiG_raw = pd.read_csv(res_folder + 'relation/Di_G_res.csv') 340 | # print(len(DiG_raw), len(DiG_raw.drop_duplicates(subset=['Disease', 'Gene'], keep='first'))) 341 | 342 | 343 | if __name__ == '__main__': 344 | main() 345 | -------------------------------------------------------------------------------- /iBKH-KD-protocol/Knowledge_Discovery_Pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# iBKH-based Knowledge Discovery Pipeline" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This is the implementation of Knowledge Discovery pipeline in our iBKH portal at http://ibkh.ai/.\n", 15 | "\n", 16 | "Given a target entity of interest, the task is to discover the Top-N entities from different entity types (currently supporting gene, drug, symptom, and pathway entities) that potentially link to the target entity. \n", 17 | "\n", 18 | "\n", 19 | "Generally, the pipeline contains 3 steps, including: \n", 20 | "1. Data preparation (triplets generation); \n", 21 | "\n", 22 | "2. Knowledge graph embedding learning; \n", 23 | "\n", 24 | "3. Knowledge discovery based on link prediction – predicting drug entities that potentially link to AD. " 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "### Step 1 – Data preparation (triplets generation)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "###### Collecting iBKH knowledge graph source data\n", 39 | "\n", 40 | "Download the latest version of iBKH knowledge graph data (entities and relations) at: https://github.com/wcm-wanglab/iBKH/tree/main/iBKH\n", 41 | "\n", 42 | "\n", 43 | "Please make sure putting the downloaded files following the structure below.\n", 44 | "\n", 45 | "```\n", 46 | ".\n", 47 | "├── Case Study-AD Drug Repurposing.ipynb\n", 48 | "├── Data\n", 49 | "│ ├── iBKH \n", 50 | "│ │ ├── Entity\n", 51 | "│ │ │ ├── anatomy_vocab.csv\n", 52 | "│ │ │ ├── disease_vocab.csv\n", 53 | "│ │ │ ├── drug_vocab.csv\n", 54 | "│ │ │ ├── dsp_vocab.csv\n", 55 | "│ │ │ ├── gene_vocab.csv\n", 56 | "│ │ │ ├── molecule_vocab.csv\n", 57 | "│ │ │ ├── pathway_vocab.csv\n", 58 | "│ │ │ ├── sdsi_vocab.csv\n", 59 | "│ │ │ ├── side_effect_vocab.csv\n", 60 | "│ │ │ ├── symptom_vocab.csv\n", 61 | "│ │ │ ├── tc_vocab.csv\n", 62 | "│ │ │ ├── ...\n", 63 | "│ │ │ │ \n", 64 | "│ │ ├── Relation\n", 65 | "│ │ │ ├── A_G_res.csv\n", 66 | "│ │ │ ├── D_D_res.csv\n", 67 | "│ │ │ ├── D_Di_res.csv\n", 68 | "│ │ │ ├── D_G_res.csv\n", 69 | "│ │ │ ├── D_Pwy_res.csv\n", 70 | "│ │ │ ├── D_SE_res.csv\n", 71 | "│ │ │ ├── Di_Di_res.csv\n", 72 | "│ │ │ ├── Di_G_res.csv\n", 73 | "│ │ │ ├── Di_Pwy_res.csv\n", 74 | "│ │ │ ├── Di_Sy_res.csv\n", 75 | "│ │ │ ├── DSP_SDSI_res.csv\n", 76 | "│ │ │ ├── G_G_res.csv\n", 77 | "│ │ │ ├── G_Pwy_res.csv\n", 78 | "│ │ │ ├── SDSI_A_res.csv\n", 79 | "│ │ │ ├── SDSI_D_res.csv\n", 80 | "│ │ │ ├── SDSI_Di_res.csv\n", 81 | "│ │ │ ├── SDSI_Sy.csv\n", 82 | "│ │ │ ├── SDSI_TC_res.csv\n", 83 | "│ │ │ ├── ...\n", 84 | "│ │ │ └── \n", 85 | "│ │ └── \n", 86 | "│ └── ...\n", 87 | "└── ...\n", 88 | "```" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "# import required packages\n", 98 | "\n", 99 | "import warnings\n", 100 | "warnings.filterwarnings(\"ignore\")\n", 101 | "\n", 102 | "import pandas as pd\n", 103 | "import numpy as np\n", 104 | "import pickle\n", 105 | "\n", 106 | "import torch as th\n", 107 | "import torch.nn.functional as fn\n", 108 | "\n", 109 | "from sklearn.preprocessing import MinMaxScaler\n", 110 | "\n", 111 | "import os\n", 112 | "\n", 113 | "import sys\n", 114 | "sys.path.append('.') # Use only with Jupyter Notebook\n", 115 | "\n", 116 | "import funcs.KG_processing as KG_processing" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "### Step 1: Generate Triplet Set from iBKH " 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "A triplet, i.e., (h, r, t), is the basic unit for a knowledge graph. We generate triplet set from iBKH, which will be used for knowledge graph embedding learning." 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "kg_folder = 'data/iBKH/' # The folder is used to store the iBKH-KG data\n", 140 | "triplet_path = 'data/triplets/' # The folder is used to store processed results\n", 141 | "if not os.path.exists(triplet_path):\n", 142 | " os.makedirs(triplet_path) \n", 143 | "output_path = 'data/dataset/' # Output folder\n", 144 | "if not os.path.exists(output_path):\n", 145 | " os.makedirs(output_path)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "Generating triplets for different entity type pairs." 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "KG_processing.DDi_triplets(kg_folder, triplet_path)\n", 162 | "KG_processing.DG_triplets(kg_folder, triplet_path)\n", 163 | "KG_processing.DPwy_triplets(kg_folder, triplet_path)\n", 164 | "KG_processing.DSE_triplets(kg_folder, triplet_path)\n", 165 | "KG_processing.DiDi_triplets(kg_folder, triplet_path)\n", 166 | "KG_processing.DiG_triplets(kg_folder, triplet_path)\n", 167 | "KG_processing.DiPwy_triplets(kg_folder, triplet_path)\n", 168 | "KG_processing.DiSy_triplets(kg_folder, triplet_path)\n", 169 | "KG_processing.GG_triplets(kg_folder, triplet_path)\n", 170 | "KG_processing.GPwy_triplets(kg_folder, triplet_path)\n", 171 | "KG_processing.DD_triplets(kg_folder, triplet_path)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "Combine all the triplets set extracted from the relation results among the entities, then convert the triplet set from .csv format to the .tsv format based on the DGL input requirement." 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "# Specifying triplet type you want to use.\n", 188 | "included_pair_type = ['DDi', 'DiG', 'DG', 'GG', 'DD', 'DiDi',\n", 189 | " 'GPwy', 'DiPwy', 'DPwy', 'DiSy', 'DSE']\n", 190 | "\n", 191 | "# Running below script will return a csv file, which combines all triplets extracted from the above functions.\n", 192 | "KG_processing.generate_triplet_set(triplet_path=triplet_path) " 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "# Split the data into training, validation, and testing sets.\n", 202 | "# And convert data to TSV files following DGK-KE requirements.\n", 203 | "KG_processing.generate_DGL_data_set(triplet_path=triplet_path, \n", 204 | " output_path=output_path, \n", 205 | " train_val_test_ratio=[.9, .05, .05])" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "### Step 2: Knowledge graph embedding" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "We invoke the command line toolkit provided by DGL-KE to learn the embedding of entities and relations in iBKH. Here, we use four different models to learn the entity and edge representations of iBKH, namely TransE, TransR, DistMult, and ComplEx. To use other KGE model or AWS instances please refer to DGL-KE’s Document.\n" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "Open command line (Windows OS and UNIX OS) or terminal (MAC OS) and change directory as " 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "cd [your file path]/iBKH-KD-protocol" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "Train and evaluate the knowledge graph embedding model by running the command below." 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "DGLBACKEND=pytorch \\\n", 252 | "dglke_train --dataset iBKH --data_path ./data/dataset \\\n", 253 | " --data_files training_triplets.tsv \\\n", 254 | " validation_triplets.tsv \\\n", 255 | " testing_triplets.tsv \\\n", 256 | " --format raw_udd_hrt --model_name [model name] \\\n", 257 | " --batch_size [batch size] --hidden_dim [hidden dim] \\\n", 258 | " --neg_sample_size [neg sample size] --gamma [gamma] \\\n", 259 | " --lr [learning rate] --max_step [max step] \\\n", 260 | " --log_interval [log interval] \\\n", 261 | " --batch_size_eval [batch size eval] \\\n", 262 | " -adv --regularization_coef [regularization coef] \\\n", 263 | " --num_thread [num thread] --num_proc [num proc] \\\n", 264 | " --neg_sample_size_eval [neg sample size eval] \\\n", 265 | " --save_path ./data/embeddings --test" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "Running above command will train the specific knowledge graph embedding model in the training dataset and evaluate the model performance in link prediction task in the testing set. This will result in multiple metrics including: Hit@k (the average number of times the positive triplet is among the k highest ranked triplets); Mean Rank (MR, the average rank of the positive triplets); Mean Reciprocal Rank (MRR, the average reciprocal rank of the positive instances). Higher values of Hit@k and MRR and a lower value of MR indicate good performance, and vice versa.\n", 273 | "\n", 274 | "\n", 275 | "Of note, the user can use above command to find optimal hyperparameters of the model. For simplicity, the user can also use our suggested hyperparameters as below.\n", 276 | "\n", 277 | "```\n", 278 | "Arguments \t TransE\t TransR\t ComplEx\t DistMult\n", 279 | "--model_name\t TransE_l2\t TransR\t ComplEx\t DistMult\n", 280 | "--batch_size\t 1024\t 1024\t 1024\t 1024\n", 281 | "--batch_size_eval\t 1000\t 1000\t 1000\t 1000\n", 282 | "--neg_sample_size\t 256\t 256\t 256\t 256\n", 283 | "--neg_sample_size_eval\t1000\t 1000\t 1000\t 1000\n", 284 | "--hidden_dim\t 400\t 200\t 200\t 400\n", 285 | "--gamma\t 12.0\t 12.0\t 12.0\t 12.0\n", 286 | "--lr\t 0.1\t 0.005\t 0.005\t 0.005\n", 287 | "--max_step\t 10000\t 10000\t 10000\t 10000\n", 288 | "--log_interval \t100\t 100\t 100\t 100\n", 289 | "--regularization_coef\t1.00E-09\t 1.00E-07\t 1.00E-07\t 1.00E-07\n", 290 | "\n", 291 | "```" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "After determining hyperparameters that can lead to desirable performance, we then re-train the model using the whole dataset by running" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "DGLBACKEND=pytorch \\\n", 308 | "dglke_train --dataset iBKH --data_path ./data/dataset \\\n", 309 | " --data_files whole_triplets.tsv \\\n", 310 | " --format raw_udd_hrt --model_name [model name] \\\n", 311 | " --batch_size [batch size] --hidden_dim [hidden dim] \\\n", 312 | " --neg_sample_size [neg sample size] --gamma [gamma] \\\n", 313 | " --lr [learning rate] --max_step [max step] \\\n", 314 | " --log_interval [log interval] \\\n", 315 | " -adv --regularization_coef [regularization coef] \\\n", 316 | " --num_thread [num thread] --num_proc [num proc] \\\n", 317 | " --save_path ./data/embeddings" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "This will generate two output files for each model: “iBKH_[model name]\\_entity.npy”, containing the low dimension embeddings of entities in iBKH and “iBKH_[model name]\\_relation.npy”, containing the low dimension embeddings of relations in iBKH. These embeddings can be used in downstream knowledge discovery tasks." 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "### Step 3: Knowledge Discovery Based on iBKH - Hypothesis Generation" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "This step conducts knowledge discovery based on iBKH. \n", 339 | "\n", 340 | "We showcases an example -- drug repurposing hypothesis generation for Parkinson's disease." 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "from funcs.KG_link_pred import generate_hypothesis,\\\n", 350 | " generate_hypothesis_ensemble_model" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "PD = [\"parkinson's disease\", \"late onset parkinson's disease\"]" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "r_type = [\"Treats_DDi\", \"Palliates_DDi\"]" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "###### Drug repurposing hypothesis generation based on graph embedding using the TransE model." 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "proposed_df = generate_hypothesis(target_entity=PD, candidate_entity_type='drug',\n", 385 | " relation_type=r_type, embedding_folder='data/embeddings',\n", 386 | " method='transE_l2', kg_folder = 'data/iBKH', \n", 387 | " triplet_folder = 'data/triplets', topK=100, \n", 388 | " save_path='output', save=True,\n", 389 | " without_any_rel=False)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "This will result in an output CSV file stored in the \"output\" folder." 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "# print the predicted drugs.\n", 406 | "\n", 407 | "proposed_df" 408 | ] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "metadata": {}, 413 | "source": [ 414 | "We provide an ensemble model that integrates TransE, TransR, complEx, and DistMult to generate hypotheses." 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "metadata": {}, 421 | "outputs": [], 422 | "source": [ 423 | "ensemble_proposed_df = generate_hypothesis_ensemble_model(target_entity=PD, candidate_entity_type='drug',\n", 424 | " relation_type=r_type, \n", 425 | " embedding_folder='data/embeddings',\n", 426 | " kg_folder = 'data/iBKH', \n", 427 | " triplet_folder = 'data/triplets',\n", 428 | " topK=100, save_path='output', save=True, \n", 429 | " without_any_rel=False)" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "# print the predicted drugs using ensemble method\n", 439 | "ensemble_proposed_df" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "###### Interpreting prediction results in knowledge graph." 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": {}, 452 | "source": [ 453 | "Finally, we interpret predicted repurposing drug candidates using knowledge graph. We can extract intermediate entities that construct the shortest paths linking the target entity (i.e., Parkinson's disease) and the predicted drug candidates." 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "metadata": {}, 459 | "source": [ 460 | "1. To achive this goal, we first deploy the iBKH knoweldge graph using Neo4j with an AWS server. Please refer the following instruction to set up the knoweldge graph: https://docs.google.com/document/d/1cLDPLp_nVCJ5xrDlJ-B-Q3wf24tb-Dyq55nAXxaNgTM/edit" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": {}, 466 | "source": [ 467 | "2. Interpreting repurposing drug candidates." 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "metadata": {}, 474 | "outputs": [], 475 | "source": [ 476 | "import funcs.knowledge_visualization as knowledge_visualization" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "# List of predicted repurposing drug candidates to interprete\n", 486 | "\n", 487 | "drug_list = ['Glutathione', 'Clioquinol', 'Steroids', 'Taurine']" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [ 496 | "knowledge_visualization.subgraph_visualization(target_type='Disease', target_list=PD,\n", 497 | " predicted_type='Drug', predicted_list=drug_list, \n", 498 | " neo4j_url = \"neo4j://54.210.251.104:7687\", \n", 499 | " username = \"neo4j\", password = \"password\",\n", 500 | " alpha=1.5, k=0.8, figsize=(15, 10), save=True)" 501 | ] 502 | } 503 | ], 504 | "metadata": { 505 | "kernelspec": { 506 | "display_name": "Python 3", 507 | "language": "python", 508 | "name": "python3" 509 | }, 510 | "language_info": { 511 | "codemirror_mode": { 512 | "name": "ipython", 513 | "version": 3 514 | }, 515 | "file_extension": ".py", 516 | "mimetype": "text/x-python", 517 | "name": "python", 518 | "nbconvert_exporter": "python", 519 | "pygments_lexer": "ipython3", 520 | "version": "3.7.3" 521 | } 522 | }, 523 | "nbformat": 4, 524 | "nbformat_minor": 5 525 | } 526 | -------------------------------------------------------------------------------- /Codes_Term Harmonization/Entity_Integration/entity_anatomy.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import string 4 | import requests 5 | from lxml.html import fromstring 6 | 7 | pd.set_option('display.max_columns', None) 8 | pd.set_option('display.max_rows', None) 9 | 10 | 11 | term_type_list = ['AC', 'BD', 'BN', 'BPCK', 'BR', 'CC', 'CDC', 'CDO', 'CD', 'CMN', 'CN', 'CPR', 'CP', 'CR', 'CSY', 'CV', 12 | 'CX', 'DC10', 'DC9', 'DE', 'DFG', 'DF', 'DI', 'DP', 'FI', 'FN', 'GLP', 'GN', 'GO', 'GPCK', 'HTJKN1', 13 | 'HTJKN', 'HTN', 'HT', 'ID', 'IN', 'IVC', 'IV', 'LA', 'LC', 'LG', 'LN', 'LPDN', 'LPN', 'LVDN', 'MD', 14 | 'MH', 'MIN', 'MS', 'MTH_CN', 'MTH_FN', 'MTH_LN', 'MTH_OAP', 'MTH_OPN', 'MTH_OP', 'MTH_PTGB', 15 | 'MTH_PTN', 'MTH_PT', 'MTH_RXN_BD', 'MTH_RXN_CDC', 'MTH_RXN_CD', 'MTH_RXN_DP', 'MTH_SI', 'MTH_SMQ', 16 | 'MV', 'NM', 'OC', 'OPN', 'OP', 'OR', 'OSN', 'PCE', 'PC', 'PEP', 'PHENO', 'PIN', 'PN', 'POS', 'PR', 17 | 'PSC', 'PSN', 'PTAV', 'PTCS', 'PTGB', 'PTJKN1', 'PTJKN', 'PTN', 'PT', 'PX', 'RPT', 'RXN_IN', 'RXN_PT', 18 | 'SBDC', 'SBDF', 'SBDG', 'SBD', 'SCDC', 'SCDF', 'SCDG', 'SCD', 'SCN', 'SD', 'SI', 'SMQ', 'SP', 'ST', 19 | 'SU', 'TA', 'TG', 'TQ', 'UCN', 'USN', 'VPT', 'VS', 'XD'] 20 | 21 | 22 | def get_UMLS_tgt(apikey): 23 | uri = "https://utslogin.nlm.nih.gov" 24 | auth_endpoint = "/cas/v1/api-key" 25 | params = {'apikey': apikey} 26 | h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent": "python"} 27 | r = requests.post(uri + auth_endpoint, data=params, headers=h) 28 | response = fromstring(r.text) 29 | tgt = response.xpath('//form/@action')[0] 30 | return tgt 31 | 32 | 33 | def get_UMLS_ts(tgt): 34 | service = "http://umlsks.nlm.nih.gov" 35 | params = {'service': service} 36 | h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent": "python"} 37 | r = requests.post(tgt, data=params, headers=h) 38 | st = r.text 39 | return st 40 | 41 | 42 | def get_UMLS_name(tgt, umls_cui): 43 | st = get_UMLS_ts(tgt) 44 | url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + umls_cui + '?ticket=' + st 45 | resp = requests.get(url) 46 | name = '' 47 | if 'error' not in resp.json(): 48 | content = resp.json()['result'] 49 | name = content['name'] 50 | 51 | return name 52 | 53 | 54 | def access_UMLS_CUI(tgt, id_type, entity_id): 55 | st = get_UMLS_ts(tgt) 56 | umls_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/source/' + id_type + '/' + entity_id + \ 57 | '/atoms?ttys=MH,NM,PT&ticket=' + st 58 | resp = requests.get(umls_url) 59 | umls_cui = '' 60 | if 'error' not in resp.json(): 61 | content = resp.json()['result'][0] 62 | umls_cui = content['concept'].replace('https://uts-ws.nlm.nih.gov/rest/content/2020AB/CUI/', '') 63 | # print(umls_cui) 64 | return umls_cui 65 | 66 | 67 | def access_UMLS_CUI_name(tgt, name): 68 | name = name.lower() 69 | name = name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 70 | name_set = set(filter(None, name.split(' '))) 71 | if 'and' in name_set: 72 | name_set.remove('and') 73 | st = get_UMLS_ts(tgt) 74 | db_url = 'https://uts-ws.nlm.nih.gov/rest/search/current?string=' + name + '&ticket=' + st 75 | db_resp = requests.get(db_url) 76 | db_content_list = db_resp.json()['result']['results'] 77 | res_umls = '' 78 | exact_match = False 79 | for db_content in db_content_list: 80 | umls_cui = db_content['ui'] 81 | umls_name = db_content['name'].lower() 82 | umls_name = umls_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 83 | umls_name_set = set(filter(None, umls_name.split(' '))) 84 | if umls_name_set == name_set: 85 | res_umls = umls_cui 86 | exact_match = True 87 | if res_umls == '': 88 | res_umls = db_content_list[0]['ui'] 89 | res_umls = res_umls if res_umls != 'NONE' else '' 90 | # print(res_umls, res_umls_name, exact_match) 91 | if not exact_match: 92 | st = get_UMLS_ts(tgt) 93 | url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + res_umls + '/atoms?ticket=' + st 94 | resp = requests.get(url) 95 | if 'error' not in resp.json(): 96 | pageCount = int(resp.json()['pageCount']) 97 | for page in range(1, pageCount + 1): 98 | st = get_UMLS_ts(tgt) 99 | page_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + res_umls + '/atoms?pageNumber=' + str( 100 | page) + '&ticket=' + st 101 | page_resp = requests.get(page_url) 102 | content = page_resp.json()['result'] 103 | for res in content: 104 | if res['termType'] in term_type_list: 105 | disease_name = res['name'].lower().replace('to ', '').translate( 106 | str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 107 | disease_name_set = set(filter(None, disease_name.split(' '))) 108 | if 'and' in disease_name_set: 109 | disease_name_set.remove('and') 110 | exact_match = name_set == disease_name_set 111 | if exact_match: 112 | break 113 | if exact_match: 114 | break 115 | # print(res_umls, res_umls_name, exact_match) 116 | return res_umls if exact_match else '' 117 | 118 | 119 | def UMLS2MeSH(tgt, umls_cui): 120 | st = get_UMLS_ts(tgt) 121 | mesh_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + umls_cui + '/atoms?sabs=MSH&ttys=MH,NM,PT&ticket=' + st 122 | mesh_resp = requests.get(mesh_url) 123 | mesh_id = '' 124 | if 'error' not in mesh_resp.json(): 125 | mesh_content = mesh_resp.json()['result'] 126 | mesh_id = mesh_content[0]['code'].replace( 127 | 'https://uts-ws.nlm.nih.gov/rest/content/2020AB/source/MSH/', '') 128 | return mesh_id 129 | 130 | 131 | def refine_res_2(): 132 | anatomy_res = pd.read_csv('anatomy_res_2.csv') 133 | anatomy_res = anatomy_res[['primary', 'name', 'uberon_id', 'bto_id', 'mesh_id', 'umls_cui']] 134 | 135 | anatomy_res['mesh_id'] = anatomy_res['mesh_id'].str.replace('MESH:', '') 136 | anatomy_res['umls_cui'] = anatomy_res['umls_cui'].str.replace('UMLS:', '') 137 | 138 | mesh_anatomy = pd.read_csv('anatomy_mesh.csv') 139 | mesh_anatomy['mesh_id'] = mesh_anatomy['mesh_id'].str.replace('MESH:', '') 140 | mesh_name_dict = mesh_anatomy.set_index('mesh_id')['mesh_term'].to_dict() 141 | 142 | bto = pd.read_csv('bto.csv') 143 | bto_name_dict = bto.set_index('bto_id')['name'].to_dict() 144 | 145 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6' 146 | tgt = get_UMLS_tgt(apikey) 147 | 148 | for i in range(len(anatomy_res)): 149 | mesh_id = anatomy_res.loc[i, 'mesh_id'] 150 | umls_cui = anatomy_res.loc[i, 'umls_cui'] 151 | bto_id = anatomy_res.loc[i, 'bto_id'] 152 | 153 | if not pd.isnull(bto_id): 154 | temp_df = anatomy_res[anatomy_res['bto_id'] == bto_id] 155 | if len(temp_df) > 1: 156 | bto_name = bto_name_dict[bto_id] 157 | temp_bto_name = bto_name.lower() 158 | temp_bto_name = temp_bto_name.translate( 159 | str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 160 | bto_name_set = set(filter(None, temp_bto_name.split(' '))) 161 | for j in range(len(temp_df)): 162 | name = temp_df.iloc[j, 1] 163 | temp_name = name.lower() 164 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 165 | name_set = set(filter(None, temp_name.split(' '))) 166 | if name_set != bto_name_set: 167 | anatomy_res.loc[anatomy_res['name'] == name, 'bto_id'] = np.nan 168 | temp_2 = anatomy_res[anatomy_res['bto_id'] == bto_id] 169 | if len(temp_2) == 0: 170 | anatomy_res.loc[anatomy_res['name'] == temp_df.iloc[0, 1], 'bto_id'] = bto_id 171 | temp_df_2 = anatomy_res[anatomy_res['bto_id'] == bto_id] 172 | if len(temp_df_2) > 1: 173 | for j in range(1, len(temp_df_2)): 174 | temp_primary = temp_df_2.iloc[j, 0] 175 | anatomy_res.loc[anatomy_res['primary'] == temp_primary, 'bto_id'] = np.nan 176 | 177 | if not pd.isnull(mesh_id): 178 | temp_df = anatomy_res[anatomy_res['mesh_id'] == mesh_id] 179 | if len(temp_df) > 1: 180 | mesh_term = mesh_name_dict[mesh_id] if mesh_id in mesh_name_dict else '' 181 | temp_mesh_term = mesh_term.lower() 182 | temp_mesh_term = temp_mesh_term.translate( 183 | str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 184 | mesh_term_set = set(filter(None, temp_mesh_term.split(' '))) 185 | for j in range(len(temp_df)): 186 | name = temp_df.iloc[j, 1] 187 | temp_name = name.lower() 188 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 189 | name_set = set(filter(None, temp_name.split(' '))) 190 | if name_set != mesh_term_set: 191 | anatomy_res.loc[anatomy_res['name'] == name, 'mesh_id'] = np.nan 192 | temp_2 = anatomy_res[anatomy_res['mesh_id'] == mesh_id] 193 | if len(temp_2) == 0: 194 | anatomy_res.loc[anatomy_res['name'] == temp_df.iloc[0, 1], 'mesh_id'] = mesh_id 195 | temp_df_2 = anatomy_res[anatomy_res['mesh_id'] == mesh_id] 196 | if len(temp_df_2) > 1: 197 | for j in range(1, len(temp_df_2)): 198 | temp_primary = temp_df_2.iloc[j, 0] 199 | anatomy_res.loc[anatomy_res['primary'] == temp_primary, 'mesh_id'] = np.nan 200 | 201 | if not pd.isnull(umls_cui): 202 | temp_df = anatomy_res[anatomy_res['umls_cui'] == umls_cui] 203 | if len(temp_df) > 1: 204 | umls_name = get_UMLS_name(tgt, umls_cui) 205 | temp_umls_name = umls_name.lower() 206 | temp_umls_name = temp_umls_name.translate( 207 | str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 208 | umls_name_set = set(filter(None, temp_umls_name.split(' '))) 209 | for j in range(len(temp_df)): 210 | name = temp_df.iloc[j, 1] 211 | temp_name = name.lower() 212 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 213 | name_set = set(filter(None, temp_name.split(' '))) 214 | if name_set != umls_name_set: 215 | anatomy_res.loc[anatomy_res['name'] == name, 'umls_cui'] = np.nan 216 | temp_2 = anatomy_res[anatomy_res['umls_cui'] == umls_cui] 217 | if len(temp_2) == 0: 218 | anatomy_res.loc[anatomy_res['name'] == temp_df.iloc[0, 1], 'umls_cui'] = umls_cui 219 | temp_df_2 = anatomy_res[anatomy_res['umls_cui'] == umls_cui] 220 | if len(temp_df_2) > 1: 221 | for j in range(1, len(temp_df_2)): 222 | temp_primary = temp_df_2.iloc[j, 0] 223 | anatomy_res.loc[anatomy_res['primary'] == temp_primary, 'umls_cui'] = np.nan 224 | print(i + 1, '/', len(anatomy_res), 'Completed...') 225 | anatomy_res.to_csv('anatomy_res_2_refined.csv', index=False) 226 | 227 | 228 | def enriche_CL(): 229 | cl_df = pd.read_csv('cl.csv') 230 | 231 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6' 232 | tgt = get_UMLS_tgt(apikey) 233 | 234 | for i in range(len(cl_df)): 235 | fma_id = cl_df.loc[i, 'fma'] 236 | mesh_id = cl_df.loc[i, 'mesh_id'] 237 | umls_cui = cl_df.loc[i, 'umls_cui'] 238 | name = cl_df.loc[i, 'name'] 239 | 240 | if pd.isnull(umls_cui): 241 | if not pd.isnull(fma_id): 242 | temp_umls = access_UMLS_CUI(tgt, 'FMA', fma_id) 243 | else: 244 | temp_umls = access_UMLS_CUI_name(tgt, name) 245 | cl_df.loc[i, 'umls_cui'] = temp_umls 246 | 247 | if not pd.isnull(umls_cui) and pd.isnull(mesh_id): 248 | temp_mesh = UMLS2MeSH(tgt, umls_cui) 249 | cl_df.loc[i, 'mesh_id'] = temp_mesh 250 | print(i + 1, '/', len(cl_df), 'Completed...') 251 | cl_df.to_csv('cl_enriched.csv', index=False) 252 | 253 | 254 | def refine_CL(): 255 | cl_df = pd.read_csv('cl_enriched.csv') 256 | cl_df = cl_df[['cl_id', 'name', 'mesh_id', 'umls_cui', 'fma', 'bto_id']] 257 | 258 | mesh_anatomy = pd.read_csv('anatomy_mesh.csv') 259 | mesh_anatomy['mesh_id'] = mesh_anatomy['mesh_id'].str.replace('MESH:', '') 260 | mesh_name_dict = mesh_anatomy.set_index('mesh_id')['mesh_term'].to_dict() 261 | 262 | bto = pd.read_csv('bto.csv') 263 | bto_name_dict = bto.set_index('bto_id')['name'].to_dict() 264 | 265 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6' 266 | tgt = get_UMLS_tgt(apikey) 267 | 268 | for i in range(len(cl_df)): 269 | mesh_id = cl_df.loc[i, 'mesh_id'] 270 | umls_cui = cl_df.loc[i, 'umls_cui'] 271 | bto_id = cl_df.loc[i, 'bto_id'] 272 | 273 | if not pd.isnull(bto_id): 274 | temp_df = cl_df[cl_df['bto_id'] == bto_id] 275 | if len(temp_df) > 1: 276 | bto_name = bto_name_dict[bto_id] 277 | temp_bto_name = bto_name.lower() 278 | temp_bto_name = temp_bto_name.translate( 279 | str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 280 | bto_name_set = set(filter(None, temp_bto_name.split(' '))) 281 | for j in range(len(temp_df)): 282 | name = temp_df.iloc[j, 1] 283 | temp_name = name.lower() 284 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 285 | name_set = set(filter(None, temp_name.split(' '))) 286 | if name_set != bto_name_set: 287 | cl_df.loc[cl_df['name'] == name, 'bto_id'] = np.nan 288 | temp_2 = cl_df[cl_df['bto_id'] == bto_id] 289 | if len(temp_2) == 0: 290 | cl_df.loc[cl_df['name'] == temp_df.iloc[0, 1], 'bto_id'] = bto_id 291 | temp_df_2 = cl_df[cl_df['bto_id'] == bto_id] 292 | if len(temp_df_2) > 1: 293 | for j in range(1, len(temp_df_2)): 294 | temp_primary = temp_df_2.iloc[j, 0] 295 | cl_df.loc[cl_df['cl_id'] == temp_primary, 'bto_id'] = np.nan 296 | 297 | if not pd.isnull(mesh_id): 298 | temp_df = cl_df[cl_df['mesh_id'] == mesh_id] 299 | if len(temp_df) > 1: 300 | mesh_term = mesh_name_dict[mesh_id] if mesh_id in mesh_name_dict else '' 301 | temp_mesh_term = mesh_term.lower() 302 | temp_mesh_term = temp_mesh_term.translate( 303 | str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 304 | mesh_term_set = set(filter(None, temp_mesh_term.split(' '))) 305 | for j in range(len(temp_df)): 306 | name = temp_df.iloc[j, 1] 307 | temp_name = name.lower() 308 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 309 | name_set = set(filter(None, temp_name.split(' '))) 310 | if name_set != mesh_term_set: 311 | cl_df.loc[cl_df['name'] == name, 'mesh_id'] = np.nan 312 | temp_2 = cl_df[cl_df['mesh_id'] == mesh_id] 313 | if len(temp_2) == 0: 314 | cl_df.loc[cl_df['name'] == temp_df.iloc[0, 1], 'mesh_id'] = mesh_id 315 | temp_df_2 = cl_df[cl_df['mesh_id'] == mesh_id] 316 | if len(temp_df_2) > 1: 317 | for j in range(1, len(temp_df_2)): 318 | temp_primary = temp_df_2.iloc[j, 0] 319 | cl_df.loc[cl_df['cl_id'] == temp_primary, 'mesh_id'] = np.nan 320 | 321 | if not pd.isnull(umls_cui): 322 | temp_df = cl_df[cl_df['umls_cui'] == umls_cui] 323 | if len(temp_df) > 1: 324 | umls_name = get_UMLS_name(tgt, umls_cui) 325 | temp_umls_name = umls_name.lower() 326 | temp_umls_name = temp_umls_name.translate( 327 | str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 328 | umls_name_set = set(filter(None, temp_umls_name.split(' '))) 329 | for j in range(len(temp_df)): 330 | name = temp_df.iloc[j, 1] 331 | temp_name = name.lower() 332 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 333 | name_set = set(filter(None, temp_name.split(' '))) 334 | if name_set != umls_name_set: 335 | cl_df.loc[cl_df['name'] == name, 'umls_cui'] = np.nan 336 | temp_2 = cl_df[cl_df['umls_cui'] == umls_cui] 337 | if len(temp_2) == 0: 338 | cl_df.loc[cl_df['name'] == temp_df.iloc[0, 1], 'umls_cui'] = umls_cui 339 | temp_df_2 = cl_df[cl_df['umls_cui'] == umls_cui] 340 | if len(temp_df_2) > 1: 341 | for j in range(1, len(temp_df_2)): 342 | temp_primary = temp_df_2.iloc[j, 0] 343 | cl_df.loc[cl_df['cl_id'] == temp_primary, 'umls_cui'] = np.nan 344 | print(i + 1, '/', len(cl_df), 'Completed...') 345 | cl_df.to_csv('cl_refined.csv', index=False) 346 | 347 | 348 | def integrate_CL(): 349 | anatomy_res = pd.read_csv('anatomy_res_2_refined.csv') 350 | anatomy_res['cl_id'] = [''] * len(anatomy_res) 351 | idx = len(anatomy_res) 352 | 353 | anatomy_res['mesh_id'] = anatomy_res['mesh_id'].str.replace('MESH:', '') 354 | anatomy_res['umls_cui'] = anatomy_res['umls_cui'].str.replace('UMLS:', '') 355 | 356 | bto_list_res = list(anatomy_res.dropna(subset=['bto_id'])['bto_id']) 357 | mesh_list_res = list(anatomy_res.dropna(subset=['mesh_id'])['mesh_id']) 358 | umls_list_res = list(anatomy_res.dropna(subset=['umls_cui'])['umls_cui']) 359 | 360 | cl_res = pd.read_csv('cl_refined.csv') 361 | for i in range(len(cl_res)): 362 | cl_id = cl_res.loc[i, 'cl_id'] 363 | cl_name = cl_res.loc[i, 'name'] 364 | mesh_id = cl_res.loc[i, 'mesh_id'] 365 | umlc_cui = cl_res.loc[i, 'umls_cui'] 366 | bto_id = cl_res.loc[i, 'bto_id'] 367 | 368 | if bto_id in bto_list_res: 369 | anatomy_res.loc[anatomy_res['bto_id'] == bto_id, 'cl_id'] = cl_id 370 | elif mesh_id in mesh_list_res: 371 | anatomy_res.loc[anatomy_res['mesh_id'] == mesh_id, 'cl_id'] = cl_id 372 | elif umlc_cui in umls_list_res: 373 | anatomy_res.loc[anatomy_res['umls_cui'] == umlc_cui, 'cl_id'] = cl_id 374 | else: 375 | anatomy_res.loc[idx] = [cl_id, cl_name, '', '', mesh_id, umlc_cui, cl_id] 376 | idx += 1 377 | print(i + 1, '/', len(cl_res), 'Completed...') 378 | anatomy_res.to_csv('anatomy_res_3.csv', index=False) 379 | 380 | 381 | def main(): 382 | refine_res_2() 383 | # enriche_CL() 384 | # refine_CL() 385 | integrate_CL() 386 | 387 | an_vocab = pd.read_csv('anatomy_res_3.csv') 388 | print(len(an_vocab), len(an_vocab.drop_duplicates(subset='primary', keep='first'))) 389 | mesh_vocab = an_vocab.dropna(subset=['mesh_id']) 390 | print(len(mesh_vocab), len(mesh_vocab.drop_duplicates(subset='mesh_id', keep='first'))) 391 | bto_vocab = an_vocab.dropna(subset=['bto_id']) 392 | print(len(bto_vocab), len(bto_vocab.drop_duplicates(subset='bto_id', keep='first'))) 393 | cl_vocab = an_vocab.dropna(subset=['cl_id']) 394 | print(len(cl_vocab), len(cl_vocab.drop_duplicates(subset='cl_id', keep='first'))) 395 | umls_vocab = an_vocab.dropna(subset=['umls_cui']) 396 | print(len(umls_vocab), len(umls_vocab.drop_duplicates(subset='umls_cui', keep='first'))) 397 | 398 | # apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6' 399 | # tgt = get_UMLS_tgt(apikey) 400 | # # umls_cui = access_UMLS_CUI(tgt, 'FMA', '68646') 401 | # umls_cui = access_UMLS_CUI_name(tgt, 'cell') 402 | # print(umls_cui) 403 | 404 | 405 | if __name__ == '__main__': 406 | main() 407 | -------------------------------------------------------------------------------- /Codes_Term Harmonization/Entity_Integration/entity_disease.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import requests 4 | from lxml.html import fromstring 5 | import string 6 | 7 | folder = '' 8 | term_type_list = ['AC', 'BD', 'BN', 'BPCK', 'BR', 'CC', 'CDC', 'CDO', 'CD', 'CMN', 'CN', 'CPR', 'CP', 'CR', 'CSY', 'CV', 9 | 'CX', 'DC10', 'DC9', 'DE', 'DFG', 'DF', 'DI', 'DP', 'FI', 'FN', 'GLP', 'GN', 'GO', 'GPCK', 'HTJKN1', 10 | 'HTJKN', 'HTN', 'HT', 'ID', 'IN', 'IVC', 'IV', 'LA', 'LC', 'LG', 'LN', 'LPDN', 'LPN', 'LVDN', 'MD', 11 | 'MH', 'MIN', 'MS', 'MTH_CN', 'MTH_FN', 'MTH_LN', 'MTH_OAP', 'MTH_OPN', 'MTH_OP', 'MTH_PTGB', 12 | 'MTH_PTN', 'MTH_PT', 'MTH_RXN_BD', 'MTH_RXN_CDC', 'MTH_RXN_CD', 'MTH_RXN_DP', 'MTH_SI', 'MTH_SMQ', 13 | 'MV', 'NM', 'OC', 'OPN', 'OP', 'OR', 'OSN', 'PCE', 'PC', 'PEP', 'PHENO', 'PIN', 'PN', 'POS', 'PR', 14 | 'PSC', 'PSN', 'PTAV', 'PTCS', 'PTGB', 'PTJKN1', 'PTJKN', 'PTN', 'PT', 'PX', 'RPT', 'RXN_IN', 'RXN_PT', 15 | 'SBDC', 'SBDF', 'SBDG', 'SBD', 'SCDC', 'SCDF', 'SCDG', 'SCD', 'SCN', 'SD', 'SI', 'SMQ', 'SP', 'ST', 16 | 'SU', 'TA', 'TG', 'TQ', 'UCN', 'USN', 'VPT', 'VS', 'XD'] 17 | 18 | def get_UMLS_tgt(apikey): 19 | uri = "https://utslogin.nlm.nih.gov" 20 | auth_endpoint = "/cas/v1/api-key" 21 | params = {'apikey': apikey} 22 | h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent": "python"} 23 | r = requests.post(uri + auth_endpoint, data=params, headers=h) 24 | response = fromstring(r.text) 25 | tgt = response.xpath('//form/@action')[0] 26 | return tgt 27 | 28 | 29 | def get_UMLS_ts(tgt): 30 | service = "http://umlsks.nlm.nih.gov" 31 | params = {'service': service} 32 | h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent": "python"} 33 | r = requests.post(tgt, data=params, headers=h) 34 | st = r.text 35 | return st 36 | 37 | 38 | def access_UMLS_CUI(tgt, id_type, entity_id): 39 | st = get_UMLS_ts(tgt) 40 | umls_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/source/' + id_type + '/' + entity_id + \ 41 | '/atoms?ttys=MH,NM,PT&ticket=' + st 42 | resp = requests.get(umls_url) 43 | umls_cui = '' 44 | if 'error' not in resp.json(): 45 | content = resp.json()['result'][0] 46 | umls_cui = content['concept'].replace('https://uts-ws.nlm.nih.gov/rest/content/2020AB/CUI/', '') 47 | # print(umls_cui) 48 | return umls_cui 49 | 50 | 51 | def UMLS2MeSH(tgt, umls_cui): 52 | st = get_UMLS_ts(tgt) 53 | mesh_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + umls_cui + '/atoms?sabs=MSH&ttys=MH,NM,PT&ticket=' + st 54 | mesh_resp = requests.get(mesh_url) 55 | mesh_id = '' 56 | if 'error' not in mesh_resp.json(): 57 | mesh_content = mesh_resp.json()['result'] 58 | mesh_id = mesh_content[0]['code'].replace( 59 | 'https://uts-ws.nlm.nih.gov/rest/content/2020AB/source/MSH/', '') 60 | return mesh_id 61 | 62 | 63 | def get_UMLS_name(tgt, umls_cui): 64 | st = get_UMLS_ts(tgt) 65 | url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + umls_cui + '?ticket=' + st 66 | resp = requests.get(url) 67 | name = '' 68 | if 'error' not in resp.json(): 69 | content = resp.json()['result'] 70 | name = content['name'] 71 | 72 | return name 73 | 74 | 75 | def access_UMLS_CUI_name(tgt, name): 76 | name = name.lower() 77 | name = name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 78 | name_set = set(filter(None, name.split(' '))) 79 | if 'and' in name_set: 80 | name_set.remove('and') 81 | st = get_UMLS_ts(tgt) 82 | db_url = 'https://uts-ws.nlm.nih.gov/rest/search/current?string=' + name + '&ticket=' + st 83 | db_resp = requests.get(db_url) 84 | db_content_list = db_resp.json()['result']['results'] 85 | res_umls = '' 86 | exact_match = False 87 | for db_content in db_content_list: 88 | umls_cui = db_content['ui'] 89 | umls_name = db_content['name'].lower() 90 | umls_name = umls_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 91 | umls_name_set = set(filter(None, umls_name.split(' '))) 92 | if umls_name_set == name_set: 93 | res_umls = umls_cui 94 | exact_match = True 95 | if res_umls == '': 96 | res_umls = db_content_list[0]['ui'] 97 | res_umls = res_umls if res_umls != 'NONE' else '' 98 | # print(res_umls, res_umls_name, exact_match) 99 | if not exact_match: 100 | st = get_UMLS_ts(tgt) 101 | url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + res_umls + '/atoms?ticket=' + st 102 | resp = requests.get(url) 103 | if 'error' not in resp.json(): 104 | pageCount = int(resp.json()['pageCount']) 105 | for page in range(1, pageCount + 1): 106 | st = get_UMLS_ts(tgt) 107 | page_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + res_umls + '/atoms?pageNumber=' + str( 108 | page) + '&ticket=' + st 109 | page_resp = requests.get(page_url) 110 | content = page_resp.json()['result'] 111 | for res in content: 112 | if res['termType'] in term_type_list: 113 | disease_name = res['name'].lower().replace('to ', '').translate( 114 | str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 115 | disease_name_set = set(filter(None, disease_name.split(' '))) 116 | if 'and' in disease_name_set: 117 | disease_name_set.remove('and') 118 | exact_match = name_set == disease_name_set 119 | if exact_match: 120 | break 121 | if exact_match: 122 | break 123 | # print(res_umls, res_umls_name, exact_match) 124 | return res_umls if exact_match else '' 125 | 126 | 127 | def enrich_DO(): 128 | do_df = pd.read_csv(folder + 'do.csv') 129 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6' 130 | tgt = get_UMLS_tgt(apikey) 131 | 132 | for i in range(len(do_df)): 133 | umls_cui = do_df.loc[i, 'umls_cui'] 134 | mesh_id = do_df.loc[i, 'mesh_id'] 135 | if pd.isnull(umls_cui): 136 | icd_10 = do_df.loc[i, 'icd_10'] 137 | icd_10 = str(icd_10) if not pd.isnull(icd_10) else '' 138 | icd_9 = do_df.loc[i, 'icd_9'] 139 | icd_9 = str(icd_9) if not pd.isnull(icd_9) else '' 140 | snomedct_id = do_df.loc[i, 'snomedct_id'] 141 | snomedct_id = str(snomedct_id) if not pd.isnull(snomedct_id) else '' 142 | name = do_df.loc[i, 'disease_name'] 143 | umls_cui = access_UMLS_CUI(tgt, 'ICD10CM', icd_10) 144 | if umls_cui == '': 145 | umls_cui = access_UMLS_CUI(tgt, 'ICD9CM', icd_9) 146 | if umls_cui == '': 147 | umls_cui = access_UMLS_CUI(tgt, 'SNOMEDCT_US', snomedct_id) 148 | if umls_cui == '': 149 | umls_cui = access_UMLS_CUI_name(tgt, name) 150 | do_df.loc[i, 'umls_cui'] = umls_cui 151 | if not pd.isnull(umls_cui) and pd.isnull(mesh_id): 152 | mesh_id = UMLS2MeSH(tgt, umls_cui) 153 | do_df.loc[i, 'mesh_id'] = mesh_id 154 | print(i + 1, '/', len(do_df), 'Completed...') 155 | # print(do_df[['doid', 'umls_cui', 'mesh_id']]) 156 | do_df.to_csv(folder + 'do_enriched.csv', index=False) 157 | 158 | 159 | def refine_DO(): 160 | do_df = pd.read_csv(folder + 'do_enriched.csv') 161 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6' 162 | tgt = get_UMLS_tgt(apikey) 163 | 164 | mesh_disease = pd.read_csv(folder + 'mesh_disease.csv') 165 | mesh_disease['mesh_id'] = mesh_disease['mesh_id'].str.replace('MESH:', '') 166 | mesh_name_dict = mesh_disease.set_index('mesh_id')['mesh_term'].to_dict() 167 | 168 | for i in range(len(do_df)): 169 | mesh_id = do_df.loc[i, 'mesh_id'] 170 | umls_cui = do_df.loc[i, 'umls_cui'] 171 | if not pd.isnull(mesh_id): 172 | temp_df = do_df[do_df['mesh_id'] == mesh_id] 173 | if len(temp_df) > 1: 174 | mesh_term = mesh_name_dict[mesh_id] if mesh_id in mesh_name_dict else '' 175 | temp_mesh_term = mesh_term.lower() 176 | temp_mesh_term = temp_mesh_term.translate( 177 | str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 178 | mesh_term_set = set(filter(None, temp_mesh_term.split(' '))) 179 | for j in range(len(temp_df)): 180 | name = temp_df.iloc[j, 1] 181 | temp_name = name.lower() 182 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 183 | name_set = set(filter(None, temp_name.split(' '))) 184 | if name_set != mesh_term_set: 185 | do_df.loc[do_df['disease_name'] == name, 'mesh_id'] = np.nan 186 | temp_2 = do_df[do_df['mesh_id'] == mesh_id] 187 | if len(temp_2) == 0: 188 | do_df.loc[do_df['disease_name'] == temp_df.iloc[0, 1], 'mesh_id'] = mesh_id 189 | temp_df_2 = do_df[do_df['mesh_id'] == mesh_id] 190 | if len(temp_df_2) > 1: 191 | for j in range(1, len(temp_df_2)): 192 | temp_primary = temp_df_2.iloc[j, 0] 193 | do_df.loc[do_df['doid'] == temp_primary, 'mesh_id'] = np.nan 194 | 195 | if not pd.isnull(umls_cui): 196 | temp_df = do_df[do_df['umls_cui'] == umls_cui] 197 | if len(temp_df) > 1: 198 | umls_name = get_UMLS_name(tgt, umls_cui) 199 | temp_umls_name = umls_name.lower() 200 | temp_umls_name = temp_umls_name.translate( 201 | str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 202 | umls_name_set = set(filter(None, temp_umls_name.split(' '))) 203 | for j in range(len(temp_df)): 204 | name = temp_df.iloc[j, 1] 205 | temp_name = name.lower() 206 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 207 | name_set = set(filter(None, temp_name.split(' '))) 208 | if name_set != umls_name_set: 209 | do_df.loc[do_df['disease_name'] == name, 'umls_cui'] = np.nan 210 | temp_2 = do_df[do_df['umls_cui'] == umls_cui] 211 | if len(temp_2) == 0: 212 | do_df.loc[do_df['disease_name'] == temp_df.iloc[0, 1], 'umls_cui'] = umls_cui 213 | temp_df_2 = do_df[do_df['umls_cui'] == umls_cui] 214 | if len(temp_df_2) > 1: 215 | for j in range(1, len(temp_df_2)): 216 | temp_primary = temp_df_2.iloc[j, 0] 217 | do_df.loc[do_df['doid'] == temp_primary, 'umls_cui'] = np.nan 218 | print(i + 1, '/', len(do_df), 'Completed...') 219 | do_df.to_csv(folder + 'do_enriched_refined.csv', index=False) 220 | 221 | 222 | def enrich_KEGG(): 223 | kegg_df = pd.read_csv(folder + 'kegg_disease.csv') 224 | kegg_df = kegg_df.dropna(subset=['name']) 225 | kegg_df = kegg_df.reset_index(drop=True) 226 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6' 227 | tgt = get_UMLS_tgt(apikey) 228 | 229 | umls_list = [] 230 | for i in range(len(kegg_df)): 231 | names = kegg_df.loc[i, 'name'] 232 | mesh_id = kegg_df.loc[i, 'mesh_id'] 233 | icd_10 = kegg_df.loc[i, 'icd_10'] 234 | icd_10 = icd_10.split(' ')[0] if not pd.isnull(icd_10) else '' 235 | name = names.split('; ')[0] 236 | name = name[:name.find(' (')] 237 | umls_cui = '' 238 | if not pd.isnull(mesh_id): 239 | umls_cui = access_UMLS_CUI(tgt, 'MSH', mesh_id) 240 | if umls_cui == '': 241 | umls_cui = access_UMLS_CUI(tgt, 'ICD10CM', icd_10) 242 | if umls_cui == '': 243 | umls_cui = access_UMLS_CUI_name(tgt, name) 244 | umls_list.append(umls_cui) 245 | if umls_cui != '' and pd.isnull(mesh_id): 246 | mesh_id = UMLS2MeSH(tgt, umls_cui) 247 | kegg_df.loc[i, 'mesh_id'] = mesh_id 248 | print(i + 1, '/', len(kegg_df), 'Completed...') 249 | kegg_df['umls_cui'] = umls_list 250 | print(kegg_df[['kegg_id', 'mesh_id', 'umls_cui']]) 251 | kegg_df.to_csv(folder + 'kegg_disease_enriched.csv', index=False) 252 | 253 | 254 | def refine_KEGG(): 255 | kegg_df = pd.read_csv(folder + 'kegg_disease_enriched.csv') 256 | mesh_disease = pd.read_csv(folder + 'mesh_disease.csv') 257 | mesh_disease['mesh_id'] = mesh_disease['mesh_id'].str.replace('MESH:', '') 258 | mesh_name_dict = mesh_disease.set_index('mesh_id')['mesh_term'].to_dict() 259 | 260 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6' 261 | tgt = get_UMLS_tgt(apikey) 262 | 263 | for i in range(len(kegg_df)): 264 | mesh_id = kegg_df.loc[i, 'mesh_id'] 265 | umls_cui = kegg_df.loc[i, 'umls_cui'] 266 | 267 | if not pd.isnull(mesh_id): 268 | temp_df = kegg_df[kegg_df['mesh_id'] == mesh_id] 269 | if len(temp_df) > 1: 270 | mesh_term = mesh_name_dict[mesh_id] if mesh_id in mesh_name_dict else '' 271 | temp_mesh_term = mesh_term.lower() 272 | temp_mesh_term = temp_mesh_term.translate( 273 | str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 274 | mesh_term_set = set(filter(None, temp_mesh_term.split(' '))) 275 | for j in range(len(temp_df)): 276 | name = temp_df.iloc[j, 1] 277 | temp_name = name.lower() 278 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 279 | name_set = set(filter(None, temp_name.split(' '))) 280 | if name_set != mesh_term_set: 281 | kegg_df.loc[kegg_df['name'] == name, 'mesh_id'] = np.nan 282 | temp_2 = kegg_df[kegg_df['mesh_id'] == mesh_id] 283 | if len(temp_2) == 0: 284 | kegg_df.loc[kegg_df['name'] == temp_df.iloc[0, 1], 'mesh_id'] = mesh_id 285 | temp_df_2 = kegg_df[kegg_df['mesh_id'] == mesh_id] 286 | if len(temp_df_2) > 1: 287 | for j in range(1, len(temp_df_2)): 288 | temp_primary = temp_df_2.iloc[j, 0] 289 | kegg_df.loc[kegg_df['kegg_id'] == temp_primary, 'mesh_id'] = np.nan 290 | 291 | if not pd.isnull(umls_cui): 292 | temp_df = kegg_df[kegg_df['umls_cui'] == umls_cui] 293 | if len(temp_df) > 1: 294 | umls_name = get_UMLS_name(tgt, umls_cui) 295 | temp_umls_name = umls_name.lower() 296 | temp_umls_name = temp_umls_name.translate( 297 | str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 298 | umls_name_set = set(filter(None, temp_umls_name.split(' '))) 299 | for j in range(len(temp_df)): 300 | name = temp_df.iloc[j, 1] 301 | temp_name = name.lower() 302 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 303 | name_set = set(filter(None, temp_name.split(' '))) 304 | if name_set != umls_name_set: 305 | kegg_df.loc[kegg_df['name'] == name, 'umls_cui'] = np.nan 306 | temp_2 = kegg_df[kegg_df['umls_cui'] == umls_cui] 307 | if len(temp_2) == 0: 308 | kegg_df.loc[kegg_df['name'] == temp_df.iloc[0, 1], 'umls_cui'] = umls_cui 309 | temp_df_2 = kegg_df[kegg_df['umls_cui'] == umls_cui] 310 | if len(temp_df_2) > 1: 311 | for j in range(1, len(temp_df_2)): 312 | temp_primary = temp_df_2.iloc[j, 0] 313 | kegg_df.loc[kegg_df['kegg_id'] == temp_primary, 'umls_cui'] = np.nan 314 | print(i + 1, '/', len(kegg_df), 'Completed...') 315 | kegg_df.to_csv(folder + 'kegg_disease_enriched_refined.csv', index=False) 316 | 317 | 318 | def enrich_PharmGKB(): 319 | pharmgkb_df = pd.read_csv(folder + 'pharmgkb_disease_res.csv') 320 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6' 321 | tgt = get_UMLS_tgt(apikey) 322 | 323 | for i in range(len(pharmgkb_df)): 324 | mesh_id = pharmgkb_df.loc[i, 'mesh_id'] 325 | umls_cui = pharmgkb_df.loc[i, 'umls_cui'] 326 | 327 | if pd.isnull(umls_cui): 328 | snomedct_id = pharmgkb_df.loc[i, 'snomedct_id'] 329 | snomedct_id = str(snomedct_id) if not pd.isnull(snomedct_id) else '' 330 | name = pharmgkb_df.loc[i, 'name'] 331 | umls_cui = access_UMLS_CUI(tgt, 'SNOMEDCT_US', snomedct_id) 332 | if umls_cui == '': 333 | umls_cui = access_UMLS_CUI_name(tgt, name) 334 | pharmgkb_df.loc[i, 'umls_cui'] = umls_cui 335 | if not pd.isnull(umls_cui) and pd.isnull(mesh_id): 336 | mesh_id = UMLS2MeSH(tgt, umls_cui) 337 | pharmgkb_df.loc[i, 'mesh_id'] = mesh_id 338 | print(i + 1, '/', len(pharmgkb_df), 'Completed...') 339 | 340 | pharmgkb_df.to_csv(folder + 'pharmgkb_disease_enriched.csv', index=False) 341 | 342 | 343 | def refine_PharmGKB(): 344 | pharmgkb_df = pd.read_csv(folder + 'pharmgkb_disease_enriched.csv') 345 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6' 346 | tgt = get_UMLS_tgt(apikey) 347 | 348 | mesh_disease = pd.read_csv(folder + 'mesh_disease.csv') 349 | mesh_disease['mesh_id'] = mesh_disease['mesh_id'].str.replace('MESH:', '') 350 | mesh_name_dict = mesh_disease.set_index('mesh_id')['mesh_term'].to_dict() 351 | 352 | for i in range(len(pharmgkb_df)): 353 | mesh_id = pharmgkb_df.loc[i, 'mesh_id'] 354 | umls_cui = pharmgkb_df.loc[i, 'umls_cui'] 355 | if not pd.isnull(mesh_id): 356 | temp_df = pharmgkb_df[pharmgkb_df['mesh_id'] == mesh_id] 357 | if len(temp_df) > 1: 358 | mesh_term = mesh_name_dict[mesh_id] if mesh_id in mesh_name_dict else '' 359 | temp_mesh_term = mesh_term.lower() 360 | temp_mesh_term = temp_mesh_term.translate( 361 | str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 362 | mesh_term_set = set(filter(None, temp_mesh_term.split(' '))) 363 | for j in range(len(temp_df)): 364 | name = temp_df.iloc[j, 1] 365 | temp_name = name.lower() 366 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 367 | name_set = set(filter(None, temp_name.split(' '))) 368 | if name_set != mesh_term_set: 369 | pharmgkb_df.loc[pharmgkb_df['name'] == name, 'mesh_id'] = np.nan 370 | temp_2 = pharmgkb_df[pharmgkb_df['mesh_id'] == mesh_id] 371 | if len(temp_2) == 0: 372 | pharmgkb_df.loc[pharmgkb_df['name'] == temp_df.iloc[0, 1], 'mesh_id'] = mesh_id 373 | temp_df_2 = pharmgkb_df[pharmgkb_df['mesh_id'] == mesh_id] 374 | if len(temp_df_2) > 1: 375 | for j in range(1, len(temp_df_2)): 376 | temp_primary = temp_df_2.iloc[j, 0] 377 | pharmgkb_df.loc[pharmgkb_df['pharmgkb_id'] == temp_primary, 'mesh_id'] = np.nan 378 | if not pd.isnull(umls_cui): 379 | temp_df = pharmgkb_df[pharmgkb_df['umls_cui'] == umls_cui] 380 | if len(temp_df) > 1: 381 | umls_name = get_UMLS_name(tgt, umls_cui) 382 | temp_umls_name = umls_name.lower() 383 | temp_umls_name = temp_umls_name.translate( 384 | str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 385 | umls_name_set = set(filter(None, temp_umls_name.split(' '))) 386 | for j in range(len(temp_df)): 387 | name = temp_df.iloc[j, 1] 388 | temp_name = name.lower() 389 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) 390 | name_set = set(filter(None, temp_name.split(' '))) 391 | if name_set != umls_name_set: 392 | pharmgkb_df.loc[pharmgkb_df['name'] == name, 'umls_cui'] = np.nan 393 | temp_2 = pharmgkb_df[pharmgkb_df['umls_cui'] == umls_cui] 394 | if len(temp_2) == 0: 395 | pharmgkb_df.loc[pharmgkb_df['name'] == temp_df.iloc[0, 1], 'umls_cui'] = umls_cui 396 | temp_df_2 = pharmgkb_df[pharmgkb_df['umls_cui'] == umls_cui] 397 | if len(temp_df_2) > 1: 398 | for j in range(1, len(temp_df_2)): 399 | temp_primary = temp_df_2.iloc[j, 0] 400 | pharmgkb_df.loc[pharmgkb_df['pharmgkb_id'] == temp_primary, 'umls_cui'] = np.nan 401 | print(i + 1, '/', len(pharmgkb_df), 'Completed...') 402 | pharmgkb_df.to_csv(folder + 'pharmgkb_disease_enriched_refined.csv', index=False) 403 | 404 | 405 | def refine_CTD_disease(): 406 | CTD_disease = pd.read_csv(folder + 'CTD_disease_enriched.csv') 407 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6' 408 | tgt = get_UMLS_tgt(apikey) 409 | 410 | for i in range(len(CTD_disease)): 411 | disease_id = CTD_disease.loc[i, 'disease_id'] 412 | mesh_id = CTD_disease.loc[i, 'mesh_id'] 413 | umls_cui = CTD_disease.loc[i, 'umls_cui'] 414 | if 'OMIM' in disease_id: 415 | if not pd.isnull(mesh_id): 416 | temp_df = CTD_disease[CTD_disease['mesh_id'] == mesh_id] 417 | if len(temp_df) > 1: 418 | for j in range(len(temp_df)): 419 | temp_primary = temp_df.iloc[j, 0] 420 | if 'OMIM' in temp_primary: 421 | CTD_disease.loc[CTD_disease['disease_id'] == temp_primary, 'mesh_id'] = np.nan 422 | if not pd.isnull(umls_cui): 423 | temp_df = CTD_disease[CTD_disease['umls_cui'] == umls_cui] 424 | if len(temp_df) > 1: 425 | for j in range(len(temp_df)): 426 | temp_primary = temp_df.iloc[j, 0] 427 | if 'OMIM' in temp_primary: 428 | CTD_disease.loc[CTD_disease['disease_id'] == temp_primary, 'umls_cui'] = np.nan 429 | 430 | CTD_disease.to_csv(folder + 'CTD_disease_enriched_refined.csv', index=False) 431 | 432 | 433 | def enrich_DRKG_DDi(): 434 | drkg_DDi = pd.read_csv('/Users/yuhou/Documents/Knowledge_Graph/knowledge_bases_integration/stage_2/drkg_DDi.csv') 435 | drkg_DDi_disease = drkg_DDi.drop_duplicates(subset='entity_2', keep='first') 436 | drkg_DDi_disease['entity_2'] = drkg_DDi_disease['entity_2'].str.replace('Disease::', '') 437 | drkg_DDi_disease = drkg_DDi_disease.reset_index(drop=True)[['entity_2']] 438 | print(drkg_DDi_disease) 439 | 440 | 441 | def main(): 442 | # enrich_DO() 443 | # enrich_KEGG() 444 | # enrich_PharmGKB() 445 | enrich_DRKG_DDi() 446 | 447 | # refine_DO() 448 | # refine_KEGG() 449 | # refine_PharmGKB() 450 | # refine_CTD_disease() 451 | 452 | # apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6' 453 | # tgt = get_UMLS_tgt(apikey) 454 | # umls_cui = access_UMLS_CUI(tgt, 'ICD10CM', 'C83.8') 455 | # # name = 'Water' 456 | # # umls_cui = access_UMLS_CUI_name(tgt, name) 457 | # print(umls_cui) 458 | # umls_name = get_UMLS_name(tgt, 'C0265318') 459 | # print(umls_name) 460 | 461 | 462 | if __name__ == '__main__': 463 | main() 464 | --------------------------------------------------------------------------------