├── iBKH-KD-protocol
├── data
│ └── test
├── output
│ ├── shortest_path_interpretation_Disease_Drug.pdf
│ ├── prediction_drug_top100_transE_l2.csv
│ └── prediction_drug_top100_ensemble.csv
├── funcs
│ ├── knowledge_visualization.py
│ └── KG_link_pred.py
└── Knowledge_Discovery_Pipeline.ipynb
├── Codes_Term Harmonization
├── README.md
├── Entity_Integration
│ ├── entity_side_effect.py
│ ├── entity_gene.py
│ ├── entity_pathway.py
│ ├── entity_symptom.py
│ ├── entity_anatomy.py
│ └── entity_disease.py
└── Relation_Integration
│ ├── integrate_drug_related.py
│ ├── integrate_disease_related.py
│ ├── integrate_drug_disease.py
│ ├── integrate_drug_gene.py
│ ├── integrate_gene_related.py
│ └── integrate_disease_gene.py
├── iBKH_Schema.png
├── Codes_Analysis
├── image
│ └── knowledge_discover.png
└── README.md
├── iBKH
├── iBKH_2021_04_12
│ ├── Relation
│ │ └── README.md
│ └── Entity
│ │ └── README.md
└── iBKH_2021_05_03
│ ├── Relation
│ └── README.md
│ └── Entity
│ └── README.md
├── README.md
└── Source Information
└── README.md
/iBKH-KD-protocol/data/test:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/Codes_Term Harmonization/README.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/iBKH_Schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wcm-wanglab/iBKH/HEAD/iBKH_Schema.png
--------------------------------------------------------------------------------
/Codes_Analysis/image/knowledge_discover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wcm-wanglab/iBKH/HEAD/Codes_Analysis/image/knowledge_discover.png
--------------------------------------------------------------------------------
/iBKH-KD-protocol/output/shortest_path_interpretation_Disease_Drug.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wcm-wanglab/iBKH/HEAD/iBKH-KD-protocol/output/shortest_path_interpretation_Disease_Drug.pdf
--------------------------------------------------------------------------------
/Codes_Term Harmonization/Entity_Integration/entity_side_effect.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | folder = ''
4 |
5 |
6 | def extract_SIDER():
7 | sider_df = pd.read_table(folder + 'entity/side_effect/meddra_all_se.tsv', header=None)
8 | sider_df = sider_df[sider_df[3] == 'PT']
9 | sider_df = sider_df[[0, 4, 5]]
10 | sider_df = sider_df.rename(columns={0: 'CID', 4: 'umls_cui', 5: 'name'})
11 | sider_df = sider_df.reset_index(drop=True)
12 | print(sider_df)
13 | res = sider_df[['umls_cui', 'name']]
14 | res = res.drop_duplicates(subset='umls_cui', keep='first')
15 | res['primary'] = 'UMLS:' + res['umls_cui'].astype(str)
16 | res = res[['primary', 'umls_cui', 'name']]
17 | print(res)
18 | res.to_csv(folder + 'entity/side_effect/side_effect_vocab.csv', index=False)
19 |
20 |
21 | def main():
22 | extract_SIDER()
23 |
24 |
25 | if __name__ == '__main__':
26 | main()
27 |
--------------------------------------------------------------------------------
/iBKH/iBKH_2021_04_12/Relation/README.md:
--------------------------------------------------------------------------------
1 | ## Download iBKH relations
2 | To access the relations in the iBKH, you can directly download the iBKH relations by the following link.
3 |
4 | ```
5 | https://wcm.box.com/s/fzzsx9ldj8a64jsa04hyf8khple7js7n
6 | ```
7 |
8 | When you unzip the file, you will get the following .csv files.
9 | ```
10 | ./relation/A_G_res.csv
11 | ./relation/D_D_res.csv
12 | ./relation/D_Di_res.csv
13 | ./relation/D_G_res.csv
14 | ./relation/Di_Di_res.csv
15 | ./relation/Di_G_res.csv
16 | ./relation/Di_S_res.csv
17 | ./relation/DSP_SDSI_res.csv
18 | ./relation/G_G_res.csv
19 | ./relation/SDSI_Ares.csv
20 | ./relation/SDSI_D_res.csv
21 | ./relation/SDSI_Di_res.csv
22 | ./relation/SDSI_S.csv
23 | ./relation/SDSI_TC_res.csv
24 | ```
25 |
26 | ## iBKH relations
27 | Each row in the iBKH relationship describes the relationship between a pair of entities. We kept all the relationship types from the source database in the iBKH relations tables, and use binary to express exist/non-exist. 1 indicates that the relationship exists between the entity pairs, and 0 indicates that the relationship does not exist. For example,
28 | Drug | Disease | Palliates | Treats | Effect | Association | Source
29 | --- | --- | --- | --- |--- |--- |---
30 | DrugBank:DB00843 | DOID:10652 | 0 | 1 | 0 | 1 | Hetionet; CTD
31 | ... | ... | ... | ... | ... | ... | ...
32 |
33 | From the above record, we can observe that the entity 'Donepezil' (primary ID is DrugBank:DB00843) and the entity 'Alzheimer's Disease' (primary ID is DOID:10652) have the relation 'Treats' and 'Association', and the relations come from the Hetionet and CTD respectively.
34 |
--------------------------------------------------------------------------------
/iBKH/iBKH_2021_05_03/Relation/README.md:
--------------------------------------------------------------------------------
1 | ## Download iBKH relations
2 | To access the relations in the iBKH, you can directly download the iBKH relations by the following link.
3 |
4 | ```
5 | https://wcm.box.com/s/dcq6lj4vxzs4rnxu6xx60ziwl62qrzyp
6 | ```
7 |
8 | When you unzip the file, you will get the following .csv files.
9 | ```
10 | ./relation/A_G_res.csv
11 | ./relation/D_D_res.csv
12 | ./relation/D_Di_res.csv
13 | ./relation/D_G_res.csv
14 | ./relation/D_Pwy_res.csv
15 | ./relation/D_SE_res.csv
16 | ./relation/Di_Di_res.csv
17 | ./relation/Di_G_res.csv
18 | ./relation/Di_Pwy_res.csv
19 | ./relation/Di_Sy_res.csv
20 | ./relation/DSP_SDSI_res.csv
21 | ./relation/G_G_res.csv
22 | ./relation/G_Pwy_res.csv
23 | ./relation/SDSI_A_res.csv
24 | ./relation/SDSI_D_res.csv
25 | ./relation/SDSI_Di_res.csv
26 | ./relation/SDSI_Sy.csv
27 | ./relation/SDSI_TC_res.csv
28 | ```
29 |
30 | ## iBKH relations
31 | Each row in the iBKH relationship describes the relationship between a pair of entities. We kept all the relationship types from the source database in the iBKH relations tables, and use binary to express exist/non-exist. 1 indicates that the relationship exists between the entity pairs, and 0 indicates that the relationship does not exist. The inference score reflects the degree of similarity between the drug-disease network in the CTD inferred relationship. The triplets will be assigned an inference score when the triplets are only an inferred relation from the CTD.
32 |
33 | | Drug | Disease | Treats | Palliates | Effect | Associate | Inferred_Relation | ... | Source | Inference_Score |
34 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
35 | | DrugBank:DB00843 | DOID:10652 | 1 | 0 | 1 | 1 | 0 | ... | CTD;DRKG;Hetionet;KEGG | ... |
36 | | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
37 |
38 | From the above example record, we can observe that the entity 'Donepezil' (primary ID is DrugBank:DB00843) and the entity 'Alzheimer's Disease' (primary ID is DOID:10652) have the relation 'Treats' and 'Association', and the relations come from the Hetionet and CTD curated relation respectively.
39 |
40 | | Drug | Disease | Treats | Palliates | Effect | Associate | Inferred_Relation | ... | Source | Inference_Score |
41 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
42 | | DrugBank:DB06767 | DOID:10283 | 0 | 0 | 0 | 0 | 1 | ... |CTD | 342.19 |
43 | | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
44 |
45 | From the above example record, we can observe that the entity 'Ammonium chloride' (primary ID is DrugBank: DB06767) and the entity 'Prostate cancer' (primary ID is DOID:10283) have the relation from the CTD inferred relation. The relation is assigned an inference score (342.19).
46 |
--------------------------------------------------------------------------------
/iBKH/iBKH_2021_04_12/Entity/README.md:
--------------------------------------------------------------------------------
1 | ## Download iBKH entities
2 | To access the entity vocabulary in the iBKH, you can directly download the iBKH entities by the following link.
3 | ```
4 | https://wcm.box.com/s/kz7lnowhf2iejjwsopo6cqsdati0yj6i
5 | ```
6 |
7 | When you unzip the file, you will get the following .csv files.
8 | ```
9 | ./entity/anatomy_vocab.csv
10 | ./entity/disease_vocab.csv
11 | ./entity/drug_vocab.csv
12 | ./entity/dsp_vocab.csv
13 | ./entity/gene_vocab.csv
14 | ./entity/molecule_vocab.csv
15 | ./entity/sdsi_vocab.csv
16 | ./entity/symptom_vocab.csv
17 | ./entity/tc_vocab.csv
18 | ```
19 |
20 | ## iBKH entities vocabulary
21 | Each row in the iBKH entity vocabulary describes an entity, and each column in this row records the entity's information in different source databases (such as original ID, name, etc.). For example,
22 | | primary | do_id | do_name | kegg_id | kegg_name | pharmgkb_id | pharmgkb_name | umls_cui | mesh_id | ... |
23 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
24 | | DOID:0001816 | 0001816 | angiosarcoma | H01666 | Angiosarcoma | PA444390 | Hemangiosarcoma | C0018923 | D006394 | ... |
25 | | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
26 |
27 | The above row comes from the disease vocabulary, which describes a disease entity 'Angiosarcoma'. We can observe that the entity 'Angiosarcoma' has the following information, Disease Ontology ID (DOID:0001816), KEGG ID (H01666), PharmGKB ID (PA444390), the name in PharmGKB ('Hemangiosarcoma'), UMLS CUI (C0018923) and MeSH ID (D006394).
28 |
29 | | primary | symbol | hgnc_id | ncbi_id | pharmgkb_id |
30 | | --- | --- | --- | --- | --- |
31 | | HGNC:5 | A1BG | 5 | 1 | PA24356 |
32 | | ... | ... | ... | ... | ... |
33 |
34 | This example comes from the gene vocabulary, it describes a gene entity 'A1BG'. The corresponding information of the entity 'A1BG' has, HGNC ID (HGNC:5), NCBI ID (NCBI:1), gene symbol (A1BG) and PharmGKB ID (PA24356).
35 |
36 | We assigned the primary ID for each type of entity, for example, we used HGNC ID as the primary ID in gene entity vocabulary. And we used the entity's primary ID to describe the entities in the relationship. For example, there is a relation 'Treats' between entities 'Donepezil' and 'Alzheimer's Disease' in the iBKH. And we used the entity Donepezil's primary ID (DrugBank:DB00843) and AD's primary ID (DOID:10652) to describe them respectively. When an entity can't find the corresponding primary ID, we will follow the primary priority order to do the mapping. For example, the NCBI ID is the second primary ID for the Gene entity vocabulary. Currently, the existing entity vocabularies have the following primary ID order:
37 | * Gene: HGNC ID, NCBI ID
38 | * Disease: Disease Ontology ID, KEGG ID, PharmGKB ID, MeSH ID, OMIM ID, iDISK ID
39 | * Drug: DrugBank ID, KEGG ID, PharmGKB ID, MeSH ID, iDISK ID
40 | * Anatomy: Uberon ID, BTO ID, MeSH ID, CL ID
41 | * Molecule: ChEMBL ID, ChEBI ID
42 | * Symptom: MeSH ID, UMLS CUI
43 | * DSI: iDISK
44 | * DSP: iDISK
45 | * TC: UMLS CUI
46 | * Pathway: Reactome ID, KEGG ID
47 | * Side-Effect: UMLS CUI
48 |
--------------------------------------------------------------------------------
/iBKH/iBKH_2021_05_03/Entity/README.md:
--------------------------------------------------------------------------------
1 | ## Download iBKH entities
2 | To access the entity vocabulary in the iBKH, you can directly download the iBKH entities using the following link.
3 | ```
4 | https://wcm.box.com/s/gagu6yj2toyk4kirb6hpsb1qu4dm203p
5 | ```
6 |
7 | When you unzip the file, you will get the following .csv files.
8 | ```
9 | ./entity/anatomy_vocab.csv
10 | ./entity/disease_vocab.csv
11 | ./entity/drug_vocab.csv
12 | ./entity/dsp_vocab.csv
13 | ./entity/gene_vocab.csv
14 | ./entity/molecule_vocab.csv
15 | ./entity/pathway_vocab.csv
16 | ./entity/sdsi_vocab.csv
17 | ./entity/side_effect_vocab.csv
18 | ./entity/symptom_vocab.csv
19 | ./entity/tc_vocab.csv
20 | ```
21 |
22 | ## iBKH entities vocabulary
23 | Each row in the iBKH entity vocabulary describes an entity, and each column in this row records the entity's information in different source databases (such as original ID, name, etc.). For example,
24 | | primary | name | do_id | kegg_id | pharmgkb_id | mesh_id | umls_cui | icd_10 | icd_9 | omim_id | iDISK_id |
25 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
26 | | DOID:10652 | alzheimer's disease | DOID:10652 | H00056 | PA443319 | D000544 | C0002395 | G30 | 331 | ... | ... |
27 | | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
28 |
29 | The above row comes from the disease vocabulary, which describes a disease entity 'alzheimer's disease'. We can observe that the entity 'alzheimer's disease' has the following information, Disease Ontology ID (DOID:10652), KEGG ID (H00056), PharmGKB ID (PA443319), UMLS CUI (C0002395), MeSH ID (D000544), ICD-10 code (G30) and ICD-9 code (331).
30 |
31 | | primary | symbol | hgnc_id | ncbi_id | pharmgkb_id |
32 | | --- | --- | --- | --- | --- |
33 | | HGNC:5 | A1BG | 5 | 1 | PA24356 |
34 | | ... | ... | ... | ... | ... |
35 |
36 | This example comes from the gene vocabulary, it describes a gene entity 'A1BG'. The corresponding information of the entity 'A1BG' has, HGNC ID (HGNC:5), NCBI ID (NCBI:1), gene symbol (A1BG) and PharmGKB ID (PA24356).
37 |
38 | We assigned the primary ID for each type of entity, for example, we used HGNC ID as the primary ID in gene entity vocabulary. And we used the entity's primary ID to describe the entities in the relationship. For example, there is a relation 'Treats' between entities 'Donepezil' and 'Alzheimer's Disease' in the iBKH. And we used the entity Donepezil's primary ID (DrugBank:DB00843) and AD's primary ID (DOID:10652) to describe them respectively. When an entity can't find the corresponding primary ID, we will follow the primary priority order to do the mapping. For example, the NCBI ID is the second primary ID for the Gene entity vocabulary. Currently, the existing entity vocabularies have the following primary ID order:
39 | * Gene: HGNC ID, NCBI ID
40 | * Disease: Disease Ontology ID, KEGG ID, PharmGKB ID, MeSH ID, OMIM ID, iDISK ID
41 | * Drug: DrugBank ID, KEGG ID, PharmGKB ID, MeSH ID, iDISK ID
42 | * Anatomy: Uberon ID, BTO ID, MeSH ID, CL ID
43 | * Molecule: ChEMBL ID, ChEBI ID
44 | * Symptom: MeSH ID, UMLS CUI
45 | * DSI: iDISK
46 | * DSP: iDISK
47 | * TC: UMLS CUI
48 | * Pathway: Reactome ID, KEGG ID
49 | * Side-Effect: UMLS CUI
50 |
--------------------------------------------------------------------------------
/Codes_Term Harmonization/Relation_Integration/integrate_drug_related.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | folder = ''
5 |
6 |
7 | def integrate_DPwy():
8 | D_Pwy_kegg = pd.read_csv(folder + '/kegg_drug_pathway.csv')
9 | print(D_Pwy_kegg)
10 | pathway_vocab = pd.read_csv(folder + '/pathway_vocab.csv')
11 | drug_vocab = pd.read_csv(folder + '/drug_vocab.csv')
12 |
13 | pathway_primary_dict = pathway_vocab.dropna(subset=['kegg_id']).set_index('kegg_id')['primary'].to_dict()
14 | kegg_drug_vocab = drug_vocab.dropna(subset=['kegg_id'])
15 | kegg_drug_primary_dict = kegg_drug_vocab.set_index('kegg_id')['primary'].to_dict()
16 |
17 | D_Pwy_res = D_Pwy_kegg.replace({'kegg_id': kegg_drug_primary_dict, 'pathway_id': pathway_primary_dict})
18 | D_Pwy_res = D_Pwy_res.rename(columns={'kegg_id': 'Drug', 'pathway_id': 'Pathway'})
19 | D_Pwy_res = D_Pwy_res[['Drug', 'Pathway']]
20 | D_Pwy_res['Association'] = [1] * len(D_Pwy_res)
21 | D_Pwy_res['Source'] = ['KEGG'] * len(D_Pwy_res)
22 | print(D_Pwy_res)
23 | D_Pwy_res.to_csv(folder + '/D_Pw_res.csv', index=False)
24 |
25 |
26 | def integrate_DSE():
27 | sider_df = pd.read_table(folder + '/meddra_all_se.tsv', header=None)
28 | sider_df = sider_df[sider_df[3] == 'PT']
29 | sider_df = sider_df[[0, 4, 5]]
30 | sider_df = sider_df.rename(columns={0: 'CID', 4: 'umls_cui', 5: 'name'})
31 | sider_df = sider_df.reset_index(drop=True)
32 |
33 | drug_vocab = pd.read_csv(folder + '/drug_vocab.csv')
34 | side_effect_vocab = pd.read_csv(folder + '/side_effect_vocab.csv')
35 |
36 | cid_primary = drug_vocab.dropna(subset=['CID']).set_index('CID')['primary'].to_dict()
37 | se_primary = side_effect_vocab.set_index(['umls_cui'])['primary'].to_dict()
38 |
39 | D_SE_res = sider_df[['CID', 'umls_cui']]
40 | D_SE_res = D_SE_res.replace({'CID': cid_primary})
41 | D_SE_res = D_SE_res.replace({'umls_cui': se_primary})
42 | D_SE_res['Cause'] = [1] * len(D_SE_res)
43 | D_SE_res['Source'] = ['SIDER'] * len(D_SE_res)
44 | D_SE_res = D_SE_res.rename(columns={'CID': 'Drug', 'umls_cui': 'Side_Effect'})
45 |
46 | print(D_SE_res)
47 | D_SE_res.to_csv(folder + '/D_SE_res.csv', index=False)
48 |
49 |
50 | def integrate_DSDSI():
51 | sdsi_spd = pd.read_table(folder + '/MRREL.RRF', delimiter='|')
52 | sdsi_spd = sdsi_spd[sdsi_spd['REL'] == 'interacts_with']
53 | sdsi_spd = sdsi_spd.reset_index(drop=True)
54 |
55 | sdsi_vocab = pd.read_csv(folder + '/sdsi_vocab.csv')
56 | sdsi_primary_dict = sdsi_vocab.set_index('iDISK_id')['primary'].to_dict()
57 | drug_vocab = pd.read_csv(folder + '/drug_vocab.csv')
58 | drug_idisk_primary_dict = drug_vocab.dropna(subset=['iDISK_id']).set_index('iDISK_id')['primary'].to_dict()
59 |
60 | sdsi_spd_res = pd.DataFrame(columns=['SDSI', 'Drug', 'interacts_with', 'Source'])
61 | for i in range(len(sdsi_spd)):
62 | sdsi = sdsi_spd.loc[i, 'CUI1']
63 | drug = sdsi_spd.loc[i, 'CUI2']
64 | sdsi_primary = sdsi_primary_dict[sdsi]
65 | drug_primary = drug_idisk_primary_dict[drug]
66 | sdsi_spd_res.loc[i] = [sdsi_primary, drug_primary, 1, 'iDISK']
67 | print(i + 1, '/', len(sdsi_spd), 'Completed (SDSI_SPD)...')
68 |
69 | sdsi_spd_res.to_csv(folder + '/SDSI_D_res.csv', index=False)
70 |
71 |
72 | def main():
73 | # integrate_DPwy()
74 | # integrate_DSE()
75 | integrate_DSDSI()
76 |
77 |
78 | if __name__ == '__main__':
79 | main()
80 |
--------------------------------------------------------------------------------
/Codes_Term Harmonization/Entity_Integration/entity_gene.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | folder = ''
5 | res_folder = ''
6 |
7 |
8 | def remove_duplicated_ncbi():
9 | gene_vocab = pd.read_csv(folder + 'entity/gene_vocab.csv')
10 | gene_vocab = gene_vocab[['primary', 'symbol', 'hgnc_id', 'ncbi_id']]
11 | # gene_vocab = gene_vocab.drop_duplicates(subset='ncbi_id', keep='first')
12 | gene_vocab = gene_vocab[(~gene_vocab.duplicated(subset='ncbi_id')) | (gene_vocab['ncbi_id'].isnull())]
13 | print(len(gene_vocab), len(gene_vocab.drop_duplicates(subset='primary', keep='first')))
14 | hgnc_vocab = gene_vocab.dropna(subset=['hgnc_id'])
15 | print(len(hgnc_vocab), len(hgnc_vocab.drop_duplicates(subset='hgnc_id', keep='first')))
16 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
17 | print(len(ncbi_vocab), len(ncbi_vocab.drop_duplicates(subset='ncbi_id', keep='first')))
18 | print(gene_vocab)
19 | gene_vocab.to_csv(res_folder + 'gene_vocab.csv', index=False)
20 |
21 |
22 | def add_PharmGKB_gene():
23 | gene_vocab = pd.read_csv(res_folder + 'gene_vocab.csv')
24 | gene_vocab['pharmgkb_id'] = [''] * len(gene_vocab)
25 | idx = len(gene_vocab)
26 |
27 | hgnc_vocab = gene_vocab.dropna(subset=['hgnc_id'])
28 | hgnc_vocab['hgnc_id'] = hgnc_vocab['hgnc_id'].astype(int).astype(str)
29 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
30 | hgnc_list = list(hgnc_vocab['hgnc_id'])
31 | ncbi_list = list(ncbi_vocab['ncbi_id'])
32 |
33 | pharmgkb_gene = pd.read_table(res_folder + 'pharmgkb_gene.tsv')
34 | for i in range(len(pharmgkb_gene)):
35 | p_id = pharmgkb_gene.loc[i, 'PharmGKB Accession Id']
36 | hgnc_id = pharmgkb_gene.loc[i, 'HGNC ID']
37 | ncbi_id = pharmgkb_gene.loc[i, 'NCBI Gene ID']
38 | symbol = pharmgkb_gene.loc[i, 'Symbol']
39 |
40 | if not pd.isnull(hgnc_id):
41 | hgnc_id = hgnc_id.replace('HGNC:', '')
42 | if hgnc_id in hgnc_list:
43 | gene_vocab.loc[gene_vocab['hgnc_id'] == int(hgnc_id), 'pharmgkb_id'] = p_id
44 | elif not pd.isnull(ncbi_id):
45 | if ncbi_id in ncbi_list:
46 | gene_vocab.loc[gene_vocab['ncbi_id'] == ncbi_id, 'pharmgkb_id'] = p_id
47 | else:
48 | gene_vocab.loc[idx] = ['PharmGKB:' + p_id, symbol, '', '', p_id]
49 | idx += 1
50 | print(i + 1, '/', len(pharmgkb_gene), 'Completed...')
51 | print(gene_vocab)
52 | gene_vocab.to_csv(res_folder + 'gene_vocab_2.csv', index=False)
53 |
54 |
55 | def add_ensembl():
56 | gene_vocab = pd.read_csv(folder + 'entity/gene_vocab.csv')
57 |
58 | ensembl_df = pd.read_table(res_folder + 'gene2ensembl_May_3')
59 | ncbi_ensembl_dict = ensembl_df.set_index('GeneID')['Ensembl_gene_identifier'].to_dict()
60 | # ncbi_protein_dict = ensembl_df.set_index('GeneID')['Ensembl_protein_identifier'].to_dict()
61 | # print(gene_vocab)
62 | # print(ncbi_ensembl_dict[100527964], ncbi_protein_dict[100527964])
63 | ensembl_list = []
64 | for i in range(len(gene_vocab)):
65 | ncbi_id = gene_vocab.loc[i, 'ncbi_id']
66 | ensembl_id = ncbi_ensembl_dict[ncbi_id] if ncbi_id in ncbi_ensembl_dict else ''
67 | ensembl_list.append(ensembl_id)
68 | print(i + 1, '/', len(gene_vocab), 'Completed...')
69 | gene_vocab['ensembl_id'] = ensembl_list
70 | print(gene_vocab)
71 | gene_vocab.to_csv(res_folder + 'gene_vocab_3.csv', index=False)
72 |
73 |
74 | def main():
75 | # remove_duplicated_ncbi()
76 | # add_PharmGKB_gene()
77 |
78 | add_ensembl()
79 |
80 |
81 | if __name__ == '__main__':
82 | main()
83 |
--------------------------------------------------------------------------------
/Codes_Term Harmonization/Relation_Integration/integrate_disease_related.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | folder = ''
5 |
6 |
7 | def integrate_DiPwy():
8 | Di_Pwy_kegg = pd.read_csv(folder + 'kegg_disease_pathway.csv')
9 | print(Di_Pwy_kegg)
10 |
11 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv')
12 | kegg_disease_vocab = disease_vocab.dropna(subset=['kegg_id'])
13 | kegg_disease_primary_dict = kegg_disease_vocab.set_index('kegg_id')['primary'].to_dict()
14 |
15 | pathway_vocab = pd.read_csv(folder + 'pathway_vocab.csv')
16 | pathway_primary_dict = pathway_vocab.dropna(subset=['kegg_id']).set_index('kegg_id')['primary'].to_dict()
17 |
18 | Di_Pwy_res = Di_Pwy_kegg.replace({'kegg_id': kegg_disease_primary_dict, 'pathway_id': pathway_primary_dict})
19 | Di_Pwy_res = Di_Pwy_res.rename(columns={'kegg_id': 'Disease', 'pathway_id': 'Pathway'})
20 | Di_Pwy_res = Di_Pwy_res[['Disease', 'Pathway']]
21 | Di_Pwy_res['Association'] = [1] * len(Di_Pwy_res)
22 | Di_Pwy_res['Source'] = ['KEGG'] * len(Di_Pwy_res)
23 | print(Di_Pwy_res)
24 | Di_Pwy_res.to_csv(folder + 'Di_Pw_res.csv', index=False)
25 |
26 |
27 | def integrate_DiSy():
28 | hetionet_DiSy = pd.read_csv(folder + 'hetionet_DiS.csv')
29 | hetionet_DiSy = hetionet_DiSy.rename(columns={'source': 'Disease', 'target': 'Symptom'})
30 | hetionet_DiSy = hetionet_DiSy[['Disease', 'Symptom']]
31 | print(hetionet_DiSy)
32 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv')
33 | do_vocab = disease_vocab.dropna(subset=['do_id'])
34 | do_primary_dict = do_vocab.set_index('do_id')['primary'].to_dict()
35 |
36 | symptom_vocab = pd.read_csv(folder + 'symptom_vocab.csv')
37 | symptom_primary_dict = symptom_vocab.set_index('mesh_id')['primary'].to_dict()
38 |
39 | hetionet_DiSy = hetionet_DiSy.replace({'Disease': do_primary_dict, 'Symptom': symptom_primary_dict})
40 | DiSy_res = hetionet_DiSy
41 | DiSy_res['Present'] = [1] * len(DiSy_res)
42 | DiSy_res['Source'] = ['Hetionet'] * len(DiSy_res)
43 | DiSy_res.to_csv(folder + 'Di_S_res.csv', index=False)
44 |
45 |
46 | def integrate_DiDSI():
47 | sdsi_dis = pd.read_table(folder + 'MRREL.RRF', delimiter='|')
48 | sdsi_dis = sdsi_dis[sdsi_dis['REL'] == 'is_effective_for']
49 | sdsi_dis = sdsi_dis.reset_index(drop=True)
50 |
51 | sdsi_vocab = pd.read_csv(folder + 'sdsi_vocab.csv')
52 | sdsi_primary_dict = sdsi_vocab.set_index('iDISK_id')['primary'].to_dict()
53 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv')
54 | iDISK_vocab = disease_vocab.dropna(subset=['iDISK_id'])
55 | iDISK_primary_dict = iDISK_vocab.set_index('iDISK_id')['primary'].to_dict()
56 |
57 | sdsi_dis = sdsi_dis.rename(columns={'CUI1': 'DSI', 'CUI2': 'Disease'})
58 | sdsi_dis = sdsi_dis[['DSI', 'Disease']]
59 | sdsi_dis = sdsi_dis.replace({'DSI': sdsi_primary_dict, 'Disease': iDISK_primary_dict})
60 |
61 | DSIDi_res = sdsi_dis
62 | DSIDi_res['is_effective_for'] = [1] * len(DSIDi_res)
63 | DSIDi_res['Source'] = ['iDISK'] * len(DSIDi_res)
64 | DSIDi_res.to_csv(folder + 'SDSI_Di_res.csv', index=False)
65 |
66 |
67 | def integrate_DSISy():
68 | sdsi_ss = pd.read_table(folder + 'MRREL.RRF', delimiter='|')
69 | sdsi_ss = sdsi_ss[sdsi_ss['REL'] == 'has_adverse_reaction']
70 | sdsi_ss = sdsi_ss.reset_index(drop=True)
71 |
72 | sdsi_vocab = pd.read_csv(folder + 'sdsi_vocab.csv')
73 | sdsi_primary_dict = sdsi_vocab.set_index('iDISK_id')['primary'].to_dict()
74 |
75 | symptom_vocab = pd.read_csv(folder + 'symptom_vocab.csv')
76 | symptom_primary_dict = symptom_vocab.dropna(subset=['iDISK_id']).set_index('iDISK_id')['primary'].to_dict()
77 |
78 | sdsi_ss = sdsi_ss.rename(columns={'CUI1': 'DSI', 'CUI2': 'Symptom'})
79 | sdsi_ss = sdsi_ss[['DSI', 'Symptom']]
80 | sdsi_ss = sdsi_ss.replace({'DSI': sdsi_primary_dict, 'Symptom': symptom_primary_dict})
81 |
82 | DSIDy_res = sdsi_ss
83 | DSIDy_res['has_adverse_reaction'] = [1] * len(DSIDy_res)
84 | DSIDy_res['Source'] = ['iDISK'] * len(DSIDy_res)
85 | DSIDy_res.to_csv(folder + 'SDSI_S_res.csv', index=False)
86 |
87 |
88 | def main():
89 | # integrate_DiPwy()
90 | # integrate_DiSy()
91 | # integrate_DiDSI()
92 | integrate_DSISy()
93 |
94 |
95 | if __name__ == '__main__':
96 | main()
97 |
--------------------------------------------------------------------------------
/Codes_Analysis/README.md:
--------------------------------------------------------------------------------
1 | # iBKH Case Study
2 | ## Overview
3 | We enable high-quality knowledge discovery based on iBKH. We developed a knowledge discovery module based on [DGL-KE (Deep Graph Library – Knowledge Embedding)](https://github.com/awslabs/dgl-ke), a Python package for efficient and scalable graph learning. To demonstrate its potentials, we conducted two proof-of-concept studies: 1) Case Study I: in-silico hypothesis generation for Alzheimer’s disease (AD) drug repurposing, and 2) Case Study II: knowledge-enhanced cohort exploration for older adults with Apolipoprotein E (APOE) ε4 genotype (a significant genetic risk factor of AD).
4 |
5 | ## Python Dependencies
6 | The codes mainly depends on the scientific stacks on the Python 3.7.
7 | ```
8 | numpy 1.21.5
9 | pandas 1.3.5
10 | torch 1.2.0 (https://pytorch.org/)
11 | sklearn 0.0
12 | neo4j 5.2.0 (https://pypi.org/project/neo4j/5.2.0/)
13 | matplotlib 3.1.1
14 | statsmodels 0.11.1
15 | ```
16 |
17 | ## DGL-KE Platform for iBKH Setup
18 | In this work, we used the [Deep Graph Library - Knowledge Embedding (DGL-KE)](https://github.com/awslabs/dgl-ke), a Python-based implementation for the advanced KGE algorithms, such as TransE, TransR, ComplEx, and DistMult. You may follow the [Installation Guide](https://dglke.dgl.ai/doc/install.html) to complete the DGL-KE installation.
19 |
20 | ## Case Study - Alzheimer's Disease (AD) drug repurposing
21 | This is the implementation of AD drug repurposing based on iBKH. The task is to dicover drugs that potentially link to AD in the iBKH. Detailed information and codes can be found [here](https://github.com/wcm-wanglab/iBKH/blob/main/Codes/Case_Study-AD_Drug_Repurposing.ipynb).
22 |
23 |
48 |
49 |
50 |
--------------------------------------------------------------------------------
/Codes_Term Harmonization/Entity_Integration/entity_pathway.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from bs4 import BeautifulSoup
3 | from urllib.request import Request, urlopen
4 | import urllib.error
5 |
6 | folder = ''
7 |
8 |
9 | def extract_Reactome_vocab():
10 | pathway_vocab = pd.read_table(folder + 'ReactomePathways.txt', header=None)
11 | pathway_vocab = pathway_vocab.rename(columns={0: 'Reactome_ID', 1: 'Name', 2: 'Species'})
12 | pathway_vocab['primary'] = ['REACT:'] + pathway_vocab['Reactome_ID'].astype(str)
13 | pathway_res = pathway_vocab[['primary', 'Reactome_ID', 'Name', 'Species']]
14 | pathway_res = pathway_res[pathway_res['Species'] == 'Homo sapiens']
15 | pathway_res = pathway_res[['primary', 'Reactome_ID', 'Name']]
16 | print(pathway_res)
17 | pathway_res.to_csv(folder + 'res/pathway_res.csv', index=False)
18 |
19 |
20 | def add_CTD_pathway():
21 | ctd_pw = pd.read_csv('/Users/yuhou/Documents/Knowledge_Graph/CTD/vocabulary/CTD_pathways.csv', header=27)
22 | ctd_pw = ctd_pw.dropna(subset=['PathwayID'])
23 | ctd_pw = ctd_pw.reset_index(drop=True)
24 | pathway_res = pd.read_csv(folder + 'res/pathway_res.csv')
25 | idx = len(pathway_res)
26 | pathway_res['KEGG_ID'] = [''] * idx
27 | react_list = list(pathway_res['Reactome_ID'])
28 |
29 | for i in range(len(ctd_pw)):
30 | pathway_id = ctd_pw.loc[i, 'PathwayID']
31 | pathway_name = ctd_pw.loc[i, '# PathwayName']
32 | if 'REACT' in pathway_id:
33 | pathway_id = pathway_id.replace('REACT:', '')
34 | if pathway_id not in react_list:
35 | pathway_res.loc[idx] = ['REACT:' + pathway_id, pathway_id, pathway_name, '']
36 | idx += 1
37 | elif 'KEGG' in pathway_id:
38 | pathway_id = pathway_id.replace('KEGG:hsa', '')
39 | if 'M' in pathway_id:
40 | pathway_res.loc[idx] = ['KEGG:' + pathway_id.replace('_', ''), '', pathway_name, 'hsa' + pathway_id]
41 | idx += 1
42 | else:
43 | pathway_res.loc[idx] = ['KEGG:map' + pathway_id, '', pathway_name, 'hsa' + pathway_id]
44 | idx += 1
45 | print(i + 1, '/', len(ctd_pw), 'Completed...')
46 | pathway_res.to_csv(folder + 'res/pathway_res_2.csv', index=False)
47 |
48 |
49 | def process_reactome_go():
50 | reactome_vocab = pd.read_csv(folder + 'res/pathway_res.csv')
51 | go_id_list = []
52 | for i in range(len(reactome_vocab)):
53 | reactome_id = reactome_vocab.loc[i, 'Reactome_ID']
54 | url = 'https://reactome.org/content/detail/' + reactome_id
55 | req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
56 | go_id = ''
57 | try:
58 | rep = urlopen(req)
59 | webpage = rep.read()
60 | soup = BeautifulSoup(webpage, 'html.parser')
61 | go_link = soup.findAll("a", {"title": "go to GO"})
62 | if len(go_link) > 0:
63 | go_id = go_link[0].get('href').replace('https://www.ebi.ac.uk/QuickGO/term/', '')
64 | except urllib.error.HTTPError as e:
65 | print(reactome_id, 'HTTPError: {}'.format(e.code))
66 | go_id_list.append(go_id)
67 | print(i + 1, '/', len(reactome_vocab), 'Completed...')
68 | reactome_vocab['go_id'] = go_id_list
69 | reactome_vocab.to_csv(folder + 'res/reactome_pathway.csv', index=False)
70 |
71 |
72 | def integrate_reactome_kegg():
73 | pathway_res = pd.read_csv(folder + 'stage_4/entity/pathway/res/pathway_res.csv')
74 | pathway_res['kegg_id'] = [''] * len(pathway_res)
75 | idx = len(pathway_res)
76 | kegg_pathway = pd.read_csv(folder + 'KEGG/kegg_pathway.csv')
77 | print(kegg_pathway)
78 | reactome_golist = list(pathway_res.dropna(subset=['go_id'])['go_id'])
79 |
80 | for i in range(len(kegg_pathway)):
81 | kegg_id = kegg_pathway.loc[i, 'kegg_id']
82 | pathway_name = kegg_pathway.loc[i, 'name']
83 | go_id = kegg_pathway.loc[i, 'go_id']
84 | if go_id in reactome_golist:
85 | pathway_res.loc[pathway_res['go_id'] == go_id, 'kegg_id'] = kegg_id
86 | else:
87 | pathway_res.loc[idx] = ['KEGG:' + kegg_id, '', pathway_name, go_id, kegg_id]
88 | idx += 1
89 | print(i + 1, '/', len(kegg_pathway), 'Completed...')
90 | print(pathway_res)
91 | pathway_res.to_csv(folder + 'stage_4/entity/pathway/res/pathway_res_2.csv', index=False)
92 | with open(folder + 'stage_4/entity/pathway/res/integrate_note.txt', 'w') as f:
93 | f.write('pathway_res_2.csv: Reactome; KEGG')
94 | f.close()
95 |
96 |
97 | def main():
98 | # extract_Reactome_vocab()
99 | # add_CTD_pathway()
100 | # process_reactome_go()
101 |
102 | integrate_reactome_kegg()
103 |
104 |
105 | if __name__ == '__main__':
106 | main()
107 |
--------------------------------------------------------------------------------
/Codes_Term Harmonization/Entity_Integration/entity_symptom.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import requests
3 | from lxml.html import fromstring
4 |
5 | folder = ''
6 | iDISK_folder = ''
7 |
8 |
9 | def get_UMLS_tgt(apikey):
10 | uri = "https://utslogin.nlm.nih.gov"
11 | auth_endpoint = "/cas/v1/api-key"
12 | params = {'apikey': apikey}
13 | h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent": "python"}
14 | r = requests.post(uri + auth_endpoint, data=params, headers=h)
15 | response = fromstring(r.text)
16 | tgt = response.xpath('//form/@action')[0]
17 | return tgt
18 |
19 |
20 | def get_UMLS_ts(tgt):
21 | service = "http://umlsks.nlm.nih.gov"
22 | params = {'service': service}
23 | h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent": "python"}
24 | r = requests.post(tgt, data=params, headers=h)
25 | st = r.text
26 | return st
27 |
28 |
29 | def mesh2umls(tgt, mesh_id):
30 | st = get_UMLS_ts(tgt)
31 | mesh_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/source/MSH/' + mesh_id + '/atoms?ttys=MH,NM&ticket=' + st
32 | mesh_resp = requests.get(mesh_url)
33 | umls_cui = ''
34 | if 'error' not in mesh_resp.json():
35 | mesh_content = mesh_resp.json()['result'][0]
36 | umls_cui = mesh_content['concept'].replace('https://uts-ws.nlm.nih.gov/rest/content/2020AB/CUI/', '')
37 |
38 | return umls_cui
39 |
40 |
41 | def UMLS2MeSH(tgt, umls_cui):
42 | st = get_UMLS_ts(tgt)
43 | mesh_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + umls_cui + '/atoms?sabs=MSH&ttys=MH,NM,PT&ticket=' + st
44 | mesh_resp = requests.get(mesh_url)
45 | mesh_id = ''
46 | if 'error' not in mesh_resp.json():
47 | mesh_content = mesh_resp.json()['result']
48 | mesh_id = mesh_content[0]['code'].replace(
49 | 'https://uts-ws.nlm.nih.gov/rest/content/2020AB/source/MSH/', '')
50 | return mesh_id
51 |
52 |
53 | def enrich_Hetionet():
54 | hetionet_df = pd.read_table('/Users/yuhou/Documents/Knowledge_Graph/hetionet/hetionet-v1.0-nodes.tsv')
55 | hetionet_symptom = hetionet_df[hetionet_df['kind'] == 'Symptom']
56 | hetionet_symptom = hetionet_symptom.reset_index(drop=True)
57 | print(hetionet_symptom)
58 | res = pd.DataFrame(columns=['primary', 'name', 'mesh_id', 'umls_cui'])
59 | idx = 0
60 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6'
61 | tgt = get_UMLS_tgt(apikey)
62 | for i in range(len(hetionet_symptom)):
63 | mesh_id = hetionet_symptom.loc[i, 'id'].replace('Symptom::', '')
64 | name = hetionet_symptom.loc[i, 'name']
65 | umls_cui = mesh2umls(tgt, mesh_id)
66 | res.loc[idx] = ['MESH:' + mesh_id, name, mesh_id, umls_cui]
67 | idx += 1
68 | print(i + 1, '/', len(hetionet_symptom), 'Completed...')
69 | res.to_csv(folder + 'symptom_vocab_refined.csv', index=False)
70 |
71 |
72 | def integrate_iDISK():
73 | iDISK_SS = pd.read_csv(iDISK_folder + 'entity/SS_enriched.csv')
74 |
75 | symptom_vocab = pd.read_csv(folder + 'symptom_vocab_refined.csv')
76 | symptom_vocab['iDISK_id'] = [''] * len(symptom_vocab)
77 | idx = len(symptom_vocab)
78 | mesh_vocab_list = list(symptom_vocab.dropna(subset=['mesh_id'])['mesh_id'])
79 | umls_vocab_list = list(symptom_vocab.dropna(subset=['umls_cui'])['umls_cui'])
80 |
81 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6'
82 | tgt = get_UMLS_tgt(apikey)
83 |
84 | for i in range(len(iDISK_SS)):
85 | cui = iDISK_SS.loc[i, 'CUI']
86 | name = iDISK_SS.loc[i, 'name']
87 | umls_cui = iDISK_SS.loc[i, 'UMLS']
88 | mesh_id = UMLS2MeSH(tgt, umls_cui)
89 |
90 | if mesh_id in mesh_vocab_list:
91 | symptom_vocab.loc[symptom_vocab['mesh_id'] == mesh_id, 'iDISK_id'] = cui
92 | elif umls_cui in umls_vocab_list:
93 | symptom_vocab.loc[symptom_vocab['umls_cui'] == umls_cui, 'iDISK_id'] = cui
94 | else:
95 | if mesh_id != '':
96 | symptom_vocab.loc[idx] = ['MESH:' + mesh_id, name, mesh_id, umls_cui, cui]
97 | idx += 1
98 | else:
99 | symptom_vocab.loc[idx] = ['UMLS:' + umls_cui, name, mesh_id, umls_cui, cui]
100 | idx += 1
101 | print(i + 1, '/', len(iDISK_SS), 'Completed...')
102 | symptom_vocab.to_csv(folder + 'symptom_vocab_refined_2.csv', index=False)
103 |
104 |
105 | def main():
106 | # enrich_Hetionet()
107 | integrate_iDISK()
108 |
109 | # symptom_vocab = pd.read_csv(folder + 'symptom_vocab_refined.csv')
110 | # print(len(symptom_vocab), len(symptom_vocab.drop_duplicates(subset='primary', keep='first')))
111 | # mesh_vocab = symptom_vocab.dropna(subset=['mesh_id'])
112 | # print(len(mesh_vocab), len(mesh_vocab.drop_duplicates(subset='mesh_id', keep='first')))
113 | # umls_vocab = symptom_vocab.dropna(subset=['umls_cui'])
114 | # print(len(umls_vocab), len(umls_vocab.drop_duplicates(subset='umls_cui', keep='first')))
115 |
116 |
117 | if __name__ == '__main__':
118 | main()
119 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # integrative Biomedical Knowledge Hub (iBKH)
2 | iBKH integrates data from 17 publicly available biomedical databases. The current version of iBKH contains a total of 2,384,501 entities of 11 types. Specifically, the iBKH includes 23,003 anatomy entities, 19,236 disease entities, 37,997 drug entities, 88,376 gene entities, 2,065,015 molecule entities, 1,361 symptom entities, 4,101 DSI entities, 137,568 DSP entities, 605 TC entities, 2,988 pathway entities and 4,251 side-effect entities. For the relationships in the iBKH, there are 86 relation types within 18 kinds of entity pairs, including Anatomy-Gene, Drug-Disease, Drug-Drug, Drug-Gene, Disease-Disease, Disease-Gene, Disease-Symptom, Gene-Gene, DSI-Disease, DSI-Symptom, DSI-Drug, DSI-Anatomy, DSI-DSP, DSI-TC, Disease-Pathway, Drug-Pathway, Gene-Pathway and Drug-Side Effect. In total, iBKH contains 48,194,646 relations.
3 |
4 |
5 |
6 | ## Materials and Methods
7 | Our ultimate goal was to build a biomedical knowledge graph via comprehensively incorporating biomedical knowledge as much as possible. To this end, we collected and integrated 17 publicly available data sources to curate a comprehensive one. Details of the used data resources were listed in [Table](https://github.com/wcm-wanglab/iBKH/blob/main/Source%20Information/README.md).
8 |
9 | ## Statistics of iBKH
10 | | Entity Type | Number | Included Identifiers |
11 | | ---------------|:---------:|:--------------------:|
12 | | Anatomy | 23,003 | Uberon ID, BTO ID, MeSH ID, Cell Ontology ID |
13 | | Disease | 19,236 | Disease Ontology ID, PharmGKB ID, MeSH ID, OMIM ID |
14 | | Drug | 37,997 | DrugBank ID, PharmGKB ID, MeSH ID |
15 | | Gene | 88,376 | HGNC ID, NCBI ID, PharmGKB ID |
16 | | Molecule | 2,065,015 | CHEMBL ID, CHEBI ID |
17 | | Symptom | 1,361 | MeSH ID |
18 | | Dietary Supplement Ingredient | 4,101 | iDISK ID |
19 | | Dietary Supplement Product | 137,568 | iDISK ID |
20 | | Therapeutic Class | 605 | iDISK ID, UMLS CUI |
21 | | Pathway | 2,988 | Reactome ID, Gene Ontology ID |
22 | | Side-Effect | 4,251 | UMLS CUI |
23 | | **Total Entities** | **2,384,501** | - |
24 |
25 | | Relation Type | Number |
26 | | ----------------|:----------:|
27 | | Anatomy-Gene | 12,171,021 |
28 | | Drug-Disease | 2,717,947 |
29 | | Drug-Drug | 2,684,682 |
30 | | Drug-Gene | 1,303,747 |
31 | | Disease-Disease | 11,072 |
32 | | Disease-Gene | 27,538,774 |
33 | | Disease-Symptom | 3,357 |
34 | | Gene-Gene | 735,156 |
35 | | DSI-Symptom | 2,093 |
36 | | DSI-Disease | 5,134 |
37 | | DSI-Drug | 3,057 |
38 | | DSI-Anatomy | 4,334 |
39 | | DSP-DSI | 689,297 |
40 | | DSI-TC | 5,430 |
41 | | Disease-Pathway | 1,941 |
42 | | Drug-Pathway | 3,231 |
43 | | Gene-Pathway | 152,243 |
44 | | Drug-Side Effect| 163,206 |
45 | | **Total Relations** | **48,194,646** |
46 |
47 | ## Neo4j Deployment
48 | We deployed our iBKH using Neo4j (https://neo4j.com) on AWS, a robust graph database platform. Specifically, Neo4j can take the CSV files of entities and relations we curated as input and automatically create a KG instance. In this way, the iBKH can be updated efficiently and flexibly. Follow instructions [here](https://docs.google.com/document/d/1cLDPLp_nVCJ5xrDlJ-B-Q3wf24tb-Dyq55nAXxaNgTM/edit?usp=sharing) to deploy iBKH in your AWS server.
49 |
50 | ## iBKH-based knowledge discovery
51 | We developed a knowledge discovery pipeline in iBKH. We utilized [Deep Graph Library - Knowledge Graph Embedding (DGL-KE)](https://github.com/awslabs/dgl-ke) to learn embeddings of iBKH, based on which we can derive novel biomedical knowledge. We applied our pipeline for two case studies, Alzheimer's disease drug repurposing hypothesis generation and a knowledge-enhanced EHR patient cohort study. All codes for these analyses can be found [here](https://github.com/wcm-wanglab/iBKH/tree/main/Codes). And please refer to [readme for case studies](https://github.com/wcm-wanglab/iBKH/blob/main/Codes/README.md) for more details.
52 |
53 | ## Licence
54 | iBKH is licensed under the [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) and [CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/). The iBKH integrated the data from many resources, and users should consider the licenses for each of them (see the detail in the [table](https://github.com/wcm-wanglab/iBKH/blob/main/Source%20Information/README.md)). For sources with defined licenses, we apply the license attribute on a per-node and per-edge basis. However, some sources do not provide any licenses, so for those, we have requested permission.
55 |
56 | ## Cite
57 | ```
58 | @article {Su2021.03.12.21253461,
59 | title = {Biomedical Discovery through the integrative Biomedical Knowledge Hub (iBKH)},
60 | author = {Chang Su, Yu Hou, Suraj Rajendran, Jacqueline R. M. A. Maasch, Zehra Abedi, Haotan Zhang, Zilong Bai,
61 | Anthony Cuturrufo, Winston Guo, Fayzan F. Chaudhry, Gregory Ghahramani, Jian Tang, Feixiong Cheng,
62 | Yue Li, Rui Zhang, Jiang Bian, Fei Wang},
63 | year = {2022},
64 | doi = {10.1101/2021.03.12.21253461},
65 | publisher = {Cold Spring Harbor Laboratory Press},
66 | URL = {https://www.medrxiv.org/content/10.1101/2021.03.12.21253461v4},
67 | journal = {medRxiv}
68 | }
69 |
70 | ```
71 |
72 |
--------------------------------------------------------------------------------
/iBKH-KD-protocol/funcs/knowledge_visualization.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Wed Jul 26 17:48:29 2023
5 |
6 | @author: changsu
7 | """
8 |
9 |
10 | import networkx as nx
11 | from neo4j import GraphDatabase
12 | import matplotlib.pyplot as plt
13 | import os
14 |
15 |
16 |
17 |
18 |
19 | def subgraph_visualization(target_type, target_list, predicted_type, predicted_list,
20 | excluded_r_type = [],
21 | neo4j_url = "neo4j://54.210.251.104:7687",
22 | username = "neo4j", password = "password",
23 | alpha = 1, k=0.3, nsize=200, target_size_ratio=2.5,
24 | with_node_label=True, node_label_size = 10,
25 | with_edge_label=True, edge_label_size = 7,
26 | figsize=(14, 10),
27 | save=True, save_path='output', save_name=None):
28 |
29 | # Connect to the Neo4j database
30 | driver = GraphDatabase.driver(neo4j_url,
31 | auth=(username, password),
32 | encrypted=False)
33 | neo4j_res_list = []
34 |
35 | # Build Cypher statement
36 | for target in target_list:
37 | for predict in predicted_list:
38 | cypher = "MATCH (e1:" + target_type + " {Name: \"" + target + "\"})"
39 | cypher += ", (e2:" + predicted_type + " {Name: \"" + predict + "\"})"
40 | cypher += ", p = allShortestPaths((e1)-[*..5]-(e2)) RETURN p LIMIT 30"
41 |
42 | # Run the Neo4j query and retrieve the results
43 | session = driver.session()
44 | neo4j_res = session.run(cypher)
45 | neo4j_res_list.append(neo4j_res)
46 |
47 | # Create a NetworX Graph object
48 | g = nx.MultiGraph()
49 |
50 | # Define node groups and their corresponding colors
51 | group_colors = {
52 | "Disease": "#E0C3FC",
53 | "Drug": "#83B5D1",
54 | "Gene": "#F28482",
55 | "Symptom": "#7B967A",
56 | "Side-effect": "#9DA1DD",
57 | "Pathway": "#94D2BD"
58 | }
59 |
60 |
61 | node_id_map = {}
62 | id_node_map = {}
63 | node_color = {}
64 |
65 | edge_label_map = {}
66 | # Iterate over the Neo4j query result and add nodes to the network
67 | idx = 0
68 | for neo4j_res in neo4j_res_list:
69 | for record in neo4j_res:
70 | path = record["p"]
71 |
72 | # adding node
73 | for node in path.nodes:
74 | node_type_list = list(node.labels)
75 | node_type = node_type_list[0] if node_type_list[0] != 'Node' else node_type_list[1]
76 | node_name = node['Name']
77 |
78 | if node_name not in node_id_map:
79 | node_id_map[node_name] = idx
80 | id_node_map[idx] = node_name
81 |
82 | g.add_node(node_name)
83 | node_color[node_name] = group_colors[node_type]
84 |
85 | idx += 1
86 |
87 | # adding edges
88 | for relation in path.relationships:
89 | start_node_type = list(relation.start_node.labels)[0]
90 | end_node_type = list(relation.end_node.labels)[0]
91 |
92 | r_type = relation.type
93 |
94 | if r_type in excluded_r_type:
95 | continue
96 |
97 | start = relation.start_node['Name']
98 | end = relation.end_node["Name"]
99 |
100 | edge_label_map[(start, end)] = r_type
101 |
102 | g.add_edge(start, end, label=r_type)
103 |
104 |
105 | color_map = []
106 | size_map = []
107 | for n in g.nodes():
108 | color_map.append(node_color[n])
109 | if (n in target_list) or (n in predicted_list):
110 | size_map.append(nsize * target_size_ratio)
111 | else:
112 | size_map.append(nsize)
113 |
114 |
115 | plt.figure(figsize=figsize)
116 |
117 | positions = nx.spring_layout(g, k=k)
118 |
119 |
120 | for node in target_list:
121 | positions[node][0] -= alpha # adjust value as needed
122 |
123 | for node in predicted_list:
124 | positions[node][0] += alpha # adjust value as needed
125 |
126 | nx.draw(g, with_labels = with_node_label,
127 | node_color=color_map, node_size=size_map,
128 | edge_color='#E0E0E0', pos=positions,
129 | font_size=node_label_size)
130 |
131 | nx.draw_networkx_edge_labels(
132 | g, positions,
133 | edge_labels=edge_label_map,
134 | font_color='blue',
135 | font_size = edge_label_size
136 | )
137 |
138 | if save == True:
139 |
140 | if not os.path.exists(save_path):
141 | os.makedirs(save_path)
142 |
143 | if save_name == None:
144 | save_name ='shortest_path_interpretation_%s_%s.pdf' % (target_type, predicted_type)
145 | plt.savefig(save_path + '/' + save_name)
146 |
147 |
148 | plt.show()
149 |
150 | return g
151 |
152 |
153 | #excluded_r_type = [
154 | # 'Inferred_Relation_DDi', 'Semantic_Relation_DDi'
155 | # 'Semantic_Relation_DG', '19 Semantic_Relation_DiG',
156 | # 'Semantic_Relation_GG', 'Inferred_Relation_DiG']
157 |
158 |
--------------------------------------------------------------------------------
/Source Information/README.md:
--------------------------------------------------------------------------------
1 | **Source information**
2 |
3 |
4 |
5 |
6 | | Source |
7 | Entity |
8 | Relation |
9 | URL |
10 | License |
11 |
12 |
13 | | Type |
14 | Number |
15 | Type |
16 | Number |
17 |
18 |
19 |
20 |
21 | | Bgee |
22 | Anatomy, Gene |
23 | 60,072 |
24 | Anatomy-Express Present-Gene, Anatomy-Express Absent-Gene |
25 | 11,731,369 |
26 | https://bgee.org/ |
27 | https://creativecommons.org/publicdomain/zero/1.0/ |
28 |
29 |
30 | | Brenda Tissue Ontology |
31 | Tissue |
32 | 6,478 |
33 | - |
34 | - |
35 | https://www.brenda-enzymes.org/index.php |
36 | https://creativecommons.org/licenses/by/4.0/ |
37 |
38 |
39 | | Cell Ontology |
40 | Cell |
41 | 2,200 |
42 | - |
43 | - |
44 | http://obofoundry.org/ontology/cl.html |
45 | https://creativecommons.org/licenses/by/4.0/ |
46 |
47 |
48 | | Comparative Toxicogenomics Database |
49 | Disease, Gene, Chemical, Pathway |
50 | 73,922 |
51 | Chemical-Gene, Chemical-Disease, Chemical-Pathway, Gene-Disease, Gene-Pathway, Disease-Pathway |
52 | 38,344,568 |
53 | http://ctdbase.org/ |
54 | Confirmed via email |
55 |
56 |
57 | | ChEMBL |
58 | Molecular |
59 | 1,940,733 |
60 | - |
61 | - |
62 | https://www.ebi.ac.uk/chembl/ |
63 | https://creativecommons.org/licenses/by-sa/3.0/ |
64 |
65 |
66 | | ChEBI |
67 | Molecular |
68 | 155,342 |
69 | - |
70 | - |
71 | https://www.ebi.ac.uk/chebi/init.do |
72 | https://creativecommons.org/licenses/by/4.0/ |
73 |
74 |
75 | | Drug Repurposing Knowledge Graph |
76 | Anatomy, Atc, Biological process, Cellular component, Compound, Disease, Gene, Molecular function, Pathway, Pharmacologic class, Side effect, Symptom, Tax |
77 | 97,238 |
78 | Gene-Gene, Compound-Gene, Disease-Gene, Atc-Compound, Compound-Compound, Compound-Disease, Gene-Tax, Biological process-Gene, Disease-Symptom, Anatomy-Disease, Disease-Disease, Anatomy-Gene, Gene-Molecular function, Compound-Pharmacologic class, Cellular component-Gene, Gene-Pathway, Compound-Side effect |
79 | 5,874,261 |
80 | https://github.com/gnn4dr/DRKG |
81 | https://www.apache.org/licenses/LICENSE-2.0 |
82 |
83 |
84 | | Disease Ontology |
85 | Disease |
86 | 10,648 |
87 | - |
88 | - |
89 | https://disease-ontology.org/ |
90 | https://creativecommons.org/publicdomain/zero/1.0/ |
91 |
92 |
93 | | DrugBank |
94 | Drug |
95 | 15,128 |
96 | Drug-Target, Drug-Enzyme, Drug-Carrier, Drug-Transporter |
97 | 28,014 |
98 | https://go.drugbank.com/ |
99 | http://creativecommons.org/licenses/by-nc/4.0/ |
100 |
101 |
102 | | Hetionet |
103 | Anatomy, Biological process, Cellular component, Compound, Diease, Gene, Molecular function, Pathway, Pharmacologic class, Side effect, Symptom |
104 | 47,031 |
105 | Anatomy–downregulates–Gene, Anatomy–expresses–Gene, Anatomy–upregulates–Gene, Compound–binds–Gene, Compound–causes–Side Effect, Compound–downregulates–Gene, Compound–palliates–Disease, Compound–resembles–Compound, Compound–treats–Disease, Compound–upregulates–Gene, Disease–associates–Gene, Disease–downregulates–Gene, Disease–localizes–Anatomy, Disease–presents–Symptom, Disease–resembles–Disease, Disease–upregulates–Gene, Gene–covaries–Gene, Gene–interacts–Gene, Gene–participates–Biological Process, Gene–participates–Cellular Component, Gene–participates–Molecular Function, Gene–participates–Pathway, Gene→regulates→Gene, Pharmacologic Class–includes–Compound |
106 | 2,250,197 |
107 | https://github.com/hetio/hetionet |
108 | https://creativecommons.org/publicdomain/zero/1.0/ |
109 |
110 |
111 | | HUGO Gene Nomenclature Committee |
112 | Gene |
113 | 41,439 |
114 | - |
115 | - |
116 | https://www.genenames.org/ |
117 | No restriction (https://www.genenames.org/about/) |
118 |
119 |
120 | | iDISK |
121 | Dietary Supplement Ingredient, Dietary Supplement Product, Disease, Drug, Anatomy, Symptom, Therapeutic Class |
122 | 144,536 |
123 | DSI-Anatomy, DSI-Symptom, DSI-Disease, DSI-Drug, DSI-DSP, DSI-TC |
124 | 705,075 |
125 | https://conservancy.umn.edu/handle/11299/204783 |
126 | https://creativecommons.org/licenses/by-sa/3.0/us/ |
127 |
128 |
129 | | PharmGKB |
130 | Genes, Variant, Drug/Chemical, Phenotype |
131 | 43,112 |
132 | Disease-Gene, Drug/Chemical -Gene, Gene-Gene, Gene-Variant, Disease-Variant, Drug/Chemical-Variant |
133 | 61,616 |
134 | https://www.pharmgkb.org/ |
135 | https://creativecommons.org/licenses/by-sa/4.0/ |
136 |
137 |
138 | | Reactome |
139 | Genes, Pathway |
140 | 13,589 |
141 | Gene-Pathway |
142 | 118,480 |
143 | https://reactome.org/ |
144 | https://creativecommons.org/licenses/by/4.0/ |
145 |
146 |
147 | | SIDER |
148 | Drug, Side effect |
149 | 5,681 |
150 | Drug-Side effect |
151 | 163,206 |
152 | http://sideeffects.embl.de/ |
153 | https://creativecommons.org/licenses/by-nc-sa/4.0/ |
154 |
155 |
156 | | TISSUE |
157 | Tissue, Gene |
158 | 26,260 |
159 | Tissue-Express-Gene |
160 | 6,788,697 |
161 | https://tissues.jensenlab.org/ |
162 | https://creativecommons.org/licenses/by/4.0/ |
163 |
164 |
165 | | Uberon |
166 | Anatomy |
167 | 14,944 |
168 | - |
169 | - |
170 | https://www.ebi.ac.uk/ols/ontologies/uberon |
171 | http://creativecommons.org/licenses/by/3.0/ |
172 |
173 |
174 |
175 |
--------------------------------------------------------------------------------
/iBKH-KD-protocol/output/prediction_drug_top100_transE_l2.csv:
--------------------------------------------------------------------------------
1 | primary,name,drugbank_id,kegg_id,pharmgkb_id,umls_cui,mesh_id,iDISK_id,CID,id,score,score_norm
2 | DrugBank:DB01080,Vigabatrin,DB01080,D00535,PA10231,C0048044,D020888,,CID100005665,359,-0.11448057,1.0
3 | DrugBank:DB01104,Sertraline,DB01104,D02360,PA451333,C0074393,D020280,DC0480486,CID100005203,521,-0.11556599,0.9998298
4 | DrugBank:DB06155,Rimonabant,DB06155,D05731,PA152407999,C1142933,D000077285,,CID100104849,13872,-0.12436323,0.9984506
5 | DrugBank:DB00907,Cocaine,DB00907,D00110,PA449072,C0009170,D003042,DC0479713,CID100002826,8911,-0.1275249,0.9979549
6 | DrugBank:DB00472,Fluoxetine,DB00472,D00823,PA449673,C0016365,D005473,DC0480534,CID100003386,590,-0.1280578,0.99787134
7 | DrugBank:DB01041,Thalidomide,DB01041,D00754,PA451644,C0039736,D013792,,CID100005426,238,-0.13207258,0.9972419
8 | DrugBank:DB01577,Metamfetamine,DB01577,D08187,PA450403,C0025611,D008694,DC0480968,CID100001206,382,-0.13832352,0.9962619
9 | DrugBank:DB00470,Dronabinol,DB00470,D00306,PA449421,C0039663,D013759,,,1970,-0.13955317,0.9960691
10 | DrugBank:DB00898,Ethanol,DB00898,D00068,PA448073,C0001962,D000431,DC0478554,,1259,-0.14065692,0.99589604
11 | DrugBank:DB00313,Valproic acid,DB00313,D00399,PA451846,C0042291,D014635,DC0479769,,225,-0.1472781,0.99485797
12 | DrugBank:DB01149,Nefazodone,DB01149,D08257,PA450603,C0068485,C051752,DC0481580,CID100004449,2235,-0.15075247,0.99431324
13 | DrugBank:DB00715,Paroxetine,DB00715,D02362,PA450801,C0070122,D017374,DC0480716,CID100004691,476,-0.15228291,0.9940733
14 | DrugBank:DB01065,Melatonin,DB01065,D08170,PA164752558,C0025219,D008550,DC0492506,CID100000896,1380,-0.1569333,0.9933442
15 | DrugBank:DB00752,Tranylcypromine,DB00752,D08625,PA451741,C0040778,D014191,DC0479660,CID100005530,2101,-0.15885594,0.99304277
16 | DrugBank:DB01454,Midomafetamine,DB01454,D11172,PA131887008,C0115471,D018817,,,12548,-0.1606406,0.9927629
17 | DrugBank:DB13323,Trichloroethylene,DB13323,,PA166115521,C0040905,D014241,,,14834,-0.16149126,0.9926296
18 | DrugBank:DB00328,Indomethacin,DB00328,D00141,PA449982,C0021246,D007213,DC0479186,CID100003715,170,-0.16267565,0.9924439
19 | DrugBank:DB00344,Protriptyline,DB00344,D08447,PA451168,C0033743,D011530,,CID100004976,1894,-0.1631321,0.99237233
20 | DrugBank:DB00184,Nicotine,DB00184,D03365,PA450626,C0028040,D009538,DC0479431,CID100000942,428,-0.16453218,0.9921528
21 | DrugBank:DB02852,Domoic Acid,DB02852,,,C0058678,C012301,,,9792,-0.16459528,0.9921429
22 | DrugBank:DB01156,Bupropion,DB01156,D07591,PA448687,C0085208,D016642,DC0481575,CID100000444,300,-0.1654988,0.9920013
23 | DrugBank:DB00182,Amphetamine,DB00182,D07445,PA448408,C0002658,D000661,,CID100003007,497,-0.17122842,0.991103
24 | DrugBank:DB00363,Clozapine,DB00363,D00283,PA449061,C0009079,D003024,DC0479429,CID100002818,481,-0.1722086,0.9909493
25 | DrugBank:DB00734,Risperidone,DB00734,D00426,PA451257,C0073393,D018967,DC0480519,CID100005073,491,-0.174525,0.9905861
26 | DrugBank:DB00679,Thioridazine,DB00679,D00373,PA451666,C0039943,D013881,,CID100005452,496,-0.17616771,0.99032855
27 | DrugBank:DB04819,Methapyrilene,DB04819,,,C0025625,D008701,,,11847,-0.17761308,0.990102
28 | DrugBank:DB00477,Chlorpromazine,DB00477,D00270,PA448964,C0008286,D002746,,CID100002726,482,-0.17900142,0.9898843
29 | DrugBank:DB01224,Quetiapine,DB01224,D08456,PA451201,C0123091,,,CID100005002,471,-0.17924665,0.9898459
30 | DrugBank:DB00564,Carbamazepine,DB00564,D00252,PA448785,C0006949,D002220,DC0478574,CID100002554,447,-0.18068857,0.9896198
31 | DrugBank:DB01174,Phenobarbital,DB01174,D00506,PA450911,C0031412,D010634,DC0479421,CID100004763,373,-0.18244445,0.9893445
32 | DrugBank:DB00502,Haloperidol,DB00502,D00136,PA449841,C0018546,D006220,DC0479623,CID100003559,507,-0.18472835,0.98898643
33 | DrugBank:DB00554,Piroxicam,DB00554,D00127,PA450985,C0031990,D010894,,CID123690938,479,-0.1871792,0.98860216
34 | DrugBank:DB00740,Riluzole,DB00740,D00775,PA451251,C0073379,D019782,DC0479400,CID100005070,404,-0.18738426,0.98857003
35 | DrugBank:DB00831,Trifluoperazine,DB00831,D08636,PA451771,C0040979,D014268,,CID100005566,519,-0.18778177,0.9885077
36 | DrugBank:DB01202,Levetiracetam,DB01202,D00709,PA450206,C0377265,D000077287,,CID100059708,120,-0.18812412,0.98845404
37 | DrugBank:DB00924,Cyclobenzaprine,DB00924,D07758,PA449160,C0056732,C004704,,CID100002895,1170,-0.18916538,0.9882908
38 | DrugBank:DB00482,Celecoxib,DB00482,D00567,PA448871,C0538927,D000068579,,CID100002662,504,-0.19024923,0.98812085
39 | DrugBank:DB00397,Phenylpropanolamine,DB00397,D01224,PA164748965,C0031495,D010665,DC0479424,CID100004786,390,-0.1904266,0.988093
40 | DrugBank:DB04827,Urethane,DB04827,,,C0041964,D014520,,,14986,-0.19091032,0.9880172
41 | DrugBank:DB00321,Amitriptyline,DB00321,D07448,PA448385,C0002600,D000639,,CID100002160,385,-0.1919516,0.98785394
42 | DrugBank:DB00252,Phenytoin,DB00252,D00512,PA450947,C0031507,D010672,DC0479283,CID100001775,454,-0.19265485,0.9877437
43 | DrugBank:DB00370,Mirtazapine,DB00370,D00563,PA450522,C0049506,D000078785,,CID100004205,1410,-0.19276918,0.98772573
44 | DrugBank:DB00605,Sulindac,DB00605,D00120,PA451565,C0038792,D013467,,CID100005352,1532,-0.19316767,0.98766327
45 | DrugBank:DB00334,Olanzapine,DB00334,D00454,PA450688,C0171023,D000077152,,CID100004585,474,-0.19318072,0.98766124
46 | DrugBank:DB00315,Zolmitriptan,DB00315,D00415,PA451975,C0528166,C089750,,CID100005731,480,-0.19324104,0.98765177
47 | DrugBank:DB00997,Doxorubicin,DB00997,D03899,PA449412,C0013089,D004317,DC0479994,CID100001690,0,-0.19488305,0.98739433
48 | DrugBank:DB00906,Tiagabine,DB00906,D02097,PA451682,C0068897,D000078308,DC0479416,CID100005466,1552,-0.19495434,0.9873832
49 | DrugBank:DB01623,Thiothixene,DB01623,D00374,PA451669,C0039955,D013888,,CID100005454,522,-0.19659656,0.9871257
50 | DrugBank:DB00822,Disulfiram,DB00822,D00131,PA449376,C0012772,D004221,DC0479284,CID100003117,453,-0.19794677,0.98691404
51 | DrugBank:DB00747,Scopolamine,DB00747,D00138,PA451308,C0036442,D012601,DC0478717,CID100005184,1516,-0.19825011,0.9868665
52 | DrugBank:DB01216,Finasteride,DB01216,D00321,PA449627,C0060389,D018120,DC0481582,CID100003350,1274,-0.20089561,0.9864517
53 | DrugBank:DB00934,Maprotiline,DB00934,D02566,PA450322,C0024778,D008376,,CID100004011,1368,-0.20286947,0.9861422
54 | DrugBank:DB03575,Phencyclidine,DB03575,,PA128406980,C0031381,D010622,,CID100006468,13152,-0.20314965,0.9860983
55 | DrugBank:DB01234,Dexamethasone,DB01234,D00292,PA449247,C0011777,D003907,DC0480274,CID100002367,62,-0.20375183,0.9860039
56 | DrugBank:DB00188,Bortezomib,DB00188,D03150,PA10252,C1176309,D000069286,DC0480574,CID100093860,326,-0.20386507,0.9859861
57 | DrugBank:DB09167,Dosulepin,DB09167,D07872,,C0013065,D004308,,CID100003155,9806,-0.2062184,0.98561716
58 | DrugBank:DB00865,Benzphetamine,DB00865,D07514,PA448586,C0005096,D001589,,CID100002341,339,-0.20694748,0.98550284
59 | DrugBank:DB01236,Sevoflurane,DB01236,D00547,PA451341,C0074414,D000077149,DC0478680,CID100005206,14145,-0.20871544,0.9852257
60 | DrugBank:DB01151,Desipramine,DB01151,D07791,PA449233,C0011685,D003891,DC0481239,CID100002995,1203,-0.20973898,0.9850652
61 | DrugBank:DB01171,Moclobemide,DB01171,D02561,PA452615,C0066673,D020912,,CID100004235,1415,-0.20980196,0.9850553
62 | DrugBank:DB00601,Linezolid,DB00601,D00947,PA450233,C0663241,D000069349,DC0489100,CID100003929,1354,-0.21180709,0.984741
63 | DrugBank:DB00273,Topiramate,DB00273,D00537,PA451728,C0076829,D000077236,DC0479772,CID100005514,219,-0.21294571,0.98456246
64 | DrugBank:DB01176,Cyclizine,DB01176,D03621,PA164742937,C0010547,D003501,,CID100006726,2245,-0.21753024,0.9838437
65 | DrugBank:DB01198,Zopiclone,DB01198,D01372,PA10236,C0078847,C515050,,CID100005735,754,-0.21950008,0.9835348
66 | DrugBank:DB00829,Diazepam,DB00829,D00293,PA449283,C0012010,D003975,,CID100003016,452,-0.21965425,0.9835107
67 | DrugBank:DB00425,Zolpidem,DB00425,D00706,PA451976,C0078839,D000077334,DC0481605,CID100005732,922,-0.22110155,0.98328376
68 | DrugBank:DB00295,Morphine,DB00295,D08233,PA450550,C0026549,D009020,DC0490234,CID100004253,1865,-0.22138812,0.9832388
69 | DrugBank:DB01238,Aripiprazole,DB01238,D01164,PA10026,C0299792,D000068180,DC0491787,CID100060795,500,-0.22148094,0.9832243
70 | DrugBank:DB00540,Nortriptyline,DB00540,D08288,PA450657,C0028420,D009661,,CID100004543,1440,-0.22218998,0.9831131
71 | DrugBank:DB00316,Acetaminophen,DB00316,D00217,PA448015,C0000970,D000082,DC0479594,CID100001983,980,-0.2224822,0.9830673
72 | DrugBank:DB00215,Citalopram,DB00215,D07704,PA449015,C0008845,D015283,DC0481664,CID100002771,83,-0.22288562,0.98300403
73 | DrugBank:DB00850,Perphenazine,DB00850,D00503,PA450882,C0031184,D010546,,CID100004748,589,-0.22457759,0.9827388
74 | DrugBank:DB00776,Oxcarbazepine,DB00776,D00533,PA450732,C0069751,D000078330,,CID100034312,282,-0.22752447,0.98227674
75 | DrugBank:DB00091,Cyclosporine,DB00091,D00184,PA449167,C0010592,D016572,DC0479202,,97,-0.22848886,0.9821256
76 | DrugBank:DB00515,Cisplatin,DB00515,D00275,PA449014,C0008838,D002945,DC0479316,CID100002767,137,-0.22941272,0.98198074
77 | DrugBank:DB00937,Diethylpropion,DB00937,D07444,PA164778098,C0012201,D004053,,CID100007029,129,-0.23071453,0.9817766
78 | DrugBank:DB00843,Donepezil,DB00843,D00670,PA449394,C0527316,D000077265,,CID100003152,379,-0.2313868,0.9816712
79 | DrugBank:DB01050,Ibuprofen,DB01050,D00126,PA449957,C0020740,D007052,DC0480526,CID100003672,654,-0.23151053,0.98165184
80 | DrugBank:DB00332,Ipratropium,DB00332,D02212,PA450082,C0027235,D009241,,CID100003746,450,-0.23167636,0.9816258
81 | DrugBank:DB01544,Flunitrazepam,DB01544,D01230,PA164781320,C0016296,D005445,,CID100003380,3571,-0.23227203,0.98153245
82 | DrugBank:DB04017,Clorgiline,DB04017,D03248,,C0009035,D003010,,,8867,-0.23293683,0.9814282
83 | DrugBank:DB00819,Acetazolamide,DB00819,D00218,PA448018,C0000981,D000086,DC0481912,CID100001986,188,-0.23434326,0.98120767
84 | DrugBank:DB01242,Clomipramine,DB01242,D00811,PA449048,C0009010,D002997,DC0482226,CID100002801,503,-0.23548973,0.98102796
85 | DrugBank:DB00575,Clonidine,DB00575,D00281,PA449051,C0009014,D003000,DC0481087,CID100002803,304,-0.23552413,0.98102254
86 | DrugBank:DB06148,Mianserin,DB06148,D08216,PA134687937,C0025912,D008803,,CID100004184,12005,-0.23575918,0.9809857
87 | DrugBank:DB01105,Sibutramine,DB01105,D08513,PA451344,C0074493,C058254,,CID100005210,323,-0.23596597,0.9809533
88 | DrugBank:DB00555,Lamotrigine,DB00555,D00354,PA450164,C0064636,D000077213,,CID100003878,11,-0.23617615,0.9809203
89 | DrugBank:DB00652,Pentazocine,DB00652,D00498,PA164744326,C0030873,D010423,DC0478463,CID100004736,13056,-0.23656079,0.98086005
90 | DrugBank:DB01168,Procarbazine,DB01168,D08423,PA451112,C0033223,D011344,,CID100004915,246,-0.23699106,0.9807926
91 | DrugBank:DB01618,Molindone,DB01618,D08226,PA164746756,C0026388,D008972,,CID100023897,1416,-0.23750307,0.9807123
92 | DrugBank:DB00234,Reboxetine,DB00234,D02573,PA144614921,C0168388,D000077593,,CID100065856,13800,-0.23847479,0.98055995
93 | DrugBank:DB12278,Propiverine,DB12278,D08441,,C0138666,C015586,,,1500,-0.23917194,0.98045063
94 | DrugBank:DB12093,Tetrahydropalmatine,DB12093,,,C0076278,C014215,,,14608,-0.23968408,0.98037034
95 | DrugBank:DB00899,Remifentanil,DB00899,D08473,PA451232,C0246631,D000077208,,CID100060814,13810,-0.24021445,0.9802872
96 | DrugBank:DB00969,Alosetron,DB00969,D07129,PA164745502,C0291772,C090840,,CID100002099,1001,-0.24023908,0.9802833
97 | DrugBank:DB01142,Doxepin,DB01142,D07875,PA449409,C0013085,D004316,,CID100003158,1238,-0.24116267,0.98013854
98 | DrugBank:DB00593,Ethosuximide,DB00593,D00539,PA449533,C0015043,D005013,DC0479436,CID100003291,1261,-0.2412753,0.9801209
99 | DrugBank:DB00371,Meprobamate,DB00371,D00376,PA450377,C0025386,D008620,,CID100004064,1914,-0.24243839,0.9799385
100 | DrugBank:DB00497,Oxycodone,DB00497,D05312,PA450741,C0030049,D010098,DC0481597,CID100004635,1977,-0.24295442,0.9798576
101 | DrugBank:DB00949,Felbamate,DB00949,D00536,PA449590,C0060135,D000078328,DC0479407,CID100003331,224,-0.24399206,0.97969496
102 |
--------------------------------------------------------------------------------
/iBKH-KD-protocol/funcs/KG_link_pred.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Tue Jul 4 10:06:14 2023
5 |
6 | @author: changsu
7 | """
8 |
9 | import torch as th
10 | import torch.nn.functional as fn
11 |
12 | import numpy as np
13 | import pandas as pd
14 |
15 | from sklearn.preprocessing import MinMaxScaler
16 |
17 | import os
18 |
19 |
20 | """
21 | The following codes define functions
22 | """
23 | # link prediction based on embeddings derived using TransE
24 | def transE_l2(head, rel, tail, gamma=12.0):
25 | # Paper link: https://papers.nips.cc/paper/5071-translating-embeddings-for-modeling-multi-relational-data
26 | score = head + rel - tail
27 |
28 | return gamma - th.norm(score, p=2, dim=-1)
29 |
30 |
31 | # link prediction based on embeddings derived using TransR
32 | def transR(head, rel, tail, proj, rel_idx, gamma=12.0):
33 | # Paper link: https://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/download/9571/9523
34 | proj = proj.reshape(-1, head.shape[1], rel.shape[0])[rel_idx]
35 | head_r = th.einsum('ab,bc->ac', head, proj)
36 | tail_r = th.einsum('b,bc->c', th.tensor(tail), proj)
37 | score = head_r + rel - tail_r
38 |
39 | return gamma - th.norm(score, p=1, dim=-1)
40 |
41 |
42 | # link prediction based on embeddings derived using DistMult
43 | def DistMult(head, rel, tail):
44 | # Paper link: https://arxiv.org/abs/1412.6575
45 | score = head * rel * tail
46 |
47 | return th.sum(score, dim=-1)
48 |
49 |
50 |
51 | # link prediction based on embeddings derived using complEx
52 | def complEx(head, rel, tail, gamma=12.0):
53 | # Paper link: https://arxiv.org/abs/1606.06357
54 | real_head, img_head = th.chunk(head, 2, dim=-1)
55 | real_tail, img_tail = th.chunk(th.tensor(tail), 2, dim=-1)
56 | real_rel, img_rel = th.chunk(rel, 2, dim=-1)
57 |
58 | score = real_head * real_tail * real_rel \
59 | + img_head * img_tail * real_rel \
60 | + real_head * img_tail * img_rel \
61 | - img_head * real_tail * img_rel
62 |
63 | return th.sum(score, -1)
64 |
65 |
66 | def generate_hypothesis(target_entity, candidate_entity_type, relation_type,
67 | embedding_folder='data/embeddings', method='transE_l2',
68 | kg_folder = 'data/iBKH', triplet_folder = 'data/triplets',
69 | without_any_rel=False, topK=100,
70 | save_path='output', save=True):
71 |
72 | # load entity vocab
73 | entities = {}
74 | for e in ['anatomy', 'disease', 'drug', 'dsp', 'gene',
75 | 'molecule', 'pathway', 'sdsi', 'side_effect',
76 | 'symptom', 'tc']:
77 | e_df = pd.read_csv(kg_folder + '/entity/' + e + '_vocab.csv', header=0, low_memory=False)
78 | if e == 'gene':
79 | e_df = e_df.rename(columns={'symbol':'name'})
80 | if e == 'molecule':
81 | e_df = e_df.rename(columns={'chembl_id':'name'})
82 |
83 | entities[e] = e_df
84 |
85 | # get target entity vocab
86 | target_entity_vocab = pd.DataFrame()
87 | for e in entities:
88 | e_df = entities[e][['primary', 'name']]
89 | target_entity_vocab = pd.concat([target_entity_vocab, e_df[e_df['name'].isin(target_entity)]])
90 |
91 |
92 | # load embeddings
93 | entity_emb = np.load(embedding_folder + '/' + method + '/iBKH_' + method + '_entity.npy')
94 | rel_emb = np.load(embedding_folder + '/' + method + '/iBKH_' + method + '_relation.npy')
95 | if method == 'transR':
96 | proj_np = np.load(embedding_folder + '/' + method + '/iBKH_TransRprojection.npy')
97 | proj_emb = th.tensor(proj_np)
98 |
99 | # load entity and relation embedding map
100 | entity_emb_map = pd.read_csv(embedding_folder + '/' + method + '/entities.tsv', sep='\t', header=None, low_memory=False)
101 | entity_emb_map.columns = ['id', 'primary']
102 | rel_emb_map = pd.read_csv(embedding_folder + '/' + method + '/relations.tsv', sep='\t', header=None, low_memory=False)
103 | rel_emb_map.columns = ['rid', 'relation']
104 |
105 | target_entity_vocab = pd.merge(target_entity_vocab, entity_emb_map, on='primary', how='left')
106 |
107 |
108 | target_entity_ids = []
109 | target_entity_names = []
110 | target_entity_primaries = []
111 | for idx, row in target_entity_vocab.iterrows():
112 | target_entity_ids.append(row['id'])
113 | target_entity_names.append(row['name'])
114 | target_entity_primaries.append(row['primary'])
115 |
116 |
117 | # get candidate entity embeddings
118 | candidate_entities = pd.merge(entities[candidate_entity_type], entity_emb_map, on='primary', how='inner')
119 | candidate_entity_ids = th.tensor(candidate_entities.id.tolist()).long()
120 | candidate_embs = th.tensor(entity_emb[candidate_entity_ids])
121 |
122 |
123 | # get target relation embeddings
124 | target_relations = rel_emb_map[rel_emb_map['relation'].isin(relation_type)]
125 | target_relation_ids = th.tensor(target_relations.rid.tolist()).long()
126 | target_relation_embs = [th.tensor(rel_emb[rid]) for rid in target_relation_ids]
127 |
128 |
129 |
130 |
131 | # rank candidate entities
132 | scores_per_target_ent = []
133 | candidate_ids = []
134 | for rid in range(len(target_relation_embs)):
135 | rel_emb=target_relation_embs[rid]
136 | for target_id in target_entity_ids:
137 | target_emb = entity_emb[target_id]
138 |
139 | if method == 'transE_l2':
140 | score = fn.logsigmoid(transE_l2(candidate_embs, rel_emb, target_emb))
141 | elif method == 'transR':
142 | score = fn.logsigmoid(transR(candidate_embs, rel_emb, target_emb, proj_emb, target_relation_ids[rid]))
143 | elif method == 'complEx':
144 | score = fn.logsigmoid(complEx(candidate_embs, rel_emb, target_emb))
145 | elif method == 'DistMult':
146 | score = fn.logsigmoid(DistMult(candidate_embs, rel_emb, target_emb))
147 | else:
148 | print("Method name error!!! Please check name of the knowledge graph embedding method you used.")
149 |
150 | scores_per_target_ent.append(score)
151 | candidate_ids.append(candidate_entity_ids)
152 | scores = th.cat(scores_per_target_ent)
153 | candidate_ids = th.cat(candidate_ids)
154 |
155 | idx = th.flip(th.argsort(scores), dims=[0])
156 | scores = scores[idx].numpy()
157 | candidate_ids = candidate_ids[idx].numpy()
158 |
159 |
160 | # de-duplication
161 | _, unique_indices = np.unique(candidate_ids, return_index=True)
162 | # sorting
163 | ranked_unique_indices = np.sort(unique_indices)
164 | proposed_candidate_ids = candidate_ids[ranked_unique_indices]
165 | proposed_scores = scores[ranked_unique_indices]
166 | proposed_scores_norm = MinMaxScaler().fit_transform(proposed_scores.reshape(-1, 1))
167 |
168 |
169 | proposed_df = pd.DataFrame()
170 | proposed_df['id'] = proposed_candidate_ids
171 | proposed_df['score'] = proposed_scores
172 | proposed_df['score_norm'] = proposed_scores_norm
173 |
174 | # proposed_df = pd.merge(proposed_df, candidate_entities, on='id', how='left')
175 | proposed_df = pd.merge(candidate_entities, proposed_df, on='id', how='right')
176 |
177 |
178 | ### remove candidate entities who have already linked to the target entity
179 | rel_meta_type = relation_type[0].split('_')[-1] # e.g., Treats_DDi => DDi
180 | # load triplet file
181 | triplet_df = pd.read_csv(triplet_folder + '/' + rel_meta_type + '_triplet.csv', header=0, low_memory=False)
182 | if without_any_rel == False:
183 | triplet_df = triplet_df[triplet_df['Relation'].isin(relation_type)]
184 | # only keep triplets that contain target entity
185 | triplet_df = triplet_df[(triplet_df['Head'].isin(target_entity_primaries)) | (triplet_df['Tail'].isin(target_entity_primaries))]
186 | # candidate entities with known relation with target entity
187 | candidates_known = triplet_df['Head'].tolist() + triplet_df['Tail'].tolist()
188 | candidates_known = list(set(candidates_known) - set(target_entity_primaries))
189 |
190 | # in results, filter out andidate entities with known relation with target entity
191 | proposed_df = proposed_df[~proposed_df['primary'].isin(candidates_known)]
192 | proposed_df = proposed_df[~proposed_df['name'].isin(target_entity_names)]
193 |
194 | proposed_df = proposed_df.reset_index(drop=True)
195 |
196 | if topK != None:
197 | proposed_df = proposed_df[: topK]
198 |
199 | if save == True:
200 | if not os.path.exists(save_path):
201 | os.makedirs(save_path)
202 | proposed_df.to_csv(save_path + '/prediction_%s_top%s_%s.csv' % (candidate_entity_type, topK, method), index=False)
203 |
204 | return proposed_df
205 |
206 |
207 |
208 | def generate_hypothesis_ensemble_model(target_entity, candidate_entity_type, relation_type,
209 | embedding_folder='data/embeddings',
210 | kg_folder = 'data/iBKH', triplet_folder = 'data/triplets',
211 | without_any_rel=False, topK=100,
212 | save_path='output', save=True):
213 |
214 | transE_res = generate_hypothesis(target_entity=target_entity, candidate_entity_type=candidate_entity_type,
215 | relation_type=relation_type, embedding_folder=embedding_folder, method='transE_l2',
216 | kg_folder = kg_folder, triplet_folder = triplet_folder,
217 | without_any_rel=without_any_rel, topK=None, save=False)
218 | transE_res['transE_vote'] = len(transE_res) - transE_res.index.values
219 |
220 | transR_res = generate_hypothesis(target_entity=target_entity, candidate_entity_type=candidate_entity_type,
221 | relation_type=relation_type, embedding_folder=embedding_folder, method='transR',
222 | kg_folder = kg_folder, triplet_folder = triplet_folder,
223 | without_any_rel=without_any_rel, topK=None, save=False)
224 | transR_res['transR_vote'] = len(transR_res) - transR_res.index.values
225 |
226 | complEx_res = generate_hypothesis(target_entity=target_entity, candidate_entity_type=candidate_entity_type,
227 | relation_type=relation_type, embedding_folder=embedding_folder, method='complEx',
228 | kg_folder = kg_folder, triplet_folder = triplet_folder,
229 | without_any_rel=without_any_rel, topK=None, save=False)
230 | complEx_res['complEx_vote'] = len(complEx_res) - complEx_res.index.values
231 |
232 | DistMult_res = generate_hypothesis(target_entity=target_entity, candidate_entity_type=candidate_entity_type,
233 | relation_type=relation_type, embedding_folder=embedding_folder, method='DistMult',
234 | kg_folder = kg_folder, triplet_folder = triplet_folder,
235 | without_any_rel=without_any_rel, topK=None, save=False)
236 | DistMult_res['DistMult_vote'] = len(DistMult_res) - DistMult_res.index.values
237 |
238 |
239 | combined_res = pd.merge(transE_res, transR_res[['primary', 'transR_vote']], on='primary', how='left')
240 | combined_res = pd.merge(combined_res, complEx_res[['primary', 'complEx_vote']], on='primary', how='left')
241 | combined_res = pd.merge(combined_res, DistMult_res[['primary', 'DistMult_vote']], on='primary', how='left')
242 |
243 | combined_res['vote_score'] = combined_res['transE_vote'] + combined_res['transR_vote'] + combined_res['complEx_vote'] + combined_res['DistMult_vote']
244 | combined_res['vote_score_normed'] = MinMaxScaler().fit_transform(combined_res['vote_score'].values.reshape(-1, 1))
245 |
246 | combined_res = combined_res.sort_values(by='vote_score_normed', ascending=False)
247 |
248 | combined_res = combined_res.reset_index(drop=True)
249 |
250 | if topK != None:
251 | combined_res = combined_res[: topK]
252 |
253 | if save == True:
254 | if not os.path.exists(save_path):
255 | os.makedirs(save_path)
256 | combined_res.to_csv(save_path + '/prediction_%s_top%s_ensemble.csv' % (candidate_entity_type, topK), index=False)
257 |
258 | return combined_res
259 |
--------------------------------------------------------------------------------
/iBKH-KD-protocol/output/prediction_drug_top100_ensemble.csv:
--------------------------------------------------------------------------------
1 | primary,name,drugbank_id,kegg_id,pharmgkb_id,umls_cui,mesh_id,iDISK_id,CID,id,score,score_norm,transE_vote,transR_vote,complEx_vote,DistMult_vote,vote_score,vote_score_normed
2 | DrugBank:DB00143,Glutathione,DB00143,D00014,PA449780,C0017817,D005978,,,3092,-1.3192606,0.81111217,24263,26927,26138,24317,101645,1.0
3 | DrugBank:DB04815,Clioquinol,DB04815,D03538,PA449039,C0021978,D007464,,,1603,-0.9699813,0.86587286,24994,26876,24297,25295,101462,0.9980499344650107
4 | MeSH:D013256,Steroids,,,,C0038317,D013256,,,14378,-1.3089085,0.8127352,24275,25877,24796,26138,101086,0.994043242436836
5 | DrugBank:DB01956,Taurine,DB01956,D00047,PA451590,C0039350,D013654,,,3451,-1.1711981,0.8343257,24518,26020,25620,24751,100909,0.9921571134767644
6 | DrugBank:DB00431,Lindane,DB00431,D00360,PA164754914,C0005038,D001556,,CID100000727,1308,-0.6793266,0.9114423,25776,26705,25516,22897,100894,0.9919972720394703
7 | DrugBank:DB00132,alpha-Linolenic acid,DB00132,,PA449384,C0051405,D017962,,,6868,-1.6717641,0.7558459,23680,26141,25097,25957,100875,0.9917948062188975
8 | DrugBank:DB00179,Masoprocol,DB00179,D04862,PA164746493,C0733397,D009637,,,11706,-1.6596137,0.75775087,23697,26983,24552,25515,100747,0.9904308259539871
9 | DrugBank:DB00336,Nitrofural,DB00336,D00862,PA164754877,C0028157,D009583,,,12524,-1.3049827,0.8133507,24280,25621,25051,25766,100718,0.9901217991752183
10 | DrugBank:DB04115,Berberine,DB04115,D00092,PA165860812,C0005117,D001599,,,7823,-1.3932794,0.7995073,24156,25945,25326,25269,100696,0.9898873650671867
11 | DrugBank:DB09086,Eugenol,DB09086,D04117,,C0015153,D005054,,,10173,-1.7526864,0.7431588,23552,26093,24917,26097,100659,0.989493089521861
12 | DrugBank:DB12290,Puerarin,DB12290,,,C0072591,C033607,,,13634,-1.7054859,0.750559,23620,26102,24885,25981,100588,0.9887365067186684
13 | MeSH:D008094,Lithium,,,,C0023870,D008094,DC0478809,CID100011125,11539,-0.5504055,0.9316548,26175,26771,25921,21578,100445,0.9872126850164638
14 | DrugBank:DB11118,Ammonia,DB11118,,PA166131585,C0002607,D000641,,CID100000222,7068,-1.2208875,0.8265353,24419,26092,24718,25118,100347,0.9861683876261416
15 | DrugBank:DB02587,Colforsin,DB02587,D03584,PA146096022,C0917964,D005576,,,8994,-1.310175,0.8125366,24272,26546,25592,23826,100236,0.9849855609901645
16 | DrugBank:DB11735,Galactose,DB11735,D04291,PA449725,C0016945,D005690,,,10530,-1.7624575,0.74162686,23538,26153,23936,26573,100200,0.9846019415406584
17 | PharmGKB:PA10832,corticosteroids,,,PA10832,C0001617,D000305,DC0478594,,6611,-1.385069,0.8007946,24169,25704,24801,25497,100171,0.9842929147618895
18 | DrugBank:DB04422,Homocysteine,DB04422,,,C0019878,D006710,,,10902,-1.2244079,0.82598335,24414,26287,24528,24890,100119,0.9837387977792698
19 | DrugBank:DB09153,Sodium chloride,DB09153,D02056,PA451382,C0037494,D012965,,,1522,-1.5519575,0.7746295,23890,26184,25322,24645,100041,0.9829076223053399
20 | DrugBank:DB11672,Curcumin,DB11672,,PA151958596,C0010467,D003474,,,9140,-1.250675,0.82186514,24354,27049,26388,22138,99929,0.9817141395735431
21 | MeSH:D002331,Carnitine,,,,C0007258,D002331,,CID100000085,8437,-1.4813418,0.78570074,24020,26586,26241,22912,99759,0.9799026032842089
22 | DrugBank:DB03057,Malonaldehyde,DB03057,,,C0024643,D008315,,,11669,-2.0908349,0.69014317,22823,26283,24464,26126,99696,0.9792312692475731
23 | DrugBank:DB00746,Deferoxamine,DB00746,D03670,PA164746490,C0011145,D003676,,CID100002973,1199,-0.75536406,0.899521,25558,26389,26096,21631,99674,0.9789968351395417
24 | DrugBank:DB00171,ATP,DB00171,D08646,PA164743471,C0001480,D000255,,,6591,-1.2064426,0.82879996,24448,27085,26364,21719,99616,0.978378781582004
25 | KEGG:D03878,Dizocilpine maleate,,D03878,,C0079246,D016291,,,9775,-1.1505466,0.8375635,24561,25198,23376,26451,99586,0.9780590987074156
26 | DrugBank:DB00119,Pyruvic acid,DB00119,,PA164778686,C0072802,D019289,,,13706,-1.4370607,0.7926432,24081,25924,25727,23755,99487,0.9770041452212739
27 | DrugBank:DB06757,Manganese,DB06757,D04854,,C0024706,D008345,,,11680,-0.8935449,0.87785673,25210,26223,25240,22783,99456,0.9766738062508658
28 | DrugBank:DB06750,Ginsenoside Rg1,DB06750,,,C0074018,C035054,,,10610,-1.7730337,0.73996866,23517,25615,23643,26622,99397,0.9760450965975087
29 | DrugBank:DB04786,Suramin,DB04786,,PA10292,C0038880,D013498,,,14468,-1.4168519,0.7958116,24112,26679,25487,22965,99243,0.9744040578412883
30 | PharmGKB:PA452347,glucocorticoids,,,PA452347,C0017710,D005938,DC0481533,,10630,-1.746833,0.7440765,23562,26047,23637,25951,99197,0.9739138774335858
31 | PharmGKB:PA452233,antipsychotics,,,PA452233,C0040615,D014150,DC0479620,,7308,-1.3567141,0.8052401,24200,24500,24754,25552,99006,0.9718785631320397
32 | PharmGKB:PA449509,estrogens,,,PA449509,C0014939,D004967,DC0478555,CID100000699,10066,-0.89519787,0.8775976,25203,26280,26417,21027,98927,0.9710367315622903
33 | DrugBank:DB02772,Sucrose,DB02772,D00025,PA451525,C0038636,D013395,,,14419,-2.229847,0.66834855,22388,26152,25481,24881,98902,0.9707703291668
34 | DrugBank:DB00151,Cysteine,DB00151,D00026,PA449173,C0010654,D003545,,,3013,-2.1263072,0.68458176,22721,26370,26384,23412,98887,0.9706104877295056
35 | DrugBank:DB02315,Cyclic GMP,DB02315,,,C0018338,D006152,,,9181,-2.0006838,0.7042773,23043,25451,24994,25348,98836,0.9700670268427054
36 | MeSH:D013481,Superoxides,,,,C0038836,D013481,,,14466,-1.8277962,0.7313829,23420,26103,23652,25648,98823,0.9699284975970504
37 | DrugBank:DB01782,Pyrazolanthrone,DB01782,,,C0968382,C432165,,,13664,-2.033749,0.6990932,22961,26094,23559,26205,98819,0.9698858732137721
38 | MeSH:D014222,Triamcinolone Acetonide,,,,C0040866,D014222,,,14804,-1.8468038,0.72840285,23382,24951,23478,26851,98662,0.9682128661700926
39 | DrugBank:DB13063,Parthenolide,DB13063,,,C0070126,C002669,,,12987,-2.1425145,0.68204075,22673,24848,24131,26989,98641,0.9679890881578808
40 | MeSH:D014801,Vitamin A,,,,C0042839,D014801,DC0489740,CID100001071,15107,-1.5368314,0.77700096,23926,25370,24244,25096,98636,0.9679358076787827
41 | DrugBank:DB02266,Flufenamic acid,DB02266,D01581,PA166049190,C0016282,D005439,,,10357,-1.5224365,0.77925783,23954,25890,24214,24573,98631,0.9678825271996847
42 | DrugBank:DB15584,Luteolin,DB15584,,,C0065264,D047311,,,11604,-2.0775828,0.6922208,22852,25538,23548,26682,98620,0.9677653101456689
43 | DrugBank:DB00148,Creatine,DB00148,,PA164778930,C0010286,D003401,,,9102,-1.55466,0.77420574,23882,26593,25414,22629,98518,0.9666783883720682
44 | DrugBank:DB01914,D-glucose,DB01914,,PA449773,C0017725,D005947,,CID100000206,1296,-0.87310547,0.88106126,25256,26377,25496,21377,98506,0.9665505152222329
45 | DrugBank:DB01025,Amlexanox,DB01025,D01828,PA164745310,C0103049,C045742,,CID100002161,7058,-1.4479347,0.7909384,24063,24373,25396,24628,98460,0.9660603348145307
46 | DrugBank:DB04468,Afimoxifene,DB04468,D06551,,C2347999,C016601,,,6665,-1.1752086,0.8336969,24509,26481,25058,22395,98443,0.9658791811855973
47 | DrugBank:DB13721,Cypermethrin,DB13721,D07763,,C0056849,C017160,,,9242,-1.3620205,0.8044082,24194,25968,24217,24033,98412,0.9655488422151892
48 | MeSH:D004205,Cromolyn Sodium,,,,C0012694,D004205,,,9116,-1.991005,0.7057947,23070,24919,23665,26743,98397,0.9653890007778951
49 | DrugBank:DB14104,Linoleic acid,DB14104,,,C0023749,D019787,,,11504,-2.3294046,0.65273976,22032,25650,24142,26572,98396,0.9653783446820754
50 | DrugBank:DB05382,Iodine,DB05382,D00108,PA450049,C0021968,D007455,,,1644,-1.8361273,0.73007673,23401,25177,24176,25603,98357,0.9649627569451105
51 | MeSH:D009573,Nitrites,,,,C0028137,D009573,,CID100000946,12516,-1.677129,0.7550048,23673,23802,25641,25199,98315,0.9645152009206868
52 | PharmGKB:PA166123346,bilirubin,,,PA166123346,C0005437,D001663,,,7909,-1.9876075,0.7063274,23080,26255,23677,25292,98304,0.964397983866671
53 | DrugBank:DB04224,Oleic Acid,DB04224,D02315,,C0028928,D019301,,,12768,-1.6043189,0.7664201,23798,25321,23913,25254,98286,0.9642061741419179
54 | DrugBank:DB07352,Apigenin,DB07352,,,C0912024,D047310,,,7340,-1.84235,0.7291011,23387,25168,23479,26203,98237,0.9636840254467569
55 | DrugBank:DB08818,Hyaluronic acid,DB08818,D08043,PA165958431,C0020196,D006820,,,10921,-1.7635304,0.74145865,23534,26041,24275,24376,98226,0.9635668083927411
56 | DrugBank:DB00772,Malathion,DB00772,D00534,PA164748092,C0024547,D008294,,CID100004004,2111,-1.1328325,0.84034073,24602,25733,25305,22550,98190,0.963183188943235
57 | DrugBank:DB02010,Staurosporine,DB02010,,PA165109623,C0075193,D019311,,,14370,-1.932675,0.71493983,23203,24550,24279,26119,98151,0.96276760120627
58 | DrugBank:DB02375,Myricetin,DB02375,,,C0067067,C040015,,,12180,-2.1731112,0.6772437,22568,25747,23222,26587,98124,0.9624798866191404
59 | DrugBank:DB03310,Glutathione disulfide,DB03310,D00031,,C0061516,D019803,,,10651,-1.9640263,0.7100245,23134,25693,23026,26248,98101,0.9622347964152893
60 | DrugBank:DB11656,Rebamipide,DB11656,D01121,,C0069562,C052785,,,3058,-1.6594839,0.75777125,23698,25614,22950,25783,98045,0.961638055049391
61 | DrugBank:DB00756,Hexachlorophene,DB00756,D00859,PA449871,C0019435,D006582,,CID100003598,2105,-0.64097184,0.9174557,25876,26128,24219,21807,98030,0.9614782136120967
62 | PharmGKB:PA133822447,catecholamines,,,PA133822447,C0007412,D002395,,,8460,-1.395436,0.7991692,24150,26108,22783,24966,98007,0.9612331234082456
63 | DrugBank:DB14154,Gold,DB14154,,,C0018026,D006046,,CID100022318,10687,-0.7806944,0.89554965,25480,24740,25761,21959,97940,0.9605191649883316
64 | DrugBank:DB03904,Urea,DB03904,D00023,PA451831,C0041942,D014508,,,3161,-1.6132491,0.76502,23784,26524,24742,22845,97895,0.9600396406764491
65 | DrugBank:DB02679,Cyanamide,DB02679,D00123,,C0010502,D003484,,,3781,-1.4897939,0.7843756,24014,24719,23454,25639,97826,0.9593043700648958
66 | MeSH:C013592,mangiferin,,,,C0065654,C013592,,,11693,-1.8169029,0.73309076,23441,24587,23350,26426,97804,0.9590699359568642
67 | DrugBank:DB13242,Bucladesine,DB13242,D07546,,C0012054,D003994,,,8127,-1.5925876,0.7682594,23823,25973,24687,23282,97765,0.9586543482198993
68 | KEGG:D00767,Vecuronium bromide,,D00767,,C0042435,D014673,,,15056,-1.2183957,0.82692593,24423,26837,23105,23396,97761,0.9586117238366207
69 | DrugBank:DB12695,Phenethyl Isothiocyanate,DB12695,,,C0070558,C058305,,,13157,-1.5559322,0.7740063,23880,26328,25704,21837,97749,0.9584838506867854
70 | DrugBank:DB11846,Creatinine,DB11846,D03600,,C0010294,D003404,,,9105,-2.3239262,0.65359867,22048,26272,24596,24797,97713,0.9581002312372793
71 | DrugBank:DB11831,Dinitrochlorobenzene,DB11831,,,C0012460,D004137,,,9694,-1.8853838,0.7223542,23299,25764,24966,23679,97708,0.9580469507581812
72 | DrugBank:DB00643,Mebendazole,DB00643,D00368,PA164776669,C0025023,D008463,,CID100004030,806,-0.5924298,0.9250662,26028,26789,25705,19183,97705,0.9580149824707225
73 | DrugBank:DB14574,Cobalt,DB14574,,,C0009148,D003035,,,8897,-2.1436973,0.6818553,22669,25897,23404,25728,97698,0.9579403897999852
74 | DrugBank:DB00403,Ceruletide,DB00403,D03442,PA164774919,C0006639,D002108,,,8552,-2.0493033,0.69665456,22922,24122,24357,26293,97694,0.9578977654167067
75 | DrugBank:DB02530,gamma-Aminobutyric acid,DB02530,D00058,,C0016904,D005680,,,10543,-1.0121074,0.85926825,24880,25520,26253,20938,97591,0.9568001875472865
76 | DrugBank:DB00848,Levamisole,DB00848,D08114,PA450205,C0023556,D007978,,,11459,-1.6001302,0.76707685,23804,25536,23189,25038,97567,0.9565444412476158
77 | DrugBank:DB11263,Polydatin,DB11263,,,C0071538,C058229,,,13400,-2.0231712,0.70075166,22990,24229,23849,26434,97502,0.9558517950193408
78 | DrugBank:DB16101,Baicalein,DB16101,,,C0052927,C006680,,,7655,-2.1985002,0.6732632,22496,24826,23079,27029,97430,0.9550845561203287
79 | DrugBank:DB03796,Palmitic Acid,DB03796,D05341,,C0030234,D019308,,,2343,-1.2327998,0.8246677,24398,24630,24269,24116,97413,0.9549034024913952
80 | DrugBank:DB04173,Fructose,DB04173,D00114,PA449716,C0016745,D005632,,,1630,-1.612917,0.7650721,23785,24169,23703,25721,97378,0.9545304391377087
81 | KEGG:D04970,Methacholine chloride,,D04970,PA450398,C0079829,D016210,,,11828,-2.16005,0.6792915,22626,25365,23356,26025,97372,0.9544665025627912
82 | DrugBank:DB11588,Carbon monoxide,DB11588,D09706,,C0007018,D002248,,,8398,-2.3546972,0.64877427,21926,25392,23663,26378,97359,0.9543279733171361
83 | MeSH:C093642,SB 203580,,,,C0297666,C093642,,,14045,-2.1055696,0.6878331,22781,25073,23313,26060,97227,0.952921368668947
84 | DrugBank:DB12029,Chlorogenic Acid,DB12029,,,C0008240,D002726,,,8647,-1.9954457,0.7050985,23055,23907,24553,25711,97226,0.9529107125731275
85 | MeSH:D008628,Mercury,,,,C0025424,D008628,,,11803,-1.2403855,0.82347834,24372,25102,24252,23499,97225,0.9529000564773078
86 | KEGG:D03625,Cycloheximide,,D03625,,C0010572,D003513,,,9191,-1.7577664,0.7423623,23545,24809,23931,24935,97220,0.9528467759982098
87 | DrugBank:DB13366,Hydrochloric acid,DB13366,D02057,,C0020259,D006851,,,10932,-2.2469037,0.6656744,22345,24687,23626,26419,97077,0.9513229542960051
88 | DrugBank:DB09526,Hydroquinone,DB09526,D00073,PA449924,C0020306,C031927,,,1313,-1.8660864,0.7253797,23344,25658,24823,23246,97071,0.9512590177210873
89 | DrugBank:DB00050,Cetrorelix,DB00050,D07665,PA164764506,C0209366,C062876,,CID125074886,8562,-1.4797181,0.7859553,24026,26225,25323,21477,97051,0.951045895804695
90 | DrugBank:DB07795,Fisetin,DB07795,,,C0060397,C017875,,,10318,-1.9922534,0.70559895,23064,24741,22719,26486,97010,0.950608995876091
91 | PharmGKB:PA451776,triglycerides,,,PA451776,C0041004,D014280,,,14863,-2.4083483,0.64036274,21715,25268,23187,26838,97008,0.9505876836844518
92 | MeSH:D007306,Insecticides,,,,C0021576,D007306,,,11144,-2.16775,0.67808425,22600,24143,23269,26994,97006,0.9505663714928125
93 | DrugBank:DB09028,Cytisine,DB09028,D07770,PA166153416,C0056913,C004712,,,9268,-1.7558718,0.74265933,23548,24573,24143,24727,96991,0.9504065300555182
94 | DrugBank:DB01917,Putrescine,DB01917,,,C0034170,D011700,,,13645,-2.062014,0.69466174,22894,25899,23990,24184,96967,0.9501507837558476
95 | DrugBank:DB11134,Cupric oxide,DB11134,,,C0056598,C030973,,,9136,-1.2453867,0.82269424,24365,25525,24645,22400,96935,0.94980978868962
96 | DrugBank:DB04942,Tamibarotene,DB04942,D01418,PA164743464,C1567753,C061133,,,3643,-1.1598458,0.8361055,24542,25712,24806,21867,96927,0.9497245399230629
97 | KEGG:D02413,Butylated hydroxytoluene,,D02413,PA448704,C0006507,D002084,,,8190,-1.800713,0.7356291,23459,24490,23174,25798,96921,0.9496606033481454
98 | DrugBank:DB04652,Corticosterone,DB04652,,,C0010124,D003345,,,9060,-1.5524349,0.7745546,23888,25748,25584,21681,96901,0.9494474814317531
99 | DrugBank:DB08842,Acetylcarnitine,DB08842,,,C0001040,D000108,,,6491,-1.8005266,0.7356583,23460,24755,23700,24979,96894,0.9493728887610158
100 | DrugBank:DB02342,2-Methoxyestradiol,DB02342,,PA13496724,C0046319,D000077584,,,5432,-1.9019606,0.7197553,23266,25054,24082,24462,96864,0.9490532058864274
101 | PharmGKB:PA128406983,pesticides,,,PA128406983,C0031253,D010575,,,13118,-1.7440054,0.7445198,23567,25509,23656,24113,96845,0.9488507400658548
102 |
--------------------------------------------------------------------------------
/Codes_Term Harmonization/Relation_Integration/integrate_drug_disease.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 | folder = ''
5 | CTD_folder = '../CTD/'
6 |
7 |
8 | pd.set_option('display.max_columns', None)
9 |
10 |
11 | def integrate_Hetionet_KEGG():
12 | hetionet_DDi = pd.read_csv(folder + 'hetionet_DDi.csv')
13 | hetionet_DDi = hetionet_DDi.rename(columns={'source': 'Drug', 'target': 'Disease'})
14 | hetionet_DDi['Drug'] = hetionet_DDi['Drug'].str.replace('Compound::', '')
15 | hetionet_DDi['Disease'] = hetionet_DDi['Disease'].str.replace('Disease::', '')
16 |
17 | drug_vocab = pd.read_csv(folder + 'drug_vocab.csv')
18 | db_vocab = drug_vocab.dropna(subset=['drugbank_id'])
19 | db_primary_dict = db_vocab.set_index('drugbank_id')['primary'].to_dict()
20 | kegg_drug_vocab = drug_vocab.dropna(subset=['kegg_id'])
21 | kegg_drug_primary_dict = kegg_drug_vocab.set_index('kegg_id')['primary'].to_dict()
22 |
23 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv')
24 | do_vocab = disease_vocab.dropna(subset=['do_id'])
25 | do_primary_dict = do_vocab.set_index('do_id')['primary'].to_dict()
26 | kegg_disease_vocab = disease_vocab.dropna(subset=['kegg_id'])
27 | kegg_disease_primary_dict = kegg_disease_vocab.set_index('kegg_id')['primary'].to_dict()
28 |
29 | hetionet_ddi_ctd = hetionet_DDi[hetionet_DDi['metaedge'] == 'CtD']
30 | hetionet_ddi_ctd = hetionet_ddi_ctd.replace({'Drug': db_primary_dict, 'Disease': do_primary_dict})
31 | hetionet_ddi_ctd = hetionet_ddi_ctd[['Drug', 'Disease']]
32 | hetionet_ddi_ctd['Treats_Hetionet'] = [1] * len(hetionet_ddi_ctd)
33 | hetionet_ddi_ctd['Palliates_Hetionet'] = [0] * len(hetionet_ddi_ctd)
34 |
35 | hetionet_ddi_cpd = hetionet_DDi[hetionet_DDi['metaedge'] == 'CpD']
36 | hetionet_ddi_cpd = hetionet_ddi_cpd.replace({'Drug': db_primary_dict, 'Disease': do_primary_dict})
37 | hetionet_ddi_cpd = hetionet_ddi_cpd[['Drug', 'Disease']]
38 | hetionet_ddi_cpd['Treats_Hetionet'] = [0] * len(hetionet_ddi_cpd)
39 | hetionet_ddi_cpd['Palliates_Hetionet'] = [1] * len(hetionet_ddi_cpd)
40 |
41 | DDi_res = pd.concat((hetionet_ddi_ctd, hetionet_ddi_cpd))
42 | DDi_res.loc[DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False), 'Palliates_Hetionet'] = 1
43 | DDi_res = DDi_res.drop_duplicates(subset=['Drug', 'Disease'], keep='first')
44 |
45 | DDi_res['Source'] = ['Hetionet'] * len(DDi_res)
46 | print(DDi_res)
47 | DDi_res['Effect_KEGG'] = [0] * len(DDi_res)
48 | kegg_df = pd.read_csv(folder + 'kegg_drug_disease.csv')
49 | kegg_df = kegg_df.rename(columns={'drug': 'Drug', 'disease': 'Disease'})
50 | kegg_df = kegg_df.replace({'Drug': kegg_drug_primary_dict, 'Disease': kegg_disease_primary_dict})
51 | kegg_df['Treats_Hetionet'] = [0] * len(kegg_df)
52 | kegg_df['Palliates_Hetionet'] = [0] * len(kegg_df)
53 | kegg_df['Source'] = ['KEGG'] * len(kegg_df)
54 | kegg_df['Effect_KEGG'] = [1] * len(kegg_df)
55 |
56 | DDi_res = pd.concat((DDi_res, kegg_df))
57 | DDi_res.loc[DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False), 'Effect_KEGG'] = 1
58 | DDi_res['Source'] = np.where(DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False),
59 | DDi_res['Source'].astype(str) + ';KEGG', DDi_res['Source'].astype(str) + '')
60 | DDi_res = DDi_res.drop_duplicates(subset=['Drug', 'Disease'], keep='first')
61 | DDi_res_col = list(DDi_res.columns)
62 | DDi_res_col_new = DDi_res_col[:-2] + DDi_res_col[-1:] + DDi_res_col[-2:-1]
63 | DDi_res = DDi_res[DDi_res_col_new]
64 | DDi_res['Source'] = DDi_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
65 | print(DDi_res)
66 | DDi_res.to_csv(folder + 'DDi_res.csv', index=False)
67 | with open(folder + 'integration_notes.txt', 'w') as f:
68 | f.write('DDi_res: Hetionet (Treats and Palliates) and KEGG (Effect).\n')
69 | f.close()
70 |
71 |
72 | def extract_PharmGKB_DDi():
73 | pharmgkb_rel = pd.read_table(folder + 'pharmgkb_rel.tsv')
74 | pharmgkb_rel = pharmgkb_rel[pharmgkb_rel['Association'] == 'associated']
75 | pharmgkb_rel = pharmgkb_rel.reset_index(drop=True)
76 | res = pd.DataFrame(columns=['Drug', 'Disease'])
77 | idx = 0
78 | for i in range(len(pharmgkb_rel)):
79 | p1_id = pharmgkb_rel.loc[i, 'Entity1_id']
80 | p1_type = pharmgkb_rel.loc[i, 'Entity1_type']
81 | p2_id = pharmgkb_rel.loc[i, 'Entity2_id']
82 | p2_type = pharmgkb_rel.loc[i, 'Entity2_type']
83 | if p1_type == 'Chemical' and p2_type == 'Disease':
84 | drug = p1_id
85 | disease = p2_id
86 | elif p2_type == 'Chemical' and p1_type == 'Disease':
87 | drug = p2_id
88 | disease = p1_id
89 | else:
90 | continue
91 | res.loc[idx] = [drug, disease]
92 | idx += 1
93 | res.to_csv(folder + 'pharmgkb_drug_disease.csv', index=False)
94 |
95 |
96 | def integrate_CTD_DDi_curated():
97 | chem_disease = pd.read_csv(CTD_folder + 'CTD_chemicals_diseases.csv', header=27)
98 | chem_disease = chem_disease.dropna(subset=['ChemicalID', 'DiseaseID'])
99 | chem_disease = chem_disease.drop_duplicates(subset=['ChemicalID', 'DiseaseID'])
100 | chem_disease = chem_disease.reset_index(drop=True)
101 | chem_disease = chem_disease.rename(columns={'ChemicalID': 'Drug', 'DiseaseID': 'Disease'})
102 | chem_disease_curated = chem_disease[pd.isnull(chem_disease['InferenceScore'])]
103 |
104 | chem_disease_curated = chem_disease_curated[['Drug', 'Disease']]
105 | chem_disease_curated = chem_disease_curated.reset_index(drop=True)
106 |
107 | drug_vocab = pd.read_csv(folder + 'drug_vocab.csv')
108 | mesh_drug_vocab = drug_vocab.dropna(subset=['mesh_id'])
109 | mesh_durg_primary_dict = mesh_drug_vocab.set_index('mesh_id')['primary'].to_dict()
110 |
111 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv')
112 | mesh_disease_vocab = disease_vocab.dropna(subset=['mesh_id'])
113 | mesh_disease_primary_dict = mesh_disease_vocab.set_index('mesh_id')['primary'].to_dict()
114 | omim_vocab = disease_vocab.dropna(subset=['omim_id'])
115 | omim_vocab['omim_id'] = omim_vocab['omim_id'].astype(int).astype(str)
116 | omim_primary_dict = omim_vocab.set_index('omim_id')['primary'].to_dict()
117 |
118 | DDi_res = pd.read_csv(folder + 'DDi_res.csv')
119 | DDi_res_col = list(DDi_res.columns)[2:]
120 | DDi_res['Associate_CTD'] = [0] * len(DDi_res)
121 |
122 | for i in range(len(chem_disease_curated)):
123 | drug_id = chem_disease_curated.loc[i, 'Drug']
124 | disease_id = chem_disease_curated.loc[i, 'Disease']
125 |
126 | chem_disease_curated.loc[i, 'Drug'] = mesh_durg_primary_dict[drug_id]
127 | if 'MESH' in disease_id:
128 | disease_id = disease_id.replace('MESH:', '')
129 | chem_disease_curated.loc[i, 'Disease'] = mesh_disease_primary_dict[disease_id]
130 | else:
131 | disease_id = disease_id.replace('OMIM:', '')
132 | chem_disease_curated.loc[i, 'Disease'] = omim_primary_dict[disease_id]
133 | print(i + 1, '/', len(chem_disease_curated), 'Completed...')
134 | print(chem_disease_curated)
135 |
136 | for col in DDi_res_col[:-1]:
137 | chem_disease_curated[col] = [0] * len(chem_disease_curated)
138 | chem_disease_curated['Source'] = ['CTD'] * len(chem_disease_curated)
139 | chem_disease_curated['Associate_CTD'] = [1] * len(chem_disease_curated)
140 | DDi_res = pd.concat((DDi_res, chem_disease_curated))
141 | DDi_res.loc[DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False), 'Associate_CTD'] = 1
142 | DDi_res['Source'] = np.where(DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False),
143 | DDi_res['Source'].astype(str) + ';CTD', DDi_res['Source'].astype(str) + '')
144 | DDi_res = DDi_res.drop_duplicates(subset=['Drug', 'Disease'], keep='first')
145 | DDi_res_col = list(DDi_res.columns)
146 | DDi_res_col_new = DDi_res_col[:-2] + DDi_res_col[-1:] + DDi_res_col[-2:-1]
147 | DDi_res = DDi_res[DDi_res_col_new]
148 | DDi_res['Source'] = DDi_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
149 | DDi_res.to_csv(folder + 'DDi_res_2.csv', index=False)
150 | with open(folder + 'integration_notes.txt', 'a') as f:
151 | f.write('DDi_res_2: Hetionet, KEGG and CTD (Associate).\n')
152 | f.close()
153 |
154 |
155 | def integrate_CTD_DDi_inferred():
156 | DDi_res = pd.read_csv(folder + 'DDi_res_2.csv')
157 | DDi_res_col = list(DDi_res.columns)[2:]
158 | DDi_res['Inferred_Relation'] = [0] * len(DDi_res)
159 | DDi_res['Inference_Score'] = [''] * len(DDi_res)
160 |
161 | chem_disease_inferred = pd.read_csv(folder + 'CTD_chem_disease_inferred.csv')
162 |
163 | for col in DDi_res_col[:-1]:
164 | chem_disease_inferred[col] = [0] * len(chem_disease_inferred)
165 | chem_disease_inferred['Source'] = ['CTD'] * len(chem_disease_inferred)
166 | chem_disease_inferred['Inferred_Relation'] = [1] * len(chem_disease_inferred)
167 | temp_col = list(chem_disease_inferred.columns)
168 | chem_disease_inferred_col = temp_col[:2] + temp_col[3:] + temp_col[2:3]
169 | chem_disease_inferred = chem_disease_inferred[chem_disease_inferred_col]
170 | print(list(chem_disease_inferred.columns))
171 | DDi_res = pd.concat((DDi_res, chem_disease_inferred))
172 | DDi_res.loc[DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False), 'Inferred_Relation'] = 1
173 | DDi_res['Source'] = np.where(DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False),
174 | DDi_res['Source'].astype(str) + ';CTD', DDi_res['Source'].astype(str) + '')
175 | DDi_res = DDi_res.drop_duplicates(subset=['Drug', 'Disease'], keep='first')
176 | DDi_res_col = list(DDi_res.columns)
177 | DDi_res_col_new = DDi_res_col[:-3] + DDi_res_col[-2:-1] + DDi_res_col[-3:-2] + DDi_res_col[-1:]
178 | DDi_res = DDi_res[DDi_res_col_new]
179 | DDi_res['Source'] = DDi_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
180 | DDi_res.to_csv(folder + 'DDi_res_3.csv', index=False)
181 | with open(folder + 'integration_notes.txt', 'a') as f:
182 | f.write('DDi_res_3: Hetionet, KEGG and CTD (Inferred_Relation).\n')
183 | f.close()
184 |
185 |
186 | def integrate_DRKG_DDi():
187 | DDi_res = pd.read_csv(folder + 'DDi_res_3.csv')
188 | DDi_res_col = list(DDi_res.columns)[2:]
189 |
190 | drkg_DDi = pd.read_csv('drkg_DDi.csv')
191 | # drkg_DDi = pd.read_csv('drkg_DDi.csv')
192 | drkg_DDi = drkg_DDi.rename(columns={'entity_1': 'Drug', 'entity_2': 'Disease'})
193 | drkg_DDi['Drug'] = drkg_DDi['Drug'].str.replace('Compound::', '')
194 | drkg_DDi['Disease'] = drkg_DDi['Disease'].str.replace('Disease::', '')
195 | ddi_relation_list = list(drkg_DDi.drop_duplicates(subset='relation', keep='first')['relation'])
196 | # ddi_source_list = list(drkg_DDi.drop_duplicates(subset='source', keep='first')['source'])
197 | # print(ddi_relation_list)
198 | # print(ddi_source_list)
199 | # print(drkg_DDi.drop_duplicates(subset='relation', keep='first'))
200 |
201 | drug_vocab = pd.read_csv(folder + 'drug_vocab.csv')
202 | db_vocab = drug_vocab.dropna(subset=['drugbank_id'])
203 | db_primary_dict = db_vocab.set_index('drugbank_id')['primary'].to_dict()
204 | mesh_drug_vocab = drug_vocab.dropna(subset=['mesh_id'])
205 | mesh_durg_primary_dict = mesh_drug_vocab.set_index('mesh_id')['primary'].to_dict()
206 |
207 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv')
208 | mesh_disease_vocab = disease_vocab.dropna(subset=['mesh_id'])
209 | mesh_disease_primary_dict = mesh_disease_vocab.set_index('mesh_id')['primary'].to_dict()
210 | omim_vocab = disease_vocab.dropna(subset=['omim_id'])
211 | omim_vocab['omim_id'] = omim_vocab['omim_id'].astype(int).astype(str)
212 | omim_primary_dict = omim_vocab.set_index('omim_id')['primary'].to_dict()
213 |
214 | for drkg_rel in ddi_relation_list:
215 | print(drkg_rel)
216 | DDi_res[drkg_rel] = [0] * len(DDi_res)
217 | drkg_DDi_temp = drkg_DDi[drkg_DDi['relation'] == drkg_rel]
218 | drkg_DDi_temp = drkg_DDi_temp[['Drug', 'Disease']]
219 | drkg_DDi_temp = drkg_DDi_temp.reset_index(drop=True)
220 | drkg_DDi_temp_primary = pd.DataFrame(columns=['Drug', 'Disease'])
221 | idx = 0
222 | for i in range(len(drkg_DDi_temp)):
223 | drug_id = drkg_DDi_temp.loc[i, 'Drug']
224 | disease_id = drkg_DDi_temp.loc[i, 'Disease']
225 |
226 | if 'DB' in drug_id:
227 | drug_id_primary = db_primary_dict[drug_id]
228 | elif 'MESH' in drug_id:
229 | drug_id = drug_id.replace('MESH:', '')
230 | if drug_id in mesh_durg_primary_dict:
231 | drug_id_primary = mesh_durg_primary_dict[drug_id]
232 | else:
233 | continue
234 | else:
235 | continue
236 |
237 | if 'MESH' in disease_id:
238 | disease_id = disease_id.replace('MESH:', '')
239 | disease_id_primary = mesh_disease_primary_dict[disease_id]
240 | else:
241 | disease_id = disease_id.replace('OMIM:', '')
242 | disease_id_primary = omim_primary_dict[disease_id]
243 |
244 | drkg_DDi_temp_primary.loc[idx] = [drug_id_primary, disease_id_primary]
245 | idx += 1
246 | for col in DDi_res_col[:-2]:
247 | drkg_DDi_temp_primary[col] = [0] * len(drkg_DDi_temp_primary)
248 | drkg_DDi_temp_primary['Source'] = ['DRKG'] * len(drkg_DDi_temp_primary)
249 | drkg_DDi_temp_primary['Inference_Score'] = [''] * len(drkg_DDi_temp_primary)
250 | drkg_DDi_temp_primary[drkg_rel] = [1] * len(drkg_DDi_temp_primary)
251 | DDi_res = pd.concat((DDi_res, drkg_DDi_temp_primary))
252 | DDi_res.loc[DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False), drkg_rel] = 1
253 | DDi_res['Source'] = np.where(DDi_res.duplicated(subset=['Drug', 'Disease'], keep=False),
254 | DDi_res['Source'].astype(str) + ';DRKG', DDi_res['Source'].astype(str) + '')
255 | DDi_res = DDi_res.drop_duplicates(subset=['Drug', 'Disease'], keep='first')
256 | DDi_res_col = list(DDi_res.columns)
257 | DDi_res_col_new = DDi_res_col[:-3] + DDi_res_col[-1:] + DDi_res_col[-3:-1]
258 | DDi_res = DDi_res[DDi_res_col_new]
259 | DDi_res_col = DDi_res_col_new[2:]
260 | DDi_res['Source'] = DDi_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
261 |
262 | DDi_res = DDi_res.rename(columns={'Compound treats the disease': 'Treats_DRKG'})
263 | DDi_res.to_csv(folder + 'DDi_res_4.csv', index=False)
264 | with open(folder + 'integration_notes.txt', 'a') as f:
265 | f.write('DDi_res_4: Hetionet, KEGG, CTD and DRKG (Treats and Semantic Relations).\n')
266 | f.close()
267 |
268 |
269 | def main():
270 | # integrate_Hetionet_KEGG()
271 | # extract_PharmGKB_DDi()
272 | integrate_CTD_DDi_curated()
273 | integrate_CTD_DDi_inferred()
274 | integrate_DRKG_DDi()
275 |
276 |
277 | if __name__ == '__main__':
278 | main()
279 |
--------------------------------------------------------------------------------
/Codes_Term Harmonization/Relation_Integration/integrate_drug_gene.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 | folder = ''
5 | CTD_folder = '../CTD/'
6 |
7 | # pd.set_option('display.max_columns', None)
8 | # pd.set_option('display.max_rows', None)
9 |
10 |
11 | def integrate_DrugBank_KEGG():
12 | DG_res = pd.read_csv(folder + '/DGres_DrugBank_regulate.csv')
13 | DG_res_col = list(DG_res.columns)
14 | DG_res_col = [col_name.replace('_DrugBank', '') for col_name in DG_res_col]
15 | DG_res.columns = DG_res_col
16 | rel_list = list(DG_res_col)[2:]
17 | DG_res['Associate_KEGG'] = [0] * len(DG_res)
18 | DG_res['Source'] = ['DrugBank'] * len(DG_res)
19 | # print(DG_res)
20 | kegg_res = pd.read_csv(folder + '/kegg_drug_gene.csv')
21 |
22 | drug_vocab = pd.read_csv(folder + '/drug_vocab.csv')
23 | kegg_vocab = drug_vocab.dropna(subset=['kegg_id'])
24 | kegg_primary_dict = kegg_vocab.set_index('kegg_id')['primary'].to_dict()
25 |
26 | gene_vocab = pd.read_csv(folder + '/gene_vocab_2.csv')
27 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
28 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict()
29 |
30 | kegg_res = kegg_res.replace({'drug': kegg_primary_dict, 'gene': ncbi_primary_dict})
31 | kegg_res = kegg_res.rename(columns={'drug': 'Drug', 'gene': 'Gene'})
32 | kegg_res = kegg_res[['Drug', 'Gene']]
33 | for col in rel_list:
34 | kegg_res[col] = [0] * len(kegg_res)
35 | kegg_res['Associate_KEGG'] = [1] * len(kegg_res)
36 | kegg_res['Source'] = ['KEGG'] * len(kegg_res)
37 | # print(kegg_res)
38 | DG_res = pd.concat((DG_res, kegg_res))
39 | DG_res.loc[DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 'Associate_KEGG'] = 1
40 | DG_res.loc[DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 'Source'] = 'DrugBank;KEGG'
41 | DG_res = DG_res.drop_duplicates(subset=['Drug', 'Gene'], keep='first')
42 | DG_res.to_csv(folder + '/DG_res.csv', index=False)
43 | with open(folder + '/integration_notes.txt', 'w') as f:
44 | f.write('DG_res: DrugBank (Target, Transporter, Enzyme, Carrier, Downregulates and Upregulates) and KEGG (Associate).\n')
45 | f.close()
46 |
47 |
48 | def extract_PharmGKB_DG():
49 | pharmgkb_rel = pd.read_table(folder + 'pharmgkb_rel.tsv')
50 | pharmgkb_rel = pharmgkb_rel[pharmgkb_rel['Association'] == 'associated']
51 | pharmgkb_rel = pharmgkb_rel.reset_index(drop=True)
52 | res = pd.DataFrame(columns=['Drug', 'Gene'])
53 | idx = 0
54 | for i in range(len(pharmgkb_rel)):
55 | p1_id = pharmgkb_rel.loc[i, 'Entity1_id']
56 | p1_type = pharmgkb_rel.loc[i, 'Entity1_type']
57 | p2_id = pharmgkb_rel.loc[i, 'Entity2_id']
58 | p2_type = pharmgkb_rel.loc[i, 'Entity2_type']
59 | if p1_type == 'Chemical' and p2_type == 'Gene':
60 | drug = p1_id
61 | gene = p2_id
62 | elif p2_type == 'Chemical' and p1_type == 'Gene':
63 | drug = p2_id
64 | gene = p1_id
65 | else:
66 | continue
67 | res.loc[idx] = [drug, gene]
68 | idx += 1
69 | res.to_csv(folder + '/pharmgkb_drug_gene.csv', index=False)
70 |
71 |
72 | def integrate_PharmGKB():
73 | DG_res = pd.read_csv(folder + '/DG_res.csv')
74 | DG_res_col = list(DG_res.columns)[2:]
75 | DG_res['Associate_PharmGKB'] = [0] * len(DG_res)
76 | # print(DG_res)
77 | pharmgkb_res = pd.read_csv(folder + '/pharmgkb_drug_gene.csv')
78 |
79 | drug_vocab = pd.read_csv(folder + '/drug_vocab.csv')
80 | pharmgkb_drug_vocab = drug_vocab.dropna(subset=['pharmgkb_id'])
81 | pharmgkb_drug_primary_dict = pharmgkb_drug_vocab.set_index('pharmgkb_id')['primary'].to_dict()
82 |
83 | gene_vocab = pd.read_csv(folder + '/gene_vocab_2.csv')
84 | pharmgkb_gene_vocab = gene_vocab.dropna(subset=['pharmgkb_id'])
85 | pharmgkb_gene_primary_dict = pharmgkb_gene_vocab.set_index('pharmgkb_id')['primary'].to_dict()
86 |
87 | pharmgkb_res = pharmgkb_res.replace({'Drug': pharmgkb_drug_primary_dict, 'Gene': pharmgkb_gene_primary_dict})
88 | for col in DG_res_col[:-1]:
89 | pharmgkb_res[col] = [0] * len(pharmgkb_res)
90 | pharmgkb_res['Source'] = ['PharmGKB'] * len(pharmgkb_res)
91 | pharmgkb_res['Associate_PharmGKB'] = [1] * len(pharmgkb_res)
92 | # print(pharmgkb_res)
93 | DG_res = pd.concat((DG_res, pharmgkb_res))
94 | DG_res.loc[DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 'Associate_PharmGKB'] = 1
95 | DG_res['Source'] = np.where(DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), DG_res['Source'].astype(str) + ';PharmGKB', DG_res['Source'].astype(str) + '')
96 | DG_res = DG_res.drop_duplicates(subset=['Drug', 'Gene'], keep='first')
97 | DG_res_col = list(DG_res.columns)
98 | DG_res_col_new = DG_res_col[:-2] + DG_res_col[-1:] + DG_res_col[-2:-1]
99 | DG_res = DG_res[DG_res_col_new]
100 | DG_res['Source'] = DG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
101 | DG_res.to_csv(folder + '/DG_res_2.csv', index=False)
102 | with open(folder + '/integration_notes.txt', 'a') as f:
103 | f.write('DG_res_2: DrugBank, KEGG and PharmGKB (Associate).\n')
104 | f.close()
105 |
106 |
107 | def integrate_Hetionet():
108 | DG_res = pd.read_csv(folder + '/DG_res_2.csv')
109 | DG_res_col = list(DG_res.columns)[2:]
110 | DG_res['Binds_Hetionet'] = [0] * len(DG_res)
111 |
112 | hetionet_DG = pd.read_csv(folder + '/hetionet_DG.csv')
113 | hetionet_DG = hetionet_DG.rename(columns={'source': 'Drug', 'target': 'Gene'})
114 | hetionet_DG['Drug'] = hetionet_DG['Drug'].str.replace('Compound::', '')
115 | hetionet_DG['Gene'] = hetionet_DG['Gene'].str.replace('Gene::', '')
116 |
117 | drug_vocab = pd.read_csv(folder + '/drug_vocab.csv')
118 | db_vocab = drug_vocab.dropna(subset=['drugbank_id'])
119 | db_primary_dict = db_vocab.set_index('drugbank_id')['primary'].to_dict()
120 |
121 | gene_vocab = pd.read_csv(folder + '/gene_vocab_2.csv')
122 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
123 | ncbi_vocab['ncbi_id'] = ncbi_vocab['ncbi_id'].astype(int).astype(str)
124 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict()
125 |
126 | # integrate binds
127 | hetionet_binds = hetionet_DG[hetionet_DG['metaedge'] == 'CbG']
128 | hetionet_binds = hetionet_binds.replace({'Drug': db_primary_dict, 'Gene': ncbi_primary_dict})
129 | hetionet_binds = hetionet_binds[['Drug', 'Gene']]
130 | for col in DG_res_col[:-1]:
131 | hetionet_binds[col] = [0] * len(hetionet_binds)
132 | hetionet_binds['Source'] = ['Hetinoet'] * len(hetionet_binds)
133 | hetionet_binds['Binds_Hetionet'] = [1] * len(hetionet_binds)
134 | DG_res = pd.concat((DG_res, hetionet_binds))
135 | DG_res.loc[DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 'Binds_Hetionet'] = 1
136 | DG_res['Source'] = np.where(DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), DG_res['Source'].astype(str) + ';Hetinoet', DG_res['Source'].astype(str) + '')
137 | DG_res = DG_res.drop_duplicates(subset=['Drug', 'Gene'], keep='first')
138 | DG_res_col = list(DG_res.columns)
139 | DG_res_col_new = DG_res_col[:-2] + DG_res_col[-1:] + DG_res_col[-2:-1]
140 | DG_res = DG_res[DG_res_col_new]
141 | DG_res['Source'] = DG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
142 | print(DG_res)
143 |
144 | # integrate Downregulates
145 | DG_res_col = list(DG_res.columns)[2:]
146 | DG_res['Downregulates_Hetionet'] = [0] * len(DG_res)
147 | hetionet_downregulates = hetionet_DG[hetionet_DG['metaedge'] == 'CdG']
148 | hetionet_downregulates = hetionet_downregulates.replace({'Drug': db_primary_dict, 'Gene': ncbi_primary_dict})
149 | hetionet_downregulates = hetionet_downregulates[['Drug', 'Gene']]
150 | for col in DG_res_col[:-1]:
151 | hetionet_downregulates[col] = [0] * len(hetionet_downregulates)
152 | hetionet_downregulates['Source'] = ['Hetinoet'] * len(hetionet_downregulates)
153 | hetionet_downregulates['Downregulates_Hetionet'] = [1] * len(hetionet_downregulates)
154 | DG_res = pd.concat((DG_res, hetionet_downregulates))
155 | DG_res.loc[DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 'Downregulates_Hetionet'] = 1
156 | DG_res['Source'] = np.where(DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), DG_res['Source'].astype(str) + ';Hetinoet', DG_res['Source'].astype(str) + '')
157 | DG_res = DG_res.drop_duplicates(subset=['Drug', 'Gene'], keep='first')
158 | DG_res_col = list(DG_res.columns)
159 | DG_res_col_new = DG_res_col[:-2] + DG_res_col[-1:] + DG_res_col[-2:-1]
160 | DG_res = DG_res[DG_res_col_new]
161 | DG_res['Source'] = DG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
162 | print(DG_res)
163 |
164 | # integrate Upregulates
165 | DG_res_col = list(DG_res.columns)[2:]
166 | DG_res['Upregulates_Hetionet'] = [0] * len(DG_res)
167 | hetionet_upregulates = hetionet_DG[hetionet_DG['metaedge'] == 'CuG']
168 | hetionet_upregulates = hetionet_upregulates.replace({'Drug': db_primary_dict, 'Gene': ncbi_primary_dict})
169 | hetionet_upregulates = hetionet_upregulates[['Drug', 'Gene']]
170 | for col in DG_res_col[:-1]:
171 | hetionet_upregulates[col] = [0] * len(hetionet_upregulates)
172 | hetionet_upregulates['Source'] = ['Hetinoet'] * len(hetionet_upregulates)
173 | hetionet_upregulates['Upregulates_Hetionet'] = [1] * len(hetionet_upregulates)
174 | DG_res = pd.concat((DG_res, hetionet_upregulates))
175 | DG_res.loc[DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 'Upregulates_Hetionet'] = 1
176 | DG_res['Source'] = np.where(DG_res.duplicated(subset=['Drug', 'Gene'], keep=False),
177 | DG_res['Source'].astype(str) + ';Hetinoet', DG_res['Source'].astype(str) + '')
178 | DG_res = DG_res.drop_duplicates(subset=['Drug', 'Gene'], keep='first')
179 | DG_res_col = list(DG_res.columns)
180 | DG_res_col_new = DG_res_col[:-2] + DG_res_col[-1:] + DG_res_col[-2:-1]
181 | DG_res = DG_res[DG_res_col_new]
182 | DG_res['Source'] = DG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
183 | print(DG_res)
184 | DG_res.to_csv(folder + '/DG_res_3.csv', index=False)
185 | with open(folder + '/integration_notes.txt', 'a') as f:
186 | f.write('DG_res_3: DrugBank, KEGG, PharmGKB and Hetionet (Binds, Downregulates and Upregulates).\n')
187 | f.close()
188 |
189 |
190 | def intergrate_CTD_DG():
191 | DG_res = pd.read_csv(folder + '/DG_res_3.csv')
192 | DG_res_col = list(DG_res.columns)[2:]
193 | DG_res['Interaction_CTD'] = [0] * len(DG_res)
194 |
195 | chem_gene = pd.read_csv(CTD_folder + 'CTD_chem_gene_ixns.csv', header=27)
196 | chem_gene = chem_gene[['ChemicalID', 'GeneID']].dropna()
197 | chem_gene = chem_gene.drop_duplicates(subset=['ChemicalID', 'GeneID'])
198 | chem_gene = chem_gene.reset_index(drop=True)
199 | chem_gene = chem_gene.rename(columns={'ChemicalID': 'Drug', 'GeneID': 'Gene'})
200 |
201 | drug_vocab = pd.read_csv(folder + '/drug_vocab.csv')
202 | mesh_vocab = drug_vocab.dropna(subset=['mesh_id'])
203 | mesh_primary_dict = mesh_vocab.set_index('mesh_id')['primary'].to_dict()
204 |
205 | gene_vocab = pd.read_csv(folder + '/gene_vocab_2.csv')
206 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
207 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict()
208 |
209 | # chem_gene = chem_gene.replace({'Drug': mesh_primary_dict, 'Gene': ncbi_primary_dict})
210 |
211 | for i in range(len(chem_gene)):
212 | drug_id = chem_gene.loc[i, 'Drug']
213 | gene_id = chem_gene.loc[i, 'Gene']
214 |
215 | chem_gene.loc[i, 'Drug'] = mesh_primary_dict[drug_id]
216 | chem_gene.loc[i, 'Gene'] = ncbi_primary_dict[gene_id]
217 | print(i + 1, '/', len(chem_gene), 'Completed...')
218 | print(chem_gene)
219 |
220 | for col in DG_res_col[:-1]:
221 | chem_gene[col] = [0] * len(chem_gene)
222 | chem_gene['Source'] = ['CTD'] * len(chem_gene)
223 | chem_gene['Interaction_CTD'] = [1] * len(chem_gene)
224 | DG_res = pd.concat((DG_res, chem_gene))
225 | DG_res.loc[DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), 'Interaction_CTD'] = 1
226 | DG_res['Source'] = np.where(DG_res.duplicated(subset=['Drug', 'Gene'], keep=False),
227 | DG_res['Source'].astype(str) + ';CTD', DG_res['Source'].astype(str) + '')
228 | DG_res = DG_res.drop_duplicates(subset=['Drug', 'Gene'], keep='first')
229 | DG_res_col = list(DG_res.columns)
230 | DG_res_col_new = DG_res_col[:-2] + DG_res_col[-1:] + DG_res_col[-2:-1]
231 | DG_res = DG_res[DG_res_col_new]
232 | DG_res['Source'] = DG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
233 | DG_res.to_csv(folder + '/DG_res_4.csv', index=False)
234 | with open(folder + '/integration_notes.txt', 'a') as f:
235 | f.write('DG_res_4: DrugBank, KEGG, PharmGKB, Hetionet and CTD (Interaction).\n')
236 | f.close()
237 |
238 |
239 | def integrate_DRKG_DG():
240 | DG_res = pd.read_csv(folder + '/DG_res_4.csv')
241 | DG_res_col = list(DG_res.columns)[2:]
242 |
243 | drkg_DG = pd.read_csv('drug/drkg_DG.csv')
244 | drkg_DG = drkg_DG.rename(columns={'entity_1': 'Drug', 'entity_2': 'Gene'})
245 | drkg_DG['Drug'] = drkg_DG['Drug'].str.replace('Compound::', '')
246 | drkg_DG['Gene'] = drkg_DG['Gene'].str.replace('Gene::', '')
247 | dg_relation_list = list(drkg_DG.drop_duplicates(subset='relation', keep='first')['relation'])
248 | # dg_source_list = list(drkg_DG.drop_duplicates(subset='source', keep='first')['source'])
249 | # print(dg_relation_list)
250 | # print(dg_source_list)
251 | # print(drkg_DG.drop_duplicates(subset='relation', keep='first'))
252 |
253 | drug_vocab = pd.read_csv(folder + '/drug_vocab.csv')
254 | db_vocab = drug_vocab.dropna(subset=['drugbank_id'])
255 | db_primary_dict = db_vocab.set_index('drugbank_id')['primary'].to_dict()
256 |
257 | gene_vocab = pd.read_csv(folder + '/gene_vocab_2.csv')
258 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
259 | ncbi_vocab['ncbi_id'] = ncbi_vocab['ncbi_id'].astype(int).astype(str)
260 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict()
261 |
262 | for drkg_rel in dg_relation_list:
263 | print(drkg_rel)
264 | DG_res[drkg_rel] = [0] * len(DG_res)
265 | drkg_DG_temp = drkg_DG[drkg_DG['relation'] == drkg_rel]
266 | drkg_DG_temp = drkg_DG_temp.replace({'Drug': db_primary_dict, 'Gene': ncbi_primary_dict})
267 | drkg_DG_temp = drkg_DG_temp[['Drug', 'Gene']]
268 | for col in DG_res_col[:-1]:
269 | drkg_DG_temp[col] = [0] * len(drkg_DG_temp)
270 | drkg_DG_temp['Source'] = ['DRKG'] * len(drkg_DG_temp)
271 | drkg_DG_temp[drkg_rel] = [1] * len(drkg_DG_temp)
272 | DG_res = pd.concat((DG_res, drkg_DG_temp))
273 | DG_res.loc[DG_res.duplicated(subset=['Drug', 'Gene'], keep=False), drkg_rel] = 1
274 | DG_res['Source'] = np.where(DG_res.duplicated(subset=['Drug', 'Gene'], keep=False),
275 | DG_res['Source'].astype(str) + ';DRKG', DG_res['Source'].astype(str) + '')
276 | DG_res = DG_res.drop_duplicates(subset=['Drug', 'Gene'], keep='first')
277 | DG_res_col = list(DG_res.columns)
278 | DG_res_col_new = DG_res_col[:-2] + DG_res_col[-1:] + DG_res_col[-2:-1]
279 | DG_res = DG_res[DG_res_col_new]
280 | DG_res_col = DG_res_col_new[2:]
281 | DG_res['Source'] = DG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
282 |
283 | DG_res = DG_res.rename(columns={'association': 'Associate_DRKG', 'direct interation': 'Interaction_DRKG'})
284 | DG_res.to_csv(folder + '/DG_res_5.csv', index=False)
285 | with open(folder + '/integration_notes.txt', 'a') as f:
286 | f.write('DG_res_5: DrugBank, KEGG, PharmGKB, Hetionet, CTD and DRKG (Semantic Relations, Interaction and Associate).\n')
287 | f.close()
288 |
289 |
290 | def main():
291 | # integrate_DrugBank_KEGG()
292 | # # extract_PharmGKB_DG()
293 | # integrate_PharmGKB()
294 | integrate_Hetionet()
295 | intergrate_CTD_DG()
296 | integrate_DRKG_DG()
297 |
298 |
299 | if __name__ == '__main__':
300 | main()
301 |
--------------------------------------------------------------------------------
/Codes_Term Harmonization/Relation_Integration/integrate_gene_related.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 | pd.set_option('display.max_columns', None)
5 |
6 | folder = ''
7 |
8 |
9 | def integrate_Hetionet_GG():
10 | hetionet_GG = pd.read_csv(folder + '/hetionet_GG.csv')
11 | hetionet_GG = hetionet_GG.rename(columns={'source': 'Gene_1', 'target': 'Gene_2'})
12 | hetionet_GG['Gene_1'] = hetionet_GG['Gene_1'].str.replace('Gene::', '')
13 | hetionet_GG['Gene_2'] = hetionet_GG['Gene_2'].str.replace('Gene::', '')
14 |
15 | gene_vocab = pd.read_csv(folder + '/gene_vocab_2.csv')
16 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
17 | ncbi_vocab['ncbi_id'] = ncbi_vocab['ncbi_id'].astype(int).astype(str)
18 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict()
19 |
20 | hetionet_GcG = hetionet_GG[hetionet_GG['metaedge'] == 'GcG']
21 | print(hetionet_GcG)
22 | hetionet_GcG = hetionet_GcG.replace({'Gene_1': ncbi_primary_dict, 'Gene_2': ncbi_primary_dict})
23 | hetionet_GcG = hetionet_GcG[['Gene_1', 'Gene_2']]
24 | print(hetionet_GcG)
25 | hetionet_GcG['Covaries'] = [1] * len(hetionet_GcG)
26 | hetionet_GcG['Interacts'] = [0] * len(hetionet_GcG)
27 |
28 | hetionet_GiG = hetionet_GG[hetionet_GG['metaedge'] == 'GiG']
29 | hetionet_GiG = hetionet_GiG.replace({'Gene_1': ncbi_primary_dict, 'Gene_2': ncbi_primary_dict})
30 | hetionet_GiG = hetionet_GiG[['Gene_1', 'Gene_2']]
31 | hetionet_GiG['Covaries'] = [0] * len(hetionet_GiG)
32 | hetionet_GiG['Interacts'] = [1] * len(hetionet_GiG)
33 |
34 | GG_res = pd.concat((hetionet_GcG, hetionet_GiG))
35 | GG_res.loc[GG_res.duplicated(subset=['Gene_1', 'Gene_2'], keep=False), 'Interacts'] = 1
36 | GG_res = GG_res.drop_duplicates(subset=['Gene_1', 'Gene_2'], keep='first')
37 |
38 | GG_res['Regulates'] = [0] * len(GG_res)
39 |
40 | hetionet_GrG = hetionet_GG[hetionet_GG['metaedge'] == 'Gr>G']
41 | hetionet_GrG = hetionet_GrG.replace({'Gene_1': ncbi_primary_dict, 'Gene_2': ncbi_primary_dict})
42 | hetionet_GrG = hetionet_GrG[['Gene_1', 'Gene_2']]
43 | hetionet_GrG['Covaries'] = [0] * len(hetionet_GrG)
44 | hetionet_GrG['Interacts'] = [0] * len(hetionet_GrG)
45 | hetionet_GrG['Regulates'] = [1] * len(hetionet_GrG)
46 |
47 | GG_res = pd.concat((GG_res, hetionet_GrG))
48 | GG_res.loc[GG_res.duplicated(subset=['Gene_1', 'Gene_2'], keep=False), 'Regulates'] = 1
49 | GG_res = GG_res.drop_duplicates(subset=['Gene_1', 'Gene_2'], keep='first')
50 |
51 | GG_res['Source'] = ['Hetionet'] * len(GG_res)
52 | print(GG_res)
53 | GG_res.to_csv(folder + '/GG_res.csv', index=False)
54 | with open(folder + '/integration_notes.txt', 'w') as f:
55 | f.write('GG_res: Hetionet (Covaries, Interacts and Regulates).\n')
56 | f.close()
57 |
58 |
59 | def extract_PharmGKB_GG():
60 | pharmgkb_rel = pd.read_table(folder + 'pharmgkb_rel.tsv')
61 | pharmgkb_rel = pharmgkb_rel[pharmgkb_rel['Association'] == 'associated']
62 | pharmgkb_rel = pharmgkb_rel.reset_index(drop=True)
63 | res = pd.DataFrame(columns=['Gene_1', 'Gene_2'])
64 | idx = 0
65 | for i in range(len(pharmgkb_rel)):
66 | p1_id = pharmgkb_rel.loc[i, 'Entity1_id']
67 | p1_type = pharmgkb_rel.loc[i, 'Entity1_type']
68 | p2_id = pharmgkb_rel.loc[i, 'Entity2_id']
69 | p2_type = pharmgkb_rel.loc[i, 'Entity2_type']
70 | if p1_type == 'Gene' and p2_type == 'Gene':
71 | gene_1 = p1_id
72 | gene_2 = p2_id
73 | else:
74 | continue
75 | res.loc[idx] = [gene_1, gene_2]
76 | idx += 1
77 | res.to_csv(folder + '/pharmgkb_gene_gene.csv', index=False)
78 |
79 |
80 | def integrate_PharmGKB_GG():
81 | GG_res = pd.read_csv(folder + '/GG_res.csv')
82 | GG_res_cols = list(GG_res.columns)[2:]
83 | GG_res['Associate'] = [0] * len(GG_res)
84 |
85 | pharmgkb_res = pd.read_csv(folder + '/pharmgkb_gene_gene.csv')
86 |
87 | gene_vocab = pd.read_csv(folder + '/gene_vocab_2.csv')
88 | pharmgkb_gene_vocab = gene_vocab.dropna(subset=['pharmgkb_id'])
89 | pharmgkb_gene_primary_dict = pharmgkb_gene_vocab.set_index('pharmgkb_id')['primary'].to_dict()
90 |
91 | pharmgkb_res = pharmgkb_res.replace({'Gene_1': pharmgkb_gene_primary_dict, 'Gene_2': pharmgkb_gene_primary_dict})
92 | for col in GG_res_cols[:-1]:
93 | pharmgkb_res[col] = [0] * len(pharmgkb_res)
94 | pharmgkb_res['Source'] = ['PharmGKB'] * len(pharmgkb_res)
95 | pharmgkb_res['Associate'] = [1] * len(pharmgkb_res)
96 | GG_res = pd.concat((GG_res, pharmgkb_res))
97 | GG_res.loc[GG_res.duplicated(subset=['Gene_1', 'Gene_2'], keep=False), 'Associate'] = 1
98 | GG_res['Source'] = np.where(GG_res.duplicated(subset=['Gene_1', 'Gene_2'], keep=False),
99 | GG_res['Source'].astype(str) + ';PharmGKB', GG_res['Source'].astype(str) + '')
100 | GG_res = GG_res.drop_duplicates(subset=['Gene_1', 'Gene_2'], keep='first')
101 | GG_res_col = list(GG_res.columns)
102 | GG_res_col_new = GG_res_col[:-2] + GG_res_col[-1:] + GG_res_col[-2:-1]
103 | GG_res = GG_res[GG_res_col_new]
104 | GG_res['Source'] = GG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
105 | GG_res.to_csv(folder + '/GG_res_2.csv', index=False)
106 | with open(folder + '/integration_notes.txt', 'a') as f:
107 | f.write('GG_res_2: Hetionet and PharmGKB (Associate).\n')
108 | f.close()
109 |
110 |
111 | def integrate_DRKG_GG():
112 | drkg_GG = pd.read_csv(folder + '/drkg_GG.csv')
113 | drkg_GG = drkg_GG[(drkg_GG['source'] == 'GNBR') | (drkg_GG['source'] == 'IntAct')]
114 | drkg_GG = drkg_GG[['entity_1', 'relation', 'entity_2']]
115 | drkg_GG = drkg_GG[~((drkg_GG['entity_1'] == 'Gene::') | (drkg_GG['entity_2'] == 'Gene::'))]
116 | drkg_GG = drkg_GG.drop_duplicates(subset=['entity_1', 'entity_2'])
117 | drkg_GG = drkg_GG.reset_index(drop=True)
118 | drkg_GG = drkg_GG.rename(columns={'entity_1': 'Gene_1', 'entity_2': 'Gene_2'})
119 | drkg_GG['Gene_1'] = drkg_GG['Gene_1'].str.replace('Gene::', '')
120 | drkg_GG['Gene_2'] = drkg_GG['Gene_2'].str.replace('Gene::', '')
121 | gg_relation_list = list(drkg_GG.drop_duplicates(subset='relation', keep='first')['relation'])
122 | # gg_source_list = list(drkg_GG.drop_duplicates(subset='source', keep='first')['source'])
123 | # print(gg_relation_list)
124 | # print(gg_source_list)
125 | # print(drkg_GG.drop_duplicates(subset='relation', keep='first'))
126 |
127 | gene_vocab = pd.read_csv(folder + '/gene_vocab_2.csv')
128 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
129 | ncbi_vocab['ncbi_id'] = ncbi_vocab['ncbi_id'].astype(int).astype(str)
130 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict()
131 |
132 | GG_res = pd.read_csv(folder + '/GG_res_2.csv')
133 | GG_res_cols = list(GG_res.columns)[2:]
134 |
135 | for drkg_rel in gg_relation_list:
136 | print(drkg_rel)
137 | GG_res[drkg_rel] = [0] * len(GG_res)
138 | drkg_GG_temp = drkg_GG[drkg_GG['relation'] == drkg_rel]
139 | drkg_GG_temp = drkg_GG_temp.replace({'Gene_1': ncbi_primary_dict, 'Gene_2': ncbi_primary_dict})
140 | drkg_GG_temp = drkg_GG_temp[['Gene_1', 'Gene_2']]
141 | for col in GG_res_cols[:-1]:
142 | drkg_GG_temp[col] = [0] * len(drkg_GG_temp)
143 | drkg_GG_temp['Source'] = ['DRKG'] * len(drkg_GG_temp)
144 | drkg_GG_temp[drkg_rel] = [1] * len(drkg_GG_temp)
145 | GG_res = pd.concat((GG_res, drkg_GG_temp))
146 | GG_res.loc[GG_res.duplicated(subset=['Gene_1', 'Gene_2'], keep=False), drkg_rel] = 1
147 | GG_res['Source'] = np.where(GG_res.duplicated(subset=['Gene_1', 'Gene_2'], keep=False),
148 | GG_res['Source'].astype(str) + ';DRKG', GG_res['Source'].astype(str) + '')
149 | GG_res = GG_res.drop_duplicates(subset=['Gene_1', 'Gene_2'], keep='first')
150 | GG_res_col = list(GG_res.columns)
151 | GG_res_col_new = GG_res_col[:-2] + GG_res_col[-1:] + GG_res_col[-2:-1]
152 | GG_res = GG_res[GG_res_col_new]
153 | GG_res_cols = GG_res_col_new[2:]
154 | GG_res['Source'] = GG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
155 | GG_res.to_csv(folder + '/GG_res_3.csv', index=False)
156 | with open(folder + '/integration_notes.txt', 'a') as f:
157 | f.write('GG_res_3: Hetionet, PharmGKB and DRKG.\n')
158 | f.close()
159 |
160 |
161 | def integrate_GA_Bgee_present():
162 | Bgee_present = pd.read_csv('/processed_Bgee_present.csv')
163 |
164 | ncbi_df = pd.read_table('/gene2ensembl')
165 | ensembl_ncbi_dict = ncbi_df.set_index('Ensembl_gene_identifier')['GeneID'].to_dict()
166 |
167 | gene_vocab = pd.read_csv('/gene_vocab_2.csv')
168 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
169 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict()
170 |
171 | anatomy_vocab = pd.read_csv('/anatomy_res_3.csv')
172 | uberon_vocab = anatomy_vocab.dropna(subset=['uberon_id'])
173 | uberon_primary_dict = uberon_vocab.set_index('uberon_id')['primary'].to_dict()
174 |
175 | gene_list = []
176 | anatomy_list = []
177 | for i in range(len(Bgee_present)):
178 | gene_id = Bgee_present.loc[i, 'Gene ID']
179 | anatomy_id = Bgee_present.loc[i, 'Anatomical entity ID']
180 |
181 | if gene_id in ensembl_ncbi_dict:
182 | ncbi_id = ensembl_ncbi_dict[gene_id]
183 | gene_primary = ncbi_primary_dict[ncbi_id]
184 | else:
185 | continue
186 |
187 | anatomy_primary = uberon_primary_dict[anatomy_id]
188 | gene_list.append(gene_primary)
189 | anatomy_list.append(anatomy_primary)
190 | print(i + 1, '/', len(Bgee_present), 'Completed (Bgee present)...')
191 | GA_res = pd.DataFrame({'Gene': gene_list, 'Anatomy': anatomy_list, 'Present': [1] * len(gene_list), 'Source': ['Reactome'] * len(gene_list)})
192 | GA_res.to_csv('/GA_res.csv', index=False)
193 |
194 |
195 | def integrate_GA_Bgee_absent():
196 | GA_res = pd.read_csv('/GA_res.csv')
197 | GA_res = GA_res.rename(columns={'Present': 'Express'})
198 | GA_res['Absent'] = [0] * len(GA_res)
199 | print(list(GA_res.columns))
200 | Bgee_absent = pd.read_csv('/processed_Bgee_absent.csv')
201 |
202 | ncbi_df = pd.read_table('/gene2ensembl')
203 | ensembl_ncbi_dict = ncbi_df.set_index('Ensembl_gene_identifier')['GeneID'].to_dict()
204 |
205 | gene_vocab = pd.read_csv('/gene_vocab_2.csv')
206 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
207 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict()
208 |
209 | anatomy_vocab = pd.read_csv('res/entity/anatomy_res_3.csv')
210 | uberon_vocab = anatomy_vocab.dropna(subset=['uberon_id'])
211 | uberon_primary_dict = uberon_vocab.set_index('uberon_id')['primary'].to_dict()
212 |
213 | gene_list = []
214 | anatomy_list = []
215 | for i in range(len(Bgee_absent)):
216 | gene_id = Bgee_absent.loc[i, 'Gene ID']
217 | anatomy_id = Bgee_absent.loc[i, 'Anatomical entity ID']
218 |
219 | if gene_id in ensembl_ncbi_dict:
220 | ncbi_id = ensembl_ncbi_dict[gene_id]
221 | gene_primary = ncbi_primary_dict[ncbi_id]
222 | else:
223 | continue
224 |
225 | anatomy_primary = uberon_primary_dict[anatomy_id]
226 | gene_list.append(gene_primary)
227 | anatomy_list.append(anatomy_primary)
228 | print(i + 1, '/', len(Bgee_absent), 'Completed (Bgee absent)...')
229 | Bgee_absent = pd.DataFrame({'Gene': gene_list, 'Anatomy': anatomy_list, 'Express': [0] * len(gene_list),
230 | 'Source': ['Reactome'] * len(gene_list), 'Absent': [1] * len(gene_list)})
231 | print(Bgee_absent)
232 | GA_res = pd.concat((GA_res, Bgee_absent))
233 | GA_res.loc[GA_res.duplicated(subset=['Gene', 'Anatomy'], keep=False), 'Absent'] = 1
234 | GA_res = GA_res.drop_duplicates(subset=['Gene', 'Anatomy'], keep='first')
235 | GA_res = GA_res[['Gene', 'Anatomy', 'Express', 'Absent', 'Source']]
236 | GA_res.to_csv('/GA_res_2.csv', index=False)
237 |
238 |
239 | def integrate_GA_TISSUE():
240 | GA_res = pd.read_csv('/GA_res_2.csv')
241 | GA_res['Express_TISSUE'] = [0] * len(GA_res)
242 | print(list(GA_res.columns))
243 | tissue_df = pd.read_csv('/processed_TISSUE.csv')
244 |
245 | gene_vocab = pd.read_csv('/gene_vocab_2.csv')
246 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
247 | ncbi_vocab['ncbi_id'] = ncbi_vocab['ncbi_id'].astype(int).astype(str)
248 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict()
249 |
250 | anatomy_vocab = pd.read_csv('/anatomy_res_3.csv')
251 | bto_vocab = anatomy_vocab.dropna(subset=['bto_id'])
252 | bto_primary_dict = bto_vocab.set_index('bto_id')['primary'].to_dict()
253 |
254 | gene_list = []
255 | anatomy_list = []
256 | for i in range(len(tissue_df)):
257 | gene_id = tissue_df.loc[i, 'gene_id'].replace('NCBI:', '')
258 | anatomy_id = tissue_df.loc[i, 'tissue_id']
259 |
260 | gene_primary = ncbi_primary_dict[gene_id] if gene_id in ncbi_primary_dict else gene_id
261 | gene_list.append(gene_primary)
262 | anatomy_list.append(bto_primary_dict[anatomy_id])
263 |
264 | print(i + 1, '/', len(tissue_df), 'Completed (TISSUE)...')
265 | tissue_res = pd.DataFrame({'Gene': gene_list, 'Anatomy': anatomy_list, 'Express': [0] * len(gene_list),
266 | 'Absent': [0] * len(gene_list), 'Source': ['TISSUE'] * len(gene_list),
267 | 'Express_TISSUE': [1] * len(gene_list)})
268 | print(tissue_res)
269 | GA_res = pd.concat((GA_res, tissue_res))
270 | GA_res.loc[GA_res.duplicated(subset=['Gene', 'Anatomy'], keep=False), 'Express_TISSUE'] = 1
271 | GA_res['Source'] = np.where(GA_res.duplicated(subset=['Gene', 'Anatomy'], keep=False),
272 | GA_res['Source'].astype(str) + ';TISSUE', GA_res['Source'].astype(str) + '')
273 | GA_res = GA_res.drop_duplicates(subset=['Gene', 'Anatomy'], keep='first')
274 | GA_res['Source'] = GA_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
275 | GA_res['Express'] = GA_res['Express'] + GA_res['Express_TISSUE']
276 | GA_res.loc[GA_res['Express'] != 0, 'Express'] = 1
277 | GA_res_col = list(GA_res.columns)
278 | GA_res_col.remove('Express_TISSUE')
279 | GA_res = GA_res[GA_res_col]
280 | GA_res.to_csv('/GA_res_3.csv', index=False)
281 |
282 |
283 | def integrate_GPwy_Reactome():
284 | gpwy_Reactome = pd.read_table(folder + 'NCBI2Reactome_All_Levels.txt', header=None)
285 | homo_Reactome = gpwy_Reactome[gpwy_Reactome[5] == 'Homo sapiens']
286 | homo_Reactome = homo_Reactome[homo_Reactome[0].astype(str).str.isdigit()]
287 | homo_Reactome[0] = homo_Reactome[0].astype(int).astype(str)
288 | homo_Reactome = homo_Reactome.drop_duplicates(subset=[0, 1], keep='first')
289 | homo_Reactome = homo_Reactome.reset_index(drop=True)
290 | homo_Reactome = homo_Reactome[[0, 1]]
291 | homo_Reactome = homo_Reactome.rename(columns={0: 'Gene', 1: 'Pathway'})
292 | print(homo_Reactome)
293 | gene_vocab = pd.read_csv(folder + 'gene_vocab.csv')
294 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
295 | ncbi_vocab['ncbi_id'] = ncbi_vocab['ncbi_id'].astype(int).astype(str)
296 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict()
297 |
298 | pwy_vocab = pd.read_csv(folder + 'pathway_vocab.csv')
299 | reactome_vocab = pwy_vocab.dropna(subset=['reactome_id'])
300 | reactome_primary_dict = reactome_vocab.set_index('reactome_id')['primary'].to_dict()
301 |
302 | homo_Reactome = homo_Reactome.replace({'Gene': ncbi_primary_dict, 'Pathway': reactome_primary_dict})
303 | print(homo_Reactome)
304 | homo_Reactome['Reaction'] = [1] * len(homo_Reactome)
305 | homo_Reactome['Source'] = ['Reactome'] * len(homo_Reactome)
306 | homo_Reactome.to_csv(folder + 'GPwy_res.csv', index=False)
307 |
308 |
309 | def integrate_GPwy_KEGG():
310 | GPwy_res = pd.read_csv(folder + 'GPwy_res.csv')
311 | GPwy_res['Associate'] = [0] * len(GPwy_res)
312 |
313 | kegg_GPwy = pd.read_csv(folder + '/kegg_gene_pathway.csv')
314 | kegg_GPwy = kegg_GPwy.rename(columns={'pathway_id': 'Pathway', 'ncbi_id': 'Gene'})
315 | kegg_GPwy = kegg_GPwy[['Gene', 'Pathway']]
316 | print(kegg_GPwy)
317 | gene_vocab = pd.read_csv(folder + 'gene_vocab.csv')
318 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
319 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict()
320 |
321 | pwy_vocab = pd.read_csv(folder + 'pathway_vocab.csv')
322 | kegg_vocab = pwy_vocab.dropna(subset=['kegg_id'])
323 | kegg_primary_dict = kegg_vocab.set_index('kegg_id')['primary'].to_dict()
324 |
325 | kegg_GPwy = kegg_GPwy.replace({'Gene': ncbi_primary_dict, 'Pathway': kegg_primary_dict})
326 | print(kegg_GPwy)
327 | kegg_GPwy['Reaction'] = [0] * len(kegg_GPwy)
328 | kegg_GPwy['Source'] = ['KEGG'] * len(kegg_GPwy)
329 | kegg_GPwy['Associate'] = [1] * len(kegg_GPwy)
330 |
331 | GPwy_res = pd.concat((GPwy_res, kegg_GPwy))
332 | GPwy_res.loc[GPwy_res.duplicated(subset=['Gene', 'Pathway'], keep=False), 'Associate'] = 1
333 | GPwy_res['Source'] = np.where(GPwy_res.duplicated(subset=['Gene', 'Pathway'], keep=False),
334 | GPwy_res['Source'].astype(str) + ';KEGG', GPwy_res['Source'].astype(str) + '')
335 | GPwy_res = GPwy_res.drop_duplicates(subset=['Gene', 'Pathway'], keep='first')
336 | GPwy_res['Source'] = GPwy_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
337 | GPwy_res_cols = list(GPwy_res.columns)
338 | GPwy_res_cols_new = GPwy_res_cols[:-2] + GPwy_res_cols[-1:] + GPwy_res_cols[-2:-1]
339 | GPwy_res = GPwy_res[GPwy_res_cols_new]
340 | GPwy_res.to_csv(folder + 'GPwy_res_2.csv', index=False)
341 |
342 |
343 | def modify_res():
344 | AG_res = pd.read_csv(folder + 'A_G_res.csv')
345 | AG_res['Source'] = AG_res['Source'].str.replace('Reactome', 'Bgee')
346 | print(AG_res)
347 | AG_res.to_csv(folder + 'A_G_res.csv', index=False)
348 |
349 |
350 | def main():
351 | # integrate_Hetionet_GG()
352 | # extract_PharmGKB_GG()
353 | # integrate_PharmGKB_GG()
354 | # integrate_DRKG_GG()
355 | # integrate_GA_Bgee_present()
356 | # integrate_GA_Bgee_absent()
357 | # integrate_GA_TISSUE()
358 | # integrate_GPwy_Reactome()
359 | # integrate_GPwy_KEGG()
360 |
361 | modify_res()
362 |
363 |
364 | if __name__ == '__main__':
365 | main()
366 |
--------------------------------------------------------------------------------
/Codes_Term Harmonization/Relation_Integration/integrate_disease_gene.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 | pd.set_option('display.max_columns', None)
5 |
6 | folder = ''
7 | CTD_folder = '../CTD/'
8 |
9 |
10 | def integrate_Hetionet():
11 | hetionet_DiG = pd.read_csv(folder + 'hetionet_DiG.csv')
12 | hetionet_DiG = hetionet_DiG.rename(columns={'source': 'Disease', 'target': 'Gene'})
13 | hetionet_DiG['Disease'] = hetionet_DiG['Disease'].str.replace('Disease::', '')
14 | hetionet_DiG['Gene'] = hetionet_DiG['Gene'].str.replace('Gene::', '')
15 |
16 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv')
17 | do_vocab = disease_vocab.dropna(subset=['do_id'])
18 | do_primary_dict = do_vocab.set_index('do_id')['primary'].to_dict()
19 |
20 | gene_vocab = pd.read_csv(folder + 'gene_vocab_2.csv')
21 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
22 | ncbi_vocab['ncbi_id'] = ncbi_vocab['ncbi_id'].astype(int).astype(str)
23 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict()
24 |
25 | hetionet_dig_dag = hetionet_DiG[hetionet_DiG['metaedge'] == 'DaG']
26 | hetionet_dig_dag = hetionet_dig_dag.replace({'Disease': do_primary_dict, 'Gene': ncbi_primary_dict})
27 | hetionet_dig_dag = hetionet_dig_dag[['Disease', 'Gene']]
28 | hetionet_dig_dag['Associate_Hetionet'] = [1] * len(hetionet_dig_dag)
29 | hetionet_dig_dag['Downregulates_Hetionet'] = [0] * len(hetionet_dig_dag)
30 |
31 | hetionet_dig_ddg = hetionet_DiG[hetionet_DiG['metaedge'] == 'DdG']
32 | hetionet_dig_ddg = hetionet_dig_ddg.replace({'Disease': do_primary_dict, 'Gene': ncbi_primary_dict})
33 | hetionet_dig_ddg = hetionet_dig_ddg[['Disease', 'Gene']]
34 | hetionet_dig_ddg['Associate_Hetionet'] = [0] * len(hetionet_dig_ddg)
35 | hetionet_dig_ddg['Downregulates_Hetionet'] = [1] * len(hetionet_dig_ddg)
36 |
37 | DiG_res = pd.concat((hetionet_dig_dag, hetionet_dig_ddg))
38 | DiG_res.loc[DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 'Downregulates_Hetionet'] = 1
39 | DiG_res = DiG_res.drop_duplicates(subset=['Disease', 'Gene'], keep='first')
40 |
41 | DiG_res['Upregulates_Hetionet'] = [0] * len(DiG_res)
42 |
43 | hetionet_dig_dug = hetionet_DiG[hetionet_DiG['metaedge'] == 'DuG']
44 | hetionet_dig_dug = hetionet_dig_dug.replace({'Disease': do_primary_dict, 'Gene': ncbi_primary_dict})
45 | hetionet_dig_dug = hetionet_dig_dug[['Disease', 'Gene']]
46 | hetionet_dig_dug['Associate_Hetionet'] = [0] * len(hetionet_dig_dug)
47 | hetionet_dig_dug['Downregulates_Hetionet'] = [0] * len(hetionet_dig_dug)
48 | hetionet_dig_dug['Upregulates_Hetionet'] = [1] * len(hetionet_dig_dug)
49 |
50 | DiG_res = pd.concat((DiG_res, hetionet_dig_dug))
51 | DiG_res.loc[DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 'Upregulates_Hetionet'] = 1
52 | DiG_res = DiG_res.drop_duplicates(subset=['Disease', 'Gene'], keep='first')
53 |
54 | DiG_res['Source'] = ['Hetionet'] * len(DiG_res)
55 | print(DiG_res)
56 | DiG_res.to_csv(folder + 'DiG_res.csv', index=False)
57 | with open(folder + 'integration_notes.txt', 'w') as f:
58 | f.write('DiG_res: Hetionet (Associate, Downregulates and Upregulates).\n')
59 | f.close()
60 |
61 |
62 | def integrate_KEGG():
63 | DiG_res = pd.read_csv(folder + 'DiG_res.csv')
64 | DiG_res_cols = list(DiG_res.columns)[2:]
65 | DiG_res['Associate_KEGG'] = [0] * len(DiG_res)
66 |
67 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv')
68 | kegg_disease_vocab = disease_vocab.dropna(subset=['kegg_id'])
69 | kegg_disease_primary_dict = kegg_disease_vocab.set_index('kegg_id')['primary'].to_dict()
70 |
71 | gene_vocab = pd.read_csv(folder + 'gene_vocab_2.csv')
72 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
73 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict()
74 |
75 | kegg_df = pd.read_csv(folder + 'kegg_disease_gene.csv')
76 | kegg_df = kegg_df.rename(columns={'disease': 'Disease', 'gene': 'Gene'})
77 | kegg_df = kegg_df.replace({'Disease': kegg_disease_primary_dict, 'Gene': ncbi_primary_dict})
78 |
79 | for col in DiG_res_cols[:-1]:
80 | kegg_df[col] = [0] * len(kegg_df)
81 | kegg_df['Source'] = ['KEGG'] * len(kegg_df)
82 | kegg_df['Associate_KEGG'] = [1] * len(kegg_df)
83 | DiG_res = pd.concat((DiG_res, kegg_df))
84 | DiG_res.loc[DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 'Associate_KEGG'] = 1
85 | DiG_res['Source'] = np.where(DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False),
86 | DiG_res['Source'].astype(str) + ';KEGG', DiG_res['Source'].astype(str) + '')
87 | DiG_res = DiG_res.drop_duplicates(subset=['Disease', 'Gene'], keep='first')
88 | DiG_res_col = list(DiG_res.columns)
89 | DiG_res_col_new = DiG_res_col[:-2] + DiG_res_col[-1:] + DiG_res_col[-2:-1]
90 | DiG_res = DiG_res[DiG_res_col_new]
91 | print(DiG_res)
92 | DiG_res.to_csv(folder + 'DiG_res_2.csv', index=False)
93 | with open(folder + 'integration_notes.txt', 'a') as f:
94 | f.write('DiG_res_2: Hetionet and KEGG (Associate).\n')
95 | f.close()
96 |
97 |
98 | def extract_PharmGKB_DiG():
99 | pharmgkb_rel = pd.read_table(folder + 'pharmgkb_rel.tsv')
100 | pharmgkb_rel = pharmgkb_rel[pharmgkb_rel['Association'] == 'associated']
101 | pharmgkb_rel = pharmgkb_rel.reset_index(drop=True)
102 | res = pd.DataFrame(columns=['Disease', 'Gene'])
103 | idx = 0
104 | for i in range(len(pharmgkb_rel)):
105 | p1_id = pharmgkb_rel.loc[i, 'Entity1_id']
106 | p1_type = pharmgkb_rel.loc[i, 'Entity1_type']
107 | p2_id = pharmgkb_rel.loc[i, 'Entity2_id']
108 | p2_type = pharmgkb_rel.loc[i, 'Entity2_type']
109 | if p1_type == 'Disease' and p2_type == 'Gene':
110 | disease = p1_id
111 | gene = p2_id
112 | elif p2_type == 'Disease' and p1_type == 'Gene':
113 | disease = p2_id
114 | gene = p1_id
115 | else:
116 | continue
117 | res.loc[idx] = [disease, gene]
118 | idx += 1
119 | res.to_csv(folder + 'pharmgkb_disease_gene.csv', index=False)
120 |
121 |
122 | def integrate_PharmGKB():
123 | DiG_res = pd.read_csv(folder + 'DiG_res_2.csv')
124 | DiG_res_cols = list(DiG_res.columns)[2:]
125 | DiG_res['Associate_PharmGKB'] = [0] * len(DiG_res)
126 |
127 | pharmgkb_res = pd.read_csv(folder + 'pharmgkb_disease_gene.csv')
128 |
129 | gene_vocab = pd.read_csv(folder + 'gene_vocab_2.csv')
130 | pharmgkb_gene_vocab = gene_vocab.dropna(subset=['pharmgkb_id'])
131 | pharmgkb_gene_primary_dict = pharmgkb_gene_vocab.set_index('pharmgkb_id')['primary'].to_dict()
132 |
133 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv')
134 | pharmgkb_disease_vocab = disease_vocab.dropna(subset=['pharmgkb_id'])
135 | pharmgkb_disease_primary_dict = pharmgkb_disease_vocab.set_index('pharmgkb_id')['primary'].to_dict()
136 |
137 | pharmgkb_res = pharmgkb_res.replace({'Disease': pharmgkb_disease_primary_dict, 'Gene': pharmgkb_gene_primary_dict})
138 | for col in DiG_res_cols[:-1]:
139 | pharmgkb_res[col] = [0] * len(pharmgkb_res)
140 | pharmgkb_res['Source'] = ['PharmGKB'] * len(pharmgkb_res)
141 | pharmgkb_res['Associate_PharmGKB'] = [1] * len(pharmgkb_res)
142 | DiG_res = pd.concat((DiG_res, pharmgkb_res))
143 | DiG_res.loc[DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 'Associate_PharmGKB'] = 1
144 | DiG_res['Source'] = np.where(DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False),
145 | DiG_res['Source'].astype(str) + ';PharmGKB', DiG_res['Source'].astype(str) + '')
146 | DiG_res = DiG_res.drop_duplicates(subset=['Disease', 'Gene'], keep='first')
147 | DiG_res_col = list(DiG_res.columns)
148 | DiG_res_col_new = DiG_res_col[:-2] + DiG_res_col[-1:] + DiG_res_col[-2:-1]
149 | DiG_res = DiG_res[DiG_res_col_new]
150 | DiG_res['Source'] = DiG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
151 | DiG_res.to_csv(folder + 'DiG_res_3.csv', index=False)
152 | with open(folder + 'integration_notes.txt', 'a') as f:
153 | f.write('DiG_res_3: Hetionet, KEGG and PharmGKB (Associate).\n')
154 | f.close()
155 |
156 |
157 | def integrate_CTD_DiG_curated():
158 | disease_gene = pd.read_csv(CTD_folder + 'CTD_genes_diseases.csv', header=27)
159 | disease_gene = disease_gene.dropna(subset=['GeneID', 'DiseaseID'])
160 | disease_gene = disease_gene.drop_duplicates(subset=['GeneID', 'DiseaseID'])
161 | disease_gene = disease_gene.reset_index(drop=True)
162 | disease_gene = disease_gene.rename(columns={'DiseaseID': 'Disease', 'GeneID': 'Gene'})
163 | disease_gene_curated = disease_gene[pd.isnull(disease_gene['InferenceScore'])]
164 |
165 | disease_gene_curated = disease_gene_curated[['Disease', 'Gene']]
166 | disease_gene_curated = disease_gene_curated.reset_index(drop=True)
167 |
168 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv')
169 | mesh_disease_vocab = disease_vocab.dropna(subset=['mesh_id'])
170 | mesh_disease_primary_dict = mesh_disease_vocab.set_index('mesh_id')['primary'].to_dict()
171 | omim_vocab = disease_vocab.dropna(subset=['omim_id'])
172 | omim_vocab['omim_id'] = omim_vocab['omim_id'].astype(int).astype(str)
173 | omim_primary_dict = omim_vocab.set_index('omim_id')['primary'].to_dict()
174 |
175 | gene_vocab = pd.read_csv(folder + 'gene_vocab_2.csv')
176 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
177 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict()
178 |
179 | DiG_res = pd.read_csv(folder + 'DiG_res_3.csv')
180 | DiG_res_col = list(DiG_res.columns)[2:]
181 | DiG_res['Associate_CTD'] = [0] * len(DiG_res)
182 | print(disease_gene_curated)
183 | disease_list = []
184 | gene_list = []
185 | for i in range(len(disease_gene_curated)):
186 | disease_id = disease_gene_curated.loc[i, 'Disease']
187 | gene_id = disease_gene_curated.loc[i, 'Gene']
188 |
189 | gene_list.append(ncbi_primary_dict[gene_id])
190 | if 'MESH' in disease_id:
191 | disease_id = disease_id.replace('MESH:', '')
192 | disease_list.append(mesh_disease_primary_dict[disease_id])
193 | else:
194 | disease_id = disease_id.replace('OMIM:', '')
195 | disease_list.append(omim_primary_dict[disease_id])
196 | print(i + 1, '/', len(disease_gene_curated), 'Completed...')
197 | disease_gene_curated = pd.DataFrame({'Disease': disease_list, 'Gene': gene_list})
198 | print(disease_gene_curated)
199 |
200 | for col in DiG_res_col[:-1]:
201 | disease_gene_curated[col] = [0] * len(disease_gene_curated)
202 | disease_gene_curated['Source'] = ['CTD'] * len(disease_gene_curated)
203 | disease_gene_curated['Associate_CTD'] = [1] * len(disease_gene_curated)
204 | DiG_res = pd.concat((DiG_res, disease_gene_curated))
205 | DiG_res.loc[DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 'Associate_CTD'] = 1
206 | DiG_res['Source'] = np.where(DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False),
207 | DiG_res['Source'].astype(str) + ';CTD', DiG_res['Source'].astype(str) + '')
208 | DiG_res = DiG_res.drop_duplicates(subset=['Disease', 'Gene'], keep='first')
209 | DiG_res_col = list(DiG_res.columns)
210 | DiG_res_col_new = DiG_res_col[:-2] + DiG_res_col[-1:] + DiG_res_col[-2:-1]
211 | DiG_res = DiG_res[DiG_res_col_new]
212 | DiG_res['Source'] = DiG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
213 | DiG_res.to_csv(folder + 'DiG_res_4.csv', index=False)
214 | with open(folder + 'integration_notes.txt', 'a') as f:
215 | f.write('DiG_res_4: Hetionet, KEGG, PharmGKB and CTD_curated (Associate).\n')
216 | f.close()
217 |
218 |
219 | def integrate_CTD_DiG_inferred():
220 | DiG_res = pd.read_csv(folder + 'DiG_res_4.csv')
221 | DiG_res_col = list(DiG_res.columns)[2:]
222 | DiG_res['Inferred_Relation'] = [0] * len(DiG_res)
223 | DiG_res['Inference_Score'] = [''] * len(DiG_res)
224 |
225 | disease_gene_inferred = pd.read_csv(folder + 'CTD_disease_gene_inferred.csv')
226 |
227 | for col in DiG_res_col[:-1]:
228 | disease_gene_inferred[col] = [0] * len(disease_gene_inferred)
229 | disease_gene_inferred['Source'] = ['CTD'] * len(disease_gene_inferred)
230 | disease_gene_inferred['Inferred_Relation'] = [1] * len(disease_gene_inferred)
231 | temp_col = list(disease_gene_inferred.columns)
232 | disease_gene_inferred_col = temp_col[:2] + temp_col[3:] + temp_col[2:3]
233 | disease_gene_inferred = disease_gene_inferred[disease_gene_inferred_col]
234 | print(list(disease_gene_inferred.columns))
235 | DiG_res = pd.concat((DiG_res, disease_gene_inferred))
236 | DiG_res.loc[DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), 'Inferred_Relation'] = 1
237 | DiG_res['Source'] = np.where(DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False),
238 | DiG_res['Source'].astype(str) + ';CTD', DiG_res['Source'].astype(str) + '')
239 | DiG_res = DiG_res.drop_duplicates(subset=['Disease', 'Gene'], keep='first')
240 | DiG_res_col = list(DiG_res.columns)
241 | DiG_res_col_new = DiG_res_col[:-3] + DiG_res_col[-2:-1] + DiG_res_col[-3:-2] + DiG_res_col[-1:]
242 | DiG_res = DiG_res[DiG_res_col_new]
243 | DiG_res['Source'] = DiG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
244 | DiG_res.to_csv(folder + 'DiG_res_5.csv', index=False)
245 | with open(folder + 'integration_notes.txt', 'a') as f:
246 | f.write('DiG_res_5: Hetionet, KEGG, PharmGKB, CTD_curated and CTD (Inferred_Relation).\n')
247 | f.close()
248 |
249 |
250 | def integrate_DRKG_DiG():
251 | DiG_res = pd.read_csv(folder + 'DiG_res_5.csv')
252 | DiG_res_col = list(DiG_res.columns)[2:]
253 |
254 | drkg_DiG = pd.read_csv('drkg_DiG.csv')
255 | # drkg_DDi = pd.read_csv('/drkg_DDi.csv')
256 | drkg_DiG = drkg_DiG.rename(columns={'entity_1': 'Disease', 'entity_2': 'Gene'})
257 | drkg_DiG['Disease'] = drkg_DiG['Disease'].str.replace('Disease::', '')
258 | drkg_DiG['Gene'] = drkg_DiG['Gene'].str.replace('Gene::', '')
259 | dig_relation_list = list(drkg_DiG.drop_duplicates(subset='relation', keep='first')['relation'])
260 | # dig_source_list = list(drkg_DiG.drop_duplicates(subset='source', keep='first')['source'])
261 | # print(dig_relation_list)
262 | # print(dig_source_list)
263 | # print(drkg_DiG.drop_duplicates(subset='relation', keep='first'))
264 |
265 | disease_vocab = pd.read_csv(folder + 'disease_vocab.csv')
266 | mesh_disease_vocab = disease_vocab.dropna(subset=['mesh_id'])
267 | mesh_disease_primary_dict = mesh_disease_vocab.set_index('mesh_id')['primary'].to_dict()
268 | omim_vocab = disease_vocab.dropna(subset=['omim_id'])
269 | omim_vocab['omim_id'] = omim_vocab['omim_id'].astype(int).astype(str)
270 | omim_primary_dict = omim_vocab.set_index('omim_id')['primary'].to_dict()
271 |
272 | gene_vocab = pd.read_csv(folder + 'gene_vocab_2.csv')
273 | ncbi_vocab = gene_vocab.dropna(subset=['ncbi_id'])
274 | ncbi_vocab['ncbi_id'] = ncbi_vocab['ncbi_id'].astype(int).astype(str)
275 | ncbi_primary_dict = ncbi_vocab.set_index('ncbi_id')['primary'].to_dict()
276 |
277 | for drkg_rel in dig_relation_list:
278 | print(drkg_rel)
279 | DiG_res[drkg_rel] = [0] * len(DiG_res)
280 | drkg_DiG_temp = drkg_DiG[drkg_DiG['relation'] == drkg_rel]
281 | drkg_DiG_temp = drkg_DiG_temp[['Disease', 'Gene']]
282 | drkg_DiG_temp = drkg_DiG_temp.reset_index(drop=True)
283 |
284 | disease_list = []
285 | gene_list = []
286 | for i in range(len(drkg_DiG_temp)):
287 | disease_id = drkg_DiG_temp.loc[i, 'Disease']
288 | gene_id = drkg_DiG_temp.loc[i, 'Gene']
289 |
290 | if gene_id in ncbi_primary_dict:
291 | gene_list.append(ncbi_primary_dict[gene_id])
292 | else:
293 | continue
294 |
295 | if 'MESH' in disease_id:
296 | disease_id = disease_id.replace('MESH:', '')
297 | disease_list.append(mesh_disease_primary_dict[disease_id])
298 | else:
299 | disease_id = disease_id.replace('OMIM:', '')
300 | disease_list.append(omim_primary_dict[disease_id])
301 |
302 | print(i + 1, '/', len(drkg_DiG_temp), 'Completed...')
303 |
304 | drkg_DiG_temp_primary = pd.DataFrame({'Disease': disease_list, 'Gene': gene_list})
305 |
306 | for col in DiG_res_col[:-2]:
307 | drkg_DiG_temp_primary[col] = [0] * len(drkg_DiG_temp_primary)
308 | drkg_DiG_temp_primary['Source'] = ['DRKG'] * len(drkg_DiG_temp_primary)
309 | drkg_DiG_temp_primary['Inference_Score'] = [''] * len(drkg_DiG_temp_primary)
310 | drkg_DiG_temp_primary[drkg_rel] = [1] * len(drkg_DiG_temp_primary)
311 | DiG_res = pd.concat((DiG_res, drkg_DiG_temp_primary))
312 | DiG_res.loc[DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False), drkg_rel] = 1
313 | DiG_res['Source'] = np.where(DiG_res.duplicated(subset=['Disease', 'Gene'], keep=False),
314 | DiG_res['Source'].astype(str) + ';DRKG', DiG_res['Source'].astype(str) + '')
315 | DiG_res = DiG_res.drop_duplicates(subset=['Disease', 'Gene'], keep='first')
316 | DiG_res_col = list(DiG_res.columns)
317 | DiG_res_col_new = DiG_res_col[:-3] + DiG_res_col[-1:] + DiG_res_col[-3:-1]
318 | DiG_res = DiG_res[DiG_res_col_new]
319 | DiG_res_col = DiG_res_col_new[2:]
320 | DiG_res['Source'] = DiG_res['Source'].apply(lambda x: ';'.join(sorted(set(x.split(';')))))
321 |
322 | DiG_res.to_csv(folder + 'DiG_res_6.csv', index=False)
323 | with open(folder + 'integration_notes.txt', 'a') as f:
324 | f.write('DiG_res_6: Hetionet, KEGG, CTD and DRKG (Semantic Relations).\n')
325 | f.close()
326 |
327 |
328 | def main():
329 | integrate_Hetionet()
330 | integrate_KEGG()
331 | # extract_PharmGKB_DiG()
332 | integrate_PharmGKB()
333 | integrate_CTD_DiG_curated()
334 | integrate_CTD_DiG_inferred()
335 | integrate_DRKG_DiG()
336 |
337 | # DiG = pd.read_csv(res_folder + 'relation/Di_G_res_6.csv')
338 | # print(len(DiG), len(DiG.drop_duplicates(subset=['Disease', 'Gene'], keep='first')))
339 | # DiG_raw = pd.read_csv(res_folder + 'relation/Di_G_res.csv')
340 | # print(len(DiG_raw), len(DiG_raw.drop_duplicates(subset=['Disease', 'Gene'], keep='first')))
341 |
342 |
343 | if __name__ == '__main__':
344 | main()
345 |
--------------------------------------------------------------------------------
/iBKH-KD-protocol/Knowledge_Discovery_Pipeline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# iBKH-based Knowledge Discovery Pipeline"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "This is the implementation of Knowledge Discovery pipeline in our iBKH portal at http://ibkh.ai/.\n",
15 | "\n",
16 | "Given a target entity of interest, the task is to discover the Top-N entities from different entity types (currently supporting gene, drug, symptom, and pathway entities) that potentially link to the target entity. \n",
17 | "\n",
18 | "\n",
19 | "Generally, the pipeline contains 3 steps, including: \n",
20 | "1. Data preparation (triplets generation); \n",
21 | "\n",
22 | "2. Knowledge graph embedding learning; \n",
23 | "\n",
24 | "3. Knowledge discovery based on link prediction – predicting drug entities that potentially link to AD. "
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "### Step 1 – Data preparation (triplets generation)"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "metadata": {},
37 | "source": [
38 | "###### Collecting iBKH knowledge graph source data\n",
39 | "\n",
40 | "Download the latest version of iBKH knowledge graph data (entities and relations) at: https://github.com/wcm-wanglab/iBKH/tree/main/iBKH\n",
41 | "\n",
42 | "\n",
43 | "Please make sure putting the downloaded files following the structure below.\n",
44 | "\n",
45 | "```\n",
46 | ".\n",
47 | "├── Case Study-AD Drug Repurposing.ipynb\n",
48 | "├── Data\n",
49 | "│ ├── iBKH \n",
50 | "│ │ ├── Entity\n",
51 | "│ │ │ ├── anatomy_vocab.csv\n",
52 | "│ │ │ ├── disease_vocab.csv\n",
53 | "│ │ │ ├── drug_vocab.csv\n",
54 | "│ │ │ ├── dsp_vocab.csv\n",
55 | "│ │ │ ├── gene_vocab.csv\n",
56 | "│ │ │ ├── molecule_vocab.csv\n",
57 | "│ │ │ ├── pathway_vocab.csv\n",
58 | "│ │ │ ├── sdsi_vocab.csv\n",
59 | "│ │ │ ├── side_effect_vocab.csv\n",
60 | "│ │ │ ├── symptom_vocab.csv\n",
61 | "│ │ │ ├── tc_vocab.csv\n",
62 | "│ │ │ ├── ...\n",
63 | "│ │ │ │ \n",
64 | "│ │ ├── Relation\n",
65 | "│ │ │ ├── A_G_res.csv\n",
66 | "│ │ │ ├── D_D_res.csv\n",
67 | "│ │ │ ├── D_Di_res.csv\n",
68 | "│ │ │ ├── D_G_res.csv\n",
69 | "│ │ │ ├── D_Pwy_res.csv\n",
70 | "│ │ │ ├── D_SE_res.csv\n",
71 | "│ │ │ ├── Di_Di_res.csv\n",
72 | "│ │ │ ├── Di_G_res.csv\n",
73 | "│ │ │ ├── Di_Pwy_res.csv\n",
74 | "│ │ │ ├── Di_Sy_res.csv\n",
75 | "│ │ │ ├── DSP_SDSI_res.csv\n",
76 | "│ │ │ ├── G_G_res.csv\n",
77 | "│ │ │ ├── G_Pwy_res.csv\n",
78 | "│ │ │ ├── SDSI_A_res.csv\n",
79 | "│ │ │ ├── SDSI_D_res.csv\n",
80 | "│ │ │ ├── SDSI_Di_res.csv\n",
81 | "│ │ │ ├── SDSI_Sy.csv\n",
82 | "│ │ │ ├── SDSI_TC_res.csv\n",
83 | "│ │ │ ├── ...\n",
84 | "│ │ │ └── \n",
85 | "│ │ └── \n",
86 | "│ └── ...\n",
87 | "└── ...\n",
88 | "```"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": null,
94 | "metadata": {},
95 | "outputs": [],
96 | "source": [
97 | "# import required packages\n",
98 | "\n",
99 | "import warnings\n",
100 | "warnings.filterwarnings(\"ignore\")\n",
101 | "\n",
102 | "import pandas as pd\n",
103 | "import numpy as np\n",
104 | "import pickle\n",
105 | "\n",
106 | "import torch as th\n",
107 | "import torch.nn.functional as fn\n",
108 | "\n",
109 | "from sklearn.preprocessing import MinMaxScaler\n",
110 | "\n",
111 | "import os\n",
112 | "\n",
113 | "import sys\n",
114 | "sys.path.append('.') # Use only with Jupyter Notebook\n",
115 | "\n",
116 | "import funcs.KG_processing as KG_processing"
117 | ]
118 | },
119 | {
120 | "cell_type": "markdown",
121 | "metadata": {},
122 | "source": [
123 | "### Step 1: Generate Triplet Set from iBKH "
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "A triplet, i.e., (h, r, t), is the basic unit for a knowledge graph. We generate triplet set from iBKH, which will be used for knowledge graph embedding learning."
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "kg_folder = 'data/iBKH/' # The folder is used to store the iBKH-KG data\n",
140 | "triplet_path = 'data/triplets/' # The folder is used to store processed results\n",
141 | "if not os.path.exists(triplet_path):\n",
142 | " os.makedirs(triplet_path) \n",
143 | "output_path = 'data/dataset/' # Output folder\n",
144 | "if not os.path.exists(output_path):\n",
145 | " os.makedirs(output_path)"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | "Generating triplets for different entity type pairs."
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": [
161 | "KG_processing.DDi_triplets(kg_folder, triplet_path)\n",
162 | "KG_processing.DG_triplets(kg_folder, triplet_path)\n",
163 | "KG_processing.DPwy_triplets(kg_folder, triplet_path)\n",
164 | "KG_processing.DSE_triplets(kg_folder, triplet_path)\n",
165 | "KG_processing.DiDi_triplets(kg_folder, triplet_path)\n",
166 | "KG_processing.DiG_triplets(kg_folder, triplet_path)\n",
167 | "KG_processing.DiPwy_triplets(kg_folder, triplet_path)\n",
168 | "KG_processing.DiSy_triplets(kg_folder, triplet_path)\n",
169 | "KG_processing.GG_triplets(kg_folder, triplet_path)\n",
170 | "KG_processing.GPwy_triplets(kg_folder, triplet_path)\n",
171 | "KG_processing.DD_triplets(kg_folder, triplet_path)"
172 | ]
173 | },
174 | {
175 | "cell_type": "markdown",
176 | "metadata": {},
177 | "source": [
178 | "Combine all the triplets set extracted from the relation results among the entities, then convert the triplet set from .csv format to the .tsv format based on the DGL input requirement."
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": null,
184 | "metadata": {},
185 | "outputs": [],
186 | "source": [
187 | "# Specifying triplet type you want to use.\n",
188 | "included_pair_type = ['DDi', 'DiG', 'DG', 'GG', 'DD', 'DiDi',\n",
189 | " 'GPwy', 'DiPwy', 'DPwy', 'DiSy', 'DSE']\n",
190 | "\n",
191 | "# Running below script will return a csv file, which combines all triplets extracted from the above functions.\n",
192 | "KG_processing.generate_triplet_set(triplet_path=triplet_path) "
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": null,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "# Split the data into training, validation, and testing sets.\n",
202 | "# And convert data to TSV files following DGK-KE requirements.\n",
203 | "KG_processing.generate_DGL_data_set(triplet_path=triplet_path, \n",
204 | " output_path=output_path, \n",
205 | " train_val_test_ratio=[.9, .05, .05])"
206 | ]
207 | },
208 | {
209 | "cell_type": "markdown",
210 | "metadata": {},
211 | "source": [
212 | "### Step 2: Knowledge graph embedding"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {},
218 | "source": [
219 | "We invoke the command line toolkit provided by DGL-KE to learn the embedding of entities and relations in iBKH. Here, we use four different models to learn the entity and edge representations of iBKH, namely TransE, TransR, DistMult, and ComplEx. To use other KGE model or AWS instances please refer to DGL-KE’s Document.\n"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {},
225 | "source": [
226 | "Open command line (Windows OS and UNIX OS) or terminal (MAC OS) and change directory as "
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {},
233 | "outputs": [],
234 | "source": [
235 | "cd [your file path]/iBKH-KD-protocol"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {},
241 | "source": [
242 | "Train and evaluate the knowledge graph embedding model by running the command below."
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {},
249 | "outputs": [],
250 | "source": [
251 | "DGLBACKEND=pytorch \\\n",
252 | "dglke_train --dataset iBKH --data_path ./data/dataset \\\n",
253 | " --data_files training_triplets.tsv \\\n",
254 | " validation_triplets.tsv \\\n",
255 | " testing_triplets.tsv \\\n",
256 | " --format raw_udd_hrt --model_name [model name] \\\n",
257 | " --batch_size [batch size] --hidden_dim [hidden dim] \\\n",
258 | " --neg_sample_size [neg sample size] --gamma [gamma] \\\n",
259 | " --lr [learning rate] --max_step [max step] \\\n",
260 | " --log_interval [log interval] \\\n",
261 | " --batch_size_eval [batch size eval] \\\n",
262 | " -adv --regularization_coef [regularization coef] \\\n",
263 | " --num_thread [num thread] --num_proc [num proc] \\\n",
264 | " --neg_sample_size_eval [neg sample size eval] \\\n",
265 | " --save_path ./data/embeddings --test"
266 | ]
267 | },
268 | {
269 | "cell_type": "markdown",
270 | "metadata": {},
271 | "source": [
272 | "Running above command will train the specific knowledge graph embedding model in the training dataset and evaluate the model performance in link prediction task in the testing set. This will result in multiple metrics including: Hit@k (the average number of times the positive triplet is among the k highest ranked triplets); Mean Rank (MR, the average rank of the positive triplets); Mean Reciprocal Rank (MRR, the average reciprocal rank of the positive instances). Higher values of Hit@k and MRR and a lower value of MR indicate good performance, and vice versa.\n",
273 | "\n",
274 | "\n",
275 | "Of note, the user can use above command to find optimal hyperparameters of the model. For simplicity, the user can also use our suggested hyperparameters as below.\n",
276 | "\n",
277 | "```\n",
278 | "Arguments \t TransE\t TransR\t ComplEx\t DistMult\n",
279 | "--model_name\t TransE_l2\t TransR\t ComplEx\t DistMult\n",
280 | "--batch_size\t 1024\t 1024\t 1024\t 1024\n",
281 | "--batch_size_eval\t 1000\t 1000\t 1000\t 1000\n",
282 | "--neg_sample_size\t 256\t 256\t 256\t 256\n",
283 | "--neg_sample_size_eval\t1000\t 1000\t 1000\t 1000\n",
284 | "--hidden_dim\t 400\t 200\t 200\t 400\n",
285 | "--gamma\t 12.0\t 12.0\t 12.0\t 12.0\n",
286 | "--lr\t 0.1\t 0.005\t 0.005\t 0.005\n",
287 | "--max_step\t 10000\t 10000\t 10000\t 10000\n",
288 | "--log_interval \t100\t 100\t 100\t 100\n",
289 | "--regularization_coef\t1.00E-09\t 1.00E-07\t 1.00E-07\t 1.00E-07\n",
290 | "\n",
291 | "```"
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "metadata": {},
297 | "source": [
298 | "After determining hyperparameters that can lead to desirable performance, we then re-train the model using the whole dataset by running"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {},
305 | "outputs": [],
306 | "source": [
307 | "DGLBACKEND=pytorch \\\n",
308 | "dglke_train --dataset iBKH --data_path ./data/dataset \\\n",
309 | " --data_files whole_triplets.tsv \\\n",
310 | " --format raw_udd_hrt --model_name [model name] \\\n",
311 | " --batch_size [batch size] --hidden_dim [hidden dim] \\\n",
312 | " --neg_sample_size [neg sample size] --gamma [gamma] \\\n",
313 | " --lr [learning rate] --max_step [max step] \\\n",
314 | " --log_interval [log interval] \\\n",
315 | " -adv --regularization_coef [regularization coef] \\\n",
316 | " --num_thread [num thread] --num_proc [num proc] \\\n",
317 | " --save_path ./data/embeddings"
318 | ]
319 | },
320 | {
321 | "cell_type": "markdown",
322 | "metadata": {},
323 | "source": [
324 | "This will generate two output files for each model: “iBKH_[model name]\\_entity.npy”, containing the low dimension embeddings of entities in iBKH and “iBKH_[model name]\\_relation.npy”, containing the low dimension embeddings of relations in iBKH. These embeddings can be used in downstream knowledge discovery tasks."
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "metadata": {},
330 | "source": [
331 | "### Step 3: Knowledge Discovery Based on iBKH - Hypothesis Generation"
332 | ]
333 | },
334 | {
335 | "cell_type": "markdown",
336 | "metadata": {},
337 | "source": [
338 | "This step conducts knowledge discovery based on iBKH. \n",
339 | "\n",
340 | "We showcases an example -- drug repurposing hypothesis generation for Parkinson's disease."
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "metadata": {},
347 | "outputs": [],
348 | "source": [
349 | "from funcs.KG_link_pred import generate_hypothesis,\\\n",
350 | " generate_hypothesis_ensemble_model"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": null,
356 | "metadata": {},
357 | "outputs": [],
358 | "source": [
359 | "PD = [\"parkinson's disease\", \"late onset parkinson's disease\"]"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": null,
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "r_type = [\"Treats_DDi\", \"Palliates_DDi\"]"
369 | ]
370 | },
371 | {
372 | "cell_type": "markdown",
373 | "metadata": {},
374 | "source": [
375 | "###### Drug repurposing hypothesis generation based on graph embedding using the TransE model."
376 | ]
377 | },
378 | {
379 | "cell_type": "code",
380 | "execution_count": null,
381 | "metadata": {},
382 | "outputs": [],
383 | "source": [
384 | "proposed_df = generate_hypothesis(target_entity=PD, candidate_entity_type='drug',\n",
385 | " relation_type=r_type, embedding_folder='data/embeddings',\n",
386 | " method='transE_l2', kg_folder = 'data/iBKH', \n",
387 | " triplet_folder = 'data/triplets', topK=100, \n",
388 | " save_path='output', save=True,\n",
389 | " without_any_rel=False)"
390 | ]
391 | },
392 | {
393 | "cell_type": "markdown",
394 | "metadata": {},
395 | "source": [
396 | "This will result in an output CSV file stored in the \"output\" folder."
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": null,
402 | "metadata": {},
403 | "outputs": [],
404 | "source": [
405 | "# print the predicted drugs.\n",
406 | "\n",
407 | "proposed_df"
408 | ]
409 | },
410 | {
411 | "cell_type": "markdown",
412 | "metadata": {},
413 | "source": [
414 | "We provide an ensemble model that integrates TransE, TransR, complEx, and DistMult to generate hypotheses."
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": null,
420 | "metadata": {},
421 | "outputs": [],
422 | "source": [
423 | "ensemble_proposed_df = generate_hypothesis_ensemble_model(target_entity=PD, candidate_entity_type='drug',\n",
424 | " relation_type=r_type, \n",
425 | " embedding_folder='data/embeddings',\n",
426 | " kg_folder = 'data/iBKH', \n",
427 | " triplet_folder = 'data/triplets',\n",
428 | " topK=100, save_path='output', save=True, \n",
429 | " without_any_rel=False)"
430 | ]
431 | },
432 | {
433 | "cell_type": "code",
434 | "execution_count": null,
435 | "metadata": {},
436 | "outputs": [],
437 | "source": [
438 | "# print the predicted drugs using ensemble method\n",
439 | "ensemble_proposed_df"
440 | ]
441 | },
442 | {
443 | "cell_type": "markdown",
444 | "metadata": {},
445 | "source": [
446 | "###### Interpreting prediction results in knowledge graph."
447 | ]
448 | },
449 | {
450 | "cell_type": "markdown",
451 | "metadata": {},
452 | "source": [
453 | "Finally, we interpret predicted repurposing drug candidates using knowledge graph. We can extract intermediate entities that construct the shortest paths linking the target entity (i.e., Parkinson's disease) and the predicted drug candidates."
454 | ]
455 | },
456 | {
457 | "cell_type": "markdown",
458 | "metadata": {},
459 | "source": [
460 | "1. To achive this goal, we first deploy the iBKH knoweldge graph using Neo4j with an AWS server. Please refer the following instruction to set up the knoweldge graph: https://docs.google.com/document/d/1cLDPLp_nVCJ5xrDlJ-B-Q3wf24tb-Dyq55nAXxaNgTM/edit"
461 | ]
462 | },
463 | {
464 | "cell_type": "markdown",
465 | "metadata": {},
466 | "source": [
467 | "2. Interpreting repurposing drug candidates."
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": null,
473 | "metadata": {},
474 | "outputs": [],
475 | "source": [
476 | "import funcs.knowledge_visualization as knowledge_visualization"
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": null,
482 | "metadata": {},
483 | "outputs": [],
484 | "source": [
485 | "# List of predicted repurposing drug candidates to interprete\n",
486 | "\n",
487 | "drug_list = ['Glutathione', 'Clioquinol', 'Steroids', 'Taurine']"
488 | ]
489 | },
490 | {
491 | "cell_type": "code",
492 | "execution_count": null,
493 | "metadata": {},
494 | "outputs": [],
495 | "source": [
496 | "knowledge_visualization.subgraph_visualization(target_type='Disease', target_list=PD,\n",
497 | " predicted_type='Drug', predicted_list=drug_list, \n",
498 | " neo4j_url = \"neo4j://54.210.251.104:7687\", \n",
499 | " username = \"neo4j\", password = \"password\",\n",
500 | " alpha=1.5, k=0.8, figsize=(15, 10), save=True)"
501 | ]
502 | }
503 | ],
504 | "metadata": {
505 | "kernelspec": {
506 | "display_name": "Python 3",
507 | "language": "python",
508 | "name": "python3"
509 | },
510 | "language_info": {
511 | "codemirror_mode": {
512 | "name": "ipython",
513 | "version": 3
514 | },
515 | "file_extension": ".py",
516 | "mimetype": "text/x-python",
517 | "name": "python",
518 | "nbconvert_exporter": "python",
519 | "pygments_lexer": "ipython3",
520 | "version": "3.7.3"
521 | }
522 | },
523 | "nbformat": 4,
524 | "nbformat_minor": 5
525 | }
526 |
--------------------------------------------------------------------------------
/Codes_Term Harmonization/Entity_Integration/entity_anatomy.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import string
4 | import requests
5 | from lxml.html import fromstring
6 |
7 | pd.set_option('display.max_columns', None)
8 | pd.set_option('display.max_rows', None)
9 |
10 |
11 | term_type_list = ['AC', 'BD', 'BN', 'BPCK', 'BR', 'CC', 'CDC', 'CDO', 'CD', 'CMN', 'CN', 'CPR', 'CP', 'CR', 'CSY', 'CV',
12 | 'CX', 'DC10', 'DC9', 'DE', 'DFG', 'DF', 'DI', 'DP', 'FI', 'FN', 'GLP', 'GN', 'GO', 'GPCK', 'HTJKN1',
13 | 'HTJKN', 'HTN', 'HT', 'ID', 'IN', 'IVC', 'IV', 'LA', 'LC', 'LG', 'LN', 'LPDN', 'LPN', 'LVDN', 'MD',
14 | 'MH', 'MIN', 'MS', 'MTH_CN', 'MTH_FN', 'MTH_LN', 'MTH_OAP', 'MTH_OPN', 'MTH_OP', 'MTH_PTGB',
15 | 'MTH_PTN', 'MTH_PT', 'MTH_RXN_BD', 'MTH_RXN_CDC', 'MTH_RXN_CD', 'MTH_RXN_DP', 'MTH_SI', 'MTH_SMQ',
16 | 'MV', 'NM', 'OC', 'OPN', 'OP', 'OR', 'OSN', 'PCE', 'PC', 'PEP', 'PHENO', 'PIN', 'PN', 'POS', 'PR',
17 | 'PSC', 'PSN', 'PTAV', 'PTCS', 'PTGB', 'PTJKN1', 'PTJKN', 'PTN', 'PT', 'PX', 'RPT', 'RXN_IN', 'RXN_PT',
18 | 'SBDC', 'SBDF', 'SBDG', 'SBD', 'SCDC', 'SCDF', 'SCDG', 'SCD', 'SCN', 'SD', 'SI', 'SMQ', 'SP', 'ST',
19 | 'SU', 'TA', 'TG', 'TQ', 'UCN', 'USN', 'VPT', 'VS', 'XD']
20 |
21 |
22 | def get_UMLS_tgt(apikey):
23 | uri = "https://utslogin.nlm.nih.gov"
24 | auth_endpoint = "/cas/v1/api-key"
25 | params = {'apikey': apikey}
26 | h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent": "python"}
27 | r = requests.post(uri + auth_endpoint, data=params, headers=h)
28 | response = fromstring(r.text)
29 | tgt = response.xpath('//form/@action')[0]
30 | return tgt
31 |
32 |
33 | def get_UMLS_ts(tgt):
34 | service = "http://umlsks.nlm.nih.gov"
35 | params = {'service': service}
36 | h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent": "python"}
37 | r = requests.post(tgt, data=params, headers=h)
38 | st = r.text
39 | return st
40 |
41 |
42 | def get_UMLS_name(tgt, umls_cui):
43 | st = get_UMLS_ts(tgt)
44 | url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + umls_cui + '?ticket=' + st
45 | resp = requests.get(url)
46 | name = ''
47 | if 'error' not in resp.json():
48 | content = resp.json()['result']
49 | name = content['name']
50 |
51 | return name
52 |
53 |
54 | def access_UMLS_CUI(tgt, id_type, entity_id):
55 | st = get_UMLS_ts(tgt)
56 | umls_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/source/' + id_type + '/' + entity_id + \
57 | '/atoms?ttys=MH,NM,PT&ticket=' + st
58 | resp = requests.get(umls_url)
59 | umls_cui = ''
60 | if 'error' not in resp.json():
61 | content = resp.json()['result'][0]
62 | umls_cui = content['concept'].replace('https://uts-ws.nlm.nih.gov/rest/content/2020AB/CUI/', '')
63 | # print(umls_cui)
64 | return umls_cui
65 |
66 |
67 | def access_UMLS_CUI_name(tgt, name):
68 | name = name.lower()
69 | name = name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
70 | name_set = set(filter(None, name.split(' ')))
71 | if 'and' in name_set:
72 | name_set.remove('and')
73 | st = get_UMLS_ts(tgt)
74 | db_url = 'https://uts-ws.nlm.nih.gov/rest/search/current?string=' + name + '&ticket=' + st
75 | db_resp = requests.get(db_url)
76 | db_content_list = db_resp.json()['result']['results']
77 | res_umls = ''
78 | exact_match = False
79 | for db_content in db_content_list:
80 | umls_cui = db_content['ui']
81 | umls_name = db_content['name'].lower()
82 | umls_name = umls_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
83 | umls_name_set = set(filter(None, umls_name.split(' ')))
84 | if umls_name_set == name_set:
85 | res_umls = umls_cui
86 | exact_match = True
87 | if res_umls == '':
88 | res_umls = db_content_list[0]['ui']
89 | res_umls = res_umls if res_umls != 'NONE' else ''
90 | # print(res_umls, res_umls_name, exact_match)
91 | if not exact_match:
92 | st = get_UMLS_ts(tgt)
93 | url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + res_umls + '/atoms?ticket=' + st
94 | resp = requests.get(url)
95 | if 'error' not in resp.json():
96 | pageCount = int(resp.json()['pageCount'])
97 | for page in range(1, pageCount + 1):
98 | st = get_UMLS_ts(tgt)
99 | page_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + res_umls + '/atoms?pageNumber=' + str(
100 | page) + '&ticket=' + st
101 | page_resp = requests.get(page_url)
102 | content = page_resp.json()['result']
103 | for res in content:
104 | if res['termType'] in term_type_list:
105 | disease_name = res['name'].lower().replace('to ', '').translate(
106 | str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
107 | disease_name_set = set(filter(None, disease_name.split(' ')))
108 | if 'and' in disease_name_set:
109 | disease_name_set.remove('and')
110 | exact_match = name_set == disease_name_set
111 | if exact_match:
112 | break
113 | if exact_match:
114 | break
115 | # print(res_umls, res_umls_name, exact_match)
116 | return res_umls if exact_match else ''
117 |
118 |
119 | def UMLS2MeSH(tgt, umls_cui):
120 | st = get_UMLS_ts(tgt)
121 | mesh_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + umls_cui + '/atoms?sabs=MSH&ttys=MH,NM,PT&ticket=' + st
122 | mesh_resp = requests.get(mesh_url)
123 | mesh_id = ''
124 | if 'error' not in mesh_resp.json():
125 | mesh_content = mesh_resp.json()['result']
126 | mesh_id = mesh_content[0]['code'].replace(
127 | 'https://uts-ws.nlm.nih.gov/rest/content/2020AB/source/MSH/', '')
128 | return mesh_id
129 |
130 |
131 | def refine_res_2():
132 | anatomy_res = pd.read_csv('anatomy_res_2.csv')
133 | anatomy_res = anatomy_res[['primary', 'name', 'uberon_id', 'bto_id', 'mesh_id', 'umls_cui']]
134 |
135 | anatomy_res['mesh_id'] = anatomy_res['mesh_id'].str.replace('MESH:', '')
136 | anatomy_res['umls_cui'] = anatomy_res['umls_cui'].str.replace('UMLS:', '')
137 |
138 | mesh_anatomy = pd.read_csv('anatomy_mesh.csv')
139 | mesh_anatomy['mesh_id'] = mesh_anatomy['mesh_id'].str.replace('MESH:', '')
140 | mesh_name_dict = mesh_anatomy.set_index('mesh_id')['mesh_term'].to_dict()
141 |
142 | bto = pd.read_csv('bto.csv')
143 | bto_name_dict = bto.set_index('bto_id')['name'].to_dict()
144 |
145 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6'
146 | tgt = get_UMLS_tgt(apikey)
147 |
148 | for i in range(len(anatomy_res)):
149 | mesh_id = anatomy_res.loc[i, 'mesh_id']
150 | umls_cui = anatomy_res.loc[i, 'umls_cui']
151 | bto_id = anatomy_res.loc[i, 'bto_id']
152 |
153 | if not pd.isnull(bto_id):
154 | temp_df = anatomy_res[anatomy_res['bto_id'] == bto_id]
155 | if len(temp_df) > 1:
156 | bto_name = bto_name_dict[bto_id]
157 | temp_bto_name = bto_name.lower()
158 | temp_bto_name = temp_bto_name.translate(
159 | str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
160 | bto_name_set = set(filter(None, temp_bto_name.split(' ')))
161 | for j in range(len(temp_df)):
162 | name = temp_df.iloc[j, 1]
163 | temp_name = name.lower()
164 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
165 | name_set = set(filter(None, temp_name.split(' ')))
166 | if name_set != bto_name_set:
167 | anatomy_res.loc[anatomy_res['name'] == name, 'bto_id'] = np.nan
168 | temp_2 = anatomy_res[anatomy_res['bto_id'] == bto_id]
169 | if len(temp_2) == 0:
170 | anatomy_res.loc[anatomy_res['name'] == temp_df.iloc[0, 1], 'bto_id'] = bto_id
171 | temp_df_2 = anatomy_res[anatomy_res['bto_id'] == bto_id]
172 | if len(temp_df_2) > 1:
173 | for j in range(1, len(temp_df_2)):
174 | temp_primary = temp_df_2.iloc[j, 0]
175 | anatomy_res.loc[anatomy_res['primary'] == temp_primary, 'bto_id'] = np.nan
176 |
177 | if not pd.isnull(mesh_id):
178 | temp_df = anatomy_res[anatomy_res['mesh_id'] == mesh_id]
179 | if len(temp_df) > 1:
180 | mesh_term = mesh_name_dict[mesh_id] if mesh_id in mesh_name_dict else ''
181 | temp_mesh_term = mesh_term.lower()
182 | temp_mesh_term = temp_mesh_term.translate(
183 | str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
184 | mesh_term_set = set(filter(None, temp_mesh_term.split(' ')))
185 | for j in range(len(temp_df)):
186 | name = temp_df.iloc[j, 1]
187 | temp_name = name.lower()
188 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
189 | name_set = set(filter(None, temp_name.split(' ')))
190 | if name_set != mesh_term_set:
191 | anatomy_res.loc[anatomy_res['name'] == name, 'mesh_id'] = np.nan
192 | temp_2 = anatomy_res[anatomy_res['mesh_id'] == mesh_id]
193 | if len(temp_2) == 0:
194 | anatomy_res.loc[anatomy_res['name'] == temp_df.iloc[0, 1], 'mesh_id'] = mesh_id
195 | temp_df_2 = anatomy_res[anatomy_res['mesh_id'] == mesh_id]
196 | if len(temp_df_2) > 1:
197 | for j in range(1, len(temp_df_2)):
198 | temp_primary = temp_df_2.iloc[j, 0]
199 | anatomy_res.loc[anatomy_res['primary'] == temp_primary, 'mesh_id'] = np.nan
200 |
201 | if not pd.isnull(umls_cui):
202 | temp_df = anatomy_res[anatomy_res['umls_cui'] == umls_cui]
203 | if len(temp_df) > 1:
204 | umls_name = get_UMLS_name(tgt, umls_cui)
205 | temp_umls_name = umls_name.lower()
206 | temp_umls_name = temp_umls_name.translate(
207 | str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
208 | umls_name_set = set(filter(None, temp_umls_name.split(' ')))
209 | for j in range(len(temp_df)):
210 | name = temp_df.iloc[j, 1]
211 | temp_name = name.lower()
212 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
213 | name_set = set(filter(None, temp_name.split(' ')))
214 | if name_set != umls_name_set:
215 | anatomy_res.loc[anatomy_res['name'] == name, 'umls_cui'] = np.nan
216 | temp_2 = anatomy_res[anatomy_res['umls_cui'] == umls_cui]
217 | if len(temp_2) == 0:
218 | anatomy_res.loc[anatomy_res['name'] == temp_df.iloc[0, 1], 'umls_cui'] = umls_cui
219 | temp_df_2 = anatomy_res[anatomy_res['umls_cui'] == umls_cui]
220 | if len(temp_df_2) > 1:
221 | for j in range(1, len(temp_df_2)):
222 | temp_primary = temp_df_2.iloc[j, 0]
223 | anatomy_res.loc[anatomy_res['primary'] == temp_primary, 'umls_cui'] = np.nan
224 | print(i + 1, '/', len(anatomy_res), 'Completed...')
225 | anatomy_res.to_csv('anatomy_res_2_refined.csv', index=False)
226 |
227 |
228 | def enriche_CL():
229 | cl_df = pd.read_csv('cl.csv')
230 |
231 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6'
232 | tgt = get_UMLS_tgt(apikey)
233 |
234 | for i in range(len(cl_df)):
235 | fma_id = cl_df.loc[i, 'fma']
236 | mesh_id = cl_df.loc[i, 'mesh_id']
237 | umls_cui = cl_df.loc[i, 'umls_cui']
238 | name = cl_df.loc[i, 'name']
239 |
240 | if pd.isnull(umls_cui):
241 | if not pd.isnull(fma_id):
242 | temp_umls = access_UMLS_CUI(tgt, 'FMA', fma_id)
243 | else:
244 | temp_umls = access_UMLS_CUI_name(tgt, name)
245 | cl_df.loc[i, 'umls_cui'] = temp_umls
246 |
247 | if not pd.isnull(umls_cui) and pd.isnull(mesh_id):
248 | temp_mesh = UMLS2MeSH(tgt, umls_cui)
249 | cl_df.loc[i, 'mesh_id'] = temp_mesh
250 | print(i + 1, '/', len(cl_df), 'Completed...')
251 | cl_df.to_csv('cl_enriched.csv', index=False)
252 |
253 |
254 | def refine_CL():
255 | cl_df = pd.read_csv('cl_enriched.csv')
256 | cl_df = cl_df[['cl_id', 'name', 'mesh_id', 'umls_cui', 'fma', 'bto_id']]
257 |
258 | mesh_anatomy = pd.read_csv('anatomy_mesh.csv')
259 | mesh_anatomy['mesh_id'] = mesh_anatomy['mesh_id'].str.replace('MESH:', '')
260 | mesh_name_dict = mesh_anatomy.set_index('mesh_id')['mesh_term'].to_dict()
261 |
262 | bto = pd.read_csv('bto.csv')
263 | bto_name_dict = bto.set_index('bto_id')['name'].to_dict()
264 |
265 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6'
266 | tgt = get_UMLS_tgt(apikey)
267 |
268 | for i in range(len(cl_df)):
269 | mesh_id = cl_df.loc[i, 'mesh_id']
270 | umls_cui = cl_df.loc[i, 'umls_cui']
271 | bto_id = cl_df.loc[i, 'bto_id']
272 |
273 | if not pd.isnull(bto_id):
274 | temp_df = cl_df[cl_df['bto_id'] == bto_id]
275 | if len(temp_df) > 1:
276 | bto_name = bto_name_dict[bto_id]
277 | temp_bto_name = bto_name.lower()
278 | temp_bto_name = temp_bto_name.translate(
279 | str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
280 | bto_name_set = set(filter(None, temp_bto_name.split(' ')))
281 | for j in range(len(temp_df)):
282 | name = temp_df.iloc[j, 1]
283 | temp_name = name.lower()
284 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
285 | name_set = set(filter(None, temp_name.split(' ')))
286 | if name_set != bto_name_set:
287 | cl_df.loc[cl_df['name'] == name, 'bto_id'] = np.nan
288 | temp_2 = cl_df[cl_df['bto_id'] == bto_id]
289 | if len(temp_2) == 0:
290 | cl_df.loc[cl_df['name'] == temp_df.iloc[0, 1], 'bto_id'] = bto_id
291 | temp_df_2 = cl_df[cl_df['bto_id'] == bto_id]
292 | if len(temp_df_2) > 1:
293 | for j in range(1, len(temp_df_2)):
294 | temp_primary = temp_df_2.iloc[j, 0]
295 | cl_df.loc[cl_df['cl_id'] == temp_primary, 'bto_id'] = np.nan
296 |
297 | if not pd.isnull(mesh_id):
298 | temp_df = cl_df[cl_df['mesh_id'] == mesh_id]
299 | if len(temp_df) > 1:
300 | mesh_term = mesh_name_dict[mesh_id] if mesh_id in mesh_name_dict else ''
301 | temp_mesh_term = mesh_term.lower()
302 | temp_mesh_term = temp_mesh_term.translate(
303 | str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
304 | mesh_term_set = set(filter(None, temp_mesh_term.split(' ')))
305 | for j in range(len(temp_df)):
306 | name = temp_df.iloc[j, 1]
307 | temp_name = name.lower()
308 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
309 | name_set = set(filter(None, temp_name.split(' ')))
310 | if name_set != mesh_term_set:
311 | cl_df.loc[cl_df['name'] == name, 'mesh_id'] = np.nan
312 | temp_2 = cl_df[cl_df['mesh_id'] == mesh_id]
313 | if len(temp_2) == 0:
314 | cl_df.loc[cl_df['name'] == temp_df.iloc[0, 1], 'mesh_id'] = mesh_id
315 | temp_df_2 = cl_df[cl_df['mesh_id'] == mesh_id]
316 | if len(temp_df_2) > 1:
317 | for j in range(1, len(temp_df_2)):
318 | temp_primary = temp_df_2.iloc[j, 0]
319 | cl_df.loc[cl_df['cl_id'] == temp_primary, 'mesh_id'] = np.nan
320 |
321 | if not pd.isnull(umls_cui):
322 | temp_df = cl_df[cl_df['umls_cui'] == umls_cui]
323 | if len(temp_df) > 1:
324 | umls_name = get_UMLS_name(tgt, umls_cui)
325 | temp_umls_name = umls_name.lower()
326 | temp_umls_name = temp_umls_name.translate(
327 | str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
328 | umls_name_set = set(filter(None, temp_umls_name.split(' ')))
329 | for j in range(len(temp_df)):
330 | name = temp_df.iloc[j, 1]
331 | temp_name = name.lower()
332 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
333 | name_set = set(filter(None, temp_name.split(' ')))
334 | if name_set != umls_name_set:
335 | cl_df.loc[cl_df['name'] == name, 'umls_cui'] = np.nan
336 | temp_2 = cl_df[cl_df['umls_cui'] == umls_cui]
337 | if len(temp_2) == 0:
338 | cl_df.loc[cl_df['name'] == temp_df.iloc[0, 1], 'umls_cui'] = umls_cui
339 | temp_df_2 = cl_df[cl_df['umls_cui'] == umls_cui]
340 | if len(temp_df_2) > 1:
341 | for j in range(1, len(temp_df_2)):
342 | temp_primary = temp_df_2.iloc[j, 0]
343 | cl_df.loc[cl_df['cl_id'] == temp_primary, 'umls_cui'] = np.nan
344 | print(i + 1, '/', len(cl_df), 'Completed...')
345 | cl_df.to_csv('cl_refined.csv', index=False)
346 |
347 |
348 | def integrate_CL():
349 | anatomy_res = pd.read_csv('anatomy_res_2_refined.csv')
350 | anatomy_res['cl_id'] = [''] * len(anatomy_res)
351 | idx = len(anatomy_res)
352 |
353 | anatomy_res['mesh_id'] = anatomy_res['mesh_id'].str.replace('MESH:', '')
354 | anatomy_res['umls_cui'] = anatomy_res['umls_cui'].str.replace('UMLS:', '')
355 |
356 | bto_list_res = list(anatomy_res.dropna(subset=['bto_id'])['bto_id'])
357 | mesh_list_res = list(anatomy_res.dropna(subset=['mesh_id'])['mesh_id'])
358 | umls_list_res = list(anatomy_res.dropna(subset=['umls_cui'])['umls_cui'])
359 |
360 | cl_res = pd.read_csv('cl_refined.csv')
361 | for i in range(len(cl_res)):
362 | cl_id = cl_res.loc[i, 'cl_id']
363 | cl_name = cl_res.loc[i, 'name']
364 | mesh_id = cl_res.loc[i, 'mesh_id']
365 | umlc_cui = cl_res.loc[i, 'umls_cui']
366 | bto_id = cl_res.loc[i, 'bto_id']
367 |
368 | if bto_id in bto_list_res:
369 | anatomy_res.loc[anatomy_res['bto_id'] == bto_id, 'cl_id'] = cl_id
370 | elif mesh_id in mesh_list_res:
371 | anatomy_res.loc[anatomy_res['mesh_id'] == mesh_id, 'cl_id'] = cl_id
372 | elif umlc_cui in umls_list_res:
373 | anatomy_res.loc[anatomy_res['umls_cui'] == umlc_cui, 'cl_id'] = cl_id
374 | else:
375 | anatomy_res.loc[idx] = [cl_id, cl_name, '', '', mesh_id, umlc_cui, cl_id]
376 | idx += 1
377 | print(i + 1, '/', len(cl_res), 'Completed...')
378 | anatomy_res.to_csv('anatomy_res_3.csv', index=False)
379 |
380 |
381 | def main():
382 | refine_res_2()
383 | # enriche_CL()
384 | # refine_CL()
385 | integrate_CL()
386 |
387 | an_vocab = pd.read_csv('anatomy_res_3.csv')
388 | print(len(an_vocab), len(an_vocab.drop_duplicates(subset='primary', keep='first')))
389 | mesh_vocab = an_vocab.dropna(subset=['mesh_id'])
390 | print(len(mesh_vocab), len(mesh_vocab.drop_duplicates(subset='mesh_id', keep='first')))
391 | bto_vocab = an_vocab.dropna(subset=['bto_id'])
392 | print(len(bto_vocab), len(bto_vocab.drop_duplicates(subset='bto_id', keep='first')))
393 | cl_vocab = an_vocab.dropna(subset=['cl_id'])
394 | print(len(cl_vocab), len(cl_vocab.drop_duplicates(subset='cl_id', keep='first')))
395 | umls_vocab = an_vocab.dropna(subset=['umls_cui'])
396 | print(len(umls_vocab), len(umls_vocab.drop_duplicates(subset='umls_cui', keep='first')))
397 |
398 | # apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6'
399 | # tgt = get_UMLS_tgt(apikey)
400 | # # umls_cui = access_UMLS_CUI(tgt, 'FMA', '68646')
401 | # umls_cui = access_UMLS_CUI_name(tgt, 'cell')
402 | # print(umls_cui)
403 |
404 |
405 | if __name__ == '__main__':
406 | main()
407 |
--------------------------------------------------------------------------------
/Codes_Term Harmonization/Entity_Integration/entity_disease.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import requests
4 | from lxml.html import fromstring
5 | import string
6 |
7 | folder = ''
8 | term_type_list = ['AC', 'BD', 'BN', 'BPCK', 'BR', 'CC', 'CDC', 'CDO', 'CD', 'CMN', 'CN', 'CPR', 'CP', 'CR', 'CSY', 'CV',
9 | 'CX', 'DC10', 'DC9', 'DE', 'DFG', 'DF', 'DI', 'DP', 'FI', 'FN', 'GLP', 'GN', 'GO', 'GPCK', 'HTJKN1',
10 | 'HTJKN', 'HTN', 'HT', 'ID', 'IN', 'IVC', 'IV', 'LA', 'LC', 'LG', 'LN', 'LPDN', 'LPN', 'LVDN', 'MD',
11 | 'MH', 'MIN', 'MS', 'MTH_CN', 'MTH_FN', 'MTH_LN', 'MTH_OAP', 'MTH_OPN', 'MTH_OP', 'MTH_PTGB',
12 | 'MTH_PTN', 'MTH_PT', 'MTH_RXN_BD', 'MTH_RXN_CDC', 'MTH_RXN_CD', 'MTH_RXN_DP', 'MTH_SI', 'MTH_SMQ',
13 | 'MV', 'NM', 'OC', 'OPN', 'OP', 'OR', 'OSN', 'PCE', 'PC', 'PEP', 'PHENO', 'PIN', 'PN', 'POS', 'PR',
14 | 'PSC', 'PSN', 'PTAV', 'PTCS', 'PTGB', 'PTJKN1', 'PTJKN', 'PTN', 'PT', 'PX', 'RPT', 'RXN_IN', 'RXN_PT',
15 | 'SBDC', 'SBDF', 'SBDG', 'SBD', 'SCDC', 'SCDF', 'SCDG', 'SCD', 'SCN', 'SD', 'SI', 'SMQ', 'SP', 'ST',
16 | 'SU', 'TA', 'TG', 'TQ', 'UCN', 'USN', 'VPT', 'VS', 'XD']
17 |
18 | def get_UMLS_tgt(apikey):
19 | uri = "https://utslogin.nlm.nih.gov"
20 | auth_endpoint = "/cas/v1/api-key"
21 | params = {'apikey': apikey}
22 | h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent": "python"}
23 | r = requests.post(uri + auth_endpoint, data=params, headers=h)
24 | response = fromstring(r.text)
25 | tgt = response.xpath('//form/@action')[0]
26 | return tgt
27 |
28 |
29 | def get_UMLS_ts(tgt):
30 | service = "http://umlsks.nlm.nih.gov"
31 | params = {'service': service}
32 | h = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain", "User-Agent": "python"}
33 | r = requests.post(tgt, data=params, headers=h)
34 | st = r.text
35 | return st
36 |
37 |
38 | def access_UMLS_CUI(tgt, id_type, entity_id):
39 | st = get_UMLS_ts(tgt)
40 | umls_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/source/' + id_type + '/' + entity_id + \
41 | '/atoms?ttys=MH,NM,PT&ticket=' + st
42 | resp = requests.get(umls_url)
43 | umls_cui = ''
44 | if 'error' not in resp.json():
45 | content = resp.json()['result'][0]
46 | umls_cui = content['concept'].replace('https://uts-ws.nlm.nih.gov/rest/content/2020AB/CUI/', '')
47 | # print(umls_cui)
48 | return umls_cui
49 |
50 |
51 | def UMLS2MeSH(tgt, umls_cui):
52 | st = get_UMLS_ts(tgt)
53 | mesh_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + umls_cui + '/atoms?sabs=MSH&ttys=MH,NM,PT&ticket=' + st
54 | mesh_resp = requests.get(mesh_url)
55 | mesh_id = ''
56 | if 'error' not in mesh_resp.json():
57 | mesh_content = mesh_resp.json()['result']
58 | mesh_id = mesh_content[0]['code'].replace(
59 | 'https://uts-ws.nlm.nih.gov/rest/content/2020AB/source/MSH/', '')
60 | return mesh_id
61 |
62 |
63 | def get_UMLS_name(tgt, umls_cui):
64 | st = get_UMLS_ts(tgt)
65 | url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + umls_cui + '?ticket=' + st
66 | resp = requests.get(url)
67 | name = ''
68 | if 'error' not in resp.json():
69 | content = resp.json()['result']
70 | name = content['name']
71 |
72 | return name
73 |
74 |
75 | def access_UMLS_CUI_name(tgt, name):
76 | name = name.lower()
77 | name = name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
78 | name_set = set(filter(None, name.split(' ')))
79 | if 'and' in name_set:
80 | name_set.remove('and')
81 | st = get_UMLS_ts(tgt)
82 | db_url = 'https://uts-ws.nlm.nih.gov/rest/search/current?string=' + name + '&ticket=' + st
83 | db_resp = requests.get(db_url)
84 | db_content_list = db_resp.json()['result']['results']
85 | res_umls = ''
86 | exact_match = False
87 | for db_content in db_content_list:
88 | umls_cui = db_content['ui']
89 | umls_name = db_content['name'].lower()
90 | umls_name = umls_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
91 | umls_name_set = set(filter(None, umls_name.split(' ')))
92 | if umls_name_set == name_set:
93 | res_umls = umls_cui
94 | exact_match = True
95 | if res_umls == '':
96 | res_umls = db_content_list[0]['ui']
97 | res_umls = res_umls if res_umls != 'NONE' else ''
98 | # print(res_umls, res_umls_name, exact_match)
99 | if not exact_match:
100 | st = get_UMLS_ts(tgt)
101 | url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + res_umls + '/atoms?ticket=' + st
102 | resp = requests.get(url)
103 | if 'error' not in resp.json():
104 | pageCount = int(resp.json()['pageCount'])
105 | for page in range(1, pageCount + 1):
106 | st = get_UMLS_ts(tgt)
107 | page_url = 'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/' + res_umls + '/atoms?pageNumber=' + str(
108 | page) + '&ticket=' + st
109 | page_resp = requests.get(page_url)
110 | content = page_resp.json()['result']
111 | for res in content:
112 | if res['termType'] in term_type_list:
113 | disease_name = res['name'].lower().replace('to ', '').translate(
114 | str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
115 | disease_name_set = set(filter(None, disease_name.split(' ')))
116 | if 'and' in disease_name_set:
117 | disease_name_set.remove('and')
118 | exact_match = name_set == disease_name_set
119 | if exact_match:
120 | break
121 | if exact_match:
122 | break
123 | # print(res_umls, res_umls_name, exact_match)
124 | return res_umls if exact_match else ''
125 |
126 |
127 | def enrich_DO():
128 | do_df = pd.read_csv(folder + 'do.csv')
129 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6'
130 | tgt = get_UMLS_tgt(apikey)
131 |
132 | for i in range(len(do_df)):
133 | umls_cui = do_df.loc[i, 'umls_cui']
134 | mesh_id = do_df.loc[i, 'mesh_id']
135 | if pd.isnull(umls_cui):
136 | icd_10 = do_df.loc[i, 'icd_10']
137 | icd_10 = str(icd_10) if not pd.isnull(icd_10) else ''
138 | icd_9 = do_df.loc[i, 'icd_9']
139 | icd_9 = str(icd_9) if not pd.isnull(icd_9) else ''
140 | snomedct_id = do_df.loc[i, 'snomedct_id']
141 | snomedct_id = str(snomedct_id) if not pd.isnull(snomedct_id) else ''
142 | name = do_df.loc[i, 'disease_name']
143 | umls_cui = access_UMLS_CUI(tgt, 'ICD10CM', icd_10)
144 | if umls_cui == '':
145 | umls_cui = access_UMLS_CUI(tgt, 'ICD9CM', icd_9)
146 | if umls_cui == '':
147 | umls_cui = access_UMLS_CUI(tgt, 'SNOMEDCT_US', snomedct_id)
148 | if umls_cui == '':
149 | umls_cui = access_UMLS_CUI_name(tgt, name)
150 | do_df.loc[i, 'umls_cui'] = umls_cui
151 | if not pd.isnull(umls_cui) and pd.isnull(mesh_id):
152 | mesh_id = UMLS2MeSH(tgt, umls_cui)
153 | do_df.loc[i, 'mesh_id'] = mesh_id
154 | print(i + 1, '/', len(do_df), 'Completed...')
155 | # print(do_df[['doid', 'umls_cui', 'mesh_id']])
156 | do_df.to_csv(folder + 'do_enriched.csv', index=False)
157 |
158 |
159 | def refine_DO():
160 | do_df = pd.read_csv(folder + 'do_enriched.csv')
161 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6'
162 | tgt = get_UMLS_tgt(apikey)
163 |
164 | mesh_disease = pd.read_csv(folder + 'mesh_disease.csv')
165 | mesh_disease['mesh_id'] = mesh_disease['mesh_id'].str.replace('MESH:', '')
166 | mesh_name_dict = mesh_disease.set_index('mesh_id')['mesh_term'].to_dict()
167 |
168 | for i in range(len(do_df)):
169 | mesh_id = do_df.loc[i, 'mesh_id']
170 | umls_cui = do_df.loc[i, 'umls_cui']
171 | if not pd.isnull(mesh_id):
172 | temp_df = do_df[do_df['mesh_id'] == mesh_id]
173 | if len(temp_df) > 1:
174 | mesh_term = mesh_name_dict[mesh_id] if mesh_id in mesh_name_dict else ''
175 | temp_mesh_term = mesh_term.lower()
176 | temp_mesh_term = temp_mesh_term.translate(
177 | str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
178 | mesh_term_set = set(filter(None, temp_mesh_term.split(' ')))
179 | for j in range(len(temp_df)):
180 | name = temp_df.iloc[j, 1]
181 | temp_name = name.lower()
182 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
183 | name_set = set(filter(None, temp_name.split(' ')))
184 | if name_set != mesh_term_set:
185 | do_df.loc[do_df['disease_name'] == name, 'mesh_id'] = np.nan
186 | temp_2 = do_df[do_df['mesh_id'] == mesh_id]
187 | if len(temp_2) == 0:
188 | do_df.loc[do_df['disease_name'] == temp_df.iloc[0, 1], 'mesh_id'] = mesh_id
189 | temp_df_2 = do_df[do_df['mesh_id'] == mesh_id]
190 | if len(temp_df_2) > 1:
191 | for j in range(1, len(temp_df_2)):
192 | temp_primary = temp_df_2.iloc[j, 0]
193 | do_df.loc[do_df['doid'] == temp_primary, 'mesh_id'] = np.nan
194 |
195 | if not pd.isnull(umls_cui):
196 | temp_df = do_df[do_df['umls_cui'] == umls_cui]
197 | if len(temp_df) > 1:
198 | umls_name = get_UMLS_name(tgt, umls_cui)
199 | temp_umls_name = umls_name.lower()
200 | temp_umls_name = temp_umls_name.translate(
201 | str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
202 | umls_name_set = set(filter(None, temp_umls_name.split(' ')))
203 | for j in range(len(temp_df)):
204 | name = temp_df.iloc[j, 1]
205 | temp_name = name.lower()
206 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
207 | name_set = set(filter(None, temp_name.split(' ')))
208 | if name_set != umls_name_set:
209 | do_df.loc[do_df['disease_name'] == name, 'umls_cui'] = np.nan
210 | temp_2 = do_df[do_df['umls_cui'] == umls_cui]
211 | if len(temp_2) == 0:
212 | do_df.loc[do_df['disease_name'] == temp_df.iloc[0, 1], 'umls_cui'] = umls_cui
213 | temp_df_2 = do_df[do_df['umls_cui'] == umls_cui]
214 | if len(temp_df_2) > 1:
215 | for j in range(1, len(temp_df_2)):
216 | temp_primary = temp_df_2.iloc[j, 0]
217 | do_df.loc[do_df['doid'] == temp_primary, 'umls_cui'] = np.nan
218 | print(i + 1, '/', len(do_df), 'Completed...')
219 | do_df.to_csv(folder + 'do_enriched_refined.csv', index=False)
220 |
221 |
222 | def enrich_KEGG():
223 | kegg_df = pd.read_csv(folder + 'kegg_disease.csv')
224 | kegg_df = kegg_df.dropna(subset=['name'])
225 | kegg_df = kegg_df.reset_index(drop=True)
226 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6'
227 | tgt = get_UMLS_tgt(apikey)
228 |
229 | umls_list = []
230 | for i in range(len(kegg_df)):
231 | names = kegg_df.loc[i, 'name']
232 | mesh_id = kegg_df.loc[i, 'mesh_id']
233 | icd_10 = kegg_df.loc[i, 'icd_10']
234 | icd_10 = icd_10.split(' ')[0] if not pd.isnull(icd_10) else ''
235 | name = names.split('; ')[0]
236 | name = name[:name.find(' (')]
237 | umls_cui = ''
238 | if not pd.isnull(mesh_id):
239 | umls_cui = access_UMLS_CUI(tgt, 'MSH', mesh_id)
240 | if umls_cui == '':
241 | umls_cui = access_UMLS_CUI(tgt, 'ICD10CM', icd_10)
242 | if umls_cui == '':
243 | umls_cui = access_UMLS_CUI_name(tgt, name)
244 | umls_list.append(umls_cui)
245 | if umls_cui != '' and pd.isnull(mesh_id):
246 | mesh_id = UMLS2MeSH(tgt, umls_cui)
247 | kegg_df.loc[i, 'mesh_id'] = mesh_id
248 | print(i + 1, '/', len(kegg_df), 'Completed...')
249 | kegg_df['umls_cui'] = umls_list
250 | print(kegg_df[['kegg_id', 'mesh_id', 'umls_cui']])
251 | kegg_df.to_csv(folder + 'kegg_disease_enriched.csv', index=False)
252 |
253 |
254 | def refine_KEGG():
255 | kegg_df = pd.read_csv(folder + 'kegg_disease_enriched.csv')
256 | mesh_disease = pd.read_csv(folder + 'mesh_disease.csv')
257 | mesh_disease['mesh_id'] = mesh_disease['mesh_id'].str.replace('MESH:', '')
258 | mesh_name_dict = mesh_disease.set_index('mesh_id')['mesh_term'].to_dict()
259 |
260 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6'
261 | tgt = get_UMLS_tgt(apikey)
262 |
263 | for i in range(len(kegg_df)):
264 | mesh_id = kegg_df.loc[i, 'mesh_id']
265 | umls_cui = kegg_df.loc[i, 'umls_cui']
266 |
267 | if not pd.isnull(mesh_id):
268 | temp_df = kegg_df[kegg_df['mesh_id'] == mesh_id]
269 | if len(temp_df) > 1:
270 | mesh_term = mesh_name_dict[mesh_id] if mesh_id in mesh_name_dict else ''
271 | temp_mesh_term = mesh_term.lower()
272 | temp_mesh_term = temp_mesh_term.translate(
273 | str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
274 | mesh_term_set = set(filter(None, temp_mesh_term.split(' ')))
275 | for j in range(len(temp_df)):
276 | name = temp_df.iloc[j, 1]
277 | temp_name = name.lower()
278 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
279 | name_set = set(filter(None, temp_name.split(' ')))
280 | if name_set != mesh_term_set:
281 | kegg_df.loc[kegg_df['name'] == name, 'mesh_id'] = np.nan
282 | temp_2 = kegg_df[kegg_df['mesh_id'] == mesh_id]
283 | if len(temp_2) == 0:
284 | kegg_df.loc[kegg_df['name'] == temp_df.iloc[0, 1], 'mesh_id'] = mesh_id
285 | temp_df_2 = kegg_df[kegg_df['mesh_id'] == mesh_id]
286 | if len(temp_df_2) > 1:
287 | for j in range(1, len(temp_df_2)):
288 | temp_primary = temp_df_2.iloc[j, 0]
289 | kegg_df.loc[kegg_df['kegg_id'] == temp_primary, 'mesh_id'] = np.nan
290 |
291 | if not pd.isnull(umls_cui):
292 | temp_df = kegg_df[kegg_df['umls_cui'] == umls_cui]
293 | if len(temp_df) > 1:
294 | umls_name = get_UMLS_name(tgt, umls_cui)
295 | temp_umls_name = umls_name.lower()
296 | temp_umls_name = temp_umls_name.translate(
297 | str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
298 | umls_name_set = set(filter(None, temp_umls_name.split(' ')))
299 | for j in range(len(temp_df)):
300 | name = temp_df.iloc[j, 1]
301 | temp_name = name.lower()
302 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
303 | name_set = set(filter(None, temp_name.split(' ')))
304 | if name_set != umls_name_set:
305 | kegg_df.loc[kegg_df['name'] == name, 'umls_cui'] = np.nan
306 | temp_2 = kegg_df[kegg_df['umls_cui'] == umls_cui]
307 | if len(temp_2) == 0:
308 | kegg_df.loc[kegg_df['name'] == temp_df.iloc[0, 1], 'umls_cui'] = umls_cui
309 | temp_df_2 = kegg_df[kegg_df['umls_cui'] == umls_cui]
310 | if len(temp_df_2) > 1:
311 | for j in range(1, len(temp_df_2)):
312 | temp_primary = temp_df_2.iloc[j, 0]
313 | kegg_df.loc[kegg_df['kegg_id'] == temp_primary, 'umls_cui'] = np.nan
314 | print(i + 1, '/', len(kegg_df), 'Completed...')
315 | kegg_df.to_csv(folder + 'kegg_disease_enriched_refined.csv', index=False)
316 |
317 |
318 | def enrich_PharmGKB():
319 | pharmgkb_df = pd.read_csv(folder + 'pharmgkb_disease_res.csv')
320 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6'
321 | tgt = get_UMLS_tgt(apikey)
322 |
323 | for i in range(len(pharmgkb_df)):
324 | mesh_id = pharmgkb_df.loc[i, 'mesh_id']
325 | umls_cui = pharmgkb_df.loc[i, 'umls_cui']
326 |
327 | if pd.isnull(umls_cui):
328 | snomedct_id = pharmgkb_df.loc[i, 'snomedct_id']
329 | snomedct_id = str(snomedct_id) if not pd.isnull(snomedct_id) else ''
330 | name = pharmgkb_df.loc[i, 'name']
331 | umls_cui = access_UMLS_CUI(tgt, 'SNOMEDCT_US', snomedct_id)
332 | if umls_cui == '':
333 | umls_cui = access_UMLS_CUI_name(tgt, name)
334 | pharmgkb_df.loc[i, 'umls_cui'] = umls_cui
335 | if not pd.isnull(umls_cui) and pd.isnull(mesh_id):
336 | mesh_id = UMLS2MeSH(tgt, umls_cui)
337 | pharmgkb_df.loc[i, 'mesh_id'] = mesh_id
338 | print(i + 1, '/', len(pharmgkb_df), 'Completed...')
339 |
340 | pharmgkb_df.to_csv(folder + 'pharmgkb_disease_enriched.csv', index=False)
341 |
342 |
343 | def refine_PharmGKB():
344 | pharmgkb_df = pd.read_csv(folder + 'pharmgkb_disease_enriched.csv')
345 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6'
346 | tgt = get_UMLS_tgt(apikey)
347 |
348 | mesh_disease = pd.read_csv(folder + 'mesh_disease.csv')
349 | mesh_disease['mesh_id'] = mesh_disease['mesh_id'].str.replace('MESH:', '')
350 | mesh_name_dict = mesh_disease.set_index('mesh_id')['mesh_term'].to_dict()
351 |
352 | for i in range(len(pharmgkb_df)):
353 | mesh_id = pharmgkb_df.loc[i, 'mesh_id']
354 | umls_cui = pharmgkb_df.loc[i, 'umls_cui']
355 | if not pd.isnull(mesh_id):
356 | temp_df = pharmgkb_df[pharmgkb_df['mesh_id'] == mesh_id]
357 | if len(temp_df) > 1:
358 | mesh_term = mesh_name_dict[mesh_id] if mesh_id in mesh_name_dict else ''
359 | temp_mesh_term = mesh_term.lower()
360 | temp_mesh_term = temp_mesh_term.translate(
361 | str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
362 | mesh_term_set = set(filter(None, temp_mesh_term.split(' ')))
363 | for j in range(len(temp_df)):
364 | name = temp_df.iloc[j, 1]
365 | temp_name = name.lower()
366 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
367 | name_set = set(filter(None, temp_name.split(' ')))
368 | if name_set != mesh_term_set:
369 | pharmgkb_df.loc[pharmgkb_df['name'] == name, 'mesh_id'] = np.nan
370 | temp_2 = pharmgkb_df[pharmgkb_df['mesh_id'] == mesh_id]
371 | if len(temp_2) == 0:
372 | pharmgkb_df.loc[pharmgkb_df['name'] == temp_df.iloc[0, 1], 'mesh_id'] = mesh_id
373 | temp_df_2 = pharmgkb_df[pharmgkb_df['mesh_id'] == mesh_id]
374 | if len(temp_df_2) > 1:
375 | for j in range(1, len(temp_df_2)):
376 | temp_primary = temp_df_2.iloc[j, 0]
377 | pharmgkb_df.loc[pharmgkb_df['pharmgkb_id'] == temp_primary, 'mesh_id'] = np.nan
378 | if not pd.isnull(umls_cui):
379 | temp_df = pharmgkb_df[pharmgkb_df['umls_cui'] == umls_cui]
380 | if len(temp_df) > 1:
381 | umls_name = get_UMLS_name(tgt, umls_cui)
382 | temp_umls_name = umls_name.lower()
383 | temp_umls_name = temp_umls_name.translate(
384 | str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
385 | umls_name_set = set(filter(None, temp_umls_name.split(' ')))
386 | for j in range(len(temp_df)):
387 | name = temp_df.iloc[j, 1]
388 | temp_name = name.lower()
389 | temp_name = temp_name.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
390 | name_set = set(filter(None, temp_name.split(' ')))
391 | if name_set != umls_name_set:
392 | pharmgkb_df.loc[pharmgkb_df['name'] == name, 'umls_cui'] = np.nan
393 | temp_2 = pharmgkb_df[pharmgkb_df['umls_cui'] == umls_cui]
394 | if len(temp_2) == 0:
395 | pharmgkb_df.loc[pharmgkb_df['name'] == temp_df.iloc[0, 1], 'umls_cui'] = umls_cui
396 | temp_df_2 = pharmgkb_df[pharmgkb_df['umls_cui'] == umls_cui]
397 | if len(temp_df_2) > 1:
398 | for j in range(1, len(temp_df_2)):
399 | temp_primary = temp_df_2.iloc[j, 0]
400 | pharmgkb_df.loc[pharmgkb_df['pharmgkb_id'] == temp_primary, 'umls_cui'] = np.nan
401 | print(i + 1, '/', len(pharmgkb_df), 'Completed...')
402 | pharmgkb_df.to_csv(folder + 'pharmgkb_disease_enriched_refined.csv', index=False)
403 |
404 |
405 | def refine_CTD_disease():
406 | CTD_disease = pd.read_csv(folder + 'CTD_disease_enriched.csv')
407 | apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6'
408 | tgt = get_UMLS_tgt(apikey)
409 |
410 | for i in range(len(CTD_disease)):
411 | disease_id = CTD_disease.loc[i, 'disease_id']
412 | mesh_id = CTD_disease.loc[i, 'mesh_id']
413 | umls_cui = CTD_disease.loc[i, 'umls_cui']
414 | if 'OMIM' in disease_id:
415 | if not pd.isnull(mesh_id):
416 | temp_df = CTD_disease[CTD_disease['mesh_id'] == mesh_id]
417 | if len(temp_df) > 1:
418 | for j in range(len(temp_df)):
419 | temp_primary = temp_df.iloc[j, 0]
420 | if 'OMIM' in temp_primary:
421 | CTD_disease.loc[CTD_disease['disease_id'] == temp_primary, 'mesh_id'] = np.nan
422 | if not pd.isnull(umls_cui):
423 | temp_df = CTD_disease[CTD_disease['umls_cui'] == umls_cui]
424 | if len(temp_df) > 1:
425 | for j in range(len(temp_df)):
426 | temp_primary = temp_df.iloc[j, 0]
427 | if 'OMIM' in temp_primary:
428 | CTD_disease.loc[CTD_disease['disease_id'] == temp_primary, 'umls_cui'] = np.nan
429 |
430 | CTD_disease.to_csv(folder + 'CTD_disease_enriched_refined.csv', index=False)
431 |
432 |
433 | def enrich_DRKG_DDi():
434 | drkg_DDi = pd.read_csv('/Users/yuhou/Documents/Knowledge_Graph/knowledge_bases_integration/stage_2/drkg_DDi.csv')
435 | drkg_DDi_disease = drkg_DDi.drop_duplicates(subset='entity_2', keep='first')
436 | drkg_DDi_disease['entity_2'] = drkg_DDi_disease['entity_2'].str.replace('Disease::', '')
437 | drkg_DDi_disease = drkg_DDi_disease.reset_index(drop=True)[['entity_2']]
438 | print(drkg_DDi_disease)
439 |
440 |
441 | def main():
442 | # enrich_DO()
443 | # enrich_KEGG()
444 | # enrich_PharmGKB()
445 | enrich_DRKG_DDi()
446 |
447 | # refine_DO()
448 | # refine_KEGG()
449 | # refine_PharmGKB()
450 | # refine_CTD_disease()
451 |
452 | # apikey = '9a095f1e-f79f-4958-bfdd-2bcba5f134d6'
453 | # tgt = get_UMLS_tgt(apikey)
454 | # umls_cui = access_UMLS_CUI(tgt, 'ICD10CM', 'C83.8')
455 | # # name = 'Water'
456 | # # umls_cui = access_UMLS_CUI_name(tgt, name)
457 | # print(umls_cui)
458 | # umls_name = get_UMLS_name(tgt, 'C0265318')
459 | # print(umls_name)
460 |
461 |
462 | if __name__ == '__main__':
463 | main()
464 |
--------------------------------------------------------------------------------