├── LICENSE ├── README.md ├── drugbank ├── README.md ├── drugbank.py └── drugbank.zip └── gnbr ├── concat.py ├── prepare_che_dis.py ├── prepare_che_gene.py ├── prepare_gen_dis.py ├── prepare_gen_gen.py └── relations.tsv /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 ChengF-Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Requirement 2 | python 3 | pandas 4 | ## Data preprocessing 5 | ### 1.gnbr 6 | original gnbr datasets V7:https://zenodo.org/record/1035500#.XlcypZgzZPY 7 | Treat chemical_disease, gene_disease, gene_gene, gene_disease triples and entities separately 8 | concat.py:Merge all entities and triples 9 | ### 2.drugbank 10 | drugbank.py:Merge the drugbank data set with the gnbr data set (unzip drugbank/drugbank.zip first) 11 | ## Train 12 | We used the knowledge graph embedding model Rotate in the DGL framework 13 | https://github.com/dmlc/dgl/tree/master/apps/kg 14 | -------------------------------------------------------------------------------- /drugbank/README.md: -------------------------------------------------------------------------------- 1 | ## Note: Unzip the drugbank.zip file in this folder first -------------------------------------------------------------------------------- /drugbank/drugbank.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | pd.set_option("precision",10) 3 | def dbgn(): 4 | db = pd.read_csv("drugbank_all_triples.tsv",delimiter='\t',names=["db","rel","ent2"]) 5 | db_ent = pd.read_csv("drugbank_entity.tsv",names=["db"],delimiter='\t') 6 | 7 | #把GNBR中有对应的drugbank id的所有meshid以及chebi id 实体找出来 8 | #mesh-drugbank的映射文件 9 | mesh_db = pd.read_csv("mesh_drugank.tsv",delimiter='\t',names = ["db","gnbrid"]) 10 | 11 | # print(mesh_db["gnbrid"].value_counts()) 12 | # print(mesh_db["db"].value_counts()) 13 | GNBR_ent = pd.read_csv("../entity_GNBRid.tsv",delimiter='\t',names=["id","name","gnbrid"]) 14 | mesh = GNBR_ent[GNBR_ent["gnbrid"].str.contains("C:MESH:")] 15 | gnbr_mesh_db = mesh.merge(mesh_db,on="gnbrid",how='left') 16 | print(gnbr_mesh_db) 17 | in_gnbr_mesh_db = gnbr_mesh_db[~gnbr_mesh_db["db"].isnull()] 18 | print(in_gnbr_mesh_db) 19 | ss = in_gnbr_mesh_db[["gnbrid","db"]] 20 | ss.to_csv("gnbr_mesh_db.tsv",sep='\t',header=True,index=False) 21 | 22 | chebi = GNBR_ent[GNBR_ent["gnbrid"].str.contains("C:CHEBI:")] 23 | #chebi-drugbank的映射文件 24 | chebi_db = pd.read_csv('../drug_finding_6_rel_filter_middle/drugbank_chebi.tsv',delimiter='\t') 25 | 26 | chebi_db["gnbrid"] = chebi_db["gnbrid"].apply(lambda x: "C:CHEBI:"+str(x)) 27 | # chebi_db.rename({"chebi":"gnbrid","db":"drugbank"},inplace=True) 28 | print(chebi_db["gnbrid"].value_counts()) 29 | print(chebi_db["db"].value_counts()) 30 | print(chebi_db) 31 | gnbr_chebi_db= chebi.merge(chebi_db,on="gnbrid",how="left") 32 | in_gnbr_chebi_db = gnbr_chebi_db[~gnbr_chebi_db["db"].isnull()] 33 | qq = in_gnbr_chebi_db[["gnbrid","db"]] 34 | qq.to_csv("gnbr_chebi_db.tsv",sep='\t',header=True,index=False) 35 | 36 | print(qq["gnbrid"].value_counts()) 37 | 38 | # part 2 把所有有对应 dbid 的实体合并起来,并去重 39 | mesh_in = pd.read_csv("gnbr_mesh_db.tsv",delimiter='\t') 40 | chebi_in = pd.read_csv("gnbr_chebi_db.tsv",delimiter='\t') 41 | db_in_gnbr= pd.concat([mesh_in,chebi_in],axis=0) 42 | print(db_in_gnbr) 43 | db_in_gnbr.drop_duplicates(["db"],keep="first",inplace=True) 44 | print(db_in_gnbr) 45 | db_in_gnbr.to_csv("gnbr_all_db.tsv",sep='\t',header=False,index=False) 46 | 47 | 48 | # part2 将DDI中的dbid 转换成gnbr的id:去掉不在gnbr中的ddi 49 | db_in_gnbr = pd.read_csv("gnbr_all_db.tsv",delimiter='\t',names=["gnbr","db"]) 50 | db_in_gnbr["db"] = db_in_gnbr["db"].apply(lambda x: "") 51 | print(db_in_gnbr) 52 | in_db = db_in_gnbr[["db"]].iloc[:,:].values 53 | db_triples= pd.read_csv("drugbank_all_triples.tsv",delimiter='\t',names=["ent1","rel","ent2"]) 54 | 55 | ddi = db_triples[db_triples["rel"] == "ddi-interactor-in"] 56 | ddi = ddi[db_triples["ent1"].isin(in_db[:,0]) & ddi["ent2"].isin(in_db[:,0])] 57 | print(ddi) 58 | first = ddi.merge(db_in_gnbr,left_on="ent1",right_on="db",how = "left") 59 | print(first) 60 | first.to_csv("left.tsv",sep = '\t',header = True,index=False) 61 | left = first[["gnbr","rel","ent2"]] 62 | left.rename({"gnbr":"left"},inplace=True) 63 | 64 | right = left.merge(db_in_gnbr,left_on="ent2",right_on="db",how='left') 65 | print(right) 66 | right = right[["gnbr_x","rel","gnbr_y"]] 67 | right.to_csv("right.tsv",sep='\t',header=False,index=False) 68 | ss=right.drop_duplicates(subset=["gnbr_x","gnbr_y"]) 69 | print(ss) 70 | 71 | #把属性三元组添加进来(GNBR中有drugbank id的药物的属性三元组找出来) 72 | other = db_triples[~db_triples["rel"] .isin(["ddi-interactor-in"]) ] 73 | othre_triple = other[other["ent1"].isin(in_db[:,0]) | other["ent2"].isin(in_db[:,0])] 74 | print(othre_triple) 75 | first = othre_triple.merge(db_in_gnbr,left_on="ent1",right_on="db",how = "left") 76 | aa = first[["gnbr",'rel',"ent2"]] 77 | aa.to_csv("shuxin_triples.tsv",sep = '\t',header=True,index=False) 78 | 79 | shuxin = pd.read_csv("shuxin_triples.tsv",delimiter='\t') 80 | print(shuxin) 81 | ent = shuxin[["ent2"]].iloc[:,:].values 82 | entitys = list(set(ent[:,0])) 83 | entity = pd.DataFrame(entitys) 84 | ent.drop_duplicates(keep = "first",inplace=True) 85 | print(ent) 86 | #将属性三元组中的出现的关系提取出来 87 | print(entity) 88 | entity.to_csv("new_add_ents.tsv",sep='\t',header=False,index=False) 89 | relations = shuxin[["rel"]].iloc[:,:].values 90 | re =list(set( relations[:,0])) 91 | res = pd.DataFrame(re) 92 | print(res) 93 | res.to_csv("new_add_relations.tsv",sep = "\t",header = False,index = False) 94 | 95 | 96 | # 将所有三元组合并:ddi +属性三元组+gnbr三元组 97 | ddi = pd.read_csv("ddi_triples.tsv",delimiter='\t',names=["h","r","t"]) 98 | print(ddi) 99 | shu = pd.read_csv("shuxin_triples.tsv",delimiter='\t',names=["h","r","t"]) 100 | print(shu) 101 | gnbr = pd.read_csv("final_gnbr_triples.tsv",delimiter='\t',names=["h","t","r","s"]) 102 | print(gnbr) 103 | gnbl = gnbr[["h","r","t","s"]] 104 | print(gnbl) 105 | all_tri = pd.concat([ddi,shu,gnbl],axis=0) 106 | all_tri.to_csv("./dbgn/dbgn_triples.tsv",sep='\t',header=False,index=False) 107 | print(all_tri) 108 | 109 | # 所有实体合并:新增的只有属性中出现的实体 110 | new_add = pd.read_csv("new_add_ents.tsv",delimiter='\t',names=["h"]) 111 | print(new_add) 112 | gn = pd.read_csv("final_gnbr_entities.tsv",delimiter='\t',names=["h"]) 113 | print(gn) 114 | all_ent = pd.concat([new_add,gn],axis=0) 115 | all_ent.to_csv("./dbgn/dbgn_entities.tsv",sep='\t',header=False,index=True) 116 | print(all_ent) 117 | C = all_ent[all_ent["h"].str.contains("C:")] 118 | print(C) 119 | G= all_ent[all_ent["h"].str.contains("G:")] 120 | print(G) 121 | D= all_ent[all_ent["h"].str.contains("D:")] 122 | print(D) 123 | #改变id 顺序 124 | 125 | new_add = pd.read_csv("./dbgn/dbgn_entities.tsv",delimiter='\t',names=["id","h"]) 126 | all_ent = new_add[["h","id"]] 127 | all_ent.to_csv("./dbgn/dbgn_entities.tsv",sep='\t',header=False,index=False) 128 | 129 | #所有关系合并 130 | relation = pd.read_csv("../gnbr/relations.tsv", delimiter='\t', names=["rel","id"]) 131 | rel1 = relation[["rel"]] 132 | rel2 = pd.read_csv("new_add_relations.tsv", delimiter='\t', names=["rel"]) 133 | all_rel = pd.concat([rel1, rel2], axis=0) 134 | all_rel.to_csv("./dbgn/dbgn_relations.tsv", sep='\t', header=False, index=True) 135 | 136 | #统计各类关系三元组数量 137 | # gnbr = pd.read_csv("final_0326_gnbr.tsv",delimiter='\t',names=["h","t","r","s"]) 138 | # cg = gnbr[gnbr["h"].str.contains("C:") & gnbr["t"].str.contains("G:")] 139 | # print("cg",cg) 140 | # che_dis = gnbr[gnbr["h"].str.contains("C:") & gnbr["t"].str.contains("D:")] 141 | # print("che_dis",che_dis) 142 | # gen_gen = gnbr[gnbr["h"].str.contains("G:") & gnbr["t"].str.contains("G:")] 143 | # print("gen_gen",gen_gen) 144 | # gen_dis = gnbr[gnbr["h"].str.contains("G:") & gnbr["t"].str.contains("D:")] 145 | # print("gen_dis",gen_dis) 146 | 147 | 148 | if __name__ == '__main__': 149 | dbgn() -------------------------------------------------------------------------------- /drugbank/drugbank.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChengF-Lab/CoV-KGE/3fdf7109ac5c0f82bb5a7b3e8bb928ae5a8beeac/drugbank/drugbank.zip -------------------------------------------------------------------------------- /gnbr/concat.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | pd.set_option("precision",10) 3 | def triples(): 4 | che_dis = pd.read_csv("triples_che_rel_dis", delimiter='\t', names=["ent1", 'rel', 'ent2', 'score']) 5 | 6 | che_gen = pd.read_csv("triples_che_rel_gen", delimiter='\t', names=["ent1", 'rel', 'ent2', 'score']) 7 | gen_dis = pd.read_csv("triples_gen_rel_dis.tsv", delimiter='\t', names=["ent1", 'rel', 'ent2', 'score']) 8 | gen_gen = pd.read_csv("triples_gen_rel_gen.tsv", delimiter='\t', names=["ent1", 'rel', 'ent2', 'score']) 9 | 10 | triples =pd.concat([che_dis,che_gen,gen_dis,gen_gen]) 11 | print("done") 12 | triples.to_csv("triples.tsv", sep="\t", header=False, index=False) 13 | 14 | #得到所有实体 15 | # dis 16 | entity_cd_disease = pd.read_csv("original entitys/entity_cd_disease.tsv", delimiter="\t", names=None) 17 | print("done") 18 | print(entity_cd_disease) 19 | entity_gd_disease = pd.read_csv("original entitys/entity_gd_disease.tsv", delimiter="\t", names=None) 20 | print("done") 21 | print(entity_gd_disease) 22 | 23 | new_dis = pd.concat([entity_cd_disease, entity_gd_disease], axis=0) 24 | new_dis.rename(columns={'Entity2': 'entity', 'DB_ID2': 'GNBR_id'}, inplace=True) 25 | filter_disease = new_dis[["entity", 'GNBR_id']].drop_duplicates(['GNBR_id']) 26 | print(filter_disease) 27 | 28 | ###################### 29 | # chemical 30 | entity_cd_chemical = pd.read_csv("original entitys/entity_cd_chemical.tsv", delimiter="\t", names=None) 31 | print("done") 32 | print(entity_cd_chemical) 33 | entity_cg_chemical = pd.read_csv("original entitys/entity_cg_chemical.tsv", delimiter="\t", names=None) 34 | print("done") 35 | print(entity_cg_chemical) 36 | 37 | new_che = pd.concat([entity_cd_chemical, entity_cg_chemical], axis=0) 38 | new_che.rename(columns={'Entity1': 'entity', 'DB_ID1': 'GNBR_id'}, inplace=True) 39 | filter_chemical = new_che[["entity", 'GNBR_id']].drop_duplicates(['GNBR_id']) 40 | print(filter_chemical) 41 | 42 | ############################## 43 | # gene 44 | entity_cg_gene = pd.read_csv("original entitys/entity_cg_gene.tsv", delimiter="\t") 45 | entity_cg_gene.rename(columns={'Entity2': "entity", 'DB_ID2': 'GNBR_id'}, inplace=True) 46 | print("done") 47 | print(entity_cg_gene) 48 | entity_gd_gene = pd.read_csv("original entitys/entity_gd_gene.tsv", delimiter="\t") 49 | entity_gd_gene.rename(columns={'Entity1': "entity", 'DB_ID1': 'GNBR_id'}, inplace=True) 50 | print("done") 51 | print(entity_gd_gene) 52 | 53 | entity_gg_gene1 = pd.read_csv("original entitys/entity_gg_gene1.tsv", delimiter="\t") 54 | entity_gg_gene1.rename(columns={'Entity1': "entity", 'DB_ID1': 'GNBR_id'}, inplace=True) 55 | print("done") 56 | print(entity_gg_gene1) 57 | entity_gg_gene2 = pd.read_csv("original entitys/entity_gg_gene2.tsv", delimiter="\t") 58 | entity_gg_gene2.rename(columns={'Entity2': "entity", 'DB_ID2': 'GNBR_id'}, inplace=True) 59 | print("done") 60 | print(entity_gg_gene2) 61 | new_gene = pd.concat([entity_cg_gene, entity_gd_gene, entity_gg_gene1, entity_gg_gene2], axis=0) 62 | filter_gene = new_gene[["entity", 'GNBR_id']].drop_duplicates(['GNBR_id']) 63 | print(filter_gene) 64 | entity_to_GNBRid = pd.concat([filter_disease, filter_chemical, filter_gene], axis=0) 65 | print("all_entities:", entity_to_GNBRid) 66 | entity_to_GNBRid.to_csv("entity_GNBRid.tsv", sep="\t", header=True, index=False) 67 | 68 | entities_id = entity_to_GNBRid["GNBR_id"] 69 | 70 | entities_id.to_csv("gnbr_entities.tsv", sep="\t", header=True, index=False) 71 | 72 | def prepare(): 73 | #去重特殊的chemical实体 74 | gnbr_tri = pd.read_csv("triples.tsv", delimiter='\t', names=["h", "r", "t", "score"]) 75 | gnbr_ent1 = pd.read_csv("gnbr_entities.tsv", delimiter='\t', names=["gnbrid"]) 76 | gnbr_ent1 = gnbr_ent1[["gnbrid","id"]] 77 | 78 | not_gnbrc = gnbr_ent1[~gnbr_ent1["gnbrid"].str.contains("C:")] 79 | gnbr_ent = gnbr_ent1[gnbr_ent1["gnbrid"].str.contains("C:")] 80 | 81 | c_ent = gnbr_ent[~(gnbr_ent["gnbrid"].str.contains("C:MESH") | gnbr_ent["gnbrid"].str.contains("C:CHEBI"))] 82 | no_c_ent = gnbr_ent[(gnbr_ent["gnbrid"].str.contains("C:MESH") | gnbr_ent["gnbrid"].str.contains("C:CHEBI"))] 83 | print(c_ent) 84 | cs = c_ent[["gnbrid"]].iloc[:, :].values 85 | 86 | c_ent["gnbrid"] = c_ent["gnbrid"].apply(lambda x: x.replace(":", ":MESH:")) 87 | print(c_ent) 88 | all_ent = pd.concat([no_c_ent, not_gnbrc, c_ent], axis=0) 89 | print(all_ent) 90 | all_ent.drop_duplicates(["gnbrid"], keep="first", inplace=True) 91 | all_ent = all_ent["gnbrid"] 92 | print(all_ent) 93 | all_ent.to_csv("final_gnbr_entities.tsv", sep='\t', header=False, index=False) 94 | # 把C开头的三元组提取出来 95 | # not_c = gnbr_tri[~(gnbr_tri["h"].str.contains("C:") | gnbr_tri["t"].str.contains("C:"))] 96 | 97 | not_ctri = gnbr_tri[~gnbr_tri["h"].isin(cs[:, 0])] 98 | print(not_ctri) 99 | c_tri = gnbr_tri[gnbr_tri["h"].isin(cs[:, 0])] 100 | print(c_tri) 101 | c_tri["h"] = c_tri["h"].apply(lambda x: x.replace(":", ":MESH:")) 102 | print(c_tri) 103 | all_tri = pd.concat([not_ctri, c_tri], axis=0) 104 | print(all_tri) 105 | final_tri = all_tri.sort_values(by=["score"], ascending=False).groupby(by=["h", "t", "r"]).first().reset_index() 106 | print(final_tri) 107 | final_tri.to_csv("final_gnbr_triples.tsv", sep='\t', header=False, index=False) 108 | 109 | 110 | if __name__ == '__main__': 111 | triples() 112 | prepare() 113 | 114 | 115 | -------------------------------------------------------------------------------- /gnbr/prepare_che_dis.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pandas import Series,DataFrame 3 | pd.set_option("precision",10) 4 | def part1_pat2_concat(): 5 | parti_che_dis = pd.read_csv("original resource/part-i-chemical-disease-path-theme-distributions.txt", 6 | delimiter="\t") 7 | print("done") 8 | parti_che_dis.rename(columns={'path': 'Dependence_path'}, inplace=True) 9 | 10 | print(parti_che_dis) 11 | # max 12 | che_dis_name = ['PubMed_ID', 'Sentence_number', 'Entity1', 'Loc1', 'Entity2', 'Loc2', 'Entity_Raw_str1', 13 | 'Entity_Raw_str2', 'DB_ID1', 'DB_ID2', 'Entity1_type', 'Entity2_type', 'Dependence_path', 14 | 'Sentence'] 15 | partii_che_dis = pd.read_csv("original resource/part-ii-dependency-paths-chemical-disease-sorted-with-themes.txt", 16 | delimiter="\t", names=che_dis_name, dtype={'DEVICE_ADDRESS': 'str'}) 17 | print("done") 18 | 19 | partii_che_dis = partii_che_dis[['Entity1', 'Entity2', 'DB_ID1', 'DB_ID2', 'Dependence_path']] 20 | 21 | #将依赖路劲改成小写: 22 | parti_che_dis['Dependence_path'] = parti_che_dis['Dependence_path'].map(lambda x: x.lower()) 23 | partii_che_dis['Dependence_path'] = partii_che_dis['Dependence_path'].map(lambda x: x.lower()) 24 | print('lower done') 25 | 26 | #merge two tables 27 | he_che_gen = pd.merge(partii_che_dis, parti_che_dis, how='left', on='Dependence_path') 28 | print("merge done") 29 | 30 | 31 | # filter null relation 32 | T_require = he_che_gen['T'].map(lambda x: pd.notnull(x)) 33 | C_require = he_che_gen['C'].map(lambda x: pd.notnull(x)) 34 | Sa_require = he_che_gen['Sa'].map(lambda x: pd.notnull(x)) 35 | Pr_require =he_che_gen['Pr'].map(lambda x: pd.notnull(x)) 36 | Pa_require =he_che_gen['Pa'].map(lambda x: pd.notnull(x)) 37 | J_require =he_che_gen['J'].map(lambda x: pd.notnull(x)) 38 | Mp_require =he_che_gen['Mp'].map(lambda x: pd.notnull(x)) 39 | 40 | some = he_che_gen[T_require |C_require| Sa_require | Pr_require | Pa_require|J_require|Mp_require] 41 | print(some.describe()) 42 | 43 | #delete DBID is null 44 | Db1_require = some['DB_ID1'].map(lambda x: x!='null') 45 | Db2_require = some['DB_ID2'].map(lambda x: x!='null') 46 | 47 | triples_che_dis = some[Db1_require & Db2_require] 48 | print("filter done") 49 | 50 | triples_che_dis.to_csv("triples/triples_che_dis_themes.tsv",sep='\t',header=True,index=False) 51 | 52 | # get entities and rename it C:chemical D:disease 53 | triples_che_dis ["DB_ID1"] = triples_che_dis ["DB_ID1"] .apply(lambda x :"C:"+str(x)) 54 | triples_che_dis ["DB_ID2"] = triples_che_dis ["DB_ID2"] .apply(lambda x :"D:"+str(x)) 55 | 56 | triples_che_dis.to_csv("triples/triples_che_dis_themes.tsv",sep="\t",header=True,index=False) 57 | print(triples_che_dis) 58 | #drop duplicate entities 59 | che_dis_chemical= triples_che_dis[["Entity1",'DB_ID1']].drop_duplicates(['DB_ID1']) 60 | che_dis_disease = triples_che_dis[["Entity2",'DB_ID2']].drop_duplicates(['DB_ID2']) 61 | 62 | print(che_dis_chemical) 63 | print(che_dis_disease) 64 | che_dis_chemical.to_csv("original entitys/entity_cd_chemical.tsv",sep='\t',header=True,index=False) 65 | print("to_csv chemical") 66 | che_dis_disease.to_csv("original entitys/entity_cd_disease.tsv",sep='\t',header=True,index=False) 67 | print("to_csv disease") 68 | 69 | 70 | #形成三元组 71 | def relation_normalization(): 72 | 73 | triple_che_dis = pd.read_csv("triples/triples_che_dis_themes.tsv",delimiter='\t') 74 | print(triple_che_dis.describe()) 75 | 76 | print("最大值:",max(triple_che_dis[['T']].values),max(triple_che_dis[['C']].values)) 77 | #248178 78 | #将关系进行归一化: 79 | triple_che_dis ["T"] = triple_che_dis ["T"] .apply(lambda x : x/248178.0) 80 | triple_che_dis ["C"] = triple_che_dis ["C"] .apply(lambda x : x/248178.0) 81 | triple_che_dis ["Sa"] = triple_che_dis ["Sa"] .apply(lambda x : x/248178.0) 82 | triple_che_dis ["Pr"] = triple_che_dis ["Pr"] .apply(lambda x : x/248178.0) 83 | triple_che_dis ["Pa"] = triple_che_dis ["Pa"] .apply(lambda x : x/248178.0) 84 | triple_che_dis ["J"] = triple_che_dis ["J"] .apply(lambda x : x/248178.0) 85 | triple_che_dis ["Mp"] = triple_che_dis ["Mp"] .apply(lambda x : x/248178.0) 86 | print("归一化完成") 87 | #每一个不为0的主题都成为成为三元组 88 | triples= [] 89 | def load(): 90 | for index ,row in triple_che_dis.iterrows(): 91 | if index %50000 ==0: 92 | print("读入",index/50000,"行") 93 | entity_1 = row["DB_ID1"] 94 | entity_2 = row["DB_ID2"] 95 | T = row['T'] 96 | C = row['C'] 97 | Sa = row['Sa'] 98 | Pr = row['Pr'] 99 | Pa = row['Pa'] 100 | J = row['J'] 101 | Mp = row['Mp'] 102 | if T!=0.0: 103 | triples.append([entity_1,"T",entity_2,T]) 104 | if C != 0.0: 105 | triples.append([entity_1, "C", entity_2, C]) 106 | if Sa != 0.0: 107 | triples.append([entity_1, "Sa", entity_2, Sa]) 108 | if Pr != 0.0: 109 | triples.append([entity_1, "Pr", entity_2, Pr]) 110 | if Pa != 0.0: 111 | triples.append([entity_1, "Pa", entity_2, Pa]) 112 | if J != 0.0: 113 | triples.append([entity_1, "J", entity_2, J]) 114 | if Mp != 0.0: 115 | triples.append([entity_1, "Mp", entity_2, Mp]) 116 | print("read done") 117 | che_dis_triples = DataFrame(triples,columns=["che","rel","dis","score"]) 118 | 119 | #去重: 120 | che_dis_triples.drop_duplicates(["che","rel","dis","score"],inplace=True) 121 | 122 | final_che_dis_triples= che_dis_triples.sort_values('score', ascending=False).groupby(["ent1", "rel", "ent2"]).first().reset_index() 123 | final_che_dis_triples.to_csv("triples_che_rel_dis.tsv",sep='\t',header=False,index=False) 124 | print(final_che_dis_triples) 125 | 126 | if __name__ == '__main__': 127 | part1_pat2_concat() 128 | relation_normalization() -------------------------------------------------------------------------------- /gnbr/prepare_che_gene.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pandas import Series,DataFrame 3 | pd.set_option("precision",10) 4 | def part1_pat2_concat(): 5 | parti_che_gene = pd.read_csv("original resource/part-i-chemical-gene-path-theme-distributions.txt", delimiter="\t") 6 | print("done") 7 | parti_che_gene.rename(columns={'path':'Dependence_path'},inplace=True) 8 | 9 | print(parti_che_gene) 10 | #max 45222 11 | che_dis_name = ['PubMed_ID', 'Sentence_number', 'Entity1', 'Loc1','Entity2','Loc2','Entity_Raw_str1','Entity_Raw_str2','DB_ID1','DB_ID2','Entity1_type','Entity2_type','Dependence_path','Sentence'] 12 | partii_che_gene =pd.read_csv("original resource/part-ii-dependency-paths-chemical-gene-sorted-with-themes.txt", delimiter="\t", names = che_dis_name, dtype={'DEVICE_ADDRESS': 'str'}) 13 | print("done") 14 | partii_che_gene = partii_che_gene[['Entity1','Entity2','DB_ID1','DB_ID2','Dependence_path']] 15 | print(partii_che_gene) 16 | 17 | #将依赖路劲改成小写: 18 | parti_che_gene['Dependence_path'] = parti_che_gene['Dependence_path'].map(lambda x: str(x).lower()) 19 | partii_che_gene['Dependence_path'] = partii_che_gene['Dependence_path'].map(lambda x: str(x).lower()) 20 | print('小写完成') 21 | 22 | #将两个表合并 23 | he_che_gen = pd.merge(partii_che_gene, parti_che_gene, how='left', on='Dependence_path') 24 | print("合并完成") 25 | # print(he_che_gen.describe()) 26 | # print(he_che_gen[1:10]) 27 | 28 | #过滤合并后的表格 29 | 30 | T_require = he_che_gen['A+'].map(lambda x: pd.notnull(x)) 31 | C_require = he_che_gen['A-'].map(lambda x: pd.notnull(x)) 32 | Sa_require = he_che_gen['B'].map(lambda x: pd.notnull(x)) 33 | Pr_require =he_che_gen['E+'].map(lambda x: pd.notnull(x)) 34 | Pa_require =he_che_gen['E-'].map(lambda x: pd.notnull(x)) 35 | J_require =he_che_gen['N'].map(lambda x: pd.notnull(x)) 36 | Mp_require =he_che_gen['O'].map(lambda x: pd.notnull(x)) 37 | J_require1=he_che_gen['K'].map(lambda x: pd.notnull(x)) 38 | Mp_require1 =he_che_gen['Z'].map(lambda x: pd.notnull(x)) 39 | some = he_che_gen[T_require |C_require| Sa_require | Pr_require | Pa_require|J_require|Mp_require|J_require1|Mp_require1] 40 | 41 | 42 | #删除DBID为空的对 43 | Db1_require = some['DB_ID1'].map(lambda x: x!='null') 44 | Db2_require = some['DB_ID2'].map(lambda x: x!='null') 45 | 46 | triples_che_gen = some[Db1_require & Db2_require] 47 | 48 | 49 | print("过滤合并完成") 50 | 51 | #提取出实体,并修改实体DB名称 52 | triples_che_gen ["DB_ID1"] = triples_che_gen ["DB_ID1"] .apply(lambda x :"C:"+str(x)) 53 | triples_che_gen ["DB_ID2"] = triples_che_gen ["DB_ID2"] .apply(lambda x :"G:"+str(x)) 54 | 55 | triples_che_gen.to_csv("triples/triples_che_gen_themes.tsv",sep="\t",header=True,index=False) 56 | print(triples_che_gen) 57 | 58 | che_gen_chemical= triples_che_gen[["Entity1",'DB_ID1']].drop_duplicates(['DB_ID1']) 59 | che_gen_gene = triples_che_gen[["Entity2",'DB_ID2']].drop_duplicates(['DB_ID2']) 60 | 61 | print(che_gen_chemical) 62 | 63 | che_gen_chemical.to_csv("original entitys/entity_cg_chemical.tsv",sep='\t',header=True,index=False) 64 | print("to_csv chemical") 65 | che_gen_gene.to_csv("original entitys/entity_cg_gene.tsv",sep='\t',header=True,index=False) 66 | print(che_gen_gene) 67 | print("to_csv chemical") 68 | 69 | def relation_normalization(): 70 | #提取出三元组 71 | triple_che_gen= pd.read_csv("triples/triples_che_gen_themes.tsv",delimiter='\t') 72 | print(triple_che_gen.describe()) 73 | 74 | print("最大值:",max(triple_che_gen[['N']].values)) 75 | #45222 76 | #将关系进行归一化: 77 | #A+ A+.ind A- A-.ind B B.ind E+ E+.ind E- E-.ind E E.ind N N.ind O O.ind K K.ind Z Z.ind 78 | # 79 | triple_che_gen ["A+"] = triple_che_gen ["A+"] .apply(lambda x : x/45222.0) 80 | triple_che_gen ["A-"] = triple_che_gen ["A-"] .apply(lambda x : x/45222.0) 81 | triple_che_gen ["B"] = triple_che_gen ["B"] .apply(lambda x : x/45222.0) 82 | triple_che_gen ["E+"] = triple_che_gen ["E+"] .apply(lambda x : x/45222.0) 83 | triple_che_gen ["E-"] = triple_che_gen ["E-"] .apply(lambda x : x/45222.0) 84 | triple_che_gen ["E"] = triple_che_gen ["E"] .apply(lambda x : x/45222.0) 85 | triple_che_gen ["N"] = triple_che_gen ["N"] .apply(lambda x : x/45222.0) 86 | triple_che_gen ["O"] = triple_che_gen ["O"] .apply(lambda x : x/45222.0) 87 | triple_che_gen ["K"] = triple_che_gen ["K"] .apply(lambda x : x/45222.0) 88 | triple_che_gen ["Z"] = triple_che_gen ["Z"] .apply(lambda x : x/45222.0) 89 | print("归一化完成") 90 | triples= [] 91 | for index ,row in triple_che_gen.iterrows(): 92 | if index %50000 ==0: 93 | print("读入",index/50000,"行") 94 | entity_1 = row["DB_ID1"] 95 | entity_2 = row["DB_ID2"] 96 | A1 = row['A+'] 97 | A2 = row['A-'] 98 | B = row['B'] 99 | E1 = row['E+'] 100 | E2 = row['E-'] 101 | E = row['E'] 102 | N = row['N'] 103 | O = row['O'] 104 | K = row['K'] 105 | Z = row['Z'] 106 | if A1!=0.0: 107 | triples.append([entity_1,"A+",entity_2,A1]) 108 | if A2 != 0.0: 109 | triples.append([entity_1, "A-", entity_2, A2]) 110 | if B != 0.0: 111 | triples.append([entity_1, "B", entity_2, B]) 112 | if E1 != 0.0: 113 | triples.append([entity_1, "E+", entity_2, E1]) 114 | if E2 != 0.0: 115 | triples.append([entity_1, "E-", entity_2, E2]) 116 | if E != 0.0: 117 | triples.append([entity_1, "E", entity_2, E]) 118 | if N != 0.0: 119 | triples.append([entity_1, "N", entity_2, N]) 120 | if O != 0.0: 121 | triples.append([entity_1, "O", entity_2, O]) 122 | if K != 0.0: 123 | triples.append([entity_1, "K", entity_2, K]) 124 | if Z != 0.0: 125 | triples.append([entity_1, "Z", entity_2, Z]) 126 | print("read done") 127 | che_gen_triples = DataFrame(triples,columns=["che","rel","gen","score"]) 128 | 129 | #去重 130 | che_gen_triples.drop_duplicates(["che","rel","gen","score"],inplace=True) 131 | final_che_gen_triples = che_gen_triples.sort_values('score', ascending=False).groupby(["ent1","rel","ent2"]).first().reset_index() 132 | final_che_gen_triples.to_csv("triples_che_rel_gen.tsv",sep='\t',header=False,index=False) 133 | print(che_gen_triples) 134 | 135 | if __name__ == '__main__': 136 | part1_pat2_concat() 137 | relation_normalization() -------------------------------------------------------------------------------- /gnbr/prepare_gen_dis.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pandas import Series,DataFrame 3 | 4 | pd.set_option("precision",10) 5 | def part1_pat2_concat(): 6 | parti_gen_dis = pd.read_csv("original resource/part-i-gene-disease-path-theme-distributions.txt", delimiter="\t") 7 | print("done") 8 | parti_gen_dis.rename(columns={'path':'Dependence_path'},inplace=True) 9 | 10 | print(parti_gen_dis) 11 | # print(parti_gen_dis.describe()) 12 | 13 | gen_dis_name = ['PubMed_ID', 'Sentence_number', 'Entity1', 'Loc1','Entity2','Loc2','Entity_Raw_str1','Entity_Raw_str2','DB_ID1','DB_ID2','Entity1_type','Entity2_type','Dependence_path','Sentence'] 14 | print("name") 15 | partii_gen_dis =pd.read_csv("original resource/part-ii-dependency-paths-gene-disease-sorted-with-themes.txt", delimiter="\t", names = gen_dis_name, dtype={'DEVICE_ADDRESS': 'str'}) 16 | print(partii_gen_dis) 17 | partii_gen_dis = partii_gen_dis[['Entity1','Entity2','DB_ID1','DB_ID2','Dependence_path']] 18 | 19 | print("过滤") 20 | #将依赖路劲改成小写: 21 | partii_gen_dis['Dependence_path'] = partii_gen_dis['Dependence_path'].map(lambda x: str(x).lower()) 22 | partii_gen_dis['Dependence_path'] = partii_gen_dis['Dependence_path'].map(lambda x: str(x).lower()) 23 | print('小写完成') 24 | 25 | #将两个表合并 26 | he_gen_dis = pd.merge(partii_gen_dis, parti_gen_dis, how='left', on='Dependence_path') 27 | print("合并完成") 28 | 29 | # 过滤合并后的表格U U.ind Ud Ud.ind D D.ind J J.ind Te Te.ind Y Y.ind G G.ind Md Md.ind X X.ind L L.ind 30 | T_require = he_gen_dis['U'].map(lambda x: pd.notnull(x)) 31 | C_require = he_gen_dis['Ud'].map(lambda x: pd.notnull(x)) 32 | Sa_require = he_gen_dis['D'].map(lambda x: pd.notnull(x)) 33 | Pr_require =he_gen_dis['J'].map(lambda x: pd.notnull(x)) 34 | Pa_require =he_gen_dis['Te'].map(lambda x: pd.notnull(x)) 35 | J_require =he_gen_dis['Y'].map(lambda x: pd.notnull(x)) 36 | Mp_require =he_gen_dis['G'].map(lambda x: pd.notnull(x)) 37 | J_require1=he_gen_dis['Md'].map(lambda x: pd.notnull(x)) 38 | Mp_require1 =he_gen_dis['X'].map(lambda x: pd.notnull(x)) 39 | Mp_require2 =he_gen_dis['L'].map(lambda x: pd.notnull(x)) 40 | some = he_gen_dis[T_require |C_require| Sa_require | Pr_require | Pa_require|J_require|Mp_require|J_require1|Mp_require1|Mp_require2] 41 | 42 | 43 | #删除DBID为空的对 44 | Db1_require = some['DB_ID1'].map(lambda x: x!='null') 45 | Db2_require = some['DB_ID2'].map(lambda x: x!='null') 46 | 47 | triples_gen_dis = some[Db1_require & Db2_require] 48 | print("过滤合并完成") 49 | triples_gen_dis.to_csv("triples/triples_gen_dis_themes.tsv", sep="\t", header=True, index=False) 50 | 51 | #提取出实体,并修改实体DB名称 52 | triples_gen_dis ["DB_ID1"] = triples_gen_dis ["DB_ID1"] .apply(lambda x :"G:"+str(x)) 53 | triples_gen_dis ["DB_ID2"] = triples_gen_dis ["DB_ID2"] .apply(lambda x :"D:"+str(x)) 54 | 55 | gen_dis_gene = triples_gen_dis[["Entity1", 'DB_ID1']].drop_duplicates(['DB_ID1']) 56 | gen_dis_disease = triples_gen_dis[["Entity2", 'DB_ID2']].drop_duplicates(['DB_ID2']) 57 | 58 | gen_dis_gene.to_csv("original entitys/entity_gd_gene.tsv",sep='\t',header=True,index=False) 59 | print("to_csv chemical") 60 | gen_dis_disease.to_csv("original entitys/entity_gd_disease.tsv",sep='\t',header=True,index=False) 61 | print("entity done") 62 | 63 | def relation_normalization(): 64 | # #提取三元组 65 | triple_gen_dis= pd.read_csv("triples/triples_gen_dis_themes.tsv",delimiter='\t') 66 | print(triple_gen_dis.describe()) 67 | 68 | print("最大值:",max(triple_gen_dis[['L']].values)) 69 | #68514 70 | #将关系进行归一化: 71 | #U U.ind Ud Ud.ind D D.ind J J.ind Te Te.ind Y Y.ind G G.ind Md Md.ind X X.ind L L.ind 72 | 73 | triple_gen_dis ["U"] = triple_gen_dis ["U"] .apply(lambda x : x/68514.0) 74 | triple_gen_dis ["Ud"] = triple_gen_dis ["Ud"] .apply(lambda x : x/68514.0) 75 | triple_gen_dis ["D"] = triple_gen_dis ["D"] .apply(lambda x : x/68514.0) 76 | triple_gen_dis ["J"] = triple_gen_dis ["J"] .apply(lambda x : x/68514.0) 77 | triple_gen_dis ["Te"] = triple_gen_dis ["Te"] .apply(lambda x : x/68514.0) 78 | triple_gen_dis ["Y"] = triple_gen_dis ["Y"] .apply(lambda x : x/68514.0) 79 | triple_gen_dis ["G"] = triple_gen_dis ["G"] .apply(lambda x : x/68514.0) 80 | triple_gen_dis ["Md"] = triple_gen_dis ["Md"] .apply(lambda x : x/68514.0) 81 | triple_gen_dis ["X"] = triple_gen_dis ["X"] .apply(lambda x : x/68514.0) 82 | triple_gen_dis ["L"] = triple_gen_dis ["L"] .apply(lambda x : x/68514.0) 83 | print("归一化完成") 84 | triples= [] 85 | for index ,row in triple_gen_dis.iterrows(): 86 | if index %50000 ==0: 87 | print("读入",index/50000,"行") 88 | entity_1 = row["DB_ID1"] 89 | entity_2 = row["DB_ID2"] 90 | U = row['U'] 91 | Ud = row['Ud'] 92 | D = row['D'] 93 | J = row['J'] 94 | Te = row['Te'] 95 | Y = row['Y'] 96 | G = row['G'] 97 | Md = row['Md'] 98 | X = row['X'] 99 | L = row['L'] 100 | if U!=0.0: 101 | triples.append([entity_1,"U",entity_2,U]) 102 | if Ud != 0.0: 103 | triples.append([entity_1, "Ud", entity_2, Ud]) 104 | if D != 0.0: 105 | triples.append([entity_1, "D", entity_2, D]) 106 | if J != 0.0: 107 | triples.append([entity_1, "J", entity_2, J]) 108 | if Te != 0.0: 109 | triples.append([entity_1, "Te", entity_2, Te]) 110 | if Y != 0.0: 111 | triples.append([entity_1, "Y", entity_2, Y]) 112 | if G != 0.0: 113 | triples.append([entity_1, "G", entity_2, G]) 114 | if Md != 0.0: 115 | triples.append([entity_1, "Md", entity_2, Md]) 116 | if X != 0.0: 117 | triples.append([entity_1, "X", entity_2, X]) 118 | if L != 0.0: 119 | triples.append([entity_1, "L", entity_2, L]) 120 | print("read done") 121 | gen_dis_triples = DataFrame(triples,columns=["gene", "rel", "dis", "score"]) 122 | 123 | gen_dis_triples.drop_duplicates(["gene", "rel", "dis", "score"],inplace=True) 124 | 125 | final_gen_dis_triples = gen_dis_triples.groupby(["gene", "rel", "dis"]).apply(lambda x: x.sort_values('score', ascending=False)).groupby( 126 | ["gene", "rel", "dis"]).first().reset_index() 127 | 128 | final_gen_dis_triples.to_csv("triples_gen_rel_dis.tsv",sep='\t',header=False,index=False) 129 | print(gen_dis_triples) 130 | 131 | 132 | if __name__ == '__main__': 133 | part1_pat2_concat() 134 | relation_normalization() -------------------------------------------------------------------------------- /gnbr/prepare_gen_gen.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pandas import Series,DataFrame 3 | 4 | pd.set_option("precision",10) 5 | def part1_pat2_concat(): 6 | parti_gen_gen = pd.read_csv("original resource/part-i-gene-gene-path-theme-distributions.txt", delimiter="\t") 7 | print("done") 8 | print(parti_gen_gen) 9 | parti_gen_gen.rename(columns={'path':'Dependence_path'},inplace=True) 10 | 11 | gen_gen_name = ['PubMed_ID', 'Sentence_number', 'Entity1', 'Loc1','Entity2','Loc2','Entity_Raw_str1','Entity_Raw_str2','DB_ID1','DB_ID2','Entity1_type','Entity2_type','Dependence_path','Sentence'] 12 | print("name") 13 | partii_gen_gen =pd.read_csv("original resource/part-ii-dependency-paths-gene-gene-sorted-with-themes.txt", delimiter="\t", names = gen_gen_name, dtype={'DEVICE_ADDRESS': 'str'}) 14 | print("done") 15 | new_partii_gen_gen = partii_gen_gen[['Entity1','Entity2','DB_ID1','DB_ID2','Dependence_path']] 16 | print(new_partii_gen_gen) 17 | print("过滤") 18 | #将依赖路劲改成小写: 19 | new_partii_gen_gen['Dependence_path'] = new_partii_gen_gen['Dependence_path'].map(lambda x: str(x).lower()) 20 | parti_gen_gen['Dependence_path'] = parti_gen_gen['Dependence_path'].map(lambda x: str(x).lower()) 21 | print('小写完成') 22 | 23 | #将两个表合并 24 | he_gen_gen = pd.merge(new_partii_gen_gen, parti_gen_gen, how='left', on='Dependence_path') 25 | print("合并完成") 26 | 27 | #过滤合并后的表格 28 | 29 | T_require = he_gen_gen['B'].map(lambda x: pd.notnull(x)) 30 | C_require = he_gen_gen['W'].map(lambda x: pd.notnull(x)) 31 | Sa_require = he_gen_gen['V+'].map(lambda x: pd.notnull(x)) 32 | Pr_require =he_gen_gen['E+'].map(lambda x: pd.notnull(x)) 33 | Pa_require =he_gen_gen['E'].map(lambda x: pd.notnull(x)) 34 | J_require =he_gen_gen['I'].map(lambda x: pd.notnull(x)) 35 | Mp_require =he_gen_gen['H'].map(lambda x: pd.notnull(x)) 36 | J_require1=he_gen_gen['Rg'].map(lambda x: pd.notnull(x)) 37 | Mp_require1 =he_gen_gen['Q'].map(lambda x: pd.notnull(x)) 38 | some = he_gen_gen[T_require |C_require| Sa_require | Pr_require | Pa_require|J_require|Mp_require|J_require1|Mp_require1] 39 | print("过滤为空的主题") 40 | # 41 | #删除DBID为空的对 42 | Db1_require = some['DB_ID1'].map(lambda x: x!='null') 43 | Db2_require = some['DB_ID2'].map(lambda x: x!='null') 44 | 45 | triples_gen_gen = some[Db1_require & Db2_require] 46 | # 47 | print("过滤合并完成") 48 | 49 | # 50 | #提取出实体,并修改实体DB名称 51 | triples_gen_gen ["DB_ID1"] = triples_gen_gen ["DB_ID1"] .apply(lambda x :"G:"+str(x)) 52 | triples_gen_gen ["DB_ID2"] = triples_gen_gen ["DB_ID2"] .apply(lambda x :"G:"+str(x)) 53 | 54 | new_triples_gen_gen = triples_gen_gen[["Entity1","DB_ID1","Entity2","DB_ID2",'B','B.ind','W','W.ind','V+','V+.ind','E+','E+.ind','E','E.ind','I','I.ind','H','H.ind','Rg','Rg.ind','Q','Q.ind']] 55 | new_triples_gen_gen.to_csv("triples/triples_gen_gen_themes.tsv",sep="\t",header=True) 56 | print(new_triples_gen_gen) 57 | 58 | gen_gen_gene1= triples_gen_gen[["Entity1",'DB_ID1']].drop_duplicates(['DB_ID1']) 59 | gen_gen_gene2 = triples_gen_gen[["Entity2",'DB_ID2']].drop_duplicates(['DB_ID2']) 60 | print("gene1:") 61 | print(gen_gen_gene1) 62 | print("gene2:") 63 | print(gen_gen_gene2) 64 | gen_gen_gene1.to_csv("original entitys/entity_gg_gene1.tsv",sep="\t",header=True,index=False) 65 | gen_gen_gene2.to_csv("original entitys/entity_gg_gene2.tsv",sep="\t",header=True,index=False) 66 | #提取三元组 67 | 68 | # relation2id = pd.read_csv("Relation2Id.tsv",delimiter='\t') 69 | # 70 | def relation_normalization(): 71 | triple_gen_gen= pd.read_csv("triples/triples_gen_gen_themes.tsv",delimiter='\t') 72 | print(triple_gen_gen.describe()) 73 | max = max(triple_gen_gen[['Q']].values) 74 | print("max values:",max) 75 | #515159 76 | #将关系进行归一化: 77 | #B B.ind W W.ind V+ V+.ind E+ E+.ind E E.ind I I.ind H H.ind Rg Rg.ind Q Q.ind 78 | 79 | triple_gen_gen ["B"] = triple_gen_gen ["B"] .apply(lambda x : x/max) 80 | triple_gen_gen ["W"] = triple_gen_gen ["W"] .apply(lambda x : x/max) 81 | triple_gen_gen ["V+"] = triple_gen_gen ["V+"] .apply(lambda x : x/max) 82 | triple_gen_gen ["E+"] = triple_gen_gen ["E+"] .apply(lambda x : x/max) 83 | triple_gen_gen ["E"] = triple_gen_gen ["E"] .apply(lambda x : x/max) 84 | triple_gen_gen ["I"] = triple_gen_gen ["I"] .apply(lambda x : x/max) 85 | triple_gen_gen ["H"] = triple_gen_gen ["H"] .apply(lambda x : x/max) 86 | triple_gen_gen ["Rg"] = triple_gen_gen ["Rg"] .apply(lambda x : x/max) 87 | triple_gen_gen ["Q"] = triple_gen_gen ["Q"] .apply(lambda x : x/max) 88 | 89 | print("归一化完成") 90 | triples= [] 91 | for index ,row in triple_gen_gen.iterrows(): 92 | if index %50000 ==0: 93 | print("读入",index/50000,"行") 94 | entity_1 = row["DB_ID1"] 95 | entity_2 = row["DB_ID2"] 96 | B = row['B'] 97 | W = row['W'] 98 | V1 = row['V+'] 99 | E1 = row['E+'] 100 | E = row['E'] 101 | I = row['I'] 102 | H = row['H'] 103 | Rg = row['Rg'] 104 | Q = row['Q'] 105 | 106 | if B!=0.0: 107 | triples.append([entity_1,"B",entity_2,B]) 108 | if W != 0.0: 109 | triples.append([entity_1, "W", entity_2, W]) 110 | if V1 != 0.0: 111 | triples.append([entity_1, "V+", entity_2, V1]) 112 | if E1 != 0.0: 113 | triples.append([entity_1, "E+", entity_2, E1]) 114 | if E != 0.0: 115 | triples.append([entity_1, "E", entity_2, E]) 116 | if I != 0.0: 117 | triples.append([entity_1, "I", entity_2, I]) 118 | if H != 0.0: 119 | triples.append([entity_1, "H", entity_2, H]) 120 | if Rg != 0.0: 121 | triples.append([entity_1, "Rg", entity_2, Rg]) 122 | if Q != 0.0: 123 | triples.append([entity_1, "Q", entity_2, Q]) 124 | 125 | print("read done") 126 | gen_gen_triples = DataFrame(triples,columns=["gene", "rel", "gen", "score"]) 127 | 128 | gen_gen_triples.drop_duplicates(["gene", "rel", "gen", "score"],inplace=True) 129 | final_gen_gen_triples= gen_gen_triples.sort_values('score', ascending=False).groupby(["ent1", "rel", "ent2"]).first().reset_index() 130 | final_gen_gen_triples.to_csv("triples_gen_rel_gen.tsv",sep='\t',header=False,index=False) 131 | print(gen_gen_triples) 132 | 133 | if __name__ == '__main__': 134 | part1_pat2_concat() 135 | relation_normalization() 136 | -------------------------------------------------------------------------------- /gnbr/relations.tsv: -------------------------------------------------------------------------------- 1 | T 0 2 | C 1 3 | Sa 2 4 | Pr 3 5 | Pa 4 6 | J 5 7 | Mp 6 8 | Md 7 9 | X 8 10 | L 9 11 | U 10 12 | Ud 11 13 | D 12 14 | Te 13 15 | Y 14 16 | G 15 17 | A+ 16 18 | A- 17 19 | B 18 20 | E+ 19 21 | E- 20 22 | E 21 23 | N 22 24 | O 23 25 | K 24 26 | Z 25 27 | W 26 28 | V+ 27 29 | I 28 30 | H 29 31 | Rg 30 32 | Q 31 33 | --------------------------------------------------------------------------------