├── LICENSE
├── README.md
├── drugbank
    ├── README.md
    ├── drugbank.py
    └── drugbank.zip
└── gnbr
    ├── concat.py
    ├── prepare_che_dis.py
    ├── prepare_che_gene.py
    ├── prepare_gen_dis.py
    ├── prepare_gen_gen.py
    └── relations.tsv


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 ChengF-Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Requirement  
 2 | python  
 3 | pandas
 4 | ## Data preprocessing
 5 | ### 1.gnbr
 6 | original gnbr datasets V7:https://zenodo.org/record/1035500#.XlcypZgzZPY  
 7 | Treat chemical_disease, gene_disease, gene_gene, gene_disease triples and entities separately  
 8 | concat.py:Merge all entities and triples
 9 | ### 2.drugbank
10 | drugbank.py:Merge the drugbank data set with the gnbr data set (unzip drugbank/drugbank.zip first)
11 | ## Train
12 | We used the knowledge graph embedding model Rotate in the DGL framework  
13 | https://github.com/dmlc/dgl/tree/master/apps/kg
14 | 


--------------------------------------------------------------------------------
/drugbank/README.md:
--------------------------------------------------------------------------------
1 | ## Note: Unzip the drugbank.zip file in this folder first


--------------------------------------------------------------------------------
/drugbank/drugbank.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | pd.set_option("precision",10)
  3 | def dbgn():
  4 |     db = pd.read_csv("drugbank_all_triples.tsv",delimiter='\t',names=["db","rel","ent2"])
  5 |     db_ent = pd.read_csv("drugbank_entity.tsv",names=["db"],delimiter='\t')
  6 | 
  7 |     #把GNBR中有对应的drugbank id的所有meshid以及chebi id 实体找出来
  8 |     #mesh-drugbank的映射文件
  9 |     mesh_db = pd.read_csv("mesh_drugank.tsv",delimiter='\t',names = ["db","gnbrid"])
 10 | 
 11 |     # print(mesh_db["gnbrid"].value_counts())
 12 |     # print(mesh_db["db"].value_counts())
 13 |     GNBR_ent = pd.read_csv("../entity_GNBRid.tsv",delimiter='\t',names=["id","name","gnbrid"])
 14 |     mesh  = GNBR_ent[GNBR_ent["gnbrid"].str.contains("C:MESH:")]
 15 |     gnbr_mesh_db = mesh.merge(mesh_db,on="gnbrid",how='left')
 16 |     print(gnbr_mesh_db)
 17 |     in_gnbr_mesh_db = gnbr_mesh_db[~gnbr_mesh_db["db"].isnull()]
 18 |     print(in_gnbr_mesh_db)
 19 |     ss = in_gnbr_mesh_db[["gnbrid","db"]]
 20 |     ss.to_csv("gnbr_mesh_db.tsv",sep='\t',header=True,index=False)
 21 | 
 22 |     chebi  = GNBR_ent[GNBR_ent["gnbrid"].str.contains("C:CHEBI:")]
 23 |     #chebi-drugbank的映射文件
 24 |     chebi_db = pd.read_csv('../drug_finding_6_rel_filter_middle/drugbank_chebi.tsv',delimiter='\t')
 25 | 
 26 |     chebi_db["gnbrid"] = chebi_db["gnbrid"].apply(lambda x: "C:CHEBI:"+str(x))
 27 |     # chebi_db.rename({"chebi":"gnbrid","db":"drugbank"},inplace=True)
 28 |     print(chebi_db["gnbrid"].value_counts())
 29 |     print(chebi_db["db"].value_counts())
 30 |     print(chebi_db)
 31 |     gnbr_chebi_db= chebi.merge(chebi_db,on="gnbrid",how="left")
 32 |     in_gnbr_chebi_db = gnbr_chebi_db[~gnbr_chebi_db["db"].isnull()]
 33 |     qq = in_gnbr_chebi_db[["gnbrid","db"]]
 34 |     qq.to_csv("gnbr_chebi_db.tsv",sep='\t',header=True,index=False)
 35 | 
 36 |     print(qq["gnbrid"].value_counts())
 37 | 
 38 |     # part 2 把所有有对应 dbid 的实体合并起来，并去重
 39 |     mesh_in = pd.read_csv("gnbr_mesh_db.tsv",delimiter='\t')
 40 |     chebi_in = pd.read_csv("gnbr_chebi_db.tsv",delimiter='\t')
 41 |     db_in_gnbr= pd.concat([mesh_in,chebi_in],axis=0)
 42 |     print(db_in_gnbr)
 43 |     db_in_gnbr.drop_duplicates(["db"],keep="first",inplace=True)
 44 |     print(db_in_gnbr)
 45 |     db_in_gnbr.to_csv("gnbr_all_db.tsv",sep='\t',header=False,index=False)
 46 | 
 47 | 
 48 |     # part2 将DDI中的dbid 转换成gnbr的id：去掉不在gnbr中的ddi
 49 |     db_in_gnbr = pd.read_csv("gnbr_all_db.tsv",delimiter='\t',names=["gnbr","db"])
 50 |     db_in_gnbr["db"] = db_in_gnbr["db"].apply(lambda  x: "<http://bio2rdf.org/drugbank:"+x+">")
 51 |     print(db_in_gnbr)
 52 |     in_db = db_in_gnbr[["db"]].iloc[:,:].values
 53 |     db_triples= pd.read_csv("drugbank_all_triples.tsv",delimiter='\t',names=["ent1","rel","ent2"])
 54 | 
 55 |     ddi = db_triples[db_triples["rel"] == "ddi-interactor-in"]
 56 |     ddi = ddi[db_triples["ent1"].isin(in_db[:,0]) & ddi["ent2"].isin(in_db[:,0])]
 57 |     print(ddi)
 58 |     first = ddi.merge(db_in_gnbr,left_on="ent1",right_on="db",how = "left")
 59 |     print(first)
 60 |     first.to_csv("left.tsv",sep = '\t',header = True,index=False)
 61 |     left = first[["gnbr","rel","ent2"]]
 62 |     left.rename({"gnbr":"left"},inplace=True)
 63 | 
 64 |     right = left.merge(db_in_gnbr,left_on="ent2",right_on="db",how='left')
 65 |     print(right)
 66 |     right = right[["gnbr_x","rel","gnbr_y"]]
 67 |     right.to_csv("right.tsv",sep='\t',header=False,index=False)
 68 |     ss=right.drop_duplicates(subset=["gnbr_x","gnbr_y"])
 69 |     print(ss)
 70 | 
 71 |     #把属性三元组添加进来（GNBR中有drugbank id的药物的属性三元组找出来）
 72 |     other = db_triples[~db_triples["rel"] .isin(["ddi-interactor-in"]) ]
 73 |     othre_triple = other[other["ent1"].isin(in_db[:,0]) | other["ent2"].isin(in_db[:,0])]
 74 |     print(othre_triple)
 75 |     first = othre_triple.merge(db_in_gnbr,left_on="ent1",right_on="db",how = "left")
 76 |     aa = first[["gnbr",'rel',"ent2"]]
 77 |     aa.to_csv("shuxin_triples.tsv",sep = '\t',header=True,index=False)
 78 | 
 79 |     shuxin = pd.read_csv("shuxin_triples.tsv",delimiter='\t')
 80 |     print(shuxin)
 81 |     ent  = shuxin[["ent2"]].iloc[:,:].values
 82 |     entitys = list(set(ent[:,0]))
 83 |     entity = pd.DataFrame(entitys)
 84 |     ent.drop_duplicates(keep = "first",inplace=True)
 85 |     print(ent)
 86 |     #将属性三元组中的出现的关系提取出来
 87 |     print(entity)
 88 |     entity.to_csv("new_add_ents.tsv",sep='\t',header=False,index=False)
 89 |     relations = shuxin[["rel"]].iloc[:,:].values
 90 |     re =list(set( relations[:,0]))
 91 |     res = pd.DataFrame(re)
 92 |     print(res)
 93 |     res.to_csv("new_add_relations.tsv",sep = "\t",header = False,index = False)
 94 | 
 95 | 
 96 |     # 将所有三元组合并：ddi +属性三元组+gnbr三元组
 97 |     ddi = pd.read_csv("ddi_triples.tsv",delimiter='\t',names=["h","r","t"])
 98 |     print(ddi)
 99 |     shu = pd.read_csv("shuxin_triples.tsv",delimiter='\t',names=["h","r","t"])
100 |     print(shu)
101 |     gnbr = pd.read_csv("final_gnbr_triples.tsv",delimiter='\t',names=["h","t","r","s"])
102 |     print(gnbr)
103 |     gnbl  = gnbr[["h","r","t","s"]]
104 |     print(gnbl)
105 |     all_tri = pd.concat([ddi,shu,gnbl],axis=0)
106 |     all_tri.to_csv("./dbgn/dbgn_triples.tsv",sep='\t',header=False,index=False)
107 |     print(all_tri)
108 | 
109 |     # 所有实体合并：新增的只有属性中出现的实体
110 |     new_add = pd.read_csv("new_add_ents.tsv",delimiter='\t',names=["h"])
111 |     print(new_add)
112 |     gn = pd.read_csv("final_gnbr_entities.tsv",delimiter='\t',names=["h"])
113 |     print(gn)
114 |     all_ent = pd.concat([new_add,gn],axis=0)
115 |     all_ent.to_csv("./dbgn/dbgn_entities.tsv",sep='\t',header=False,index=True)
116 |     print(all_ent)
117 |     C = all_ent[all_ent["h"].str.contains("C:")]
118 |     print(C)
119 |     G= all_ent[all_ent["h"].str.contains("G:")]
120 |     print(G)
121 |     D= all_ent[all_ent["h"].str.contains("D:")]
122 |     print(D)
123 |     #改变id 顺序
124 | 
125 |     new_add = pd.read_csv("./dbgn/dbgn_entities.tsv",delimiter='\t',names=["id","h"])
126 |     all_ent = new_add[["h","id"]]
127 |     all_ent.to_csv("./dbgn/dbgn_entities.tsv",sep='\t',header=False,index=False)
128 | 
129 |     #所有关系合并
130 |     relation = pd.read_csv("../gnbr/relations.tsv", delimiter='\t', names=["rel","id"])
131 |     rel1 = relation[["rel"]]
132 |     rel2 = pd.read_csv("new_add_relations.tsv", delimiter='\t', names=["rel"])
133 |     all_rel = pd.concat([rel1, rel2], axis=0)
134 |     all_rel.to_csv("./dbgn/dbgn_relations.tsv", sep='\t', header=False, index=True)
135 | 
136 |     #统计各类关系三元组数量
137 |     # gnbr = pd.read_csv("final_0326_gnbr.tsv",delimiter='\t',names=["h","t","r","s"])
138 |     # cg = gnbr[gnbr["h"].str.contains("C:") & gnbr["t"].str.contains("G:")]
139 |     # print("cg",cg)
140 |     # che_dis = gnbr[gnbr["h"].str.contains("C:") & gnbr["t"].str.contains("D:")]
141 |     # print("che_dis",che_dis)
142 |     # gen_gen = gnbr[gnbr["h"].str.contains("G:") & gnbr["t"].str.contains("G:")]
143 |     # print("gen_gen",gen_gen)
144 |     # gen_dis = gnbr[gnbr["h"].str.contains("G:") & gnbr["t"].str.contains("D:")]
145 |     # print("gen_dis",gen_dis)
146 | 
147 | 
148 | if __name__ == '__main__':
149 |     dbgn()


--------------------------------------------------------------------------------
/drugbank/drugbank.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChengF-Lab/CoV-KGE/3fdf7109ac5c0f82bb5a7b3e8bb928ae5a8beeac/drugbank/drugbank.zip


--------------------------------------------------------------------------------
/gnbr/concat.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | pd.set_option("precision",10)
  3 | def triples():
  4 |     che_dis = pd.read_csv("triples_che_rel_dis", delimiter='\t', names=["ent1", 'rel', 'ent2', 'score'])
  5 | 
  6 |     che_gen = pd.read_csv("triples_che_rel_gen", delimiter='\t', names=["ent1", 'rel', 'ent2', 'score'])
  7 |     gen_dis = pd.read_csv("triples_gen_rel_dis.tsv", delimiter='\t', names=["ent1", 'rel', 'ent2', 'score'])
  8 |     gen_gen = pd.read_csv("triples_gen_rel_gen.tsv", delimiter='\t', names=["ent1", 'rel', 'ent2', 'score'])
  9 | 
 10 |     triples =pd.concat([che_dis,che_gen,gen_dis,gen_gen])
 11 |     print("done")
 12 |     triples.to_csv("triples.tsv", sep="\t", header=False, index=False)
 13 | 
 14 |     #得到所有实体
 15 |     # dis
 16 |     entity_cd_disease = pd.read_csv("original entitys/entity_cd_disease.tsv", delimiter="\t", names=None)
 17 |     print("done")
 18 |     print(entity_cd_disease)
 19 |     entity_gd_disease = pd.read_csv("original entitys/entity_gd_disease.tsv", delimiter="\t", names=None)
 20 |     print("done")
 21 |     print(entity_gd_disease)
 22 | 
 23 |     new_dis = pd.concat([entity_cd_disease, entity_gd_disease], axis=0)
 24 |     new_dis.rename(columns={'Entity2': 'entity', 'DB_ID2': 'GNBR_id'}, inplace=True)
 25 |     filter_disease = new_dis[["entity", 'GNBR_id']].drop_duplicates(['GNBR_id'])
 26 |     print(filter_disease)
 27 | 
 28 |     ######################
 29 |     # chemical
 30 |     entity_cd_chemical = pd.read_csv("original entitys/entity_cd_chemical.tsv", delimiter="\t", names=None)
 31 |     print("done")
 32 |     print(entity_cd_chemical)
 33 |     entity_cg_chemical = pd.read_csv("original entitys/entity_cg_chemical.tsv", delimiter="\t", names=None)
 34 |     print("done")
 35 |     print(entity_cg_chemical)
 36 | 
 37 |     new_che = pd.concat([entity_cd_chemical, entity_cg_chemical], axis=0)
 38 |     new_che.rename(columns={'Entity1': 'entity', 'DB_ID1': 'GNBR_id'}, inplace=True)
 39 |     filter_chemical = new_che[["entity", 'GNBR_id']].drop_duplicates(['GNBR_id'])
 40 |     print(filter_chemical)
 41 | 
 42 |     ##############################
 43 |     # gene
 44 |     entity_cg_gene = pd.read_csv("original entitys/entity_cg_gene.tsv", delimiter="\t")
 45 |     entity_cg_gene.rename(columns={'Entity2': "entity", 'DB_ID2': 'GNBR_id'}, inplace=True)
 46 |     print("done")
 47 |     print(entity_cg_gene)
 48 |     entity_gd_gene = pd.read_csv("original entitys/entity_gd_gene.tsv", delimiter="\t")
 49 |     entity_gd_gene.rename(columns={'Entity1': "entity", 'DB_ID1': 'GNBR_id'}, inplace=True)
 50 |     print("done")
 51 |     print(entity_gd_gene)
 52 | 
 53 |     entity_gg_gene1 = pd.read_csv("original entitys/entity_gg_gene1.tsv", delimiter="\t")
 54 |     entity_gg_gene1.rename(columns={'Entity1': "entity", 'DB_ID1': 'GNBR_id'}, inplace=True)
 55 |     print("done")
 56 |     print(entity_gg_gene1)
 57 |     entity_gg_gene2 = pd.read_csv("original entitys/entity_gg_gene2.tsv", delimiter="\t")
 58 |     entity_gg_gene2.rename(columns={'Entity2': "entity", 'DB_ID2': 'GNBR_id'}, inplace=True)
 59 |     print("done")
 60 |     print(entity_gg_gene2)
 61 |     new_gene = pd.concat([entity_cg_gene, entity_gd_gene, entity_gg_gene1, entity_gg_gene2], axis=0)
 62 |     filter_gene = new_gene[["entity", 'GNBR_id']].drop_duplicates(['GNBR_id'])
 63 |     print(filter_gene)
 64 |     entity_to_GNBRid = pd.concat([filter_disease, filter_chemical, filter_gene], axis=0)
 65 |     print("all_entities：", entity_to_GNBRid)
 66 |     entity_to_GNBRid.to_csv("entity_GNBRid.tsv", sep="\t", header=True, index=False)
 67 | 
 68 |     entities_id = entity_to_GNBRid["GNBR_id"]
 69 | 
 70 |     entities_id.to_csv("gnbr_entities.tsv", sep="\t", header=True, index=False)
 71 | 
 72 | def prepare():
 73 |     #去重特殊的chemical实体
 74 |     gnbr_tri = pd.read_csv("triples.tsv", delimiter='\t', names=["h", "r", "t", "score"])
 75 |     gnbr_ent1 = pd.read_csv("gnbr_entities.tsv", delimiter='\t', names=["gnbrid"])
 76 |     gnbr_ent1 = gnbr_ent1[["gnbrid","id"]]
 77 | 
 78 |     not_gnbrc = gnbr_ent1[~gnbr_ent1["gnbrid"].str.contains("C:")]
 79 |     gnbr_ent = gnbr_ent1[gnbr_ent1["gnbrid"].str.contains("C:")]
 80 | 
 81 |     c_ent = gnbr_ent[~(gnbr_ent["gnbrid"].str.contains("C:MESH") | gnbr_ent["gnbrid"].str.contains("C:CHEBI"))]
 82 |     no_c_ent = gnbr_ent[(gnbr_ent["gnbrid"].str.contains("C:MESH") | gnbr_ent["gnbrid"].str.contains("C:CHEBI"))]
 83 |     print(c_ent)
 84 |     cs = c_ent[["gnbrid"]].iloc[:, :].values
 85 | 
 86 |     c_ent["gnbrid"] = c_ent["gnbrid"].apply(lambda x: x.replace(":", ":MESH:"))
 87 |     print(c_ent)
 88 |     all_ent = pd.concat([no_c_ent, not_gnbrc, c_ent], axis=0)
 89 |     print(all_ent)
 90 |     all_ent.drop_duplicates(["gnbrid"], keep="first", inplace=True)
 91 |     all_ent = all_ent["gnbrid"]
 92 |     print(all_ent)
 93 |     all_ent.to_csv("final_gnbr_entities.tsv", sep='\t', header=False, index=False)
 94 |     # 把C开头的三元组提取出来
 95 |     # not_c = gnbr_tri[~(gnbr_tri["h"].str.contains("C:") | gnbr_tri["t"].str.contains("C:"))]
 96 | 
 97 |     not_ctri = gnbr_tri[~gnbr_tri["h"].isin(cs[:, 0])]
 98 |     print(not_ctri)
 99 |     c_tri = gnbr_tri[gnbr_tri["h"].isin(cs[:, 0])]
100 |     print(c_tri)
101 |     c_tri["h"] = c_tri["h"].apply(lambda x: x.replace(":", ":MESH:"))
102 |     print(c_tri)
103 |     all_tri = pd.concat([not_ctri, c_tri], axis=0)
104 |     print(all_tri)
105 |     final_tri = all_tri.sort_values(by=["score"], ascending=False).groupby(by=["h", "t", "r"]).first().reset_index()
106 |     print(final_tri)
107 |     final_tri.to_csv("final_gnbr_triples.tsv", sep='\t', header=False, index=False)
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     triples()
112 |     prepare()
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/gnbr/prepare_che_dis.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from pandas import Series,DataFrame
  3 | pd.set_option("precision",10)
  4 | def part1_pat2_concat():
  5 |     parti_che_dis = pd.read_csv("original resource/part-i-chemical-disease-path-theme-distributions.txt",
  6 |                                 delimiter="\t")
  7 |     print("done")
  8 |     parti_che_dis.rename(columns={'path': 'Dependence_path'}, inplace=True)
  9 | 
 10 |     print(parti_che_dis)
 11 |     # max
 12 |     che_dis_name = ['PubMed_ID', 'Sentence_number', 'Entity1', 'Loc1', 'Entity2', 'Loc2', 'Entity_Raw_str1',
 13 |                     'Entity_Raw_str2', 'DB_ID1', 'DB_ID2', 'Entity1_type', 'Entity2_type', 'Dependence_path',
 14 |                     'Sentence']
 15 |     partii_che_dis = pd.read_csv("original resource/part-ii-dependency-paths-chemical-disease-sorted-with-themes.txt",
 16 |                                  delimiter="\t", names=che_dis_name, dtype={'DEVICE_ADDRESS': 'str'})
 17 |     print("done")
 18 | 
 19 |     partii_che_dis = partii_che_dis[['Entity1', 'Entity2', 'DB_ID1', 'DB_ID2', 'Dependence_path']]
 20 | 
 21 |     #将依赖路劲改成小写：
 22 |     parti_che_dis['Dependence_path'] = parti_che_dis['Dependence_path'].map(lambda  x: x.lower())
 23 |     partii_che_dis['Dependence_path'] = partii_che_dis['Dependence_path'].map(lambda  x: x.lower())
 24 |     print('lower done')
 25 | 
 26 |     #merge two tables
 27 |     he_che_gen = pd.merge(partii_che_dis, parti_che_dis, how='left', on='Dependence_path')
 28 |     print("merge done")
 29 | 
 30 | 
 31 |     # filter null relation
 32 |     T_require = he_che_gen['T'].map(lambda x: pd.notnull(x))
 33 |     C_require = he_che_gen['C'].map(lambda x: pd.notnull(x))
 34 |     Sa_require = he_che_gen['Sa'].map(lambda x: pd.notnull(x))
 35 |     Pr_require =he_che_gen['Pr'].map(lambda x: pd.notnull(x))
 36 |     Pa_require =he_che_gen['Pa'].map(lambda x: pd.notnull(x))
 37 |     J_require =he_che_gen['J'].map(lambda x: pd.notnull(x))
 38 |     Mp_require =he_che_gen['Mp'].map(lambda x: pd.notnull(x))
 39 | 
 40 |     some = he_che_gen[T_require |C_require| Sa_require | Pr_require | Pa_require|J_require|Mp_require]
 41 |     print(some.describe())
 42 | 
 43 |     #delete DBID is null
 44 |     Db1_require = some['DB_ID1'].map(lambda x: x!='null')
 45 |     Db2_require = some['DB_ID2'].map(lambda x: x!='null')
 46 | 
 47 |     triples_che_dis = some[Db1_require & Db2_require]
 48 |     print("filter done")
 49 | 
 50 |     triples_che_dis.to_csv("triples/triples_che_dis_themes.tsv",sep='\t',header=True,index=False)
 51 | 
 52 |     # get entities and rename it C:chemical D:disease
 53 |     triples_che_dis ["DB_ID1"] = triples_che_dis ["DB_ID1"] .apply(lambda x :"C:"+str(x))
 54 |     triples_che_dis ["DB_ID2"] = triples_che_dis ["DB_ID2"] .apply(lambda x :"D:"+str(x))
 55 | 
 56 |     triples_che_dis.to_csv("triples/triples_che_dis_themes.tsv",sep="\t",header=True,index=False)
 57 |     print(triples_che_dis)
 58 |     #drop duplicate entities
 59 |     che_dis_chemical= triples_che_dis[["Entity1",'DB_ID1']].drop_duplicates(['DB_ID1'])
 60 |     che_dis_disease = triples_che_dis[["Entity2",'DB_ID2']].drop_duplicates(['DB_ID2'])
 61 | 
 62 |     print(che_dis_chemical)
 63 |     print(che_dis_disease)
 64 |     che_dis_chemical.to_csv("original entitys/entity_cd_chemical.tsv",sep='\t',header=True,index=False)
 65 |     print("to_csv chemical")
 66 |     che_dis_disease.to_csv("original entitys/entity_cd_disease.tsv",sep='\t',header=True,index=False)
 67 |     print("to_csv disease")
 68 | 
 69 | 
 70 | #形成三元组
 71 | def relation_normalization():
 72 | 
 73 |     triple_che_dis = pd.read_csv("triples/triples_che_dis_themes.tsv",delimiter='\t')
 74 |     print(triple_che_dis.describe())
 75 | 
 76 |     print("最大值：",max(triple_che_dis[['T']].values),max(triple_che_dis[['C']].values))
 77 |     #248178
 78 |     #将关系进行归一化：
 79 |     triple_che_dis ["T"] = triple_che_dis ["T"] .apply(lambda x : x/248178.0)
 80 |     triple_che_dis ["C"] = triple_che_dis ["C"] .apply(lambda x : x/248178.0)
 81 |     triple_che_dis ["Sa"] = triple_che_dis ["Sa"] .apply(lambda x : x/248178.0)
 82 |     triple_che_dis ["Pr"] = triple_che_dis ["Pr"] .apply(lambda x : x/248178.0)
 83 |     triple_che_dis ["Pa"] = triple_che_dis ["Pa"] .apply(lambda x : x/248178.0)
 84 |     triple_che_dis ["J"] = triple_che_dis ["J"] .apply(lambda x : x/248178.0)
 85 |     triple_che_dis ["Mp"] = triple_che_dis ["Mp"] .apply(lambda x : x/248178.0)
 86 |     print("归一化完成")
 87 |     #每一个不为0的主题都成为成为三元组
 88 |     triples= []
 89 |     def load():
 90 |         for index ,row  in triple_che_dis.iterrows():
 91 |             if index %50000 ==0:
 92 |                 print("读入",index/50000,"行")
 93 |             entity_1 = row["DB_ID1"]
 94 |             entity_2 = row["DB_ID2"]
 95 |             T = row['T']
 96 |             C = row['C']
 97 |             Sa = row['Sa']
 98 |             Pr = row['Pr']
 99 |             Pa = row['Pa']
100 |             J = row['J']
101 |             Mp = row['Mp']
102 |             if T!=0.0:
103 |                 triples.append([entity_1,"T",entity_2,T])
104 |             if C != 0.0:
105 |                 triples.append([entity_1, "C", entity_2, C])
106 |             if Sa != 0.0:
107 |                 triples.append([entity_1, "Sa", entity_2, Sa])
108 |             if Pr != 0.0:
109 |                 triples.append([entity_1, "Pr", entity_2, Pr])
110 |             if Pa != 0.0:
111 |                 triples.append([entity_1, "Pa", entity_2, Pa])
112 |             if J != 0.0:
113 |                 triples.append([entity_1, "J", entity_2, J])
114 |             if Mp != 0.0:
115 |                 triples.append([entity_1, "Mp", entity_2, Mp])
116 |         print("read done")
117 |         che_dis_triples = DataFrame(triples,columns=["che","rel","dis","score"])
118 | 
119 |         #去重：
120 |         che_dis_triples.drop_duplicates(["che","rel","dis","score"],inplace=True)
121 | 
122 |         final_che_dis_triples= che_dis_triples.sort_values('score', ascending=False).groupby(["ent1", "rel", "ent2"]).first().reset_index()
123 |         final_che_dis_triples.to_csv("triples_che_rel_dis.tsv",sep='\t',header=False,index=False)
124 |         print(final_che_dis_triples)
125 | 
126 | if __name__ == '__main__':
127 |     part1_pat2_concat()
128 |     relation_normalization()


--------------------------------------------------------------------------------
/gnbr/prepare_che_gene.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from pandas import Series,DataFrame
  3 | pd.set_option("precision",10)
  4 | def part1_pat2_concat():
  5 |     parti_che_gene = pd.read_csv("original resource/part-i-chemical-gene-path-theme-distributions.txt", delimiter="\t")
  6 |     print("done")
  7 |     parti_che_gene.rename(columns={'path':'Dependence_path'},inplace=True)
  8 | 
  9 |     print(parti_che_gene)
 10 |     #max 45222
 11 |     che_dis_name = ['PubMed_ID', 'Sentence_number', 'Entity1', 'Loc1','Entity2','Loc2','Entity_Raw_str1','Entity_Raw_str2','DB_ID1','DB_ID2','Entity1_type','Entity2_type','Dependence_path','Sentence']
 12 |     partii_che_gene =pd.read_csv("original resource/part-ii-dependency-paths-chemical-gene-sorted-with-themes.txt", delimiter="\t", names = che_dis_name, dtype={'DEVICE_ADDRESS': 'str'})
 13 |     print("done")
 14 |     partii_che_gene = partii_che_gene[['Entity1','Entity2','DB_ID1','DB_ID2','Dependence_path']]
 15 |     print(partii_che_gene)
 16 | 
 17 |     #将依赖路劲改成小写：
 18 |     parti_che_gene['Dependence_path'] = parti_che_gene['Dependence_path'].map(lambda  x: str(x).lower())
 19 |     partii_che_gene['Dependence_path'] = partii_che_gene['Dependence_path'].map(lambda  x: str(x).lower())
 20 |     print('小写完成')
 21 | 
 22 |     #将两个表合并
 23 |     he_che_gen = pd.merge(partii_che_gene, parti_che_gene, how='left', on='Dependence_path')
 24 |     print("合并完成")
 25 |     # print(he_che_gen.describe())
 26 |     # print(he_che_gen[1:10])
 27 | 
 28 |     #过滤合并后的表格
 29 | 
 30 |     T_require = he_che_gen['A+'].map(lambda x: pd.notnull(x))
 31 |     C_require = he_che_gen['A-'].map(lambda x: pd.notnull(x))
 32 |     Sa_require = he_che_gen['B'].map(lambda x: pd.notnull(x))
 33 |     Pr_require =he_che_gen['E+'].map(lambda x: pd.notnull(x))
 34 |     Pa_require =he_che_gen['E-'].map(lambda x: pd.notnull(x))
 35 |     J_require =he_che_gen['N'].map(lambda x: pd.notnull(x))
 36 |     Mp_require =he_che_gen['O'].map(lambda x: pd.notnull(x))
 37 |     J_require1=he_che_gen['K'].map(lambda x: pd.notnull(x))
 38 |     Mp_require1 =he_che_gen['Z'].map(lambda x: pd.notnull(x))
 39 |     some = he_che_gen[T_require |C_require| Sa_require | Pr_require | Pa_require|J_require|Mp_require|J_require1|Mp_require1]
 40 | 
 41 | 
 42 |     #删除DBID为空的对
 43 |     Db1_require = some['DB_ID1'].map(lambda x: x!='null')
 44 |     Db2_require = some['DB_ID2'].map(lambda x: x!='null')
 45 | 
 46 |     triples_che_gen = some[Db1_require & Db2_require]
 47 | 
 48 | 
 49 |     print("过滤合并完成")
 50 | 
 51 |     #提取出实体,并修改实体DB名称
 52 |     triples_che_gen ["DB_ID1"] = triples_che_gen ["DB_ID1"] .apply(lambda x :"C:"+str(x))
 53 |     triples_che_gen ["DB_ID2"] = triples_che_gen ["DB_ID2"] .apply(lambda x :"G:"+str(x))
 54 | 
 55 |     triples_che_gen.to_csv("triples/triples_che_gen_themes.tsv",sep="\t",header=True,index=False)
 56 |     print(triples_che_gen)
 57 | 
 58 |     che_gen_chemical= triples_che_gen[["Entity1",'DB_ID1']].drop_duplicates(['DB_ID1'])
 59 |     che_gen_gene = triples_che_gen[["Entity2",'DB_ID2']].drop_duplicates(['DB_ID2'])
 60 | 
 61 |     print(che_gen_chemical)
 62 | 
 63 |     che_gen_chemical.to_csv("original entitys/entity_cg_chemical.tsv",sep='\t',header=True,index=False)
 64 |     print("to_csv chemical")
 65 |     che_gen_gene.to_csv("original entitys/entity_cg_gene.tsv",sep='\t',header=True,index=False)
 66 |     print(che_gen_gene)
 67 |     print("to_csv chemical")
 68 | 
 69 | def relation_normalization():
 70 | #提取出三元组
 71 |     triple_che_gen= pd.read_csv("triples/triples_che_gen_themes.tsv",delimiter='\t')
 72 |     print(triple_che_gen.describe())
 73 | 
 74 |     print("最大值：",max(triple_che_gen[['N']].values))
 75 |     #45222
 76 |     #将关系进行归一化：
 77 |     #A+	A+.ind	A-	A-.ind	B	B.ind	E+	E+.ind	E-	E-.ind	E	E.ind	N	N.ind	O	O.ind	K	K.ind	Z	Z.ind
 78 |     #
 79 |     triple_che_gen ["A+"] = triple_che_gen ["A+"] .apply(lambda x : x/45222.0)
 80 |     triple_che_gen ["A-"] = triple_che_gen ["A-"] .apply(lambda x : x/45222.0)
 81 |     triple_che_gen ["B"] = triple_che_gen ["B"] .apply(lambda x : x/45222.0)
 82 |     triple_che_gen ["E+"] = triple_che_gen ["E+"] .apply(lambda x : x/45222.0)
 83 |     triple_che_gen ["E-"] = triple_che_gen ["E-"] .apply(lambda x : x/45222.0)
 84 |     triple_che_gen ["E"] = triple_che_gen ["E"] .apply(lambda x : x/45222.0)
 85 |     triple_che_gen ["N"] = triple_che_gen ["N"] .apply(lambda x : x/45222.0)
 86 |     triple_che_gen ["O"] = triple_che_gen ["O"] .apply(lambda x : x/45222.0)
 87 |     triple_che_gen ["K"] = triple_che_gen ["K"] .apply(lambda x : x/45222.0)
 88 |     triple_che_gen ["Z"] = triple_che_gen ["Z"] .apply(lambda x : x/45222.0)
 89 |     print("归一化完成")
 90 |     triples= []
 91 |     for index ,row  in triple_che_gen.iterrows():
 92 |         if index %50000 ==0:
 93 |             print("读入",index/50000,"行")
 94 |         entity_1 = row["DB_ID1"]
 95 |         entity_2 = row["DB_ID2"]
 96 |         A1 = row['A+']
 97 |         A2 = row['A-']
 98 |         B = row['B']
 99 |         E1 = row['E+']
100 |         E2 = row['E-']
101 |         E = row['E']
102 |         N = row['N']
103 |         O = row['O']
104 |         K = row['K']
105 |         Z = row['Z']
106 |         if A1!=0.0:
107 |             triples.append([entity_1,"A+",entity_2,A1])
108 |         if A2 != 0.0:
109 |             triples.append([entity_1, "A-", entity_2, A2])
110 |         if B != 0.0:
111 |             triples.append([entity_1, "B", entity_2, B])
112 |         if E1 != 0.0:
113 |             triples.append([entity_1, "E+", entity_2, E1])
114 |         if E2 != 0.0:
115 |             triples.append([entity_1, "E-", entity_2, E2])
116 |         if E != 0.0:
117 |             triples.append([entity_1, "E", entity_2, E])
118 |         if N != 0.0:
119 |             triples.append([entity_1, "N", entity_2, N])
120 |         if O != 0.0:
121 |             triples.append([entity_1, "O", entity_2, O])
122 |         if K != 0.0:
123 |             triples.append([entity_1, "K", entity_2, K])
124 |         if Z != 0.0:
125 |             triples.append([entity_1, "Z", entity_2, Z])
126 |     print("read done")
127 |     che_gen_triples = DataFrame(triples,columns=["che","rel","gen","score"])
128 | 
129 |     #去重
130 |     che_gen_triples.drop_duplicates(["che","rel","gen","score"],inplace=True)
131 |     final_che_gen_triples = che_gen_triples.sort_values('score', ascending=False).groupby(["ent1","rel","ent2"]).first().reset_index()
132 |     final_che_gen_triples.to_csv("triples_che_rel_gen.tsv",sep='\t',header=False,index=False)
133 |     print(che_gen_triples)
134 | 
135 | if __name__ == '__main__':
136 |     part1_pat2_concat()
137 |     relation_normalization()


--------------------------------------------------------------------------------
/gnbr/prepare_gen_dis.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from pandas import Series,DataFrame
  3 | 
  4 | pd.set_option("precision",10)
  5 | def  part1_pat2_concat():
  6 |     parti_gen_dis = pd.read_csv("original resource/part-i-gene-disease-path-theme-distributions.txt", delimiter="\t")
  7 |     print("done")
  8 |     parti_gen_dis.rename(columns={'path':'Dependence_path'},inplace=True)
  9 | 
 10 |     print(parti_gen_dis)
 11 |     # print(parti_gen_dis.describe())
 12 | 
 13 |     gen_dis_name = ['PubMed_ID', 'Sentence_number', 'Entity1', 'Loc1','Entity2','Loc2','Entity_Raw_str1','Entity_Raw_str2','DB_ID1','DB_ID2','Entity1_type','Entity2_type','Dependence_path','Sentence']
 14 |     print("name")
 15 |     partii_gen_dis =pd.read_csv("original resource/part-ii-dependency-paths-gene-disease-sorted-with-themes.txt", delimiter="\t", names = gen_dis_name, dtype={'DEVICE_ADDRESS': 'str'})
 16 |     print(partii_gen_dis)
 17 |     partii_gen_dis = partii_gen_dis[['Entity1','Entity2','DB_ID1','DB_ID2','Dependence_path']]
 18 | 
 19 |     print("过滤")
 20 |     #将依赖路劲改成小写：
 21 |     partii_gen_dis['Dependence_path'] = partii_gen_dis['Dependence_path'].map(lambda  x: str(x).lower())
 22 |     partii_gen_dis['Dependence_path'] = partii_gen_dis['Dependence_path'].map(lambda  x: str(x).lower())
 23 |     print('小写完成')
 24 | 
 25 |     #将两个表合并
 26 |     he_gen_dis = pd.merge(partii_gen_dis, parti_gen_dis, how='left', on='Dependence_path')
 27 |     print("合并完成")
 28 | 
 29 |     # 过滤合并后的表格U	U.ind	Ud	Ud.ind	D	D.ind	J	J.ind	Te	Te.ind	Y	Y.ind	G	G.ind	Md	Md.ind	X	X.ind	L	L.ind
 30 |     T_require = he_gen_dis['U'].map(lambda x: pd.notnull(x))
 31 |     C_require = he_gen_dis['Ud'].map(lambda x: pd.notnull(x))
 32 |     Sa_require = he_gen_dis['D'].map(lambda x: pd.notnull(x))
 33 |     Pr_require =he_gen_dis['J'].map(lambda x: pd.notnull(x))
 34 |     Pa_require =he_gen_dis['Te'].map(lambda x: pd.notnull(x))
 35 |     J_require =he_gen_dis['Y'].map(lambda x: pd.notnull(x))
 36 |     Mp_require =he_gen_dis['G'].map(lambda x: pd.notnull(x))
 37 |     J_require1=he_gen_dis['Md'].map(lambda x: pd.notnull(x))
 38 |     Mp_require1 =he_gen_dis['X'].map(lambda x: pd.notnull(x))
 39 |     Mp_require2 =he_gen_dis['L'].map(lambda x: pd.notnull(x))
 40 |     some = he_gen_dis[T_require |C_require| Sa_require | Pr_require | Pa_require|J_require|Mp_require|J_require1|Mp_require1|Mp_require2]
 41 | 
 42 | 
 43 |     #删除DBID为空的对
 44 |     Db1_require = some['DB_ID1'].map(lambda x: x!='null')
 45 |     Db2_require = some['DB_ID2'].map(lambda x: x!='null')
 46 | 
 47 |     triples_gen_dis = some[Db1_require & Db2_require]
 48 |     print("过滤合并完成")
 49 |     triples_gen_dis.to_csv("triples/triples_gen_dis_themes.tsv", sep="\t", header=True, index=False)
 50 | 
 51 |     #提取出实体,并修改实体DB名称
 52 |     triples_gen_dis ["DB_ID1"] = triples_gen_dis ["DB_ID1"] .apply(lambda x :"G:"+str(x))
 53 |     triples_gen_dis ["DB_ID2"] = triples_gen_dis ["DB_ID2"] .apply(lambda x :"D:"+str(x))
 54 | 
 55 |     gen_dis_gene = triples_gen_dis[["Entity1", 'DB_ID1']].drop_duplicates(['DB_ID1'])
 56 |     gen_dis_disease = triples_gen_dis[["Entity2", 'DB_ID2']].drop_duplicates(['DB_ID2'])
 57 | 
 58 |     gen_dis_gene.to_csv("original entitys/entity_gd_gene.tsv",sep='\t',header=True,index=False)
 59 |     print("to_csv chemical")
 60 |     gen_dis_disease.to_csv("original entitys/entity_gd_disease.tsv",sep='\t',header=True,index=False)
 61 |     print("entity done")
 62 | 
 63 | def relation_normalization():
 64 |     # #提取三元组
 65 |     triple_gen_dis= pd.read_csv("triples/triples_gen_dis_themes.tsv",delimiter='\t')
 66 |     print(triple_gen_dis.describe())
 67 | 
 68 |     print("最大值：",max(triple_gen_dis[['L']].values))
 69 |     #68514
 70 |     #将关系进行归一化：
 71 |     #U	U.ind	Ud	Ud.ind	D	D.ind	J	J.ind	Te	Te.ind	Y	Y.ind	G	G.ind	Md	Md.ind	X	X.ind	L	L.ind
 72 | 
 73 |     triple_gen_dis ["U"] = triple_gen_dis ["U"] .apply(lambda x : x/68514.0)
 74 |     triple_gen_dis ["Ud"] = triple_gen_dis ["Ud"] .apply(lambda x : x/68514.0)
 75 |     triple_gen_dis ["D"] = triple_gen_dis ["D"] .apply(lambda x : x/68514.0)
 76 |     triple_gen_dis ["J"] = triple_gen_dis ["J"] .apply(lambda x : x/68514.0)
 77 |     triple_gen_dis ["Te"] = triple_gen_dis ["Te"] .apply(lambda x : x/68514.0)
 78 |     triple_gen_dis ["Y"] = triple_gen_dis ["Y"] .apply(lambda x : x/68514.0)
 79 |     triple_gen_dis ["G"] = triple_gen_dis ["G"] .apply(lambda x : x/68514.0)
 80 |     triple_gen_dis ["Md"] = triple_gen_dis ["Md"] .apply(lambda x : x/68514.0)
 81 |     triple_gen_dis ["X"] = triple_gen_dis ["X"] .apply(lambda x : x/68514.0)
 82 |     triple_gen_dis ["L"] = triple_gen_dis ["L"] .apply(lambda x : x/68514.0)
 83 |     print("归一化完成")
 84 |     triples= []
 85 |     for index ,row  in triple_gen_dis.iterrows():
 86 |         if index %50000 ==0:
 87 |             print("读入",index/50000,"行")
 88 |         entity_1 = row["DB_ID1"]
 89 |         entity_2 = row["DB_ID2"]
 90 |         U = row['U']
 91 |         Ud = row['Ud']
 92 |         D = row['D']
 93 |         J = row['J']
 94 |         Te = row['Te']
 95 |         Y = row['Y']
 96 |         G = row['G']
 97 |         Md = row['Md']
 98 |         X = row['X']
 99 |         L = row['L']
100 |         if U!=0.0:
101 |             triples.append([entity_1,"U",entity_2,U])
102 |         if Ud != 0.0:
103 |             triples.append([entity_1, "Ud", entity_2, Ud])
104 |         if D != 0.0:
105 |             triples.append([entity_1, "D", entity_2, D])
106 |         if J != 0.0:
107 |             triples.append([entity_1, "J", entity_2, J])
108 |         if Te != 0.0:
109 |             triples.append([entity_1, "Te", entity_2, Te])
110 |         if Y != 0.0:
111 |             triples.append([entity_1, "Y", entity_2, Y])
112 |         if G != 0.0:
113 |             triples.append([entity_1, "G", entity_2, G])
114 |         if Md != 0.0:
115 |             triples.append([entity_1, "Md", entity_2, Md])
116 |         if X != 0.0:
117 |             triples.append([entity_1, "X", entity_2, X])
118 |         if L != 0.0:
119 |             triples.append([entity_1, "L", entity_2, L])
120 |     print("read done")
121 |     gen_dis_triples = DataFrame(triples,columns=["gene", "rel", "dis", "score"])
122 | 
123 |     gen_dis_triples.drop_duplicates(["gene", "rel", "dis", "score"],inplace=True)
124 | 
125 |     final_gen_dis_triples = gen_dis_triples.groupby(["gene", "rel", "dis"]).apply(lambda x: x.sort_values('score', ascending=False)).groupby(
126 |         ["gene", "rel", "dis"]).first().reset_index()
127 | 
128 |     final_gen_dis_triples.to_csv("triples_gen_rel_dis.tsv",sep='\t',header=False,index=False)
129 |     print(gen_dis_triples)
130 | 
131 | 
132 | if __name__ == '__main__':
133 |     part1_pat2_concat()
134 |     relation_normalization()


--------------------------------------------------------------------------------
/gnbr/prepare_gen_gen.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from pandas import Series,DataFrame
  3 | 
  4 | pd.set_option("precision",10)
  5 | def part1_pat2_concat():
  6 |     parti_gen_gen = pd.read_csv("original resource/part-i-gene-gene-path-theme-distributions.txt", delimiter="\t")
  7 |     print("done")
  8 |     print(parti_gen_gen)
  9 |     parti_gen_gen.rename(columns={'path':'Dependence_path'},inplace=True)
 10 | 
 11 |     gen_gen_name = ['PubMed_ID', 'Sentence_number', 'Entity1', 'Loc1','Entity2','Loc2','Entity_Raw_str1','Entity_Raw_str2','DB_ID1','DB_ID2','Entity1_type','Entity2_type','Dependence_path','Sentence']
 12 |     print("name")
 13 |     partii_gen_gen =pd.read_csv("original resource/part-ii-dependency-paths-gene-gene-sorted-with-themes.txt", delimiter="\t", names = gen_gen_name, dtype={'DEVICE_ADDRESS': 'str'})
 14 |     print("done")
 15 |     new_partii_gen_gen = partii_gen_gen[['Entity1','Entity2','DB_ID1','DB_ID2','Dependence_path']]
 16 |     print(new_partii_gen_gen)
 17 |     print("过滤")
 18 |     #将依赖路劲改成小写：
 19 |     new_partii_gen_gen['Dependence_path'] = new_partii_gen_gen['Dependence_path'].map(lambda  x: str(x).lower())
 20 |     parti_gen_gen['Dependence_path'] = parti_gen_gen['Dependence_path'].map(lambda  x: str(x).lower())
 21 |     print('小写完成')
 22 | 
 23 |     #将两个表合并
 24 |     he_gen_gen = pd.merge(new_partii_gen_gen, parti_gen_gen, how='left', on='Dependence_path')
 25 |     print("合并完成")
 26 | 
 27 |     #过滤合并后的表格
 28 | 
 29 |     T_require = he_gen_gen['B'].map(lambda x: pd.notnull(x))
 30 |     C_require = he_gen_gen['W'].map(lambda x: pd.notnull(x))
 31 |     Sa_require = he_gen_gen['V+'].map(lambda x: pd.notnull(x))
 32 |     Pr_require =he_gen_gen['E+'].map(lambda x: pd.notnull(x))
 33 |     Pa_require =he_gen_gen['E'].map(lambda x: pd.notnull(x))
 34 |     J_require =he_gen_gen['I'].map(lambda x: pd.notnull(x))
 35 |     Mp_require =he_gen_gen['H'].map(lambda x: pd.notnull(x))
 36 |     J_require1=he_gen_gen['Rg'].map(lambda x: pd.notnull(x))
 37 |     Mp_require1 =he_gen_gen['Q'].map(lambda x: pd.notnull(x))
 38 |     some = he_gen_gen[T_require |C_require| Sa_require | Pr_require | Pa_require|J_require|Mp_require|J_require1|Mp_require1]
 39 |     print("过滤为空的主题")
 40 |     #
 41 |     #删除DBID为空的对
 42 |     Db1_require = some['DB_ID1'].map(lambda x: x!='null')
 43 |     Db2_require = some['DB_ID2'].map(lambda x: x!='null')
 44 | 
 45 |     triples_gen_gen = some[Db1_require & Db2_require]
 46 |     #
 47 |     print("过滤合并完成")
 48 | 
 49 |     #
 50 |     #提取出实体,并修改实体DB名称
 51 |     triples_gen_gen ["DB_ID1"] = triples_gen_gen ["DB_ID1"] .apply(lambda x :"G:"+str(x))
 52 |     triples_gen_gen ["DB_ID2"] = triples_gen_gen ["DB_ID2"] .apply(lambda x :"G:"+str(x))
 53 | 
 54 |     new_triples_gen_gen = triples_gen_gen[["Entity1","DB_ID1","Entity2","DB_ID2",'B','B.ind','W','W.ind','V+','V+.ind','E+','E+.ind','E','E.ind','I','I.ind','H','H.ind','Rg','Rg.ind','Q','Q.ind']]
 55 |     new_triples_gen_gen.to_csv("triples/triples_gen_gen_themes.tsv",sep="\t",header=True)
 56 |     print(new_triples_gen_gen)
 57 | 
 58 |     gen_gen_gene1= triples_gen_gen[["Entity1",'DB_ID1']].drop_duplicates(['DB_ID1'])
 59 |     gen_gen_gene2 = triples_gen_gen[["Entity2",'DB_ID2']].drop_duplicates(['DB_ID2'])
 60 |     print("gene1:")
 61 |     print(gen_gen_gene1)
 62 |     print("gene2：")
 63 |     print(gen_gen_gene2)
 64 |     gen_gen_gene1.to_csv("original entitys/entity_gg_gene1.tsv",sep="\t",header=True,index=False)
 65 |     gen_gen_gene2.to_csv("original entitys/entity_gg_gene2.tsv",sep="\t",header=True,index=False)
 66 | #提取三元组
 67 | 
 68 | # relation2id = pd.read_csv("Relation2Id.tsv",delimiter='\t')
 69 | #
 70 | def relation_normalization():
 71 |     triple_gen_gen= pd.read_csv("triples/triples_gen_gen_themes.tsv",delimiter='\t')
 72 |     print(triple_gen_gen.describe())
 73 |     max = max(triple_gen_gen[['Q']].values)
 74 |     print("max values:",max)
 75 |     #515159
 76 |     #将关系进行归一化：
 77 |     #B	B.ind	W	W.ind	V+	V+.ind	E+	E+.ind	E	E.ind	I	I.ind	H	H.ind	Rg	Rg.ind	Q	Q.ind
 78 | 
 79 |     triple_gen_gen ["B"] = triple_gen_gen ["B"] .apply(lambda x : x/max)
 80 |     triple_gen_gen ["W"] = triple_gen_gen ["W"] .apply(lambda x : x/max)
 81 |     triple_gen_gen ["V+"] = triple_gen_gen ["V+"] .apply(lambda x : x/max)
 82 |     triple_gen_gen ["E+"] = triple_gen_gen ["E+"] .apply(lambda x : x/max)
 83 |     triple_gen_gen ["E"] = triple_gen_gen ["E"] .apply(lambda x : x/max)
 84 |     triple_gen_gen ["I"] = triple_gen_gen ["I"] .apply(lambda x : x/max)
 85 |     triple_gen_gen ["H"] = triple_gen_gen ["H"] .apply(lambda x : x/max)
 86 |     triple_gen_gen ["Rg"] = triple_gen_gen ["Rg"] .apply(lambda x : x/max)
 87 |     triple_gen_gen ["Q"] = triple_gen_gen ["Q"] .apply(lambda x : x/max)
 88 | 
 89 |     print("归一化完成")
 90 |     triples= []
 91 |     for index ,row  in triple_gen_gen.iterrows():
 92 |         if index %50000 ==0:
 93 |             print("读入",index/50000,"行")
 94 |         entity_1 = row["DB_ID1"]
 95 |         entity_2 = row["DB_ID2"]
 96 |         B = row['B']
 97 |         W = row['W']
 98 |         V1 = row['V+']
 99 |         E1 = row['E+']
100 |         E = row['E']
101 |         I = row['I']
102 |         H = row['H']
103 |         Rg = row['Rg']
104 |         Q = row['Q']
105 | 
106 |         if B!=0.0:
107 |             triples.append([entity_1,"B",entity_2,B])
108 |         if W != 0.0:
109 |             triples.append([entity_1, "W", entity_2, W])
110 |         if V1 != 0.0:
111 |             triples.append([entity_1, "V+", entity_2, V1])
112 |         if E1 != 0.0:
113 |             triples.append([entity_1, "E+", entity_2, E1])
114 |         if E != 0.0:
115 |             triples.append([entity_1, "E", entity_2, E])
116 |         if I != 0.0:
117 |             triples.append([entity_1, "I", entity_2, I])
118 |         if H != 0.0:
119 |             triples.append([entity_1, "H", entity_2, H])
120 |         if Rg != 0.0:
121 |             triples.append([entity_1, "Rg", entity_2, Rg])
122 |         if Q != 0.0:
123 |             triples.append([entity_1, "Q", entity_2, Q])
124 | 
125 |     print("read done")
126 |     gen_gen_triples = DataFrame(triples,columns=["gene", "rel", "gen", "score"])
127 | 
128 |     gen_gen_triples.drop_duplicates(["gene", "rel", "gen", "score"],inplace=True)
129 |     final_gen_gen_triples= gen_gen_triples.sort_values('score', ascending=False).groupby(["ent1", "rel", "ent2"]).first().reset_index()
130 |     final_gen_gen_triples.to_csv("triples_gen_rel_gen.tsv",sep='\t',header=False,index=False)
131 |     print(gen_gen_triples)
132 | 
133 | if __name__ == '__main__':
134 |     part1_pat2_concat()
135 |     relation_normalization()
136 | 


--------------------------------------------------------------------------------
/gnbr/relations.tsv:
--------------------------------------------------------------------------------
 1 | T	0
 2 | C	1
 3 | Sa	2
 4 | Pr	3
 5 | Pa	4
 6 | J	5
 7 | Mp	6
 8 | Md	7
 9 | X	8
10 | L	9
11 | U	10
12 | Ud	11
13 | D	12
14 | Te	13
15 | Y	14
16 | G	15
17 | A+	16
18 | A-	17
19 | B	18
20 | E+	19
21 | E-	20
22 | E	21
23 | N	22
24 | O	23
25 | K	24
26 | Z	25
27 | W	26
28 | V+	27
29 | I	28
30 | H	29
31 | Rg	30
32 | Q	31
33 | 


--------------------------------------------------------------------------------