├── .gitattributes ├── check_doc_db.py ├── data_load.py ├── doc_chek_hou.py ├── doc_db.py ├── doc_find_hou.py ├── doc_query.py ├── doc_query_no_beg_end.py ├── handleDBIdName.py ├── handle_db_entity.py ├── handle_min_db.py ├── merge_all_data_db_id.py ├── myCNN.py ├── process_wiki.py ├── produce_entity_index.py ├── produce_min_db.py └── word2vec.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /check_doc_db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding:utf8 -*- 3 | 4 | def check(docPath,dbPath): 5 | db_id_list=[] 6 | db_name_list=[] 7 | count = 0 8 | with open(dbPath,encoding='utf-8') as f: 9 | line = f.readline().strip('\n') 10 | while line: 11 | 12 | if (count % 2 == 0): 13 | db_id_list.append(line) 14 | # print(line) 15 | if (count % 2 == 1): 16 | db_name_list.append(line) 17 | 18 | count = count + 1 19 | line = f.readline().strip('\n') 20 | 21 | pass 22 | 23 | 24 | with open(docPath,encoding='utf-8') as docF: 25 | doc_line=docF.readline().strip('\n') 26 | while doc_line: 27 | pos=doc_line.find('E0') 28 | answerId=doc_line[pos:pos+8] 29 | 30 | findFlag=True 31 | if(len(answerId)==8): 32 | 33 | for i in range(len(db_id_list)): 34 | db_id=db_id_list[i] 35 | if(db_id==answerId): 36 | # print("find!!! "+answerId) 37 | # print(doc_line) 38 | # print(answerId) 39 | findFlag=False 40 | break 41 | if(findFlag): 42 | print(doc_line) 43 | print(answerId) 44 | print("not find!!! "+answerId) 45 | 46 | 47 | 48 | doc_line = docF.readline().strip('\n') 49 | 50 | 51 | 52 | pass 53 | 54 | 55 | 56 | 57 | if __name__=='__main__': 58 | 59 | docPath=u'E:\mypython_Linking\data_handle\docQuery\doc_' 60 | dbPath=u'E:\mypython_Linking\data_handle\dbEntity\db_' 61 | check(docPath+'1.txt',dbPath+'1.txt') 62 | pass 63 | -------------------------------------------------------------------------------- /data_load.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | #加载Google训练的词向量 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | queryIndexMap=dict() 7 | queryNameIndexMap=dict() 8 | dbIndexMap=dict() 9 | dbNameIndexMap=dict() 10 | dbTrainIndexMap=dict() 11 | dbNameTrainIndexMap=dict() 12 | queryTrainIndexMap=dict() 13 | queryNameTrainIndexMap=dict() 14 | 15 | 16 | # 加载所有的词向量 17 | def load_word_vectors(file_path): 18 | 19 | print ('loading word vectors...') 20 | f = open(file_path, 'r', encoding='utf-8') 21 | m =f.readlines() 22 | i = 0 23 | for li in m: 24 | m[i] = m[i].strip().split(' ') 25 | i = i + 1 26 | 27 | num_words=int(len(m))#词向量表的大小 28 | vec_len=int(len(m[0][1:]))#词向量的大小 29 | print (num_words,vec_len) 30 | words = [] 31 | word_vecs = np.zeros((num_words + 1, vec_len))#词向量表初始化为全0,在0位置处,表示找到的词的向量为0 32 | 33 | for i in range(num_words): 34 | if i == 0: 35 | words.append(m[i][0]) 36 | words.append(m[i][0]) 37 | word_vecs[i + 1] = np.array(m[i][1:],dtype=np.float32) 38 | 39 | f.close() 40 | print ('done.') 41 | 42 | return words, word_vecs 43 | 44 | def readQueryIndex(local,year,queryPath): 45 | global queryIndexMap 46 | global queryNameIndexMap 47 | if(local): 48 | path='H:\yaojuan\QUERY\\'+year+'\eval\\test\\test_queryIndex.txt' 49 | else: 50 | path = queryPath 51 | with open(path,encoding='utf-8') as f: 52 | line=f.readline().strip(' ').strip('\n') 53 | lineCount=0 54 | while line: 55 | if(lineCount%2==0): 56 | ##第一行,上下文 57 | queryId=line.split(' ')[0] 58 | queryIndex=line[len(line.split(' ')[0])+1:].strip(' ') 59 | queryIndexMap[queryId]=queryIndex 60 | if(lineCount%2==1): 61 | ##第二行,name 62 | queryId=line.split(' ')[0] 63 | queryIndex=line[len(line.split(' ')[0])+1:].strip(' ') 64 | queryNameIndexMap[queryId]=queryIndex 65 | lineCount=lineCount+1 66 | line=f.readline().strip(' ').strip('\n') 67 | pass 68 | 69 | def readMinDBIndex(local,year,dbPath): 70 | global dbIndexMap 71 | global dbNameIndexMap 72 | if(local): 73 | path='H:\yaojuan\QUERY\\'+year+'\eval\\test_mindbIndex.txt' 74 | else: 75 | path = dbPath 76 | 77 | with open(path,encoding='utf-8') as f: 78 | line = f.readline().strip('\n').strip(' ') 79 | lineCount=0 80 | while line: 81 | if(lineCount%2==0): 82 | dbId = line.split(' ')[0] 83 | dbIndex = line[len(line.split(' ')[0]) + 1:] 84 | dbIndexMap[dbId] = dbIndex 85 | if(lineCount%2==1): 86 | dbId = line.split(' ')[0] 87 | dbIndex = line[len(line.split(' ')[0]) + 1:] 88 | dbNameIndexMap[dbId] = dbIndex 89 | 90 | lineCount=lineCount+1 91 | line = f.readline().strip('\n').strip(' ') 92 | 93 | pass 94 | 95 | def readTrainMinDBIndex(local): 96 | global dbTrainIndexMap 97 | global dbNameTrainIndexMap 98 | if(local): 99 | ####将2014的训练数据当做模型的训练集 100 | path='H:\yaojuan\QUERY\\2014\\training\\train_mindbIndex.txt' 101 | else: 102 | path = 'train_mindbIndex.txt' 103 | 104 | with open(path,encoding='utf-8') as f: 105 | line = f.readline().strip(' ').strip('\n') 106 | lineCount=0 107 | while line: 108 | if(lineCount%2==0): 109 | dbId = line.split(' ')[0] 110 | dbIndex = line[len(line.split(' ')[0]) + 1:] 111 | dbTrainIndexMap[dbId] = dbIndex 112 | if (lineCount%2==1): 113 | dbId = line.split(' ')[0] 114 | dbIndex = line[len(line.split(' ')[0]) + 1:] 115 | dbNameTrainIndexMap[dbId] = dbIndex 116 | 117 | lineCount=lineCount+1 118 | line = f.readline().strip(' ').strip('\n') 119 | 120 | pass 121 | 122 | def readTrainQueryIndex(local): 123 | global queryTrainIndexMap 124 | global queryNameTrainIndexMap 125 | if(local): 126 | ####将2014的训练数据当做模型的训练集 127 | path='H:\yaojuan\QUERY\\2014\\training\\train\\train_queryIndex.txt' 128 | else: 129 | path = 'train_queryNounIndex.txt' 130 | with open(path,encoding='utf-8') as f: 131 | line=f.readline().strip(' ').strip('\n') 132 | lineCount=0 133 | while line: 134 | if(lineCount%2==0): 135 | queryId=line.split(' ')[0] 136 | queryIndex=line[len(line.split(' ')[0])+1:].strip(' ') 137 | queryTrainIndexMap[queryId]=queryIndex 138 | if(lineCount%2==1): 139 | queryId = line.split(' ')[0] 140 | queryIndex = line[len(line.split(' ')[0]) + 1:].strip(' ') 141 | queryNameTrainIndexMap[queryId] = queryIndex 142 | 143 | 144 | lineCount=lineCount+1 145 | line=f.readline().strip(' ').strip('\n') 146 | pass 147 | 148 | def readTrainAllData(local): 149 | words, word_vecs = load_word_vectors(u'glove.6B.100d.txt') 150 | if(local): 151 | #######这个query——entity对 152 | alldataPath = 'E:\mypython_Linking\\data_handle\\train_all_data.txt' 153 | else: 154 | alldataPath = 'train_all_data.txt' 155 | ####训练集不需要传year年份 156 | readTrainQueryIndex(local) 157 | readTrainMinDBIndex(local) 158 | queryIdList=[] 159 | queryIndexList=[] 160 | queryNameIndexList=[] 161 | dbIdList=[] 162 | dbIndexList=[] 163 | dbNameIndexList=[] 164 | lableList=[] 165 | maxLength=0 166 | minLength=1000 167 | lineCount=0 168 | with open(alldataPath, encoding='utf-8') as f: 169 | line = f.readline().strip('\n') 170 | 171 | while line: 172 | lineCount=lineCount+1 173 | dbId = line.split(' ')[1] 174 | if (len(dbTrainIndexMap[dbId].strip(' ').split(' ')) > maxLength): 175 | maxLength = len(dbTrainIndexMap[dbId].strip(' ').split(' ')) 176 | if (len(dbTrainIndexMap[dbId].strip(' ').split(' ')) < minLength): 177 | minLength = len(dbTrainIndexMap[dbId].strip(' ').split(' ')) 178 | line = f.readline().strip('\n') 179 | print('train_maxLength=' + str(maxLength)) 180 | print('train_minLength=' + str(minLength)) 181 | print('train_lineCount=' + str(lineCount)) 182 | # trainLen = lineCount // 3 183 | 184 | 185 | with open(alldataPath,encoding='utf-8') as f: 186 | line=f.readline().strip('\n') 187 | lineCount=0 188 | while line: 189 | lineCount=lineCount+1 190 | line=line.split(' ') 191 | queryId=line[0] 192 | dbId=line[1] 193 | lable=line[2] 194 | # print('queryId='+queryId) 195 | # print('queryIndex='+queryIndexMap[queryId]) 196 | # print('dbId='+dbId) 197 | # print('dbIndex='+dbIndexMap[dbId]) 198 | # print('lable='+lable) 199 | queryIdList.append(queryId) 200 | queryIndex=queryTrainIndexMap[queryId].strip(' ').split(' ') 201 | queryNameIndex=queryNameTrainIndexMap[queryId].strip(' ').split(' ') 202 | textVec=[] 203 | for i in queryIndex: 204 | textVec.append(word_vecs[int(i)]) 205 | queryIndexList.append(np.asarray(textVec,dtype=np.float32)) 206 | 207 | nameVec=[] 208 | for i in queryNameIndex: 209 | nameVec.append(word_vecs[int(i)]) 210 | queryNameIndexList.append(np.asarray(nameVec,dtype=np.float32).mean(axis=0)) 211 | 212 | 213 | dbIdList.append(dbId) 214 | dbIndex=dbTrainIndexMap[dbId] 215 | dbNameIndex = dbTrainIndexMap[dbId].strip(' ').split(' ') 216 | for i in range(len(dbTrainIndexMap[dbId].strip(' ').split(' ')),160): 217 | dbIndex=dbIndex.strip(' ')+' 0' 218 | dbIndex=dbIndex.strip(' ').split(' ') 219 | 220 | dbVec=[] 221 | for i in dbIndex: 222 | dbVec.append(word_vecs[int(i)]) 223 | dbIndexList.append(np.asarray(dbVec,dtype=np.float32)) 224 | 225 | nameVec = [] 226 | for i in dbNameIndex: 227 | nameVec.append(word_vecs[int(i)]) 228 | dbNameIndexList.append(np.asarray(nameVec, dtype=np.float32).mean(axis=0)) 229 | 230 | lableList.append(lable) 231 | 232 | # if(lineCount>=90): 233 | # 234 | # break 235 | 236 | line=f.readline().strip('\n') 237 | 238 | query=np.asarray(queryIndexList,dtype=np.float32) 239 | queryName=np.asarray(queryNameIndexList,dtype=np.float32)[:,np.newaxis,:] 240 | db=np.asarray(dbIndexList,dtype=np.float32) 241 | dbName=np.asarray(dbNameIndexList,dtype=np.float32)[:,np.newaxis,:] 242 | lab=np.asarray(lableList,dtype=np.float32) 243 | 244 | print(query.shape) 245 | print(queryName.shape) 246 | print(db.shape) 247 | print(dbName.shape) 248 | print(lab.shape) 249 | 250 | 251 | return (query,queryName,db,dbName,lab) 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | def readAllData(local,year,queryPath,dbPath): 260 | words, word_vecs = load_word_vectors(u'glove.6B.100d.txt') 261 | if(local): 262 | alldataPath = 'H:\yaojuan\QUERY\\'+year+'\eval\\test_all_data.txt' 263 | else: 264 | alldataPath = year+'_test_all_data.txt' 265 | readQueryIndex(local,year,queryPath) 266 | readMinDBIndex(local,year,dbPath) 267 | queryIdList=[] 268 | queryIndexList=[] 269 | queryNameIndexList=[] 270 | dbIdList=[] 271 | dbIndexList=[] 272 | dbNameIndexList=[] 273 | lableList=[] 274 | maxLength=0 275 | minLength=1000 276 | lineCount=0 277 | with open(alldataPath, encoding='utf-8') as f: 278 | line = f.readline().strip('\n') 279 | 280 | while line: 281 | lineCount=lineCount+1 282 | dbId = line.split(' ')[1] 283 | if (len(dbIndexMap[dbId].strip(' ').split(' ')) > maxLength): 284 | maxLength = len(dbIndexMap[dbId].strip(' ').split(' ')) 285 | if (len(dbIndexMap[dbId].strip(' ').split(' ')) < minLength): 286 | minLength = len(dbIndexMap[dbId].strip(' ').split(' ')) 287 | # if(lineCount>=90): 288 | # break 289 | line = f.readline().strip('\n') 290 | print('test_maxLength=' + str(maxLength)) 291 | print('test_minLength=' + str(minLength)) 292 | print('test_lineCount=' + str(lineCount)) 293 | 294 | 295 | with open(alldataPath,encoding='utf-8') as f: 296 | line=f.readline().strip('\n') 297 | lineCount=0 298 | while line: 299 | lineCount=lineCount+1 300 | if(lineCount>0): 301 | line=line.split(' ') 302 | queryId=line[0] 303 | dbId=line[1] 304 | lable=line[2] 305 | # print('queryId='+queryId) 306 | # print('queryIndex='+queryIndexMap[queryId]) 307 | # print('dbId='+dbId) 308 | # print('dbIndex='+dbIndexMap[dbId]) 309 | # print('lable='+lable) 310 | queryIdList.append(queryId) 311 | queryIndex=queryIndexMap[queryId].strip(' ').split(' ') 312 | queryNameIndex=queryNameIndexMap[queryId].strip(' ').split(' ') 313 | textVec=[] 314 | for i in queryIndex: 315 | textVec.append(word_vecs[int(i)]) 316 | queryIndexList.append(np.asarray(textVec,dtype=np.float32)) 317 | 318 | nameVec = [] 319 | for i in queryNameIndex: 320 | nameVec.append(word_vecs[int(i)]) 321 | queryNameIndexList.append(np.asarray(nameVec, dtype=np.float32).mean(axis=0)) 322 | 323 | dbIdList.append(dbId) 324 | dbIndex=dbIndexMap[dbId] 325 | dbNameIndex=dbNameIndexMap[dbId].strip(' ').split(' ') 326 | for i in range(len(dbIndexMap[dbId].strip(' ').split(' ')),160): 327 | dbIndex=dbIndex.strip(' ')+' 0' 328 | dbIndex=dbIndex.strip(' ').split(' ') 329 | dbVec=[] 330 | for i in dbIndex: 331 | dbVec.append(word_vecs[int(i)]) 332 | dbIndexList.append(np.asarray(dbVec,dtype=np.float32)) 333 | 334 | nameVec = [] 335 | for i in dbNameIndex: 336 | nameVec.append(word_vecs[int(i)]) 337 | dbNameIndexList.append(np.asarray(nameVec, dtype=np.float32).mean(axis=0)) 338 | 339 | lableList.append(lable) 340 | # if(lineCount>=200000): 341 | # 342 | # break 343 | 344 | line=f.readline().strip('\n') 345 | 346 | query=np.asarray(queryIndexList,dtype=np.float32) 347 | queryName=np.asarray(queryNameIndexList,dtype=np.float32)[:,np.newaxis,:] 348 | db=np.asarray(dbIndexList,dtype=np.float32) 349 | dbName=np.asarray(dbNameIndexList,dtype=np.float32)[:,np.newaxis,:] 350 | lab=np.asarray(lableList,dtype=np.float32) 351 | print(query.shape) 352 | print(queryName.shape) 353 | print(db.shape) 354 | print(dbName.shape) 355 | print(lab.shape) 356 | 357 | 358 | return (query,queryName,db,dbName,lab) 359 | 360 | 361 | 362 | 363 | 364 | if __name__=='__main__': 365 | # readQueryIndex(True) #输出显示test_query的上下文长度是20 366 | # readMinDBIndex(True) #输出显示test_db的上下文长度是768 367 | # readTrainQueryIndex(True) #输出显示train_query的上下文长度是20 368 | # readTrainMinDBIndex(True) #输出显示train_db的上下文长度是1478 369 | 370 | 371 | # readAllData(local=True,year='2014') 372 | readTrainAllData(local=True) 373 | # readTrainAllData(local=True) 374 | pass 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | #################################################################################################### 388 | 389 | # SentLength=10 390 | # 391 | # sentence1='I love you very much x_x' 392 | # sentence2='I like you' 393 | # 394 | # s_vec1 = sentence2vec(sentence1,SentLength) 395 | # s_vec2 = sentence2vec(sentence2,SentLength) 396 | # 397 | # # print(s_vec1) 398 | # print(s_vec1.shape) 399 | # 400 | # # 准备已有数据 401 | # x_data1 = tf.constant(s_vec1[np.newaxis,:,:,np.newaxis],dtype=tf.float32) 402 | # x_data2 = tf.constant(s_vec2[np.newaxis,:,:,np.newaxis],dtype=tf.float32) 403 | # y_data = [1] 404 | # print(x_data1.shape) 405 | # 406 | # # 定义placeholder 407 | # x1 = tf.placeholder(tf.float32, [None, 1]) 408 | # x2 = tf.placeholder(tf.float32, [None, 1]) 409 | # y = tf.placeholder(tf.float32, [None, 1]) 410 | # 411 | # 412 | # # [batch, in_height, in_width, in_channels] 1,20,50,1 413 | # input_arg1 = tf.Variable(s_vec1) 414 | # input_arg2 = tf.Variable(s_vec2) 415 | # # [filter_height, filter_width, in_channels, out_channels] 416 | # filter_arg1 = tf.Variable(tf.ones([3, 3, 1, 1])) 417 | # filter_arg2 = tf.Variable(tf.ones([3, 3, 1, 1])) 418 | # op1 = tf.nn.relu(tf.nn.conv2d(x_data1, filter_arg1, strides=[1, 1, 4, 1], use_cudnn_on_gpu=False, padding='SAME')) 419 | # pool1=tf.nn.max_pool(op1, ksize=[1, 2, 4, 1],strides=[1, 2, 4, 1], padding='SAME') 420 | # # softmax1=tf.nn.softmax(pool1) 421 | # 422 | # # connected=tf.nn.con 423 | # # op2 = tf.nn.conv2d(input_arg2, filter_arg2, strides=[1, 2, 2, 1], use_cudnn_on_gpu=False, padding='SAME') 424 | # # # 求模` 425 | # # x1_norm = tf.sqrt(tf.reduce_sum(tf.square(op1), axis=2)) 426 | # # 427 | # # x2_norm = tf.sqrt(tf.reduce_sum(tf.square(op2), axis=2)) 428 | # # x1_x2=tf.reduce_sum(tf.multiply(x1, x2), axis=2) 429 | # # 430 | # # cosin = x1_x2 / (x1_norm * x2_norm) 431 | # # 432 | # # cosin1 = tf.pide(x1_x2, tf.multiply(x1_norm, x2_norm)) 433 | # # 434 | # # 435 | # 436 | # 437 | # 438 | 439 | # with tf.Session() as a_sess: 440 | # a_sess.run(tf.global_variables_initializer()) 441 | # # op1,op2,a, b, c, d, e = a_sess.run([op1,op2,x1_norm, x2_norm, x1_x2, cosin, cosin1]) 442 | # 443 | # 444 | # print("----------{}---------".format("case1")) 445 | # a_op1=a_sess.run(pool1) 446 | # writer = tf.summary.FileWriter('tensorflow/', a_sess.graph) 447 | # print(a_op1) 448 | # print(a_op1.shape) 449 | # KKK=tf.reshape(a_op1,(1,20)) 450 | # print(KKK) 451 | # 452 | # print('---------------------\n\n') 453 | # 454 | # pass -------------------------------------------------------------------------------- /doc_chek_hou.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding:utf8 -*- 3 | 4 | 5 | T=0 #待链接实体总数(答案) 6 | T1=0 #待链接实体NIL总数(答案) 7 | T2=0 #待链接实体NOT_NIL总数(答案) 8 | S=0 #待链接实体总数(自己) 9 | S1=0 #待链接实体NIL总数(自己) 10 | S2=0 #待链接实体NOT_NIL总数(自己) 11 | 12 | NIL_Dui=0 #自己判空,且判对 13 | NER_Dui=0 #自己判非空,且链接对 14 | 15 | 16 | def get_T_S(answerPath): 17 | global T,T1,T2 18 | global S,S1,S2 19 | global NIL_Dui,NER_Dui 20 | with open(answerPath,encoding='utf-8') as f: 21 | line=f.readline().strip('\n') 22 | lineCount=0 23 | while line: 24 | if(lineCount%2==0): 25 | TempLine=line 26 | if(lineCount%2==1): 27 | T=T+1 28 | S=S+1 29 | ans = line.split(' ')[0] 30 | if(line.find('NIL')==-1): 31 | print(line) 32 | ansNum = int(line.split(' ')[1]) 33 | if(ans.find('NIL')==-1): 34 | ####NOT_NIL 35 | T2=T2+1 36 | else: 37 | ####NIL 38 | T1=T1+1 39 | 40 | if(ansNum==0): 41 | ####自己判空NIL 42 | S1=S1+1 43 | if(ans.find('NIL')!=-1): 44 | # 自己判空,且判对 45 | NIL_Dui=NIL_Dui+1 46 | else: 47 | ####自己判非空NOT_NIL 48 | S2=S2+1 49 | #if(ans==line.split(' ')[2]): 50 | if (int(line.split(' ')[1])<=5): 51 | # 自己判非空,且链接对 52 | NER_Dui=NER_Dui+1 53 | 54 | lineCount = lineCount + 1 55 | line=f.readline().strip('\n') 56 | 57 | 58 | 59 | if __name__=='__main__': 60 | 61 | year='2009'; fileNum=3695 62 | # year='2010'; fileNum=2231 63 | # year='2011'; fileNum=2231 64 | # year='2012'; fileNum=2016 65 | # year='2013'; fileNum=1820 66 | # year='2014'; fileNum=138 67 | for i in range(fileNum): 68 | answerPath='H:\yaojuan\QUERY\\'+year+'\\eval\\test_dbEntity\db_new_'+str(i+1)+'.txt' 69 | # answerPath = 'H:\yaojuan\QUERY\\' + year + '\\eval\\test_dbEntity\db_' + str(i + 1) + '.txt' 70 | # answerPath = 'H:\yaojuan\QUERY\\2014\eval\\test_dbEntity\db_' + str(i + 1) + '.txt' 71 | get_T_S(answerPath) 72 | print(' 总 空 非空') 73 | print(T,T1,T2) 74 | print(S,S1,S2) 75 | print(NIL_Dui+NER_Dui,NIL_Dui,NER_Dui) 76 | print(NER_Dui/T2) 77 | # print(NIL_Dui/T1) 78 | Micro_accuracy_avrage=(NIL_Dui+NER_Dui)/T*100 79 | Precision=NER_Dui/S2*100 80 | Recall=NER_Dui/T2*100 81 | F1=2*Precision*Recall/(Precision+Recall) 82 | print('Micro_accuracy_avrage:'+str(Micro_accuracy_avrage)+'%') 83 | print('Precision:'+str(Precision)+'%') 84 | print('Recall:'+str(Recall)+'%') 85 | print('F1:'+str(F1)+'%') 86 | 87 | 88 | 89 | pass -------------------------------------------------------------------------------- /doc_db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding:utf8 -*- 3 | 4 | import os 5 | import sys 6 | allFileNum = 0 7 | 8 | 9 | def printPath(level, path): 10 | global allFileNum 11 | ''''' 12 | 打印一个目录下的所有文件夹和文件 13 | ''' 14 | # 所有文件夹,第一个字段是次目录的级别 15 | dirList = [] 16 | # 所有文件 17 | fileList = [] 18 | # 返回一个列表,其中包含在目录条目的名称(google翻译) 19 | files = os.listdir(path) 20 | # 先添加目录级别 21 | dirList.append(str(level)) 22 | of = open('DocFileName.txt', 'w') 23 | 24 | 25 | for f in files: 26 | #判断是不是文件夹 27 | if (os.path.isdir(path + '/' + f)): 28 | # 排除隐藏文件夹。因为隐藏文件夹过多 29 | if (f[0] == '.'): 30 | pass 31 | else: 32 | # 添加非隐藏文件夹 33 | dirList.append(f) 34 | #判断是不是文件 35 | if (os.path.isfile(path + '/' + f)): 36 | # 添加文件 37 | fileList.append(f) 38 | doc_id=f 39 | print(f) 40 | #将DB的文件名写入‘DBFileName.txt’中 41 | of.write(f+'\n') 42 | 43 | 44 | 45 | 46 | 47 | 48 | of.close( ) 49 | # 当一个标志使用,文件夹列表第一个级别不打印 50 | i_dl = 0 51 | for dl in dirList: 52 | if (i_dl == 0): 53 | i_dl = i_dl + 1 54 | else: 55 | # 打印至控制台,不是第一个的目录 56 | print('#########' * (int(dirList[0])), dl) 57 | 58 | # 打印目录下的所有文件夹和文件,目录级别+1 59 | printPath((int(dirList[0]) + 1), path + '/' + dl) 60 | 61 | for fl in fileList: 62 | # 打印文件 63 | # print '-------' * (int(dirList[0])), fl 64 | # 顺便计算一下有多少个文件 65 | allFileNum = allFileNum + 1 66 | 67 | 68 | if __name__=='__main__': 69 | path=u'H:\\yaojuan\\QUERY\\2014\\eval\\source_documents' 70 | printPath(1,path) 71 | 72 | pass -------------------------------------------------------------------------------- /doc_find_hou.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding:utf8 -*- 3 | 4 | import os 5 | import sys 6 | 7 | 8 | NotFindQID=[] 9 | NotFindQNAME=[] 10 | DBIdNameText='' 11 | 12 | def edit(str1, str2): 13 | matrix = [[i + j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)] 14 | 15 | for i in range(1, len(str1) + 1): 16 | for j in range(1, len(str2) + 1): 17 | if str1[i - 1] == str2[j - 1]: 18 | d = 0 19 | else: 20 | d = 1 21 | matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + d) 22 | 23 | return matrix[len(str1)][len(str2)] 24 | 25 | 26 | 27 | def DBIdName2Text(): 28 | 29 | path='DBIdName.txt' 30 | count=0 31 | T=open('DBNameText.txt','w',encoding='utf-8') 32 | with open(path,encoding='utf-8') as f: 33 | line=f.readline().strip('\n') 34 | tempText='' 35 | while line: 36 | 37 | bath=int(count/20000) 38 | if(bath<=(count/20000)2000): 58 | # break 59 | print(count) 60 | line=f.readline().strip('\n') 61 | T.close() 62 | 63 | pass 64 | 65 | 66 | 67 | def ReadDBNameText(): 68 | global DBIdNameText 69 | 70 | with open('DBNameText.txt',encoding='utf-8') as f: 71 | line=f.readline().strip('\n') 72 | while line: 73 | DBIdNameText=DBIdNameText+' '+line 74 | line = f.readline().strip('\n') 75 | 76 | pass 77 | 78 | 79 | 80 | 81 | 82 | def WanQuanYingPiPei(docQueryPath,dbEntityPath): 83 | ansf=open(dbEntityPath,'w',encoding='utf-8') 84 | with open(docQueryPath,encoding='utf-8') as f: 85 | line=f.readline().strip('\n') 86 | while line: 87 | if(line.find(' NIL')==-1): 88 | pos=line.find(' E0') 89 | docId=line.split(' ')[0] 90 | query=line[len(docId)+1:pos] 91 | ansId=line[pos+1:pos+9] 92 | entity=line[pos+10:] 93 | else: 94 | pos=line.find(' NIL') 95 | docId=line.split(' ')[0] 96 | query=line[len(docId)+1:pos+1] 97 | end=line.find(' XXX') 98 | ansId=line[pos+1:end] 99 | entity=line[pos+ 9:] 100 | print(docId) 101 | # print(query)###### mention 102 | # print(ansId)###### 正确答案的id 103 | # print(entity)####### answer 104 | ###############我现在是想non-NIL的候选集找30个,NIL的可以不管 105 | houxuanId='' 106 | querypos=DBIdNameText.find(query.lower()) 107 | houCount=0 108 | while querypos!=-1: 109 | 110 | begpos=DBIdNameText.find('E0',querypos-10) 111 | endpos=DBIdNameText.find('E0',querypos) 112 | if(begpos!=-1): 113 | findStr=DBIdNameText[begpos:endpos] 114 | kuohaoPos=findStr.find('(') 115 | # print(findStr) 116 | if(kuohaoPos!=-1 and findStr.find(query.lower())]*>','',text) 43 | emoticons = re.findall('(?::|;|=)(?:\)|\(|D|P)',text) 44 | text = re.sub('[\W]+',' ',text.lower())+''.join(emoticons).replace('-','') 45 | return text.strip(' ') 46 | 47 | 48 | def WordTokener( sent): # 将单句字符串分割成词 49 | result = '' 50 | wordsInStr = nltk.word_tokenize(sent) 51 | return wordsInStr 52 | 53 | 54 | def RemoveStopWords(sent): 55 | stopwords = {}.fromkeys([line.rstrip() for line in open('stopwords.txt',encoding='utf-8')]) 56 | segs = jieba.cut(sent, cut_all=False) 57 | final = '' 58 | for seg in segs: 59 | if seg not in stopwords: 60 | final += seg 61 | return final 62 | 63 | 64 | def DocWordfrequency(doc): 65 | words = doc.strip('\n').split(' ') 66 | word_count = {} 67 | word_list=[] 68 | 69 | for w in words: 70 | if w in word_count: 71 | word_count[w] += 1 72 | else: 73 | word_count[w] = 1 74 | 75 | 76 | for w in sorted(zip(word_count.values(), word_count.keys()), reverse=True): # 安装词频排序 77 | # print(w) 78 | if(len(w[1].strip(' '))>0): 79 | wStr=w[1]+' '+str(w[0]) 80 | word_list.append(wStr) 81 | 82 | return word_list 83 | 84 | 85 | def NewText(word_tag): 86 | 87 | String='' 88 | for i in word_tag: 89 | 90 | word=i[0] 91 | tag=i[1] 92 | # if(tag.find('N')!=-1): 93 | # print(i) 94 | ###去掉一些消歧无意义的单词 95 | #去掉CD、PRP、VBD、CC、WDT,IN,RP,TO,DT 96 | hold=['NN','NNS','NNP'] 97 | if tag in hold: 98 | String=String+word+' ' 99 | 100 | return String 101 | 102 | 103 | def getContentIndex(currentText,currentQuery,docIndex,doc_id): 104 | # print("currentText:"+currentText) 105 | # print("currentQuery:"+currentQuery) 106 | # print(docIndex) 107 | # print(doc_id) 108 | # print(len(currentText)) 109 | 110 | if (currentText[-(len(currentQuery)+2):].find(currentQuery) != -1): 111 | CurText = preprocessor(currentText) 112 | CurQuery = preprocessor(currentQuery) 113 | print('****'+currentText[-(len(currentQuery)):]) 114 | print('****'+currentQuery) 115 | 116 | 117 | if (CurText.find(CurQuery) != -1): 118 | print('####'+CurText[-(len(CurQuery)):]) 119 | print('####'+CurQuery) 120 | 121 | textSplit = CurText.split(' ') 122 | querySplit = CurQuery.split(' ') 123 | index_beg = len(textSplit)-1 - len(querySplit) 124 | index_end = len(textSplit)-1 125 | print(CurQuery) 126 | print(textSplit[index_beg:index_end]) 127 | 128 | curTextIndex = getWordIndex(CurText) 129 | curQueryIndex = getWordIndex(CurQuery) 130 | if (curTextIndex.find(curQueryIndex) != -1): 131 | 132 | contentIndex = '' 133 | for i in range(index_beg - 10, index_beg): 134 | if (i < 0): 135 | print(i) 136 | print(len(docIndex)) 137 | contentIndex = contentIndex+'0' + ' ' 138 | else: 139 | print(i) 140 | print(len(docIndex)) 141 | contentIndex = contentIndex + docIndex[i] + ' ' 142 | 143 | 144 | for i in range(index_end + 1, index_end + 11): 145 | if (i >= len(docIndex)): 146 | contentIndex = contentIndex + '0' + ' ' 147 | else: 148 | contentIndex = contentIndex + docIndex[i] + ' ' 149 | 150 | print(docIndex[index_beg-1:index_end]) 151 | entityIndex='' 152 | for i in range(index_beg,index_end): 153 | entityIndex=entityIndex+docIndex[i]+' ' 154 | print('entityIndex:'+entityIndex) 155 | print(doc_id) 156 | print('contentIndex:' + contentIndex) 157 | else: 158 | print("转换成索引后,找不到了!!!!!") 159 | 160 | else: 161 | print("找不到了!!!!!") 162 | 163 | else: 164 | print(doc_id) 165 | print(currentText[-(len(currentQuery) + 2):]) 166 | print(currentQuery) 167 | print("转化后找不到了!!!!") 168 | 169 | 170 | return entityIndex,contentIndex,index_beg,index_end 171 | 172 | 173 | 174 | def getContentNounIndex(currentText,hou_halfText,currentQuery,docIndex,doc_id): 175 | print("currentText:"+currentText) 176 | print("hou_halfText:"+hou_halfText) 177 | print("currentQuery:"+currentQuery) 178 | print(docIndex) 179 | print(doc_id) 180 | print(len(currentText)) 181 | 182 | if (currentText[-(len(currentQuery)+2):].find(currentQuery) != -1): 183 | CurText = preprocessor(currentText) 184 | CurQuery = preprocessor(currentQuery) 185 | print('****'+currentText[-(len(currentQuery)):]) 186 | print('****'+currentQuery) 187 | 188 | 189 | if (CurText.find(CurQuery) != -1): 190 | print('####'+CurText[-(len(CurQuery)):]) 191 | print('####'+CurQuery) 192 | 193 | textSplit = CurText.split(' ') 194 | querySplit = CurQuery.split(' ') 195 | index_beg = len(textSplit)-1 - len(querySplit) 196 | index_end = len(textSplit)-1 197 | print(CurQuery) 198 | print(textSplit[index_beg:index_end]) 199 | 200 | curTextIndex = getWordIndex(CurText) 201 | curQueryIndex = getWordIndex(CurQuery) 202 | if (curTextIndex.find(curQueryIndex) != -1): 203 | 204 | contentIndex = '' 205 | 206 | soup1 = BeautifulSoup(currentText, 'html.parser') 207 | words1 = nltk.word_tokenize(soup1.get_text()) 208 | word_tag1 = nltk.pos_tag(words1) 209 | newText1 = NewText(word_tag1) 210 | textIndex1 = getWordIndex(newText1).strip(' ') 211 | textIndex1 = '0 0 0 0 0 0 0 0 0 0 '+textIndex1 212 | print('textIndex1:'+textIndex1) 213 | Index1 = textIndex1.split(' ')[-10:] 214 | print(Index1) 215 | 216 | 217 | soup2 = BeautifulSoup(hou_halfText, 'html.parser') 218 | words2 = nltk.word_tokenize(soup2.get_text()) 219 | word_tag2 = nltk.pos_tag(words2) 220 | newText2 = NewText(word_tag2) 221 | textIndex2 = getWordIndex(newText2).strip(' ') 222 | print('textIndex2:'+textIndex2) 223 | textIndex2 = textIndex2+' 0 0 0 0 0 0 0 0 0 0' 224 | Index2 = textIndex2.split(' ')[:10] 225 | print(Index2) 226 | 227 | for i in Index1: 228 | contentIndex=contentIndex+' '+i 229 | for j in Index2: 230 | contentIndex=contentIndex+' '+j 231 | 232 | contentIndex=contentIndex.strip(' ') 233 | print("find noun index:"+contentIndex) 234 | 235 | print(docIndex[index_beg-1:index_end]) 236 | entityIndex='' 237 | for i in range(index_beg,index_end): 238 | entityIndex=entityIndex+docIndex[i]+' ' 239 | print('entityIndex:'+entityIndex) 240 | else: 241 | print("转换成索引后,找不到了!!!!!") 242 | 243 | else: 244 | print("找不到了!!!!!") 245 | 246 | else: 247 | print(doc_id) 248 | print(currentText[-(len(currentQuery) + 2):]) 249 | print(currentQuery) 250 | print("转化后找不到了!!!!") 251 | 252 | 253 | return contentIndex,entityIndex,index_beg,index_end 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | def readIdNameFile(dbIdNamePath): 263 | global db_id_list 264 | global db_name_list 265 | IdNameMap=dict() 266 | tempId='' 267 | tempName='' 268 | count=0 269 | with open(dbIdNamePath,encoding='utf-8') as f: 270 | line=f.readline().strip('\n') 271 | while line: 272 | 273 | if(count%2==0): 274 | db_id_list.append(line) 275 | tempId=line 276 | # print(line) 277 | if(count%2==1): 278 | db_name_list.append(line) 279 | tempName=line 280 | IdNameMap[tempId]=str(int((count+1)/2))+'###'+tempName 281 | 282 | count=count+1 283 | line=f.readline().strip('\n') 284 | return IdNameMap 285 | 286 | pass 287 | 288 | def getWordIndex(docText): 289 | docIndex = '' 290 | for i in (preprocessor(docText).strip(' ').split(' ')): 291 | if (i in wordVecIndexMap): 292 | docIndex = docIndex + str(wordVecIndexMap[i]) + ' ' 293 | else: 294 | docIndex = docIndex + '0' + ' ' 295 | 296 | return docIndex.strip(' ') 297 | 298 | 299 | def readQueryFile(path): 300 | global query_id_list 301 | global query_name_list 302 | global doc_id_list 303 | global query_beg_list 304 | global query_end_list 305 | 306 | with open(path,encoding='utf-8') as f: 307 | line=f.readline() 308 | 309 | while line: 310 | if(line.find('') 313 | id=line[pos1+11:pos2] 314 | query_id_list.append(id) 315 | # print(id) 316 | if(line.find('')!=-1): 317 | pos1=line.find('') 318 | pos2=line.find('') 319 | name=line[pos1+6:pos2] 320 | query_name_list.append(name.strip(' ')) 321 | # print(name) 322 | if(line.find('')!=-1): 323 | pos1 = line.find('') 324 | pos2 = line.find('') 325 | doc = line[pos1+7:pos2] 326 | doc_id_list.append(doc) 327 | # print(doc) 328 | if(line.find('')!=-1): 329 | pos1 = line.find('') 330 | pos2 = line.find('') 331 | beg = line[pos1+5:pos2] 332 | query_beg_list.append(beg) 333 | # print(beg) 334 | if(line.find('')!=-1): 335 | pos1 = line.find('') 336 | pos2 = line.find('') 337 | end = line[pos1+5:pos2] 338 | query_end_list.append(end) 339 | # print(end) 340 | 341 | line=f.readline() 342 | 343 | pass 344 | 345 | 346 | def readAnswerFile(answerPath): 347 | global answer_query_id_list 348 | global answer_query_name_list 349 | global answer_db_id_list 350 | global answer_db_name_list 351 | 352 | with open(answerPath, encoding='utf-8') as ansf: 353 | ans_line = ansf.readline().strip('\n') 354 | while ans_line: 355 | pos1 = ans_line.find("query_id=") 356 | pos2 = ans_line.find("query_name=") 357 | pos3 = ans_line.find("db_id=") 358 | pos4 = ans_line.find("db_name=") 359 | query_id = ans_line[pos1 + 9:pos2].strip(' ') 360 | query_name = ans_line[pos2 + 11:pos3].strip(' ') 361 | db_id = ans_line[pos3 + 6:pos4].strip(' ') 362 | db_name = ans_line[pos4 + 8:len(ans_line)].strip(' ') 363 | 364 | # print(query_id) 365 | # print(query_name) 366 | # print(db_id) 367 | # print(db_name) 368 | answer_query_id_list.append(query_id) 369 | answer_query_name_list.append(query_name) 370 | answer_db_id_list.append(db_id) 371 | answer_db_name_list.append(db_name) 372 | 373 | ans_line = ansf.readline().strip('\n') 374 | pass 375 | 376 | pass 377 | 378 | 379 | 380 | 381 | 382 | def findDocQueryAndDbAnswer(docPath,year,testFalg): 383 | 384 | if(testFlag): 385 | docP=u'H:\yaojuan\QUERY\\'+year+'\eval\\source_documents' 386 | else: 387 | docP = u'H:\yaojuan\QUERY\\' + year + '\\training\\source_documents' 388 | # OOOMap={} 389 | # # 检查一下query的答案是不都有链接的实体 390 | # path = u'H:\\yaojuan\\EntityLinkingData\\DB_id_index.txt' 391 | # count = 0 392 | # all = 0 393 | # with open(path,encoding='utf-8') as f: 394 | # line = f.readline() 395 | # while line: 396 | # all = all + 1 397 | # if (len(line) < 12): 398 | # count = count + 1 399 | # OOOMap[line.split(' ')[0]]="糟糕!!!有答案实体与任何实体没有关系" 400 | # else: 401 | # OOOMap[line.split(' ')[0]]="OK!!!!没问题" 402 | # 403 | # line = f.readline() 404 | 405 | if(testFlag): 406 | queryIndexFile = open('H:\yaojuan\QUERY\\'+year+'\eval\word2vec\\test\\test_queryIndex.txt','w',encoding='utf-8') 407 | queryNounIndexFile = open('H:\yaojuan\QUERY\\' + year + '\eval\word2vec\\test\\test_queryNounIndex.txt', 'w', encoding='utf-8') 408 | queryTextIndexFile = open('H:\yaojuan\QUERY\\' + year + '\eval\word2vec\\test\\test_queryTextIndex.txt', 'w', encoding='utf-8') 409 | else: 410 | queryIndexFile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train\\train_queryIndex.txt', 'w', encoding='utf-8') 411 | queryNounIndexFile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train\\train_queryNounIndex.txt', 'w', encoding='utf-8') 412 | queryTextIndexFile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train\\train_queryTextIndex.txt', 'w', encoding='utf-8') 413 | 414 | with open(docPath,encoding='utf-8') as docF: 415 | doc_line=docF.readline() 416 | doc_count=0 417 | 418 | while doc_line: 419 | pos=doc_line.find(".txt") 420 | doc_id=doc_line[:pos] 421 | doc_count=doc_count+1 422 | # print(doc_id) 423 | if (doc_count > 0): 424 | if(testFlag): 425 | docfile= open('H:\yaojuan\QUERY\\'+year+'\eval\word2vec\\test_docQuery\doc_'+str(doc_count)+'.txt','w',encoding='utf-8') 426 | unidocfile = open('H:\yaojuan\QUERY\\'+year+'\eval\word2vec\\test_docQuery\doc_' + str(doc_count) + '_uni.txt', 'w',encoding='utf-8') 427 | docindexfile = open('H:\yaojuan\QUERY\\'+year+'\eval\word2vec\\test_docQuery\doc_' + str(doc_count) + '_Windex.txt', 'w', encoding='utf-8') 428 | textfile=open('H:\yaojuan\QUERY\\'+year+'\eval\word2vec\\test_docText\\text_'+str(doc_count)+'.txt','w',encoding='utf-8') 429 | else: 430 | docfile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_docQuery\doc_' + str(doc_count) + '.txt', 'w', encoding='utf-8') 431 | unidocfile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_docQuery\doc_' + str(doc_count) + '_uni.txt', 'w', encoding='utf-8') 432 | docindexfile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_docQuery\doc_' + str(doc_count) + '_Windex.txt', 'w', encoding='utf-8') 433 | textfile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_docText\\text_' + str(doc_count) + '.txt', 'w', encoding='utf-8') 434 | 435 | with open(docP + "\\" + doc_id+'.txt' , encoding='utf-8') as textF: 436 | docText = textF.read() 437 | DocText=preprocessor(docText.replace('\n',' '))###预处理后的文本 438 | docIndex=getWordIndex(DocText).split(' ')###预处理后的文本单词索引 439 | 440 | 441 | tempList = [] 442 | for i in range(len(doc_id_list)): 443 | if(doc_id_list[i]==doc_id): 444 | 445 | # print(doc_id_list[i]) 446 | # print(query_name_list[i]) 447 | # print(query_id_list[i]) 448 | # tempString = doc_id_list[i] + ' ' + query_id_list[i] + ' ' + query_name_list[i] + '\n' 449 | # docfile.write(tempString) 450 | 451 | for j in range(len(answer_query_id_list)): 452 | if(answer_query_id_list[j]==query_id_list[i]): 453 | # print(answer_query_id_list[j]) 454 | # print(query_id_list[i]) 455 | # print('query name:'+query_name_list[i]+' find query:'+DocText[int(query_beg_list[i]):int(query_end_list[i])+1].replace('\n',' ')) 456 | 457 | currentText = docText[:int(query_end_list[i]) + 1].replace('\n', ' ') 458 | hou_halfText = docText[int(query_end_list[i]) + 1:].replace('\n', ' ') 459 | currentQuery = query_name_list[i] 460 | entityIndex,contentIndex,index_beg,index_end=getContentIndex(currentText,currentQuery,docIndex,doc_id) 461 | queryIndexFile.write(query_id_list[i]+' '+contentIndex.strip(' ')+'\n') 462 | queryIndexFile.write(query_id_list[i]+' '+entityIndex.strip(' ')+'\n') 463 | contentNounIndex,_,_,_=getContentNounIndex(currentText,hou_halfText,currentQuery,docIndex,doc_id) 464 | queryNounIndexFile.write(query_id_list[i]+' '+contentNounIndex.strip(' ')+'\n') 465 | queryNounIndexFile.write(query_id_list[i]+' '+entityIndex.strip(' ')+'\n') 466 | 467 | 468 | 469 | 470 | 471 | tempString = doc_id_list[i] + ' ' + answer_query_id_list[j] + ' ' + answer_query_name_list[j] +' '+answer_db_id_list[j]+' '+answer_db_name_list[j]+' beg='+query_beg_list[i]+' end='+query_end_list[i] 472 | indexString = doc_id_list[i] + ' ' + answer_query_id_list[j] + ' ' + answer_query_name_list[j] + ' ' + answer_db_id_list[j] + ' ' + answer_db_name_list[j] + ' index_beg=' + \ 473 | str(index_beg) + ' index_end=' + str(index_end) 474 | tpString = doc_id_list[i] + ' ' + answer_query_name_list[j] +' '+answer_db_id_list[j]+' '+answer_db_name_list[j] 475 | 476 | 477 | 478 | docindexfile.write(indexString+'\n') 479 | docindexfile.write(entityIndex+'\n') 480 | docindexfile.write(contentIndex+'\n') 481 | 482 | 483 | 484 | docfile.write(tempString+'\n') 485 | tempList.append(tpString) 486 | # if(answer_db_id_list[j].find('NIL')==-1): 487 | # if(OOOMap[answer_db_id_list[j]].find("没问题")==-1): 488 | # print(OOOMap[answer_db_id_list[j]]) 489 | # print(tempString) 490 | 491 | 492 | with open(docP + "\\" + doc_id + '.txt', encoding='utf-8') as textF: 493 | docText = textF.read() 494 | soup = BeautifulSoup(docText, 'html.parser') 495 | words = nltk.word_tokenize(soup.get_text()) 496 | word_tag = nltk.pos_tag(words) 497 | newText = NewText(word_tag) 498 | queryTextIndexFile.write(query_id_list[i]+' '+getWordIndex(newText).strip(' ') + '\n') 499 | queryTextIndexFile.write(query_id_list[i]+' '+entityIndex.strip(' ')+'\n') 500 | print("textIdex:" + getWordIndex(newText)) 501 | 502 | 503 | 504 | tpList=set(tempList) 505 | for tp in tpList: 506 | unidocfile.write(tp+'\n') 507 | 508 | docfile.close() 509 | 510 | 511 | 512 | with open(docP + "\\" + doc_id +'.txt', encoding='utf-8') as textF: 513 | docText = textF.read() 514 | soup = BeautifulSoup(docText, 'html.parser') 515 | words = nltk.word_tokenize(soup.get_text()) 516 | word_tag = nltk.pos_tag(words) 517 | newText = NewText(word_tag) 518 | textfile.write(newText+'\n') 519 | fre=DocWordfrequency(newText) 520 | for w in fre: 521 | # wStr= 522 | textfile.write(w + '\n') 523 | # print(newText) 524 | # print(fre) 525 | 526 | 527 | doc_line = docF.readline() 528 | 529 | # queryIndexFile.close() 530 | 531 | 532 | pass 533 | 534 | 535 | 536 | if __name__=='__main__': 537 | # readWordVecIndex() 538 | # docPath=u'DocFileName.txt' 539 | # queryPath=u'H:\\yaojuan\\QUERY\\2014\\eval\\tac_kbp_2014_english_EDL_evaluation_queries.xml' 540 | # answerPath=u'newAnswer.txt' 541 | # dbIdNamePath=u'DBIdName.txt' 542 | # 543 | # readIdNameFile(dbIdNamePath) 544 | # readQueryFile(queryPath) 545 | # readAnswerFile(answerPath) 546 | # findDocQueryAndDbAnswer(docPath,queryPath) 547 | 548 | year = '2014' ####2012 2013 2014 trian2014 549 | testFlag = False 550 | readWordVecIndex() 551 | if(testFlag): 552 | docPath = u'H:\yaojuan\QUERY\\'+year+'\eval\\test\DocFileName.txt' 553 | queryPath = u'H:\yaojuan\QUERY\\'+year+'\eval\\tac_kbp_'+year+'_english_entity_linking_evaluation_queries.xml' 554 | answerPath = u'H:\yaojuan\QUERY\\'+year+'\eval\\test\\answer.txt' 555 | dbIdNamePath = u'DBIdName.txt' 556 | else: 557 | docPath = u'H:\yaojuan\QUERY\\' + year + '\\training\\train\DocFileName.txt' 558 | queryPath = u'H:\yaojuan\QUERY\\' + year + '\\training\\tac_kbp_' + year + '_english_EDL_training_queries.xml' 559 | answerPath = u'H:\yaojuan\QUERY\\' + year + '\\training\\train\\answer.txt' 560 | dbIdNamePath = u'DBIdName.txt' 561 | 562 | readIdNameFile(dbIdNamePath) 563 | readQueryFile(queryPath) 564 | readAnswerFile(answerPath) 565 | findDocQueryAndDbAnswer(docPath, year, testFlag) 566 | 567 | 568 | 569 | pass 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | -------------------------------------------------------------------------------- /doc_query_no_beg_end.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding:utf8 -*- 3 | 4 | import os 5 | import sys 6 | import jieba 7 | import nltk 8 | from bs4 import BeautifulSoup 9 | import re 10 | query_id_list=[] 11 | query_name_list=[] 12 | doc_id_list=[] 13 | 14 | answer_query_id_list=[] 15 | answer_query_name_list=[] 16 | answer_query_name_list=[] 17 | answer_db_id_list=[] 18 | answer_db_name_list=[] 19 | 20 | db_id_list=[] 21 | db_name_list=[] 22 | wordVecIndexMap=dict() 23 | 24 | def readWordVecIndex(): 25 | global wordVecIndexMap 26 | # path=u'E:\mypython_Linking\CNN\glove.6B.100d.txt' 27 | path = u'E:\data_analysis\word2vec\\vector_100.txt' 28 | count=0 29 | with open(path,encoding='utf-8') as f: 30 | line=f.readline() 31 | while line: 32 | count=count+1 33 | wordVecIndexMap[line.split(' ')[0]]=count 34 | line=f.readline() 35 | 36 | return wordVecIndexMap 37 | 38 | 39 | def preprocessor(text): 40 | text = re.sub('<[^>]*>','',text) 41 | emoticons = re.findall('(?::|;|=)(?:\)|\(|D|P)',text) 42 | text = re.sub('[\W]+',' ',text.lower())+''.join(emoticons).replace('-','') 43 | return text.strip(' ') 44 | 45 | 46 | def WordTokener( sent): # 将单句字符串分割成词 47 | result = '' 48 | wordsInStr = nltk.word_tokenize(sent) 49 | return wordsInStr 50 | 51 | 52 | def RemoveStopWords(sent): 53 | stopwords = {}.fromkeys([line.rstrip() for line in open('stopwords.txt',encoding='utf-8')]) 54 | segs = jieba.cut(sent, cut_all=False) 55 | final = '' 56 | for seg in segs: 57 | if seg not in stopwords: 58 | final += seg 59 | return final 60 | 61 | 62 | def DocWordfrequency(doc): 63 | words = doc.strip('\n').split(' ') 64 | word_count = {} 65 | word_list=[] 66 | 67 | for w in words: 68 | if w in word_count: 69 | word_count[w] += 1 70 | else: 71 | word_count[w] = 1 72 | 73 | 74 | for w in sorted(zip(word_count.values(), word_count.keys()), reverse=True): # 安装词频排序 75 | # print(w) 76 | if(len(w[1].strip(' '))>0): 77 | wStr=w[1]+' '+str(w[0]) 78 | word_list.append(wStr) 79 | 80 | return word_list 81 | 82 | 83 | def NewText(word_tag): 84 | 85 | String='' 86 | for i in word_tag: 87 | 88 | word=i[0] 89 | tag=i[1] 90 | # if(tag.find('N')!=-1): 91 | # print(i) 92 | ###去掉一些消歧无意义的单词 93 | #去掉CD、PRP、VBD、CC、WDT,IN,RP,TO,DT 94 | hold=['NN','NNS','NNP'] 95 | if tag in hold: 96 | String=String+word+' ' 97 | 98 | return String 99 | 100 | 101 | def getContentIndex(currentText,currentQuery,docIndex,doc_id): 102 | print("currentText:"+currentText) 103 | print("currentQuery:"+currentQuery) 104 | 105 | if (currentText.find(currentQuery) != -1): 106 | pos=currentText.find(currentQuery) 107 | CurText = preprocessor(currentText[:pos+len(currentQuery)]) 108 | print('CurText='+CurText) 109 | CurQuery = preprocessor(currentQuery) 110 | print('****'+currentText) 111 | print('****'+currentQuery) 112 | 113 | 114 | if (CurText.find(CurQuery) != -1): 115 | pos_beg=CurText.rfind(CurQuery) 116 | 117 | print('####'+CurText[pos_beg:pos_beg+len(CurQuery)]) 118 | print('####'+CurQuery) 119 | 120 | CurText = CurText[:pos_beg + len(CurQuery)] 121 | textSplit = CurText.split(' ') 122 | querySplit = CurQuery.split(' ') 123 | index_beg = len(textSplit)-1 - len(querySplit) 124 | index_end = len(textSplit)-1 125 | print("index_beg="+str(index_beg)) 126 | print("index_end="+str(index_end)) 127 | print(textSplit[index_beg:index_end]) 128 | # print(CurQuery) 129 | # print(textSplit[index_beg:index_end]) 130 | 131 | curTextIndex = getWordIndex(CurText) 132 | curQueryIndex = getWordIndex(CurQuery) 133 | if (curTextIndex.find(curQueryIndex) != -1): 134 | 135 | contentIndex = '' 136 | for i in range(index_beg - 10, index_beg): 137 | if (i < 0): 138 | contentIndex = contentIndex + '0' + ' ' 139 | else: 140 | contentIndex = contentIndex + docIndex[i] + ' ' 141 | 142 | for i in range(index_end + 1, index_end + 11): 143 | if (i >= len(docIndex)): 144 | contentIndex = contentIndex + '0' + ' ' 145 | else: 146 | contentIndex = contentIndex + docIndex[i] + ' ' 147 | 148 | print(docIndex[index_beg-1:index_end]) 149 | entityIndex='' 150 | for i in range(index_beg,index_end): 151 | entityIndex=entityIndex+docIndex[i]+' ' 152 | print('entityIndex:'+entityIndex) 153 | print(doc_id) 154 | print('contentIndex:' + contentIndex) 155 | else: 156 | print("转换成索引后,找不到了!!!!!") 157 | 158 | else: 159 | print("找不到了!!!!!") 160 | 161 | else: 162 | print(doc_id) 163 | print(currentText[-(len(currentQuery) + 2):]) 164 | print(currentQuery) 165 | print("转化后找不到了!!!!") 166 | 167 | 168 | return entityIndex,contentIndex,index_beg,index_end 169 | 170 | 171 | def getContentNounIndex(currentText,currentQuery,docIndex,doc_id): 172 | print("currentText:"+currentText) 173 | print("currentQuery:"+currentQuery) 174 | 175 | if (currentText.find(currentQuery) != -1): 176 | pos=currentText.find(currentQuery) 177 | CurText = preprocessor(currentText[:pos+len(currentQuery)]) 178 | CurQuery = preprocessor(currentQuery) 179 | qian_currentText=currentText[:pos+len(currentQuery)] 180 | hou_halfText=currentText[pos+len(currentQuery):] 181 | print('****'+currentText) 182 | print('****'+currentQuery) 183 | 184 | 185 | if (CurText.find(CurQuery) != -1): 186 | pos_beg=CurText.rfind(CurQuery) 187 | 188 | print('####'+CurText[pos_beg:pos_beg+len(CurQuery)]) 189 | print('####'+CurQuery) 190 | 191 | CurText = CurText[:pos_beg + len(CurQuery)] 192 | textSplit = CurText.split(' ') 193 | querySplit = CurQuery.split(' ') 194 | index_beg = len(textSplit)-1 - len(querySplit) 195 | index_end = len(textSplit)-1 196 | print("index_beg="+str(index_beg)) 197 | print("index_end="+str(index_end)) 198 | print(textSplit[index_beg:index_end]) 199 | # print(CurQuery) 200 | # print(textSplit[index_beg:index_end]) 201 | 202 | curTextIndex = getWordIndex(CurText) 203 | curQueryIndex = getWordIndex(CurQuery) 204 | if (curTextIndex.find(curQueryIndex) != -1): 205 | 206 | contentIndex = '' 207 | 208 | soup1 = BeautifulSoup(qian_currentText, 'html.parser') 209 | words1 = nltk.word_tokenize(soup1.get_text()) 210 | word_tag1 = nltk.pos_tag(words1) 211 | newText1 = NewText(word_tag1) 212 | textIndex1 = getWordIndex(newText1).strip(' ') 213 | textIndex1 = '0 0 0 0 0 0 0 0 0 0 ' + textIndex1 214 | print('textIndex1:' + textIndex1) 215 | Index1 = textIndex1.split(' ')[-10:] 216 | print(Index1) 217 | 218 | soup2 = BeautifulSoup(hou_halfText, 'html.parser') 219 | words2 = nltk.word_tokenize(soup2.get_text()) 220 | word_tag2 = nltk.pos_tag(words2) 221 | newText2 = NewText(word_tag2) 222 | textIndex2 = getWordIndex(newText2).strip(' ') 223 | print('textIndex2:' + textIndex2) 224 | textIndex2 = textIndex2 + ' 0 0 0 0 0 0 0 0 0 0' 225 | Index2 = textIndex2.split(' ')[:10] 226 | print(Index2) 227 | 228 | for i in Index1: 229 | contentIndex = contentIndex + ' ' + i 230 | for j in Index2: 231 | contentIndex = contentIndex + ' ' + j 232 | 233 | contentIndex = contentIndex.strip(' ') 234 | print("find noun index:" + contentIndex) 235 | 236 | 237 | print(docIndex[index_beg-1:index_end]) 238 | entityIndex='' 239 | for i in range(index_beg,index_end): 240 | entityIndex=entityIndex+docIndex[i]+' ' 241 | print('entityIndex:'+entityIndex) 242 | print(doc_id) 243 | print('contentIndex:' + contentIndex) 244 | else: 245 | print("转换成索引后,找不到了!!!!!") 246 | 247 | else: 248 | print("找不到了!!!!!") 249 | 250 | else: 251 | print(doc_id) 252 | print(currentText[-(len(currentQuery) + 2):]) 253 | print(currentQuery) 254 | print("转化后找不到了!!!!") 255 | 256 | 257 | return contentIndex,entityIndex,index_beg,index_end 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | def readIdNameFile(dbIdNamePath): 268 | global db_id_list 269 | global db_name_list 270 | IdNameMap=dict() 271 | tempId='' 272 | tempName='' 273 | count=0 274 | with open(dbIdNamePath,encoding='utf-8') as f: 275 | line=f.readline().strip('\n') 276 | while line: 277 | 278 | if(count%2==0): 279 | db_id_list.append(line) 280 | tempId=line 281 | # print(line) 282 | if(count%2==1): 283 | db_name_list.append(line) 284 | tempName=line 285 | IdNameMap[tempId]=str(int((count+1)/2))+'###'+tempName 286 | 287 | count=count+1 288 | line=f.readline().strip('\n') 289 | return IdNameMap 290 | 291 | pass 292 | 293 | def getWordIndex(docText): 294 | docIndex = '' 295 | for i in (preprocessor(docText).strip(' ').split(' ')): 296 | if (i in wordVecIndexMap): 297 | docIndex = docIndex + str(wordVecIndexMap[i]) + ' ' 298 | else: 299 | docIndex = docIndex + '0' + ' ' 300 | 301 | return docIndex.strip(' ') 302 | 303 | 304 | def readQueryFile(path): 305 | global query_id_list 306 | global query_name_list 307 | global doc_id_list 308 | 309 | 310 | with open(path,encoding='utf-8') as f: 311 | line=f.readline() 312 | 313 | while line: 314 | if(line.find('') 317 | id=line[pos1+11:pos2] 318 | query_id_list.append(id) 319 | # print(id) 320 | if(line.find('')!=-1): 321 | pos1=line.find('') 322 | pos2=line.find('') 323 | name=line[pos1+6:pos2] 324 | query_name_list.append(name.strip(' ')) 325 | # print(name) 326 | if(line.find('')!=-1): 327 | pos1 = line.find('') 328 | pos2 = line.find('') 329 | doc = line[pos1+7:pos2] 330 | doc_id_list.append(doc) 331 | # print(doc) 332 | 333 | line=f.readline() 334 | 335 | pass 336 | 337 | 338 | def readAnswerFile(answerPath): 339 | global answer_query_id_list 340 | global answer_query_name_list 341 | global answer_db_id_list 342 | global answer_db_name_list 343 | 344 | with open(answerPath, encoding='utf-8') as ansf: 345 | ans_line = ansf.readline().strip('\n') 346 | while ans_line: 347 | pos1 = ans_line.find("query_id=") 348 | pos2 = ans_line.find("query_name=") 349 | pos3 = ans_line.find("db_id=") 350 | pos4 = ans_line.find("db_name=") 351 | query_id = ans_line[pos1 + 9:pos2].strip(' ') 352 | query_name = ans_line[pos2 + 11:pos3].strip(' ') 353 | db_id = ans_line[pos3 + 6:pos4].strip(' ') 354 | db_name = ans_line[pos4 + 8:len(ans_line)].strip(' ') 355 | 356 | # print(query_id) 357 | # print(query_name) 358 | # print(db_id) 359 | # print(db_name) 360 | answer_query_id_list.append(query_id) 361 | answer_query_name_list.append(query_name) 362 | answer_db_id_list.append(db_id) 363 | answer_db_name_list.append(db_name) 364 | 365 | ans_line = ansf.readline().strip('\n') 366 | pass 367 | 368 | pass 369 | 370 | 371 | 372 | 373 | 374 | def findDocQueryAndDbAnswer(docPath,year,testFlag): 375 | if(testFlag): 376 | docP=u'H:\yaojuan\QUERY\\'+year+'\\eval\\source_documents' 377 | else: 378 | docP=u'H:\yaojuan\QUERY\\'+year+'\\training\\source_documents' 379 | 380 | # OOOMap={} 381 | # # 检查一下query的答案是不都有链接的实体 382 | # path = u'H:\\yaojuan\\EntityLinkingData\\DB_id_index.txt' 383 | # count = 0 384 | # all = 0 385 | # with open(path,encoding='utf-8') as f: 386 | # line = f.readline() 387 | # while line: 388 | # all = all + 1 389 | # if (len(line) < 12): 390 | # count = count + 1 391 | # OOOMap[line.split(' ')[0]]="糟糕!!!有答案实体与任何实体没有关系" 392 | # else: 393 | # OOOMap[line.split(' ')[0]]="OK!!!!没问题" 394 | # 395 | # line = f.readline() 396 | if(testFlag): 397 | queryIndexFile = open('H:\yaojuan\QUERY\\'+year+'\\eval\word2vec\\test\\test_queryIndex.txt','w',encoding='utf-8') 398 | queryNounIndexFile = open('H:\yaojuan\QUERY\\' + year + '\eval\word2vec\\test\\test_queryNounIndex.txt', 'w', encoding='utf-8') 399 | queryTextIndexFile = open('H:\yaojuan\QUERY\\' + year + '\\eval\word2vec\\test\\test_queryTextIndex.txt', 'w', encoding='utf-8') 400 | else: 401 | queryIndexFile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train\\train_queryIndex.txt', 'w', encoding='utf-8') 402 | queryNounIndexFile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train\\train_queryNounIndex.txt', 'w', encoding='utf-8') 403 | queryTextIndexFile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train\\train_queryTextIndex.txt', 'w',encoding='utf-8') 404 | 405 | with open(docPath,encoding='utf-8') as docF: 406 | doc_line=docF.readline() 407 | doc_count=0 408 | 409 | while doc_line: 410 | 411 | pos=doc_line.find(".xml") 412 | doc_id=doc_line[:pos] 413 | doc_count=doc_count+1 414 | # print(doc_id) 415 | 416 | 417 | if(doc_count>0): 418 | if(testFlag): 419 | docfile= open('H:\yaojuan\QUERY\\'+year+'\\eval\word2vec\\test_docQuery\doc_'+str(doc_count)+'.txt','w',encoding='utf-8') 420 | unidocfile = open('H:\yaojuan\QUERY\\'+year+'\\eval\word2vec\\test_docQuery\doc_' + str(doc_count) + '_uni.txt', 'w',encoding='utf-8') 421 | docindexfile = open('H:\yaojuan\QUERY\\'+year+'\\eval\word2vec\\test_docQuery\doc_' + str(doc_count) + '_Windex.txt', 'w', encoding='utf-8') 422 | textfile=open('H:\yaojuan\QUERY\\'+year+'\\eval\word2vec\\test_docText\\text_'+str(doc_count)+'.txt','w',encoding='utf-8') 423 | else: 424 | docfile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_docQuery\doc_' + str(doc_count) + '.txt', 'w', encoding='utf-8') 425 | unidocfile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_docQuery\doc_' + str(doc_count) + '_uni.txt', 'w', encoding='utf-8') 426 | docindexfile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_docQuery\doc_' + str(doc_count) + '_Windex.txt', 'w', encoding='utf-8') 427 | textfile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_docText\\text_' + str(doc_count) + '.txt', 'w', encoding='utf-8') 428 | 429 | with open(docP + "\\" + doc_id+'.xml' , encoding='utf-8') as textF: 430 | docText = textF.read() 431 | DocText=preprocessor(docText.replace('\n',' '))###预处理后的文本 432 | docIndex=getWordIndex(DocText).split(' ')###预处理后的文本单词索引 433 | 434 | 435 | tempList = [] 436 | for i in range(len(doc_id_list)): 437 | if(doc_id_list[i]==doc_id): 438 | 439 | # print(doc_id_list[i]) 440 | # print(query_name_list[i]) 441 | # print(query_id_list[i]) 442 | tempString = doc_id_list[i] + ' ' + query_id_list[i] + ' ' + query_name_list[i] + '\n' 443 | docfile.write(tempString) 444 | 445 | for j in range(len(answer_query_id_list)): 446 | if(answer_query_id_list[j]==query_id_list[i]): 447 | # print(answer_query_id_list[j]) 448 | # print(query_id_list[i]) 449 | # print('query name:'+query_name_list[i]+' find query:'+DocText[int(query_beg_list[i]):int(query_end_list[i])+1].replace('\n',' ')) 450 | 451 | currentText = docText.replace('\n',' ') 452 | currentQuery = query_name_list[i] 453 | entityIndex,contentIndex,index_beg,index_end=getContentIndex(currentText,currentQuery,docIndex,doc_id) 454 | queryIndexFile.write(query_id_list[i]+' '+contentIndex.strip(' ')+'\n') 455 | queryIndexFile.write(query_id_list[i] + ' ' + entityIndex.strip(' ') + '\n') 456 | contentNounIndex, _, _, _ = getContentNounIndex(currentText, currentQuery, docIndex, doc_id) 457 | print('contentNounIndex:'+contentNounIndex) 458 | print('entityIndex:'+entityIndex) 459 | queryNounIndexFile.write(query_id_list[i] + ' ' + contentNounIndex.strip(' ') + '\n') 460 | queryNounIndexFile.write(query_id_list[i] + ' ' + entityIndex.strip(' ') + '\n') 461 | 462 | 463 | 464 | 465 | 466 | tempString = doc_id_list[i] + ' ' + answer_query_id_list[j] + ' ' + answer_query_name_list[j] +' '+answer_db_id_list[j]+' '+answer_db_name_list[j]+' beg=' 467 | indexString = doc_id_list[i] + ' ' + answer_query_id_list[j] + ' ' + answer_query_name_list[j] + ' ' + answer_db_id_list[j] + ' ' + answer_db_name_list[j] + ' index_beg=' + \ 468 | str(index_beg) + ' index_end=' + str(index_end) 469 | tpString = doc_id_list[i] + ' ' + answer_query_name_list[j] +' '+answer_db_id_list[j]+' '+answer_db_name_list[j] 470 | 471 | 472 | 473 | docindexfile.write(indexString+'\n') 474 | docindexfile.write(entityIndex+'\n') 475 | docindexfile.write(contentIndex+'\n') 476 | 477 | 478 | docfile.write(tempString+'\n') 479 | print("tempString tempString:"+tempString) 480 | tempList.append(tpString) 481 | # if(answer_db_id_list[j].find('NIL')==-1): 482 | # if(OOOMap[answer_db_id_list[j]].find("没问题")==-1): 483 | # print(OOOMap[answer_db_id_list[j]]) 484 | # print(tempString) 485 | 486 | with open(docP + "\\" + doc_id + '.xml', encoding='utf-8') as textF: 487 | docText = textF.read() 488 | soup = BeautifulSoup(docText, 'html.parser') 489 | words = nltk.word_tokenize(soup.get_text()) 490 | word_tag = nltk.pos_tag(words) 491 | newText = NewText(word_tag) 492 | queryTextIndexFile.write( 493 | query_id_list[i] + ' ' + getWordIndex(newText).strip(' ') + '\n') 494 | queryTextIndexFile.write(query_id_list[i] + ' ' + entityIndex.strip(' ') + '\n') 495 | print("textIdex:" + getWordIndex(newText)) 496 | 497 | tpList=set(tempList) 498 | for tp in tpList: 499 | unidocfile.write(tp+'\n') 500 | 501 | docfile.close() 502 | 503 | 504 | 505 | with open(docP + "\\" + doc_id +'.xml', encoding='utf-8') as textF: 506 | docText = textF.read() 507 | soup = BeautifulSoup(docText, 'html.parser') 508 | words = nltk.word_tokenize(soup.get_text()) 509 | word_tag = nltk.pos_tag(words) 510 | newText = NewText(word_tag) 511 | textfile.write(newText+'\n') 512 | fre=DocWordfrequency(newText) 513 | for w in fre: 514 | # wStr= 515 | textfile.write(w + '\n') 516 | # print(newText) 517 | # print(fre) 518 | 519 | 520 | doc_line = docF.readline() 521 | 522 | # queryIndexFile.close() 523 | 524 | 525 | pass 526 | 527 | 528 | 529 | if __name__=='__main__': 530 | 531 | year='2010' #2009 2010 2011 trian2010 532 | testFlag=True 533 | readWordVecIndex() 534 | if(testFlag): 535 | docPath = u'H:\yaojuan\QUERY\\'+year+'\\eval\\test\DocFileName.txt' 536 | queryPath = u'H:\yaojuan\QUERY\\'+year+'\\eval\\tac_kbp_'+year+'_english_entity_linking_evaluation_queries.xml' 537 | answerPath = u'H:\yaojuan\QUERY\\'+year+'\\eval\\test\\answer.txt' 538 | dbIdNamePath = u'DBIdName.txt' 539 | else: 540 | #####train train train 2010 2010 2010 541 | docPath = u'H:\yaojuan\QUERY\\' + year + '\\training\\train\DocFileName.txt' 542 | queryPath = u'H:\yaojuan\QUERY\\' + year + '\\training\\tac_kbp_' + year + '_english_entity_linking_training_queries.xml' 543 | answerPath = u'H:\yaojuan\QUERY\\' + year + '\\training\\train\\answer.txt' 544 | dbIdNamePath = u'DBIdName.txt' 545 | 546 | 547 | readIdNameFile(dbIdNamePath) 548 | readQueryFile(queryPath) 549 | readAnswerFile(answerPath) 550 | findDocQueryAndDbAnswer(docPath, year,testFlag) 551 | 552 | pass 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | -------------------------------------------------------------------------------- /handleDBIdName.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding:utf8 -*- 3 | 4 | import os 5 | import sys 6 | 7 | def handleDBIdName(path): 8 | 9 | with open(path,encoding='utf-8') as f: 10 | line=f.readline().strip('\n') 11 | 12 | while line: 13 | print(line) 14 | if(line.find('(')): 15 | pass 16 | 17 | 18 | 19 | line = f.readline().strip('\n') 20 | 21 | 22 | pass 23 | 24 | 25 | 26 | 27 | if __name__=='__main__': 28 | path='DBIdName.txt' 29 | handleDBIdName(path) 30 | 31 | pass -------------------------------------------------------------------------------- /handle_db_entity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding:utf8 -*- 3 | 4 | # 5 | # ansf=open('all.txt','w',encoding='utf-8') 6 | import random 7 | 8 | count1=0 9 | uniMap=dict() 10 | 11 | def handleTrainDBEntity(dbPath,queryPath,newf,allF): 12 | 13 | Map=dict() 14 | with open(dbPath,encoding='utf-8') as f: 15 | line=f.readline().strip('\n') 16 | lineCount=0 17 | while line: 18 | if(lineCount%2==0): 19 | subStr=line[len(line.split(' ')[0]):] 20 | # print(subStr) 21 | if(lineCount%2==1): 22 | ans=line 23 | Map[subStr]=line 24 | pass 25 | lineCount=lineCount+1 26 | line=f.readline().strip('\n') 27 | 28 | with open(queryPath,encoding='utf-8') as f: 29 | line=f.readline().strip('\n') 30 | while line: 31 | lineCount=lineCount+1 32 | subStr=line[len(line.split(' ')[0]+' '+line.split(' ')[1]):line.find(' beg=')] 33 | # print(dbPath) 34 | # print(subStr) 35 | # print(line) 36 | # print(Map[subStr]) 37 | newf.write(line+'\n') 38 | newf.write(Map[subStr]+'\n') 39 | queryId=line.split(' ')[1] 40 | templine=Map[subStr].split(' ') 41 | ansId=templine[0] 42 | ansNum=templine[1] 43 | if(subStr.find('XXXXX')==-1): 44 | # if(len(templine)-2==1): 45 | # print(subStr) 46 | flag=True 47 | for i in range(len(templine)-2): 48 | houId=templine[i+2] 49 | if(len(houId.strip(' '))>0): 50 | if(ansId==houId): 51 | tempStr=queryId+' '+houId+' '+'1' 52 | allF.write(tempStr+'\n') 53 | elif(ansId!=houId and flag): 54 | tempStr=queryId+' '+houId+' '+'0' 55 | allF.write(tempStr+'\n') 56 | flag=False 57 | 58 | line=f.readline().strip('\n') 59 | newf.close() 60 | 61 | pass 62 | 63 | 64 | 65 | 66 | def handleDBEntity(dbPath,queryPath,newf,allF): 67 | global count1 68 | global uniMap 69 | map=dict() 70 | Map=dict() 71 | with open(dbPath,encoding='utf-8') as f: 72 | line=f.readline().strip('\n') 73 | lineCount=0 74 | while line: 75 | if(lineCount%2==0): 76 | subStr=line[len(line.split(' ')[0]):] 77 | # print(subStr) 78 | if(lineCount%2==1): 79 | ans=line 80 | Map[subStr]=line 81 | pass 82 | lineCount=lineCount+1 83 | line=f.readline().strip('\n') 84 | with open(queryPath,encoding='utf-8') as f: 85 | line=f.readline().strip('\n') 86 | 87 | while line: 88 | lineCount=lineCount+1 89 | subStr=line[len(line.split(' ')[0]+' '+line.split(' ')[1]):line.find(' beg=')] 90 | print(line) 91 | print(dbPath) 92 | print(subStr) 93 | print(line) 94 | print(Map[subStr]) 95 | newf.write(line+'\n') 96 | newf.write(Map[subStr]+'\n') 97 | queryId=line.split(' ')[1] 98 | templine=Map[subStr].split(' ') 99 | # ansId=templine[0] 100 | # ansNum=templine[1] 101 | # print(ansId) 102 | for i in range(len(templine)): 103 | if(i==0): 104 | ansId=templine[0] 105 | if(i==1): 106 | ansNum=int(templine[1]) 107 | if(i>1 and ansNum>0): 108 | houId=templine[i] 109 | if(len(houId.strip(' '))>0): 110 | if(ansId==houId): 111 | tempStr=queryId+' '+houId+' '+'1' 112 | if tempStr not in uniMap: 113 | uniMap[tempStr]=1 114 | else: 115 | uniMap[tempStr]=uniMap[tempStr]+1 116 | count1=count1+1 117 | allF.write(tempStr+'\n') 118 | else: 119 | if(ansId.find('E0')!=-1): 120 | if ansId not in map: 121 | map[ansId]=1 122 | print(ansId) 123 | tempStr=queryId+' '+houId+' '+'0' 124 | allF.write(tempStr+'\n') 125 | # if(ansId.find('NIL')==-1): 126 | # tempStr=queryId+' '+ansId+' '+'1' 127 | # allF.write(tempStr + '\n') 128 | # break 129 | 130 | line=f.readline().strip('\n') 131 | newf.close() 132 | 133 | 134 | pass 135 | 136 | 137 | 138 | if __name__=='__main__': 139 | 140 | # allF=open('H:\yaojuan\QUERY\\2010\\training\\train_all_data.txt','w',encoding='utf-8') 141 | # for i in range(1453): 142 | # 143 | # dbPath='H:\yaojuan\QUERY\\2010\\training\\train_dbEntity\db_'+str(i+1)+'.txt' 144 | # queryPath='H:\yaojuan\QUERY\\2010\\training\\train_docQuery\doc_'+str(i+1)+'.txt' 145 | # newdbPath='H:\yaojuan\QUERY\\2010\\training\\train_dbEntity\db_'+str(i+1)+'_new.txt' 146 | # newf=open(newdbPath,'w',encoding='utf-8') 147 | # handleDBEntity(dbPath,queryPath,newf,allF) 148 | # 149 | # allF.close() 150 | # print(len(uniMap)) 151 | # print(count1) 152 | 153 | year='2009'; fileNum=3695 154 | # year='2010';fileNum=2231 155 | # year='2011'; fileNum=2231 156 | # year='2012'; fileNum=2016 157 | # year='2013'; fileNum=1820 158 | # year='2014'; fileNum=138 159 | 160 | 161 | # allF = open('H:\yaojuan\QUERY\\'+year+'\\eval\\test_all_data.txt', 'w', encoding='utf-8') 162 | allF = open("H:\yaojuan\QUERY\\"+year+"\eval\word2vec\\test_all_data.txt', 'w', encoding='utf-8") 163 | 164 | for i in range(fileNum): 165 | dbPath = 'H:\yaojuan\QUERY\\'+year+'\\eval\\test_dbEntity\db_' + str(i + 1) + '.txt' 166 | queryPath = 'H:\yaojuan\QUERY\\'+year+'\\eval\\test_docQuery\doc_' + str(i + 1) + '.txt' 167 | newdbPath = 'H:\yaojuan\QUERY\\'+year+'\\eval\\test_dbEntity\db_' + str(i + 1) + '_new.txt' 168 | newf = open(newdbPath, 'w', encoding='utf-8') 169 | handleDBEntity(dbPath, queryPath, newf, allF) 170 | 171 | allF.close() 172 | print(len(uniMap)) 173 | print(count1) 174 | 175 | 176 | 177 | 178 | # allF = open('train_all_data.txt', 'w', encoding='utf-8') 179 | # for i in range(138): 180 | # dbPath = 'E:\mypython_Linking\data_handle\\train_dbEntity\db_' + str(i + 1) + '.txt' 181 | # queryPath = 'E:\mypython_Linking\data_handle\\train_docQuery\doc_' + str(i + 1) + '.txt' 182 | # newdbPath = 'E:\mypython_linking\data_handle\\train_dbEntity\db_' + str(i + 1) + '_new.txt' 183 | # newf = open(newdbPath, 'w', encoding='utf-8') 184 | # handleTrainDBEntity(dbPath, queryPath, newf, allF) 185 | # 186 | # allF.close() 187 | # pass 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | -------------------------------------------------------------------------------- /handle_min_db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding:utf8 -*- 3 | 4 | import jieba 5 | import nltk 6 | from bs4 import BeautifulSoup 7 | from data_handle import doc_query 8 | wordVecIndexMap=dict() 9 | 10 | def Score(): 11 | pass 12 | 13 | # def readWordVecIndex(): 14 | # global wordVecIndexMap 15 | # path=u'E:\mypython_Linking\CNN\glove.6B.100d.txt' 16 | # #path = u'E:\data_analysis\word2vec\\vector_100.txt' 17 | # count=0 18 | # with open(path,encoding='utf-8') as f: 19 | # line=f.readline() 20 | # while line: 21 | # count=count+1 22 | # wordVecIndexMap[line.split(' ')[0]]=count 23 | # line=f.readline() 24 | # 25 | # return wordVecIndexMap 26 | 27 | 28 | def NewText(word_tag): 29 | 30 | String='' 31 | for i in word_tag: 32 | 33 | word=i[0] 34 | tag=i[1] 35 | # if(tag.find('N')!=-1): 36 | # print(i) 37 | ###去掉一些消歧无意义的单词 38 | #去掉CD、PRP、VBD、CC、WDT,IN,RP,TO,DT 39 | not_hold=['CD'] 40 | if tag not in not_hold: 41 | if(len(word)>1): 42 | String=String+word+' ' 43 | 44 | return String 45 | 46 | 47 | 48 | def FindRelation(mindbPath,newMinDBTextPath,relation,mindbIndex,mindbAllIndex): 49 | 50 | newf=open(newMinDBTextPath,'w',encoding='utf-8') 51 | dbText='' 52 | attributeText='' 53 | entityName='' 54 | entityId='' 55 | with open(mindbPath, encoding='utf-8') as f: 56 | line=f.readline() 57 | while line: 58 | # print(line) 59 | if(line.find('2): 103 | line=linkMap[key].split(' ') 104 | for i in line: 105 | if(i not in IdSet): 106 | IdSet.add(i) 107 | tempStoreIdSet.add(i) 108 | 109 | 110 | 111 | 112 | 113 | def produceMyEntityVec(path,newPath,idPath,DB_id_index_Path,min_DB_id_index_Path): 114 | IdMap=dict() 115 | with open(path,encoding='utf-8') as f: 116 | line=f.readline().strip('\n') 117 | while line: 118 | id=line.split(' ')[0] 119 | IdMap[id]=line 120 | line=f.readline().strip('\n') 121 | 122 | linkMap=dict() 123 | with open(DB_id_index_Path,encoding='utf-8') as f: 124 | line=f.readline().strip('\n').strip(' ') 125 | while line: 126 | id = line.split(' ')[0] 127 | linkMap[id]=line 128 | line=f.readline().strip('\n').strip(' ') 129 | 130 | newf=open(newPath,'w',encoding='utf-8') 131 | idf = open(idPath,'w',encoding='utf-8') 132 | minf = open(min_DB_id_index_Path,'w',encoding='utf-8') 133 | IdCount=0 134 | for id in IdSet: 135 | newf.write(IdMap[id]+'\n') 136 | idf.write('/m/'+id+' '+str(IdCount)+'\n') 137 | IdCount=IdCount+1 138 | minf.write(linkMap[id]+'\n') 139 | 140 | 141 | if __name__=='__main__': 142 | 143 | ###### 144 | # readFile() 145 | # print(len(IdSet)) 146 | # if 'E0487663' in IdSet: 147 | # print("E0487663 E0487663 E0487663 E0487663") 148 | # else: 149 | # print('WAN WAN WAN WAN WAN WAN') 150 | # path = 'E:\mypython_Linking\CNN\entityvec.txt' 151 | # newPath = 'E:\mypython_Linking\CNN\my_entity_vecs.txt' 152 | # idPath = 'E:\mypython_Linking\CNN\entityId.txt' 153 | # DB_id_index_Path = 'H:\yaojuan\QUERY\juan\zuizhongkuochong\DB_id_index.txt' 154 | # min_DB_id_index_Path = 'E:\mypython_Linking\CNN\min_DB_id_index.txt' 155 | # produceMyEntityVec(path,newPath,idPath,DB_id_index_Path,min_DB_id_index_Path) 156 | 157 | 158 | #####################bu no link entity ######################## 159 | readFile() 160 | # IdSet.remove('E0753362') 161 | print(len(IdSet)) 162 | 163 | if 'E0487663' in IdSet: 164 | print("E0487663 E0487663 E0487663 E0487663") 165 | else: 166 | print('WAN WAN WAN WAN WAN WAN') 167 | # path = 'E:\mypython_Linking\CNN\entityvec.txt'#总的实体向量,有八十多万 ###读 168 | path = 'H:\yaojuan\QUERY\juan\zuizhongkuochong\entityvec.txt'#总的实体向量,有八十多万 ###读 169 | newPath = 'E:\mypython_Linking\CNN\my_entity_vecs_bu.txt'###生成的新文件 170 | idPath = 'E:\mypython_Linking\CNN\entityId_bu.txt'###生成的新文件 171 | DB_id_index_Path = 'H:\yaojuan\QUERY\juan\zuizhongkuochong\DB_id_index_bu.txt'###读 172 | min_DB_id_index_Path = 'E:\mypython_Linking\CNN\min_DB_id_index_bu.txt'###生成的新文件 173 | produceMyEntityVec(path, newPath, idPath, DB_id_index_Path, min_DB_id_index_Path) 174 | 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /myCNN.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import numpy 3 | import tensorflow as tf 4 | from keras.datasets import mnist 5 | from keras.models import Sequential 6 | import keras.backend as K 7 | from keras.layers import Dense 8 | from keras.layers import Dropout 9 | from keras.layers import Flatten 10 | from keras.layers.convolutional import Conv2D 11 | from keras.layers.convolutional import MaxPooling2D 12 | from keras.utils import np_utils 13 | import matplotlib.pyplot as plt 14 | from keras.constraints import maxnorm 15 | from keras.optimizers import SGD 16 | from keras.layers import * 17 | from keras.models import * 18 | import data_load 19 | import sys 20 | # from CNN import data_load 21 | from keras.utils.vis_utils import plot_model 22 | 23 | # import os 24 | # os.environ["PATH"] += os.pathsep + 'D:/Program Files (x86)/Graphviz2.38/bin/' 25 | year=sys.argv[1] 26 | test_query_path=sys.argv[2] 27 | test_db_path=sys.argv[3] 28 | 29 | #求余弦函数 30 | def cosVector(x,y): 31 | if(len(x)!=len(y)): 32 | print('error input,x and y is not in the same space') 33 | return; 34 | result1=0.0; 35 | result2=0.0; 36 | result3=0.0; 37 | for i in range(len(x)): 38 | result1+=x[i]*y[i] #sum(X*Y) 39 | result2+=x[i]**2 #sum(X*X) 40 | result3+=y[i]**2 #sum(Y*Y) 41 | #print("result is "+str(result1/((result2*result3)**0.5))) #结果显示 42 | return result1/((result2*result3)**0.5) 43 | 44 | 45 | 46 | # fix random seed for reproducibility 47 | seed = 7 48 | numpy.random.seed(seed) 49 | # load data 50 | (query_train,query_name_train,db_train,db_name_train,lab_train) = data_load.readTrainAllData(local=False) 51 | (query_test,query_name_test,db_test,db_name_test,lab_test) = data_load.readAllData(local=False,year=year,queryPath=test_query_path,dbPath=test_db_path) 52 | 53 | # query_train = query_train.reshape(query_train.shape[0],query_train.shape[1],query_train.shape[2]).astype('float32') 54 | # #query_name_train = query_name_train.reshape(query_name_train.shape[0],query_name_train.shape[1],query_name_train.shape[2]).astype('float32') 55 | # query_test = query_test.reshape(query_test.shape[0],query_test.shape[1],query_test.shape[2]).astype('float32') 56 | # db_train = db_train.reshape(db_train.shape[0],db_train.shape[1],db_train.shape[2]).astype('float32') 57 | # db_test = db_test.reshape(db_test.shape[0],db_test.shape[1],db_test.shape[2]).astype('float32') 58 | num_classes = 2 59 | 60 | 61 | # #定义记录位置信息的共现变量 62 | # pos=np.random.uniform(-0.01,0.01,size=(query_train.shape[1],query_train.shape[2])) 63 | # POS=[] 64 | # for i in range(query_train.shape[0]): 65 | # POS.append(pos) 66 | # Pos=np.asarray(POS) 67 | # print(Pos.shape) 68 | # query_pos_train=np.concatenate((query_train,Pos),axis=2) 69 | # 70 | # POS=[] 71 | # for i in range(query_test.shape[0]): 72 | # POS.append(pos) 73 | # Pos=np.asarray(POS) 74 | # print(Pos.shape) 75 | # query_pos_test=np.concatenate((query_test,Pos),axis=2) 76 | 77 | # 自定义query模型 78 | query_input=Input(shape=(query_train.shape[1], query_train.shape[2])) 79 | query_conv1=Conv1D(30, 5, padding='valid', activation='relu')(query_input) 80 | query_maxp1=MaxPooling1D(pool_size=2)(query_conv1) 81 | query_drop1=Dropout(0.4)(query_maxp1) 82 | query_conv2=Conv1D(15, 3, activation='relu')(query_drop1) 83 | query_maxp2=MaxPooling1D(pool_size=2)(query_conv2) 84 | query_drop2=Dropout(0.4)(query_maxp2) 85 | query_flat1=Flatten()(query_drop2) 86 | 87 | query_name_input=Input(shape=(query_name_train.shape[1],query_name_train.shape[2])) 88 | query_name_flat1=Flatten()(query_name_input) 89 | query_union=Concatenate()([query_flat1,query_name_flat1]) 90 | 91 | query_dens1=Dense(128, activation='relu')(query_union) 92 | query_drop3=Dropout(0.4)(query_dens1) 93 | query_dens2=Dense(50, activation='relu')(query_drop3) 94 | query_drop4=Dropout(0.4)(query_dens2) 95 | query_model=Dense(20, activation='softmax',name='query_model')(query_drop4) 96 | 97 | # 自定义db模型 98 | db_input=Input(shape=(db_train.shape[1], db_train.shape[2])) 99 | db_conv1=Conv1D(30, 5, padding='valid', activation='relu')(db_input) 100 | db_maxp1=MaxPooling1D(pool_size=2)(db_conv1) 101 | db_drop1=Dropout(0.4)(db_maxp1) 102 | db_conv2=Conv1D(15, 3, activation='relu')(db_drop1) 103 | db_maxp2=MaxPooling1D(pool_size=2)(db_conv2) 104 | db_drop2=Dropout(0.4)(db_maxp2) 105 | db_flat1=Flatten()(db_drop2) 106 | 107 | db_name_input=Input(shape=(db_name_train.shape[1],db_name_train.shape[2])) 108 | db_name_flat1=Flatten()(db_name_input) 109 | db_union=Concatenate()([db_flat1,db_name_flat1]) 110 | 111 | db_dens1=Dense(128, activation='relu')(db_union) 112 | db_drop3=Dropout(0.4)(db_dens1) 113 | db_dens2=Dense(50, activation='relu')(db_drop3) 114 | db_drop4=Dropout(0.4)(db_dens2) 115 | db_model=Dense(20, activation='softmax',name='db_model')(db_drop4) 116 | 117 | print(query_model.shape) 118 | print(db_model.shape) 119 | 120 | # x=Concatenate()([query_model,db_model]) 121 | # We stack a deep densely-connected network on top 122 | x = Multiply(name='Multiply')([query_model,db_model]) 123 | x = Dense(10, activation='relu')(x) 124 | # x = Dense(64, activation='relu')(x) 125 | # x = Dense(64, activation='relu')(x) 126 | 127 | # And finally we add the main logistic regression layer 128 | main_output = Dense(1, activation='sigmoid', name='main_output')(x) 129 | 130 | 131 | model = Model(inputs=[query_input,query_name_input,db_input,db_name_input],outputs=main_output) 132 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 133 | 134 | 135 | # new_model = Dot([query_model,db_model]) 136 | 137 | # 138 | # plot_model(model,to_file='myCNN_model_1.png', show_shapes=True) 139 | 140 | # Fit the model 141 | model.fit([query_train,query_name_train,db_train,db_name_train],lab_train, epochs=100, batch_size=200, verbose=2) 142 | # Final evaluation of the model 143 | scores = model.evaluate([query_test,query_name_test,db_test,db_name_test], lab_test, verbose=0) 144 | print("Large CNN Error: %.2f%%" % (100 - scores[1] * 100)) 145 | 146 | #save model 147 | model.save('myCNN_model_1.h5') 148 | 149 | # 已有的model在load权重过后 150 | # 取某一层的输出为输出新建为model,采用函数模型 151 | query_layer_model = Model(inputs=model.input, 152 | outputs=model.get_layer('query_model').output) 153 | db_layer_model = Model(inputs=model.input, 154 | outputs=model.get_layer('db_model').output) 155 | 156 | # 以这个model的预测值作为输出 157 | query_output = query_layer_model.predict([query_test,query_name_test,db_test,db_name_test]) 158 | db_output = db_layer_model.predict([query_test,query_name_test,db_test,db_name_test]) 159 | 160 | 161 | model_output = model.predict([query_test,query_name_test,db_test,db_name_test]) 162 | print(model_output.shape) 163 | 164 | predictFile=open(year+'predict.txt','w',encoding='utf-8') 165 | for i in range(model_output.shape[0]): 166 | x=model_output[i] 167 | pre=1 / float(1 + np.exp(- x)) 168 | predictFile.write(str(pre)+'\n') 169 | predictFile.close() 170 | 171 | 172 | #计算query_output和db_output的余弦值,用60*1的向量存储 173 | rows=query_output.shape[0] #行数 174 | cols=query_output.shape[1] #列数 175 | cosResult= [[0]*1 for i in range(rows)] 176 | 177 | 178 | for i in range(rows): 179 | cosResult[i][0]=cosVector(query_output[i], db_output[i]) 180 | 181 | #print(cosResult) 182 | 183 | file=open(year+'_cos.txt','w') 184 | for i in cosResult: 185 | file.write(str(i).replace('[','').replace(']','')+'\n') #\r\n为换行符 186 | 187 | file.close() 188 | -------------------------------------------------------------------------------- /process_wiki.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaoyao2/Entity-Linking/18d02199ef37045c2642325cb6e3d6f73f76b4f0/process_wiki.py -------------------------------------------------------------------------------- /produce_entity_index.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | 4 | 5 | def produce_entity_index(entity2idPath,DB_id_index_Path,allDBIdexPath): 6 | 7 | idMap=dict() 8 | with open(entity2idPath,encoding='utf-8') as f: 9 | line=f.readline().strip('\n') 10 | while line: 11 | id=line.split(' ')[0][3:] 12 | index=int(line.split(' ')[1]) 13 | # print(id) 14 | # print(index+1) 15 | idMap[id]=index+1 16 | line=f.readline().strip('\n') 17 | 18 | dbf=open(allDBIdexPath,'w',encoding='utf-8') 19 | with open(DB_id_index_Path,encoding='utf-8') as f: 20 | line=f.readline().strip('\n').strip(' ') 21 | lineCount=0 22 | totalCount=0 23 | nolinkCount=0 24 | while line: 25 | lineCount=lineCount+1 26 | Ids=line.split(' ') 27 | tempStr=Ids[0] 28 | if(len(Ids)==1): 29 | tempStr=tempStr+' '+str(idMap[Ids[0]]) 30 | dbf.write(tempStr+'\n') 31 | dbf.write(tempStr+'\n') 32 | nolinkCount=nolinkCount+1 33 | else: 34 | for i in range(len(Ids)): 35 | if(Ids[i]!='E0006472' and Ids[i]!='E0186505' and Ids[i]!='E0532473'): 36 | totalCount = totalCount + 1 37 | tempStr=tempStr+' '+str(idMap[Ids[i]]) 38 | dbf.write(tempStr.strip(' ') + '\n') 39 | dbf.write(Ids[0]+' '+str(idMap[Ids[0]])+'\n') 40 | print(tempStr) 41 | 42 | 43 | line=f.readline().strip('\n').strip(' ') 44 | print('average link='+str(int(totalCount/lineCount))) 45 | print(nolinkCount) 46 | 47 | 48 | pass 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | if __name__=='__main__': 59 | 60 | 61 | # entity2idPath='E:\mypython_Linking\CNN\entityId.txt' 62 | # DB_id_index_Path='E:\mypython_Linking\CNN\min_DB_id_index.txt' 63 | # allDBIdexPath = 'E:\mypython_Linking\CNN\min_DBIndex.txt' 64 | # produce_entity_index(entity2idPath,DB_id_index_Path,allDBIdexPath) 65 | 66 | 67 | ###############bu no link entity############## 68 | entity2idPath = 'E:\mypython_Linking\CNN\entityId_bu.txt' 69 | DB_id_index_Path = 'E:\mypython_Linking\CNN\min_DB_id_index_bu.txt' 70 | allDBIdexPath = 'E:\mypython_Linking\CNN\min_DBIndex_bu.txt'####生成新文件 71 | produce_entity_index(entity2idPath, DB_id_index_Path, allDBIdexPath) 72 | 73 | pass -------------------------------------------------------------------------------- /produce_min_db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding:utf8 -*- 3 | 4 | from data_handle import doc_query 5 | 6 | Map=dict() 7 | AnsIdSet=set() 8 | StoreTempSet=set() 9 | 10 | def ReadDBIndex(indexPath): 11 | global Map 12 | with open(indexPath,encoding='utf-8') as f: 13 | line=f.readline().strip('\n').strip(' ') 14 | while line: 15 | L=line.split(' ') 16 | Map[L[0]]=line 17 | line=f.readline().strip('\n').strip(' ') 18 | return Map 19 | 20 | 21 | 22 | def ReadAnswer(answerPath): 23 | global AnsIdSet 24 | lineCount=0 25 | with open(answerPath,encoding='utf-8') as f: 26 | line=f.readline().strip('\n').strip(' ') 27 | while line: 28 | if(lineCount%2==0): 29 | pass 30 | if(lineCount%2==1): 31 | line=line.split(' ') 32 | for i in range(len(line)-2): 33 | if(len(line[i+2])>2): 34 | # print(line[i+2]) 35 | AnsIdSet.add(line[i+2]) 36 | lineCount=lineCount+1 37 | line=f.readline().strip('\n').strip(' ') 38 | return AnsIdSet 39 | 40 | 41 | 42 | def ProduceMinDB(dbTextPath,year): 43 | global StoreTempSet 44 | f=open(u'H:\yaojuan\QUERY\\'+year+'\\eval\\test_minDB.txt','w',encoding='utf-8') 45 | for key in Map: 46 | if(key in AnsIdSet): 47 | line=Map[key].split(' ') 48 | for i in line: 49 | if(i not in AnsIdSet): 50 | AnsIdSet.add(i) 51 | StoreTempSet.add(i) 52 | 53 | tempF=open('Temp.txt','w',encoding='utf-8') 54 | while len(StoreTempSet)!=1: 55 | ###E0006472没有 56 | print(len(StoreTempSet)) 57 | print(StoreTempSet) 58 | for key in Map: 59 | if(key in StoreTempSet): 60 | StoreTempSet.remove(key) 61 | line=Map[key].split(' ') 62 | for i in line: 63 | if(i not in AnsIdSet): 64 | 65 | tempF.write(i+'\n') 66 | 67 | AnsIdSet.add(i) 68 | StoreTempSet.add(i) 69 | 70 | IdNameMap = doc_query.readIdNameFile(dbIdNamePath=u'DBIdName.txt') 71 | tempCount=0 72 | AnsIdSet.remove('E0006472') 73 | 74 | for i in AnsIdSet: 75 | print(len(AnsIdSet)) 76 | if(i.find('E0')==-1): 77 | print(i) 78 | 79 | else: 80 | tempCount=tempCount+1 81 | 82 | f.write(i+'\n') 83 | f.write(IdNameMap[i]+'\n') 84 | num=IdNameMap[i].split('###')[0] 85 | dbPath = dbTextPath + num +'.txt' 86 | print(dbPath) 87 | textf = open(u'H:\yaojuan\QUERY\\'+year+'\\eval\\test_minDBText\dbText_' + str(tempCount) + '.txt', 'w', encoding='utf-8') 88 | with open(dbPath,encoding='utf-8') as dbf: 89 | line=dbf.readline() 90 | while line: 91 | textf.write(line) 92 | if(line.find('')!=-1): 93 | break 94 | line=dbf.readline() 95 | dbf.close() 96 | textf.close() 97 | 98 | 99 | return AnsIdSet 100 | 101 | 102 | 103 | 104 | if __name__=='__main__': 105 | # year='2009' 106 | # filesnum=3695 107 | 108 | year='2010' 109 | filesnum=2231 110 | 111 | # year='2011' 112 | # filesnum=2231 113 | 114 | # year = '2012' 115 | # filesnum = 2016 116 | 117 | # year='2013' 118 | # filesnum=1820 119 | 120 | # year='2014' 121 | # filesnum=138 122 | 123 | 124 | indexPath = u'DB_id_index.txt' 125 | ReadDBIndex(indexPath) 126 | for i in range(filesnum):#文件个数 127 | answerPath = u'H:\yaojuan\QUERY\\'+year+'\\eval\\test_dbEntity\db_' + str(i + 1) + '.txt' 128 | ReadAnswer(answerPath) 129 | dbTextPath = u'H:\\yaojuan\\EntityLinkingData\\data\\' 130 | ProduceMinDB(dbTextPath,year) 131 | 132 | 133 | 134 | pass 135 | 136 | 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /word2vec.py: -------------------------------------------------------------------------------- 1 | #! -*- coding=utf-8 -*- 2 | from gensim.models import word2vec 3 | import logging 4 | # 主程序 5 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 6 | sentences = word2vec.Text8Corpus(u"Text8.txt") #加载语料 7 | #模型初始化S 8 | model = word2vec.Word2Vec(sentences, size=50) #训练skip-gram模型; 默认window=5 9 | model2 = word2vec.Word2Vec("hello wrold! hello The training algorithms were originally ported from the C package", size=50, window=5, min_count=5, workers=4) 10 | print "--------/n" 11 | print "--------/n" 12 | y1 = model.similarity("woman", "man") 13 | print u"woman和man的相似度为:", y1 14 | print "--------/n" 15 | print "--------/n" 16 | y2 = model.most_similar("good", topn=20) # 20个最相关的 17 | print u"和good最相关的词有:/n" 18 | for item in y2: 19 | print item[0], item[1] 20 | print "--------/n" 21 | print "--------/n" 22 | 23 | # print ' "boy" is to "father" as "girl" is to ...? /n' 24 | # y3 = model.most_similar(['girl', 'father'], ['boy'], topn=3) 25 | # for item in y3: 26 | # print item[0], item[1] 27 | # print "--------/n" 28 | # print "--------/n" 29 | # 30 | # y4 = model.doesnt_match("breakfast cereal dinner lunch".split()) 31 | # print u"不合群的词:", y4 32 | # print "--------/n" 33 | # print "--------/n" 34 | # 35 | # y5 = model.init_sims() 36 | # 37 | model.wv.save_word2vec_format('vector_50.txt') 38 | # #model = word2vec.Word2Vec.load_word2vec_format('/tmp/vectors.bin', binary=True) 39 | # print "--------/n" 40 | # print "--------/n" 41 | # 42 | # model.most_similar(['girl', 'father'], ['boy'], topn=3) 43 | # print "--------/n" 44 | # print "--------/n" 45 | # 46 | # more_examples = ["he his she", "big bigger bad", "going went being"] 47 | # for example in more_examples: 48 | # a, b, x = example.split() 49 | # predicted = model.most_similar([x, b], [a])[0][0] 50 | # print "'%s' is to '%s' as '%s' is to '%s'" % (a, b, x, predicted) 51 | # print "--------/n" 52 | # print "--------/n" 53 | # 54 | # y6=model.wv['red'] # numpy vector of a word 55 | # print y6 56 | # 57 | # y7=model.wv['white'] # numpy vector of a word 58 | # print y7 59 | # 60 | # y8 = model.similarity("yes", "no") 61 | # print y8 62 | # 63 | # y9 = model.similarity("color", "white") 64 | # print y9 65 | # 66 | # y10 = model.similarity("red", "color") 67 | # print y10 --------------------------------------------------------------------------------