├── .gitattributes
├── check_doc_db.py
├── data_load.py
├── doc_chek_hou.py
├── doc_db.py
├── doc_find_hou.py
├── doc_query.py
├── doc_query_no_beg_end.py
├── handleDBIdName.py
├── handle_db_entity.py
├── handle_min_db.py
├── merge_all_data_db_id.py
├── myCNN.py
├── process_wiki.py
├── produce_entity_index.py
├── produce_min_db.py
└── word2vec.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/check_doc_db.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding:utf8 -*-
 3 | 
 4 | def check(docPath,dbPath):
 5 |     db_id_list=[]
 6 |     db_name_list=[]
 7 |     count = 0
 8 |     with open(dbPath,encoding='utf-8') as f:
 9 |         line = f.readline().strip('\n')
10 |         while line:
11 | 
12 |             if (count % 2 == 0):
13 |                 db_id_list.append(line)
14 |                 # print(line)
15 |             if (count % 2 == 1):
16 |                 db_name_list.append(line)
17 | 
18 |             count = count + 1
19 |             line = f.readline().strip('\n')
20 | 
21 |         pass
22 | 
23 | 
24 |     with open(docPath,encoding='utf-8') as docF:
25 |         doc_line=docF.readline().strip('\n')
26 |         while doc_line:
27 |             pos=doc_line.find('E0')
28 |             answerId=doc_line[pos:pos+8]
29 | 
30 |             findFlag=True
31 |             if(len(answerId)==8):
32 | 
33 |                 for i in range(len(db_id_list)):
34 |                     db_id=db_id_list[i]
35 |                     if(db_id==answerId):
36 |                         # print("find!!! "+answerId)
37 |                         # print(doc_line)
38 |                         # print(answerId)
39 |                         findFlag=False
40 |                         break
41 |                 if(findFlag):
42 |                     print(doc_line)
43 |                     print(answerId)
44 |                     print("not find!!! "+answerId)
45 | 
46 | 
47 | 
48 |             doc_line = docF.readline().strip('\n')
49 | 
50 | 
51 | 
52 |     pass
53 | 
54 | 
55 | 
56 | 
57 | if __name__=='__main__':
58 | 
59 |     docPath=u'E:\mypython_Linking\data_handle\docQuery\doc_'
60 |     dbPath=u'E:\mypython_Linking\data_handle\dbEntity\db_'
61 |     check(docPath+'1.txt',dbPath+'1.txt')
62 |     pass
63 | 


--------------------------------------------------------------------------------
/data_load.py:
--------------------------------------------------------------------------------
  1 | #coding=utf-8
  2 | #加载Google训练的词向量
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | 
  6 | queryIndexMap=dict()
  7 | queryNameIndexMap=dict()
  8 | dbIndexMap=dict()
  9 | dbNameIndexMap=dict()
 10 | dbTrainIndexMap=dict()
 11 | dbNameTrainIndexMap=dict()
 12 | queryTrainIndexMap=dict()
 13 | queryNameTrainIndexMap=dict()
 14 | 
 15 | 
 16 | # 加载所有的词向量
 17 | def load_word_vectors(file_path):
 18 | 
 19 |     print ('loading word vectors...')
 20 |     f = open(file_path, 'r', encoding='utf-8')
 21 |     m =f.readlines()
 22 |     i = 0
 23 |     for li in m:
 24 |         m[i] = m[i].strip().split(' ')
 25 |         i = i + 1
 26 | 
 27 |     num_words=int(len(m))#词向量表的大小
 28 |     vec_len=int(len(m[0][1:]))#词向量的大小
 29 |     print (num_words,vec_len)
 30 |     words = []
 31 |     word_vecs = np.zeros((num_words + 1, vec_len))#词向量表初始化为全0，在0位置处，表示找到的词的向量为0
 32 | 
 33 |     for i in range(num_words):
 34 |         if i == 0:
 35 |             words.append(m[i][0])
 36 |         words.append(m[i][0])
 37 |         word_vecs[i + 1] = np.array(m[i][1:],dtype=np.float32)
 38 | 
 39 |     f.close()
 40 |     print ('done.')
 41 | 
 42 |     return words, word_vecs
 43 | 
 44 | def readQueryIndex(local,year,queryPath):
 45 |     global queryIndexMap
 46 |     global queryNameIndexMap
 47 |     if(local):
 48 |         path='H:\yaojuan\QUERY\\'+year+'\eval\\test\\test_queryIndex.txt'
 49 |     else:
 50 |         path = queryPath
 51 |     with open(path,encoding='utf-8') as f:
 52 |         line=f.readline().strip(' ').strip('\n')
 53 |         lineCount=0
 54 |         while line:
 55 |             if(lineCount%2==0):
 56 |                 ##第一行，上下文
 57 |                 queryId=line.split(' ')[0]
 58 |                 queryIndex=line[len(line.split(' ')[0])+1:].strip(' ')
 59 |                 queryIndexMap[queryId]=queryIndex
 60 |             if(lineCount%2==1):
 61 |                 ##第二行，name
 62 |                 queryId=line.split(' ')[0]
 63 |                 queryIndex=line[len(line.split(' ')[0])+1:].strip(' ')
 64 |                 queryNameIndexMap[queryId]=queryIndex
 65 |             lineCount=lineCount+1
 66 |             line=f.readline().strip(' ').strip('\n')
 67 |     pass
 68 | 
 69 | def readMinDBIndex(local,year,dbPath):
 70 |     global dbIndexMap
 71 |     global dbNameIndexMap
 72 |     if(local):
 73 |         path='H:\yaojuan\QUERY\\'+year+'\eval\\test_mindbIndex.txt'
 74 |     else:
 75 |         path = dbPath
 76 | 
 77 |     with open(path,encoding='utf-8') as f:
 78 |         line = f.readline().strip('\n').strip(' ')
 79 |         lineCount=0
 80 |         while line:
 81 |             if(lineCount%2==0):
 82 |                 dbId = line.split(' ')[0]
 83 |                 dbIndex = line[len(line.split(' ')[0]) + 1:]
 84 |                 dbIndexMap[dbId] = dbIndex
 85 |             if(lineCount%2==1):
 86 |                 dbId = line.split(' ')[0]
 87 |                 dbIndex = line[len(line.split(' ')[0]) + 1:]
 88 |                 dbNameIndexMap[dbId] = dbIndex
 89 | 
 90 |             lineCount=lineCount+1
 91 |             line = f.readline().strip('\n').strip(' ')
 92 | 
 93 |     pass
 94 | 
 95 | def readTrainMinDBIndex(local):
 96 |     global dbTrainIndexMap
 97 |     global dbNameTrainIndexMap
 98 |     if(local):
 99 |         ####将2014的训练数据当做模型的训练集
100 |         path='H:\yaojuan\QUERY\\2014\\training\\train_mindbIndex.txt'
101 |     else:
102 |         path = 'train_mindbIndex.txt'
103 | 
104 |     with open(path,encoding='utf-8') as f:
105 |         line = f.readline().strip(' ').strip('\n')
106 |         lineCount=0
107 |         while line:
108 |             if(lineCount%2==0):
109 |                 dbId = line.split(' ')[0]
110 |                 dbIndex = line[len(line.split(' ')[0]) + 1:]
111 |                 dbTrainIndexMap[dbId] = dbIndex
112 |             if (lineCount%2==1):
113 |                 dbId = line.split(' ')[0]
114 |                 dbIndex = line[len(line.split(' ')[0]) + 1:]
115 |                 dbNameTrainIndexMap[dbId] = dbIndex
116 | 
117 |             lineCount=lineCount+1
118 |             line = f.readline().strip(' ').strip('\n')
119 | 
120 |     pass
121 | 
122 | def readTrainQueryIndex(local):
123 |     global queryTrainIndexMap
124 |     global queryNameTrainIndexMap
125 |     if(local):
126 |         ####将2014的训练数据当做模型的训练集
127 |         path='H:\yaojuan\QUERY\\2014\\training\\train\\train_queryIndex.txt'
128 |     else:
129 |         path = 'train_queryNounIndex.txt'
130 |     with open(path,encoding='utf-8') as f:
131 |         line=f.readline().strip(' ').strip('\n')
132 |         lineCount=0
133 |         while line:
134 |             if(lineCount%2==0):
135 |                 queryId=line.split(' ')[0]
136 |                 queryIndex=line[len(line.split(' ')[0])+1:].strip(' ')
137 |                 queryTrainIndexMap[queryId]=queryIndex
138 |             if(lineCount%2==1):
139 |                 queryId = line.split(' ')[0]
140 |                 queryIndex = line[len(line.split(' ')[0]) + 1:].strip(' ')
141 |                 queryNameTrainIndexMap[queryId] = queryIndex
142 | 
143 | 
144 |             lineCount=lineCount+1
145 |             line=f.readline().strip(' ').strip('\n')
146 |     pass
147 | 
148 | def readTrainAllData(local):
149 |     words, word_vecs = load_word_vectors(u'glove.6B.100d.txt')
150 |     if(local):
151 |         #######这个query——entity对
152 |         alldataPath = 'E:\mypython_Linking\\data_handle\\train_all_data.txt'
153 |     else:
154 |         alldataPath = 'train_all_data.txt'
155 |     ####训练集不需要传year年份
156 |     readTrainQueryIndex(local)
157 |     readTrainMinDBIndex(local)
158 |     queryIdList=[]
159 |     queryIndexList=[]
160 |     queryNameIndexList=[]
161 |     dbIdList=[]
162 |     dbIndexList=[]
163 |     dbNameIndexList=[]
164 |     lableList=[]
165 |     maxLength=0
166 |     minLength=1000
167 |     lineCount=0
168 |     with open(alldataPath, encoding='utf-8') as f:
169 |         line = f.readline().strip('\n')
170 | 
171 |         while line:
172 |             lineCount=lineCount+1
173 |             dbId = line.split(' ')[1]
174 |             if (len(dbTrainIndexMap[dbId].strip(' ').split(' ')) > maxLength):
175 |                 maxLength = len(dbTrainIndexMap[dbId].strip(' ').split(' '))
176 |             if (len(dbTrainIndexMap[dbId].strip(' ').split(' ')) < minLength):
177 |                 minLength = len(dbTrainIndexMap[dbId].strip(' ').split(' '))
178 |             line = f.readline().strip('\n')
179 |     print('train_maxLength=' + str(maxLength))
180 |     print('train_minLength=' + str(minLength))
181 |     print('train_lineCount=' + str(lineCount))
182 |     # trainLen = lineCount // 3
183 | 
184 | 
185 |     with open(alldataPath,encoding='utf-8') as f:
186 |         line=f.readline().strip('\n')
187 |         lineCount=0
188 |         while line:
189 |             lineCount=lineCount+1
190 |             line=line.split(' ')
191 |             queryId=line[0]
192 |             dbId=line[1]
193 |             lable=line[2]
194 |             # print('queryId='+queryId)
195 |             # print('queryIndex='+queryIndexMap[queryId])
196 |             # print('dbId='+dbId)
197 |             # print('dbIndex='+dbIndexMap[dbId])
198 |             # print('lable='+lable)
199 |             queryIdList.append(queryId)
200 |             queryIndex=queryTrainIndexMap[queryId].strip(' ').split(' ')
201 |             queryNameIndex=queryNameTrainIndexMap[queryId].strip(' ').split(' ')
202 |             textVec=[]
203 |             for i in queryIndex:
204 |                 textVec.append(word_vecs[int(i)])
205 |             queryIndexList.append(np.asarray(textVec,dtype=np.float32))
206 | 
207 |             nameVec=[]
208 |             for i in queryNameIndex:
209 |                 nameVec.append(word_vecs[int(i)])
210 |             queryNameIndexList.append(np.asarray(nameVec,dtype=np.float32).mean(axis=0))
211 | 
212 | 
213 |             dbIdList.append(dbId)
214 |             dbIndex=dbTrainIndexMap[dbId]
215 |             dbNameIndex = dbTrainIndexMap[dbId].strip(' ').split(' ')
216 |             for i in range(len(dbTrainIndexMap[dbId].strip(' ').split(' ')),160):
217 |                 dbIndex=dbIndex.strip(' ')+' 0'
218 |             dbIndex=dbIndex.strip(' ').split(' ')
219 | 
220 |             dbVec=[]
221 |             for i in dbIndex:
222 |                 dbVec.append(word_vecs[int(i)])
223 |             dbIndexList.append(np.asarray(dbVec,dtype=np.float32))
224 | 
225 |             nameVec = []
226 |             for i in dbNameIndex:
227 |                 nameVec.append(word_vecs[int(i)])
228 |             dbNameIndexList.append(np.asarray(nameVec, dtype=np.float32).mean(axis=0))
229 | 
230 |             lableList.append(lable)
231 | 
232 |             # if(lineCount>=90):
233 |             #
234 |             #     break
235 | 
236 |             line=f.readline().strip('\n')
237 | 
238 |     query=np.asarray(queryIndexList,dtype=np.float32)
239 |     queryName=np.asarray(queryNameIndexList,dtype=np.float32)[:,np.newaxis,:]
240 |     db=np.asarray(dbIndexList,dtype=np.float32)
241 |     dbName=np.asarray(dbNameIndexList,dtype=np.float32)[:,np.newaxis,:]
242 |     lab=np.asarray(lableList,dtype=np.float32)
243 | 
244 |     print(query.shape)
245 |     print(queryName.shape)
246 |     print(db.shape)
247 |     print(dbName.shape)
248 |     print(lab.shape)
249 | 
250 | 
251 |     return (query,queryName,db,dbName,lab)
252 | 
253 | 
254 | 
255 | 
256 | 
257 | 
258 | 
259 | def readAllData(local,year,queryPath,dbPath):
260 |     words, word_vecs = load_word_vectors(u'glove.6B.100d.txt')
261 |     if(local):
262 |         alldataPath = 'H:\yaojuan\QUERY\\'+year+'\eval\\test_all_data.txt'
263 |     else:
264 |         alldataPath = year+'_test_all_data.txt'
265 |     readQueryIndex(local,year,queryPath)
266 |     readMinDBIndex(local,year,dbPath)
267 |     queryIdList=[]
268 |     queryIndexList=[]
269 |     queryNameIndexList=[]
270 |     dbIdList=[]
271 |     dbIndexList=[]
272 |     dbNameIndexList=[]
273 |     lableList=[]
274 |     maxLength=0
275 |     minLength=1000
276 |     lineCount=0
277 |     with open(alldataPath, encoding='utf-8') as f:
278 |         line = f.readline().strip('\n')
279 | 
280 |         while line:
281 |             lineCount=lineCount+1
282 |             dbId = line.split(' ')[1]
283 |             if (len(dbIndexMap[dbId].strip(' ').split(' ')) > maxLength):
284 |                 maxLength = len(dbIndexMap[dbId].strip(' ').split(' '))
285 |             if (len(dbIndexMap[dbId].strip(' ').split(' ')) < minLength):
286 |                 minLength = len(dbIndexMap[dbId].strip(' ').split(' '))
287 |             # if(lineCount>=90):
288 |             #     break
289 |             line = f.readline().strip('\n')
290 |     print('test_maxLength=' + str(maxLength))
291 |     print('test_minLength=' + str(minLength))
292 |     print('test_lineCount=' + str(lineCount))
293 | 
294 | 
295 |     with open(alldataPath,encoding='utf-8') as f:
296 |         line=f.readline().strip('\n')
297 |         lineCount=0
298 |         while line:
299 |             lineCount=lineCount+1
300 |             if(lineCount>0):
301 |                 line=line.split(' ')
302 |                 queryId=line[0]
303 |                 dbId=line[1]
304 |                 lable=line[2]
305 |                 # print('queryId='+queryId)
306 |                 # print('queryIndex='+queryIndexMap[queryId])
307 |                 # print('dbId='+dbId)
308 |                 # print('dbIndex='+dbIndexMap[dbId])
309 |                 # print('lable='+lable)
310 |                 queryIdList.append(queryId)
311 |                 queryIndex=queryIndexMap[queryId].strip(' ').split(' ')
312 |                 queryNameIndex=queryNameIndexMap[queryId].strip(' ').split(' ')
313 |                 textVec=[]
314 |                 for i in queryIndex:
315 |                     textVec.append(word_vecs[int(i)])
316 |                 queryIndexList.append(np.asarray(textVec,dtype=np.float32))
317 | 
318 |                 nameVec = []
319 |                 for i in queryNameIndex:
320 |                     nameVec.append(word_vecs[int(i)])
321 |                 queryNameIndexList.append(np.asarray(nameVec, dtype=np.float32).mean(axis=0))
322 | 
323 |                 dbIdList.append(dbId)
324 |                 dbIndex=dbIndexMap[dbId]
325 |                 dbNameIndex=dbNameIndexMap[dbId].strip(' ').split(' ')
326 |                 for i in range(len(dbIndexMap[dbId].strip(' ').split(' ')),160):
327 |                     dbIndex=dbIndex.strip(' ')+' 0'
328 |                 dbIndex=dbIndex.strip(' ').split(' ')
329 |                 dbVec=[]
330 |                 for i in dbIndex:
331 |                     dbVec.append(word_vecs[int(i)])
332 |                 dbIndexList.append(np.asarray(dbVec,dtype=np.float32))
333 | 
334 |                 nameVec = []
335 |                 for i in dbNameIndex:
336 |                     nameVec.append(word_vecs[int(i)])
337 |                 dbNameIndexList.append(np.asarray(nameVec, dtype=np.float32).mean(axis=0))
338 | 
339 |                 lableList.append(lable)
340 |                 # if(lineCount>=200000):
341 |                 #
342 |                 #     break
343 | 
344 |             line=f.readline().strip('\n')
345 | 
346 |     query=np.asarray(queryIndexList,dtype=np.float32)
347 |     queryName=np.asarray(queryNameIndexList,dtype=np.float32)[:,np.newaxis,:]
348 |     db=np.asarray(dbIndexList,dtype=np.float32)
349 |     dbName=np.asarray(dbNameIndexList,dtype=np.float32)[:,np.newaxis,:]
350 |     lab=np.asarray(lableList,dtype=np.float32)
351 |     print(query.shape)
352 |     print(queryName.shape)
353 |     print(db.shape)
354 |     print(dbName.shape)
355 |     print(lab.shape)
356 | 
357 | 
358 |     return (query,queryName,db,dbName,lab)
359 | 
360 | 
361 | 
362 | 
363 | 
364 | if __name__=='__main__':
365 |     # readQueryIndex(True) #输出显示test_query的上下文长度是20
366 |     # readMinDBIndex(True) #输出显示test_db的上下文长度是768
367 |     # readTrainQueryIndex(True) #输出显示train_query的上下文长度是20
368 |     # readTrainMinDBIndex(True) #输出显示train_db的上下文长度是1478
369 | 
370 | 
371 |     # readAllData(local=True,year='2014')
372 |     readTrainAllData(local=True)
373 |     # readTrainAllData(local=True)
374 |     pass
375 | 
376 | 
377 | 
378 | 
379 | 
380 | 
381 | 
382 | 
383 | 
384 | 
385 | 
386 | 
387 | ####################################################################################################
388 | 
389 |     # SentLength=10
390 |     #
391 |     # sentence1='I love you very much x_x'
392 |     # sentence2='I like you'
393 |     #
394 |     # s_vec1 = sentence2vec(sentence1,SentLength)
395 |     # s_vec2 = sentence2vec(sentence2,SentLength)
396 |     #
397 |     # # print(s_vec1)
398 |     # print(s_vec1.shape)
399 |     #
400 |     # # 准备已有数据
401 |     # x_data1 = tf.constant(s_vec1[np.newaxis,:,:,np.newaxis],dtype=tf.float32)
402 |     # x_data2 = tf.constant(s_vec2[np.newaxis,:,:,np.newaxis],dtype=tf.float32)
403 |     # y_data = [1]
404 |     # print(x_data1.shape)
405 |     #
406 |     # # 定义placeholder
407 |     # x1 = tf.placeholder(tf.float32, [None, 1])
408 |     # x2 = tf.placeholder(tf.float32, [None, 1])
409 |     # y = tf.placeholder(tf.float32, [None, 1])
410 |     #
411 |     #
412 |     # # [batch, in_height, in_width, in_channels] 1,20,50,1
413 |     # input_arg1 = tf.Variable(s_vec1)
414 |     # input_arg2 = tf.Variable(s_vec2)
415 |     # # [filter_height, filter_width, in_channels, out_channels]
416 |     # filter_arg1 = tf.Variable(tf.ones([3, 3, 1, 1]))
417 |     # filter_arg2 = tf.Variable(tf.ones([3, 3, 1, 1]))
418 |     # op1 = tf.nn.relu(tf.nn.conv2d(x_data1, filter_arg1, strides=[1, 1, 4, 1], use_cudnn_on_gpu=False, padding='SAME'))
419 |     # pool1=tf.nn.max_pool(op1, ksize=[1, 2, 4, 1],strides=[1, 2, 4, 1], padding='SAME')
420 |     # # softmax1=tf.nn.softmax(pool1)
421 |     #
422 |     # # connected=tf.nn.con
423 |     # # op2 = tf.nn.conv2d(input_arg2, filter_arg2, strides=[1, 2, 2, 1], use_cudnn_on_gpu=False, padding='SAME')
424 |     # # # 求模`
425 |     # # x1_norm = tf.sqrt(tf.reduce_sum(tf.square(op1), axis=2))
426 |     # #
427 |     # # x2_norm = tf.sqrt(tf.reduce_sum(tf.square(op2), axis=2))
428 |     # # x1_x2=tf.reduce_sum(tf.multiply(x1, x2), axis=2)
429 |     # #
430 |     # # cosin = x1_x2 / (x1_norm * x2_norm)
431 |     # #
432 |     # # cosin1 = tf.pide(x1_x2, tf.multiply(x1_norm, x2_norm))
433 |     # #
434 |     # #
435 |     #
436 |     #
437 |     #
438 | 
439 |     # with tf.Session() as a_sess:
440 |     #     a_sess.run(tf.global_variables_initializer())
441 |     #     # op1,op2,a, b, c, d, e = a_sess.run([op1,op2,x1_norm, x2_norm, x1_x2, cosin, cosin1])
442 |     #
443 |     #
444 |     #     print("----------{}---------".format("case1"))
445 |     #     a_op1=a_sess.run(pool1)
446 |     #     writer = tf.summary.FileWriter('tensorflow/', a_sess.graph)
447 |     #     print(a_op1)
448 |     #     print(a_op1.shape)
449 |     #     KKK=tf.reshape(a_op1,(1,20))
450 |     #     print(KKK)
451 |     #
452 |     #     print('---------------------\n\n')
453 |     #
454 |     # pass


--------------------------------------------------------------------------------
/doc_chek_hou.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding:utf8 -*-
 3 | 
 4 | 
 5 | T=0  #待链接实体总数（答案）
 6 | T1=0  #待链接实体NIL总数（答案）
 7 | T2=0  #待链接实体NOT_NIL总数（答案）
 8 | S=0  #待链接实体总数（自己）
 9 | S1=0  #待链接实体NIL总数（自己）
10 | S2=0  #待链接实体NOT_NIL总数（自己）
11 | 
12 | NIL_Dui=0  #自己判空，且判对
13 | NER_Dui=0  #自己判非空，且链接对
14 | 
15 | 
16 | def get_T_S(answerPath):
17 |     global T,T1,T2
18 |     global S,S1,S2
19 |     global NIL_Dui,NER_Dui
20 |     with open(answerPath,encoding='utf-8') as f:
21 |         line=f.readline().strip('\n')
22 |         lineCount=0
23 |         while line:
24 |             if(lineCount%2==0):
25 |                 TempLine=line
26 |             if(lineCount%2==1):
27 |                 T=T+1
28 |                 S=S+1
29 |                 ans = line.split(' ')[0]
30 |                 if(line.find('NIL')==-1):
31 |                     print(line)
32 |                 ansNum = int(line.split(' ')[1])
33 |                 if(ans.find('NIL')==-1):
34 |                     ####NOT_NIL
35 |                     T2=T2+1
36 |                 else:
37 |                     ####NIL
38 |                     T1=T1+1
39 | 
40 |                 if(ansNum==0):
41 |                     ####自己判空NIL
42 |                     S1=S1+1
43 |                     if(ans.find('NIL')!=-1):
44 |                         # 自己判空，且判对
45 |                         NIL_Dui=NIL_Dui+1
46 |                 else:
47 |                     ####自己判非空NOT_NIL
48 |                     S2=S2+1
49 |                     #if(ans==line.split(' ')[2]):
50 |                     if (int(line.split(' ')[1])<=5):
51 |                         # 自己判非空，且链接对
52 |                         NER_Dui=NER_Dui+1
53 | 
54 |             lineCount = lineCount + 1
55 |             line=f.readline().strip('\n')
56 | 
57 | 
58 | 
59 | if __name__=='__main__':
60 | 
61 |     year='2009'; fileNum=3695
62 |     # year='2010'; fileNum=2231
63 |     # year='2011'; fileNum=2231
64 |     # year='2012'; fileNum=2016
65 |     # year='2013'; fileNum=1820
66 |     # year='2014'; fileNum=138
67 |     for i in range(fileNum):
68 |         answerPath='H:\yaojuan\QUERY\\'+year+'\\eval\\test_dbEntity\db_new_'+str(i+1)+'.txt'
69 |         # answerPath = 'H:\yaojuan\QUERY\\' + year + '\\eval\\test_dbEntity\db_' + str(i + 1) + '.txt'
70 |         # answerPath = 'H:\yaojuan\QUERY\\2014\eval\\test_dbEntity\db_' + str(i + 1) + '.txt'
71 |         get_T_S(answerPath)
72 |     print(' 总   空   非空')
73 |     print(T,T1,T2)
74 |     print(S,S1,S2)
75 |     print(NIL_Dui+NER_Dui,NIL_Dui,NER_Dui)
76 |     print(NER_Dui/T2)
77 |     # print(NIL_Dui/T1)
78 |     Micro_accuracy_avrage=(NIL_Dui+NER_Dui)/T*100
79 |     Precision=NER_Dui/S2*100
80 |     Recall=NER_Dui/T2*100
81 |     F1=2*Precision*Recall/(Precision+Recall)
82 |     print('Micro_accuracy_avrage:'+str(Micro_accuracy_avrage)+'%')
83 |     print('Precision:'+str(Precision)+'%')
84 |     print('Recall:'+str(Recall)+'%')
85 |     print('F1:'+str(F1)+'%')
86 | 
87 | 
88 | 
89 |     pass


--------------------------------------------------------------------------------
/doc_db.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding:utf8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | allFileNum = 0
 7 | 
 8 | 
 9 | def printPath(level, path):
10 |     global allFileNum
11 |     '''''
12 |     打印一个目录下的所有文件夹和文件
13 |     '''
14 |     # 所有文件夹，第一个字段是次目录的级别
15 |     dirList = []
16 |     # 所有文件
17 |     fileList = []
18 |     # 返回一个列表，其中包含在目录条目的名称(google翻译)
19 |     files = os.listdir(path)
20 |     # 先添加目录级别
21 |     dirList.append(str(level))
22 |     of = open('DocFileName.txt', 'w')
23 | 
24 | 
25 |     for f in files:
26 |         #判断是不是文件夹
27 |         if (os.path.isdir(path + '/' + f)):
28 |             # 排除隐藏文件夹。因为隐藏文件夹过多
29 |             if (f[0] == '.'):
30 |                 pass
31 |             else:
32 |                 # 添加非隐藏文件夹
33 |                 dirList.append(f)
34 |         #判断是不是文件
35 |         if (os.path.isfile(path + '/' + f)):
36 |             # 添加文件
37 |             fileList.append(f)
38 |             doc_id=f
39 |             print(f)
40 |             #将DB的文件名写入‘DBFileName.txt’中
41 |             of.write(f+'\n')
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 |     of.close( )
49 |             # 当一个标志使用，文件夹列表第一个级别不打印
50 |     i_dl = 0
51 |     for dl in dirList:
52 |         if (i_dl == 0):
53 |             i_dl = i_dl + 1
54 |         else:
55 |             # 打印至控制台，不是第一个的目录
56 |             print('#########' * (int(dirList[0])), dl)
57 | 
58 |             # 打印目录下的所有文件夹和文件，目录级别+1
59 |             printPath((int(dirList[0]) + 1), path + '/' + dl)
60 | 
61 |     for fl in fileList:
62 |         # 打印文件
63 |        # print '-------' * (int(dirList[0])), fl
64 |        # 顺便计算一下有多少个文件
65 |         allFileNum = allFileNum + 1
66 | 
67 | 
68 | if __name__=='__main__':
69 |     path=u'H:\\yaojuan\\QUERY\\2014\\eval\\source_documents'
70 |     printPath(1,path)
71 | 
72 |     pass


--------------------------------------------------------------------------------
/doc_find_hou.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding:utf8 -*-
  3 | 
  4 | import os
  5 | import sys
  6 | 
  7 | 
  8 | NotFindQID=[]
  9 | NotFindQNAME=[]
 10 | DBIdNameText=''
 11 | 
 12 | def edit(str1, str2):
 13 |     matrix = [[i + j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
 14 | 
 15 |     for i in range(1, len(str1) + 1):
 16 |         for j in range(1, len(str2) + 1):
 17 |             if str1[i - 1] == str2[j - 1]:
 18 |                 d = 0
 19 |             else:
 20 |                 d = 1
 21 |             matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + d)
 22 | 
 23 |     return matrix[len(str1)][len(str2)]
 24 | 
 25 | 
 26 | 
 27 | def DBIdName2Text():
 28 | 
 29 |     path='DBIdName.txt'
 30 |     count=0
 31 |     T=open('DBNameText.txt','w',encoding='utf-8')
 32 |     with open(path,encoding='utf-8') as f:
 33 |         line=f.readline().strip('\n')
 34 |         tempText=''
 35 |         while line:
 36 | 
 37 |             bath=int(count/20000)
 38 |             if(bath<=(count/20000)<bath+1):
 39 |                 if(count%2==0):
 40 |                     DBId=line
 41 |                 if(count%2==1):
 42 |                     tempText = tempText +' '+DBId+' '+line.lower()
 43 |                     line=line.lower()
 44 |                     if(line.find(',')!=-1):
 45 |                         name = line.split(',')
 46 |                         for n in name:
 47 |                             tempText = tempText + ' ' + DBId+' '+n.strip(' ')
 48 | 
 49 |             if((20000*(bath+1))==(count+1)):
 50 |                 T.write(tempText+'\n')
 51 |                 tempText=''
 52 |             if(count==1637481):
 53 |                 T.write(tempText + '\n')
 54 |                 tempText = ''
 55 | 
 56 |             count = count + 1
 57 |             # if(count>2000):
 58 |             #     break
 59 |             print(count)
 60 |             line=f.readline().strip('\n')
 61 |     T.close()
 62 | 
 63 |     pass
 64 | 
 65 | 
 66 | 
 67 | def ReadDBNameText():
 68 |     global DBIdNameText
 69 | 
 70 |     with open('DBNameText.txt',encoding='utf-8') as f:
 71 |         line=f.readline().strip('\n')
 72 |         while line:
 73 |             DBIdNameText=DBIdNameText+' '+line
 74 |             line = f.readline().strip('\n')
 75 | 
 76 |         pass
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | def WanQuanYingPiPei(docQueryPath,dbEntityPath):
 83 |     ansf=open(dbEntityPath,'w',encoding='utf-8')
 84 |     with open(docQueryPath,encoding='utf-8') as f:
 85 |         line=f.readline().strip('\n')
 86 |         while line:
 87 |             if(line.find(' NIL')==-1):
 88 |                 pos=line.find(' E0')
 89 |                 docId=line.split(' ')[0]
 90 |                 query=line[len(docId)+1:pos]
 91 |                 ansId=line[pos+1:pos+9]
 92 |                 entity=line[pos+10:]
 93 |             else:
 94 |                 pos=line.find(' NIL')
 95 |                 docId=line.split(' ')[0]
 96 |                 query=line[len(docId)+1:pos+1]
 97 |                 end=line.find(' XXX')
 98 |                 ansId=line[pos+1:end]
 99 |                 entity=line[pos+ 9:]
100 |             print(docId)
101 |             # print(query)######  mention
102 |             # print(ansId)######  正确答案的id
103 |             # print(entity)#######  answer
104 |             ###############我现在是想non-NIL的候选集找30个，NIL的可以不管
105 |             houxuanId=''
106 |             querypos=DBIdNameText.find(query.lower())
107 |             houCount=0
108 |             while querypos!=-1:
109 | 
110 |                 begpos=DBIdNameText.find('E0',querypos-10)
111 |                 endpos=DBIdNameText.find('E0',querypos)
112 |                 if(begpos!=-1):
113 |                     findStr=DBIdNameText[begpos:endpos]
114 |                     kuohaoPos=findStr.find('(')
115 |                     # print(findStr)
116 |                     if(kuohaoPos!=-1 and findStr.find(query.lower())<kuohaoPos):
117 |                         #有括号
118 |                         findEntity=findStr[9:kuohaoPos]
119 |                         if(findEntity.strip(' ')==query.strip(' ').lower()):
120 |                             # lower一模一样
121 |                             houxuanId = houxuanId + ' ' + DBIdNameText[begpos:begpos + 8]
122 |                             houCount = houCount + 1
123 |                             print(query+' MMMMM '+findEntity)
124 | 
125 | 
126 |                     if(kuohaoPos==-1):
127 |                         #没有括号
128 |                         findEntity=findStr[9:]
129 |                         if (findEntity.strip(' ') == query.strip(' ').lower()):
130 |                             # lower一模一样
131 |                             houxuanId = houxuanId + ' ' + DBIdNameText[begpos:begpos + 8]
132 |                             houCount = houCount + 1
133 |                             print(query + ' MMMMM ' + findEntity)
134 | 
135 |                         pass
136 | 
137 | 
138 |                 querypos = DBIdNameText.find(query.lower(),querypos+1)
139 | 
140 | 
141 |             if(ansId.find('NIL')==-1):
142 |                 if(houxuanId.find(ansId)==-1):
143 |                     #有答案的实体，没有找到答案，就用答案去找答案
144 | 
145 |                     entitypos = DBIdNameText.find(entity.lower())
146 |                     print("kkkk"+entity.lower())
147 |                     while entitypos != -1:
148 | 
149 |                         begpos = DBIdNameText.find('E0', entitypos - 10)
150 |                         endpos = DBIdNameText.find('E0', entitypos)
151 |                         if (begpos != -1):
152 |                             findStr = DBIdNameText[begpos:endpos]
153 |                             # print(findStr)
154 |                             # 没有括号
155 |                             findEntity = findStr[9:]
156 |                             if (findEntity.strip(' ') == entity.strip(' ').lower()):
157 |                                 # lower一模一样
158 |                                 houxuanId = houxuanId + ' ' + DBIdNameText[begpos:begpos + 8]
159 |                                 houCount = houCount + 1
160 |                                 print(query + ' MMMMM ' + findEntity)
161 | 
162 |                             pass
163 | 
164 |                         entitypos = DBIdNameText.find(entity.lower(), entitypos + 1)
165 | 
166 | 
167 | 
168 | 
169 |             ansf.write(line+'\n')
170 |             ansf.write(ansId.strip(' ')+' '+str(houCount)+' '+houxuanId.strip(' ')+'\n')
171 |             line=f.readline().strip('\n')
172 | 
173 |     pass
174 | 
175 | 
176 | 
177 | 
178 | if __name__=='__main__':
179 |     # DBIdName2Text()
180 |     # ReadDBNameText()
181 |     # for i in range(138):
182 |     #     docQueryPath="./docQuery/doc_"+str(i+1)+"_uni.txt"
183 |     #     dbEntityPath="./dbEntity/db_"+str(i+1)+".txt"
184 |     #     WanQuanYingPiPei(docQueryPath,dbEntityPath)
185 | 
186 | 
187 |     year='2009'; fileNum=3695
188 |     # year='2010'; fileNum=2231
189 |     # year='2011'; fileNum=2231
190 |     # year='2012'; fileNum=2016
191 |     # year='2013'; fileNum=1820
192 |     # year='2014'; fileNum=138
193 | 
194 |     ReadDBNameText()
195 |     for i in range(fileNum):
196 |         docQueryPath = u"H:\yaojuan\QUERY\\"+year+"\\eval\\test_docQuery\doc_" + str(i + 1) + "_uni.txt"
197 |         dbEntityPath = u"H:\yaojuan\QUERY\\"+year+"\\eval\\test_dbEntity\db_" + str(i + 1) + ".txt"
198 |         WanQuanYingPiPei(docQueryPath, dbEntityPath)
199 | 
200 | 
201 | 


--------------------------------------------------------------------------------
/doc_query.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding:utf8 -*-
  3 | 
  4 | import os
  5 | import sys
  6 | import jieba
  7 | import nltk
  8 | from bs4 import BeautifulSoup
  9 | import re
 10 | query_id_list=[]
 11 | query_name_list=[]
 12 | doc_id_list=[]
 13 | query_beg_list=[]
 14 | query_end_list=[]
 15 | 
 16 | answer_query_id_list=[]
 17 | answer_query_name_list=[]
 18 | answer_query_name_list=[]
 19 | answer_db_id_list=[]
 20 | answer_db_name_list=[]
 21 | 
 22 | db_id_list=[]
 23 | db_name_list=[]
 24 | wordVecIndexMap=dict()
 25 | 
 26 | def readWordVecIndex():
 27 |     global wordVecIndexMap
 28 |     # path=u'E:\mypython_Linking\CNN\glove.6B.100d.txt'
 29 |     path = u'E:\data_analysis\word2vec\\vector_100.txt'
 30 |     count=0
 31 |     with open(path,encoding='utf-8') as f:
 32 |         line=f.readline()
 33 |         while line:
 34 |             count=count+1
 35 |             wordVecIndexMap[line.split(' ')[0]]=count
 36 |             line=f.readline()
 37 | 
 38 |     return wordVecIndexMap
 39 | 
 40 | 
 41 | def preprocessor(text):
 42 |     text = re.sub('<[^>]*>','',text)
 43 |     emoticons = re.findall('(?::|;|=)(?:\)|\(|D|P)',text)
 44 |     text = re.sub('[\W]+',' ',text.lower())+''.join(emoticons).replace('-','')
 45 |     return text.strip(' ')
 46 | 
 47 | 
 48 | def WordTokener( sent):  # 将单句字符串分割成词
 49 |     result = ''
 50 |     wordsInStr = nltk.word_tokenize(sent)
 51 |     return wordsInStr
 52 | 
 53 | 
 54 | def RemoveStopWords(sent):
 55 |     stopwords = {}.fromkeys([line.rstrip() for line in open('stopwords.txt',encoding='utf-8')])
 56 |     segs = jieba.cut(sent, cut_all=False)
 57 |     final = ''
 58 |     for seg in segs:
 59 |         if seg not in stopwords:
 60 |             final += seg
 61 |     return final
 62 | 
 63 | 
 64 | def DocWordfrequency(doc):
 65 |     words = doc.strip('\n').split(' ')
 66 |     word_count = {}
 67 |     word_list=[]
 68 | 
 69 |     for w in words:
 70 |         if w in word_count:
 71 |             word_count[w] += 1
 72 |         else:
 73 |             word_count[w] = 1
 74 | 
 75 | 
 76 |     for w in sorted(zip(word_count.values(), word_count.keys()), reverse=True):  # 安装词频排序
 77 |         # print(w)
 78 |         if(len(w[1].strip(' '))>0):
 79 |             wStr=w[1]+' '+str(w[0])
 80 |             word_list.append(wStr)
 81 | 
 82 |     return word_list
 83 | 
 84 | 
 85 | def NewText(word_tag):
 86 | 
 87 |     String=''
 88 |     for i in word_tag:
 89 | 
 90 |         word=i[0]
 91 |         tag=i[1]
 92 |         # if(tag.find('N')!=-1):
 93 |         #     print(i)
 94 |         ###去掉一些消歧无意义的单词
 95 |         #去掉CD、PRP、VBD、CC、WDT,IN,RP,TO,DT
 96 |         hold=['NN','NNS','NNP']
 97 |         if tag in hold:
 98 |             String=String+word+' '
 99 | 
100 |     return String
101 | 
102 | 
103 | def getContentIndex(currentText,currentQuery,docIndex,doc_id):
104 |     # print("currentText:"+currentText)
105 |     # print("currentQuery:"+currentQuery)
106 |     # print(docIndex)
107 |     # print(doc_id)
108 |     # print(len(currentText))
109 | 
110 |     if (currentText[-(len(currentQuery)+2):].find(currentQuery) != -1):
111 |         CurText = preprocessor(currentText)
112 |         CurQuery = preprocessor(currentQuery)
113 |         print('****'+currentText[-(len(currentQuery)):])
114 |         print('****'+currentQuery)
115 | 
116 | 
117 |         if (CurText.find(CurQuery) != -1):
118 |             print('####'+CurText[-(len(CurQuery)):])
119 |             print('####'+CurQuery)
120 | 
121 |             textSplit = CurText.split(' ')
122 |             querySplit = CurQuery.split(' ')
123 |             index_beg = len(textSplit)-1 - len(querySplit)
124 |             index_end = len(textSplit)-1
125 |             print(CurQuery)
126 |             print(textSplit[index_beg:index_end])
127 | 
128 |             curTextIndex = getWordIndex(CurText)
129 |             curQueryIndex = getWordIndex(CurQuery)
130 |             if (curTextIndex.find(curQueryIndex) != -1):
131 | 
132 |                 contentIndex = ''
133 |                 for i in range(index_beg - 10, index_beg):
134 |                     if (i < 0):
135 |                         print(i)
136 |                         print(len(docIndex))
137 |                         contentIndex = contentIndex+'0' + ' '
138 |                     else:
139 |                         print(i)
140 |                         print(len(docIndex))
141 |                         contentIndex = contentIndex + docIndex[i] + ' '
142 | 
143 | 
144 |                 for i in range(index_end + 1, index_end + 11):
145 |                     if (i >= len(docIndex)):
146 |                         contentIndex = contentIndex + '0' + ' '
147 |                     else:
148 |                         contentIndex = contentIndex + docIndex[i] + ' '
149 | 
150 |                 print(docIndex[index_beg-1:index_end])
151 |                 entityIndex=''
152 |                 for i in range(index_beg,index_end):
153 |                     entityIndex=entityIndex+docIndex[i]+' '
154 |                 print('entityIndex:'+entityIndex)
155 |                 print(doc_id)
156 |                 print('contentIndex:' + contentIndex)
157 |             else:
158 |                 print("转换成索引后，找不到了！！！！！")
159 | 
160 |         else:
161 |             print("找不到了！！！！！")
162 | 
163 |     else:
164 |         print(doc_id)
165 |         print(currentText[-(len(currentQuery) + 2):])
166 |         print(currentQuery)
167 |         print("转化后找不到了！！！！")
168 | 
169 | 
170 |     return entityIndex,contentIndex,index_beg,index_end
171 | 
172 | 
173 | 
174 | def getContentNounIndex(currentText,hou_halfText,currentQuery,docIndex,doc_id):
175 |     print("currentText:"+currentText)
176 |     print("hou_halfText:"+hou_halfText)
177 |     print("currentQuery:"+currentQuery)
178 |     print(docIndex)
179 |     print(doc_id)
180 |     print(len(currentText))
181 | 
182 |     if (currentText[-(len(currentQuery)+2):].find(currentQuery) != -1):
183 |         CurText = preprocessor(currentText)
184 |         CurQuery = preprocessor(currentQuery)
185 |         print('****'+currentText[-(len(currentQuery)):])
186 |         print('****'+currentQuery)
187 | 
188 | 
189 |         if (CurText.find(CurQuery) != -1):
190 |             print('####'+CurText[-(len(CurQuery)):])
191 |             print('####'+CurQuery)
192 | 
193 |             textSplit = CurText.split(' ')
194 |             querySplit = CurQuery.split(' ')
195 |             index_beg = len(textSplit)-1 - len(querySplit)
196 |             index_end = len(textSplit)-1
197 |             print(CurQuery)
198 |             print(textSplit[index_beg:index_end])
199 | 
200 |             curTextIndex = getWordIndex(CurText)
201 |             curQueryIndex = getWordIndex(CurQuery)
202 |             if (curTextIndex.find(curQueryIndex) != -1):
203 | 
204 |                 contentIndex = ''
205 | 
206 |                 soup1 = BeautifulSoup(currentText, 'html.parser')
207 |                 words1 = nltk.word_tokenize(soup1.get_text())
208 |                 word_tag1 = nltk.pos_tag(words1)
209 |                 newText1 = NewText(word_tag1)
210 |                 textIndex1 = getWordIndex(newText1).strip(' ')
211 |                 textIndex1 = '0 0 0 0 0 0 0 0 0 0 '+textIndex1
212 |                 print('textIndex1:'+textIndex1)
213 |                 Index1 = textIndex1.split(' ')[-10:]
214 |                 print(Index1)
215 | 
216 | 
217 |                 soup2 = BeautifulSoup(hou_halfText, 'html.parser')
218 |                 words2 = nltk.word_tokenize(soup2.get_text())
219 |                 word_tag2 = nltk.pos_tag(words2)
220 |                 newText2 = NewText(word_tag2)
221 |                 textIndex2 = getWordIndex(newText2).strip(' ')
222 |                 print('textIndex2:'+textIndex2)
223 |                 textIndex2 = textIndex2+' 0 0 0 0 0 0 0 0 0 0'
224 |                 Index2 = textIndex2.split(' ')[:10]
225 |                 print(Index2)
226 | 
227 |                 for i in Index1:
228 |                     contentIndex=contentIndex+' '+i
229 |                 for j in Index2:
230 |                     contentIndex=contentIndex+' '+j
231 | 
232 |                 contentIndex=contentIndex.strip(' ')
233 |                 print("find noun index:"+contentIndex)
234 | 
235 |                 print(docIndex[index_beg-1:index_end])
236 |                 entityIndex=''
237 |                 for i in range(index_beg,index_end):
238 |                     entityIndex=entityIndex+docIndex[i]+' '
239 |                 print('entityIndex:'+entityIndex)
240 |             else:
241 |                 print("转换成索引后，找不到了！！！！！")
242 | 
243 |         else:
244 |             print("找不到了！！！！！")
245 | 
246 |     else:
247 |         print(doc_id)
248 |         print(currentText[-(len(currentQuery) + 2):])
249 |         print(currentQuery)
250 |         print("转化后找不到了！！！！")
251 | 
252 | 
253 |     return contentIndex,entityIndex,index_beg,index_end
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 
260 | 
261 | 
262 | def readIdNameFile(dbIdNamePath):
263 |     global db_id_list
264 |     global db_name_list
265 |     IdNameMap=dict()
266 |     tempId=''
267 |     tempName=''
268 |     count=0
269 |     with open(dbIdNamePath,encoding='utf-8') as f:
270 |         line=f.readline().strip('\n')
271 |         while line:
272 | 
273 |             if(count%2==0):
274 |                 db_id_list.append(line)
275 |                 tempId=line
276 |                 # print(line)
277 |             if(count%2==1):
278 |                 db_name_list.append(line)
279 |                 tempName=line
280 |                 IdNameMap[tempId]=str(int((count+1)/2))+'###'+tempName
281 | 
282 |             count=count+1
283 |             line=f.readline().strip('\n')
284 |     return IdNameMap
285 | 
286 |     pass
287 | 
288 | def getWordIndex(docText):
289 |     docIndex = ''
290 |     for i in (preprocessor(docText).strip(' ').split(' ')):
291 |         if (i in wordVecIndexMap):
292 |             docIndex = docIndex + str(wordVecIndexMap[i]) + ' '
293 |         else:
294 |             docIndex = docIndex + '0' + ' '
295 | 
296 |     return docIndex.strip(' ')
297 | 
298 | 
299 | def readQueryFile(path):
300 |     global query_id_list
301 |     global query_name_list
302 |     global doc_id_list
303 |     global query_beg_list
304 |     global query_end_list
305 | 
306 |     with open(path,encoding='utf-8') as f:
307 |         line=f.readline()
308 | 
309 |         while line:
310 |             if(line.find('<query id="')!=-1):
311 |                 pos1=line.find('<query id=')
312 |                 pos2=line.find('">')
313 |                 id=line[pos1+11:pos2]
314 |                 query_id_list.append(id)
315 |                 # print(id)
316 |             if(line.find('<name>')!=-1):
317 |                 pos1=line.find('<name>')
318 |                 pos2=line.find('</name>')
319 |                 name=line[pos1+6:pos2]
320 |                 query_name_list.append(name.strip(' '))
321 |                 # print(name)
322 |             if(line.find('<docid>')!=-1):
323 |                 pos1 = line.find('<docid>')
324 |                 pos2 = line.find('</docid>')
325 |                 doc = line[pos1+7:pos2]
326 |                 doc_id_list.append(doc)
327 |                 # print(doc)
328 |             if(line.find('<beg>')!=-1):
329 |                 pos1 = line.find('<beg>')
330 |                 pos2 = line.find('</beg>')
331 |                 beg = line[pos1+5:pos2]
332 |                 query_beg_list.append(beg)
333 |                 # print(beg)
334 |             if(line.find('<end>')!=-1):
335 |                 pos1 = line.find('<end>')
336 |                 pos2 = line.find('</end>')
337 |                 end = line[pos1+5:pos2]
338 |                 query_end_list.append(end)
339 |                 # print(end)
340 | 
341 |             line=f.readline()
342 | 
343 |     pass
344 | 
345 | 
346 | def readAnswerFile(answerPath):
347 |     global answer_query_id_list
348 |     global answer_query_name_list
349 |     global answer_db_id_list
350 |     global answer_db_name_list
351 | 
352 |     with open(answerPath, encoding='utf-8') as ansf:
353 |         ans_line = ansf.readline().strip('\n')
354 |         while ans_line:
355 |             pos1 = ans_line.find("query_id=")
356 |             pos2 = ans_line.find("query_name=")
357 |             pos3 = ans_line.find("db_id=")
358 |             pos4 = ans_line.find("db_name=")
359 |             query_id = ans_line[pos1 + 9:pos2].strip(' ')
360 |             query_name = ans_line[pos2 + 11:pos3].strip(' ')
361 |             db_id = ans_line[pos3 + 6:pos4].strip(' ')
362 |             db_name = ans_line[pos4 + 8:len(ans_line)].strip(' ')
363 | 
364 |             # print(query_id)
365 |             # print(query_name)
366 |             # print(db_id)
367 |             # print(db_name)
368 |             answer_query_id_list.append(query_id)
369 |             answer_query_name_list.append(query_name)
370 |             answer_db_id_list.append(db_id)
371 |             answer_db_name_list.append(db_name)
372 | 
373 |             ans_line = ansf.readline().strip('\n')
374 |             pass
375 | 
376 |     pass
377 | 
378 | 
379 | 
380 | 
381 | 
382 | def findDocQueryAndDbAnswer(docPath,year,testFalg):
383 | 
384 |     if(testFlag):
385 |         docP=u'H:\yaojuan\QUERY\\'+year+'\eval\\source_documents'
386 |     else:
387 |         docP = u'H:\yaojuan\QUERY\\' + year + '\\training\\source_documents'
388 |     # OOOMap={}
389 |     # # 检查一下query的答案是不都有链接的实体
390 |     # path = u'H:\\yaojuan\\EntityLinkingData\\DB_id_index.txt'
391 |     # count = 0
392 |     # all = 0
393 |     # with open(path,encoding='utf-8') as f:
394 |     #     line = f.readline()
395 |     #     while line:
396 |     #         all = all + 1
397 |     #         if (len(line) < 12):
398 |     #             count = count + 1
399 |     #             OOOMap[line.split(' ')[0]]="糟糕！！！有答案实体与任何实体没有关系"
400 |     #         else:
401 |     #             OOOMap[line.split(' ')[0]]="OK!!!!没问题"
402 |     #
403 |     #         line = f.readline()
404 | 
405 |     if(testFlag):
406 |         queryIndexFile = open('H:\yaojuan\QUERY\\'+year+'\eval\word2vec\\test\\test_queryIndex.txt','w',encoding='utf-8')
407 |         queryNounIndexFile = open('H:\yaojuan\QUERY\\' + year + '\eval\word2vec\\test\\test_queryNounIndex.txt', 'w', encoding='utf-8')
408 |         queryTextIndexFile = open('H:\yaojuan\QUERY\\' + year + '\eval\word2vec\\test\\test_queryTextIndex.txt', 'w', encoding='utf-8')
409 |     else:
410 |         queryIndexFile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train\\train_queryIndex.txt', 'w', encoding='utf-8')
411 |         queryNounIndexFile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train\\train_queryNounIndex.txt', 'w', encoding='utf-8')
412 |         queryTextIndexFile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train\\train_queryTextIndex.txt', 'w', encoding='utf-8')
413 | 
414 |     with open(docPath,encoding='utf-8') as docF:
415 |         doc_line=docF.readline()
416 |         doc_count=0
417 | 
418 |         while doc_line:
419 |             pos=doc_line.find(".txt")
420 |             doc_id=doc_line[:pos]
421 |             doc_count=doc_count+1
422 |             # print(doc_id)
423 |             if (doc_count > 0):
424 |                 if(testFlag):
425 |                     docfile= open('H:\yaojuan\QUERY\\'+year+'\eval\word2vec\\test_docQuery\doc_'+str(doc_count)+'.txt','w',encoding='utf-8')
426 |                     unidocfile = open('H:\yaojuan\QUERY\\'+year+'\eval\word2vec\\test_docQuery\doc_' + str(doc_count) + '_uni.txt', 'w',encoding='utf-8')
427 |                     docindexfile = open('H:\yaojuan\QUERY\\'+year+'\eval\word2vec\\test_docQuery\doc_' + str(doc_count) + '_Windex.txt', 'w', encoding='utf-8')
428 |                     textfile=open('H:\yaojuan\QUERY\\'+year+'\eval\word2vec\\test_docText\\text_'+str(doc_count)+'.txt','w',encoding='utf-8')
429 |                 else:
430 |                     docfile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_docQuery\doc_' + str(doc_count) + '.txt', 'w', encoding='utf-8')
431 |                     unidocfile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_docQuery\doc_' + str(doc_count) + '_uni.txt', 'w', encoding='utf-8')
432 |                     docindexfile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_docQuery\doc_' + str(doc_count) + '_Windex.txt', 'w', encoding='utf-8')
433 |                     textfile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_docText\\text_' + str(doc_count) + '.txt', 'w', encoding='utf-8')
434 | 
435 |                 with open(docP + "\\" + doc_id+'.txt' , encoding='utf-8') as textF:
436 |                     docText = textF.read()
437 |                 DocText=preprocessor(docText.replace('\n',' '))###预处理后的文本
438 |                 docIndex=getWordIndex(DocText).split(' ')###预处理后的文本单词索引
439 | 
440 | 
441 |                 tempList = []
442 |                 for i in range(len(doc_id_list)):
443 |                     if(doc_id_list[i]==doc_id):
444 | 
445 |                         # print(doc_id_list[i])
446 |                         # print(query_name_list[i])
447 |                         # print(query_id_list[i])
448 |                         # tempString = doc_id_list[i] + ' ' + query_id_list[i] + ' ' + query_name_list[i] + '\n'
449 |                         # docfile.write(tempString)
450 | 
451 |                         for j in range(len(answer_query_id_list)):
452 |                             if(answer_query_id_list[j]==query_id_list[i]):
453 |                                 # print(answer_query_id_list[j])
454 |                                 # print(query_id_list[i])
455 |                                 # print('query name:'+query_name_list[i]+' find query:'+DocText[int(query_beg_list[i]):int(query_end_list[i])+1].replace('\n',' '))
456 | 
457 |                                 currentText = docText[:int(query_end_list[i]) + 1].replace('\n', ' ')
458 |                                 hou_halfText = docText[int(query_end_list[i]) + 1:].replace('\n', ' ')
459 |                                 currentQuery = query_name_list[i]
460 |                                 entityIndex,contentIndex,index_beg,index_end=getContentIndex(currentText,currentQuery,docIndex,doc_id)
461 |                                 queryIndexFile.write(query_id_list[i]+' '+contentIndex.strip(' ')+'\n')
462 |                                 queryIndexFile.write(query_id_list[i]+' '+entityIndex.strip(' ')+'\n')
463 |                                 contentNounIndex,_,_,_=getContentNounIndex(currentText,hou_halfText,currentQuery,docIndex,doc_id)
464 |                                 queryNounIndexFile.write(query_id_list[i]+' '+contentNounIndex.strip(' ')+'\n')
465 |                                 queryNounIndexFile.write(query_id_list[i]+' '+entityIndex.strip(' ')+'\n')
466 | 
467 | 
468 | 
469 | 
470 | 
471 |                                 tempString = doc_id_list[i] + ' ' + answer_query_id_list[j] + ' ' + answer_query_name_list[j] +' '+answer_db_id_list[j]+' '+answer_db_name_list[j]+' beg='+query_beg_list[i]+' end='+query_end_list[i]
472 |                                 indexString = doc_id_list[i] + ' ' + answer_query_id_list[j] + ' ' + answer_query_name_list[j] + ' ' + answer_db_id_list[j] + ' ' + answer_db_name_list[j] + ' index_beg=' + \
473 |                                              str(index_beg) + ' index_end=' + str(index_end)
474 |                                 tpString = doc_id_list[i] + ' ' + answer_query_name_list[j] +' '+answer_db_id_list[j]+' '+answer_db_name_list[j]
475 | 
476 | 
477 | 
478 |                                 docindexfile.write(indexString+'\n')
479 |                                 docindexfile.write(entityIndex+'\n')
480 |                                 docindexfile.write(contentIndex+'\n')
481 | 
482 | 
483 | 
484 |                                 docfile.write(tempString+'\n')
485 |                                 tempList.append(tpString)
486 |                                 # if(answer_db_id_list[j].find('NIL')==-1):
487 |                                 #     if(OOOMap[answer_db_id_list[j]].find("没问题")==-1):
488 |                                 #         print(OOOMap[answer_db_id_list[j]])
489 |                                 #         print(tempString)
490 | 
491 | 
492 |                                 with open(docP + "\\" + doc_id + '.txt', encoding='utf-8') as textF:
493 |                                     docText = textF.read()
494 |                                 soup = BeautifulSoup(docText, 'html.parser')
495 |                                 words = nltk.word_tokenize(soup.get_text())
496 |                                 word_tag = nltk.pos_tag(words)
497 |                                 newText = NewText(word_tag)
498 |                                 queryTextIndexFile.write(query_id_list[i]+' '+getWordIndex(newText).strip(' ') + '\n')
499 |                                 queryTextIndexFile.write(query_id_list[i]+' '+entityIndex.strip(' ')+'\n')
500 |                                 print("textIdex:" + getWordIndex(newText))
501 | 
502 | 
503 | 
504 |                 tpList=set(tempList)
505 |                 for tp in tpList:
506 |                     unidocfile.write(tp+'\n')
507 | 
508 |                 docfile.close()
509 | 
510 | 
511 | 
512 |                 with open(docP + "\\" + doc_id +'.txt', encoding='utf-8') as textF:
513 |                     docText = textF.read()
514 |                 soup = BeautifulSoup(docText, 'html.parser')
515 |                 words = nltk.word_tokenize(soup.get_text())
516 |                 word_tag = nltk.pos_tag(words)
517 |                 newText = NewText(word_tag)
518 |                 textfile.write(newText+'\n')
519 |                 fre=DocWordfrequency(newText)
520 |                 for w in fre:
521 |                     # wStr=
522 |                     textfile.write(w + '\n')
523 |                 # print(newText)
524 |                 # print(fre)
525 | 
526 | 
527 |             doc_line = docF.readline()
528 | 
529 |     # queryIndexFile.close()
530 | 
531 | 
532 |     pass
533 | 
534 | 
535 | 
536 | if __name__=='__main__':
537 |     # readWordVecIndex()
538 |     # docPath=u'DocFileName.txt'
539 |     # queryPath=u'H:\\yaojuan\\QUERY\\2014\\eval\\tac_kbp_2014_english_EDL_evaluation_queries.xml'
540 |     # answerPath=u'newAnswer.txt'
541 |     # dbIdNamePath=u'DBIdName.txt'
542 |     #
543 |     # readIdNameFile(dbIdNamePath)
544 |     # readQueryFile(queryPath)
545 |     # readAnswerFile(answerPath)
546 |     # findDocQueryAndDbAnswer(docPath,queryPath)
547 | 
548 |     year = '2014'  ####2012 2013 2014  trian2014
549 |     testFlag = False
550 |     readWordVecIndex()
551 |     if(testFlag):
552 |         docPath = u'H:\yaojuan\QUERY\\'+year+'\eval\\test\DocFileName.txt'
553 |         queryPath = u'H:\yaojuan\QUERY\\'+year+'\eval\\tac_kbp_'+year+'_english_entity_linking_evaluation_queries.xml'
554 |         answerPath = u'H:\yaojuan\QUERY\\'+year+'\eval\\test\\answer.txt'
555 |         dbIdNamePath = u'DBIdName.txt'
556 |     else:
557 |         docPath = u'H:\yaojuan\QUERY\\' + year + '\\training\\train\DocFileName.txt'
558 |         queryPath = u'H:\yaojuan\QUERY\\' + year + '\\training\\tac_kbp_' + year + '_english_EDL_training_queries.xml'
559 |         answerPath = u'H:\yaojuan\QUERY\\' + year + '\\training\\train\\answer.txt'
560 |         dbIdNamePath = u'DBIdName.txt'
561 | 
562 |     readIdNameFile(dbIdNamePath)
563 |     readQueryFile(queryPath)
564 |     readAnswerFile(answerPath)
565 |     findDocQueryAndDbAnswer(docPath, year, testFlag)
566 | 
567 | 
568 | 
569 |     pass
570 | 
571 | 
572 | 
573 | 
574 | 
575 | 
576 | 
577 | 
578 | 
579 | 
580 | 
581 | 
582 | 
583 | 
584 | 


--------------------------------------------------------------------------------
/doc_query_no_beg_end.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding:utf8 -*-
  3 | 
  4 | import os
  5 | import sys
  6 | import jieba
  7 | import nltk
  8 | from bs4 import BeautifulSoup
  9 | import re
 10 | query_id_list=[]
 11 | query_name_list=[]
 12 | doc_id_list=[]
 13 | 
 14 | answer_query_id_list=[]
 15 | answer_query_name_list=[]
 16 | answer_query_name_list=[]
 17 | answer_db_id_list=[]
 18 | answer_db_name_list=[]
 19 | 
 20 | db_id_list=[]
 21 | db_name_list=[]
 22 | wordVecIndexMap=dict()
 23 | 
 24 | def readWordVecIndex():
 25 |     global wordVecIndexMap
 26 |     # path=u'E:\mypython_Linking\CNN\glove.6B.100d.txt'
 27 |     path = u'E:\data_analysis\word2vec\\vector_100.txt'
 28 |     count=0
 29 |     with open(path,encoding='utf-8') as f:
 30 |         line=f.readline()
 31 |         while line:
 32 |             count=count+1
 33 |             wordVecIndexMap[line.split(' ')[0]]=count
 34 |             line=f.readline()
 35 | 
 36 |     return wordVecIndexMap
 37 | 
 38 | 
 39 | def preprocessor(text):
 40 |     text = re.sub('<[^>]*>','',text)
 41 |     emoticons = re.findall('(?::|;|=)(?:\)|\(|D|P)',text)
 42 |     text = re.sub('[\W]+',' ',text.lower())+''.join(emoticons).replace('-','')
 43 |     return text.strip(' ')
 44 | 
 45 | 
 46 | def WordTokener( sent):  # 将单句字符串分割成词
 47 |     result = ''
 48 |     wordsInStr = nltk.word_tokenize(sent)
 49 |     return wordsInStr
 50 | 
 51 | 
 52 | def RemoveStopWords(sent):
 53 |     stopwords = {}.fromkeys([line.rstrip() for line in open('stopwords.txt',encoding='utf-8')])
 54 |     segs = jieba.cut(sent, cut_all=False)
 55 |     final = ''
 56 |     for seg in segs:
 57 |         if seg not in stopwords:
 58 |             final += seg
 59 |     return final
 60 | 
 61 | 
 62 | def DocWordfrequency(doc):
 63 |     words = doc.strip('\n').split(' ')
 64 |     word_count = {}
 65 |     word_list=[]
 66 | 
 67 |     for w in words:
 68 |         if w in word_count:
 69 |             word_count[w] += 1
 70 |         else:
 71 |             word_count[w] = 1
 72 | 
 73 | 
 74 |     for w in sorted(zip(word_count.values(), word_count.keys()), reverse=True):  # 安装词频排序
 75 |         # print(w)
 76 |         if(len(w[1].strip(' '))>0):
 77 |             wStr=w[1]+' '+str(w[0])
 78 |             word_list.append(wStr)
 79 | 
 80 |     return word_list
 81 | 
 82 | 
 83 | def NewText(word_tag):
 84 | 
 85 |     String=''
 86 |     for i in word_tag:
 87 | 
 88 |         word=i[0]
 89 |         tag=i[1]
 90 |         # if(tag.find('N')!=-1):
 91 |         #     print(i)
 92 |         ###去掉一些消歧无意义的单词
 93 |         #去掉CD、PRP、VBD、CC、WDT,IN,RP,TO,DT
 94 |         hold=['NN','NNS','NNP']
 95 |         if tag in hold:
 96 |             String=String+word+' '
 97 | 
 98 |     return String
 99 | 
100 | 
101 | def getContentIndex(currentText,currentQuery,docIndex,doc_id):
102 |     print("currentText:"+currentText)
103 |     print("currentQuery:"+currentQuery)
104 | 
105 |     if (currentText.find(currentQuery) != -1):
106 |         pos=currentText.find(currentQuery)
107 |         CurText = preprocessor(currentText[:pos+len(currentQuery)])
108 |         print('CurText='+CurText)
109 |         CurQuery = preprocessor(currentQuery)
110 |         print('****'+currentText)
111 |         print('****'+currentQuery)
112 | 
113 | 
114 |         if (CurText.find(CurQuery) != -1):
115 |             pos_beg=CurText.rfind(CurQuery)
116 | 
117 |             print('####'+CurText[pos_beg:pos_beg+len(CurQuery)])
118 |             print('####'+CurQuery)
119 | 
120 |             CurText = CurText[:pos_beg + len(CurQuery)]
121 |             textSplit = CurText.split(' ')
122 |             querySplit = CurQuery.split(' ')
123 |             index_beg = len(textSplit)-1 - len(querySplit)
124 |             index_end = len(textSplit)-1
125 |             print("index_beg="+str(index_beg))
126 |             print("index_end="+str(index_end))
127 |             print(textSplit[index_beg:index_end])
128 |             # print(CurQuery)
129 |             # print(textSplit[index_beg:index_end])
130 | 
131 |             curTextIndex = getWordIndex(CurText)
132 |             curQueryIndex = getWordIndex(CurQuery)
133 |             if (curTextIndex.find(curQueryIndex) != -1):
134 | 
135 |                 contentIndex = ''
136 |                 for i in range(index_beg - 10, index_beg):
137 |                     if (i < 0):
138 |                         contentIndex = contentIndex + '0' + ' '
139 |                     else:
140 |                         contentIndex = contentIndex + docIndex[i] + ' '
141 | 
142 |                 for i in range(index_end + 1, index_end + 11):
143 |                     if (i >= len(docIndex)):
144 |                         contentIndex = contentIndex + '0' + ' '
145 |                     else:
146 |                         contentIndex = contentIndex + docIndex[i] + ' '
147 | 
148 |                 print(docIndex[index_beg-1:index_end])
149 |                 entityIndex=''
150 |                 for i in range(index_beg,index_end):
151 |                     entityIndex=entityIndex+docIndex[i]+' '
152 |                 print('entityIndex:'+entityIndex)
153 |                 print(doc_id)
154 |                 print('contentIndex:' + contentIndex)
155 |             else:
156 |                 print("转换成索引后，找不到了！！！！！")
157 | 
158 |         else:
159 |             print("找不到了！！！！！")
160 | 
161 |     else:
162 |         print(doc_id)
163 |         print(currentText[-(len(currentQuery) + 2):])
164 |         print(currentQuery)
165 |         print("转化后找不到了！！！！")
166 | 
167 | 
168 |     return entityIndex,contentIndex,index_beg,index_end
169 | 
170 | 
171 | def getContentNounIndex(currentText,currentQuery,docIndex,doc_id):
172 |     print("currentText:"+currentText)
173 |     print("currentQuery:"+currentQuery)
174 | 
175 |     if (currentText.find(currentQuery) != -1):
176 |         pos=currentText.find(currentQuery)
177 |         CurText = preprocessor(currentText[:pos+len(currentQuery)])
178 |         CurQuery = preprocessor(currentQuery)
179 |         qian_currentText=currentText[:pos+len(currentQuery)]
180 |         hou_halfText=currentText[pos+len(currentQuery):]
181 |         print('****'+currentText)
182 |         print('****'+currentQuery)
183 | 
184 | 
185 |         if (CurText.find(CurQuery) != -1):
186 |             pos_beg=CurText.rfind(CurQuery)
187 | 
188 |             print('####'+CurText[pos_beg:pos_beg+len(CurQuery)])
189 |             print('####'+CurQuery)
190 | 
191 |             CurText = CurText[:pos_beg + len(CurQuery)]
192 |             textSplit = CurText.split(' ')
193 |             querySplit = CurQuery.split(' ')
194 |             index_beg = len(textSplit)-1 - len(querySplit)
195 |             index_end = len(textSplit)-1
196 |             print("index_beg="+str(index_beg))
197 |             print("index_end="+str(index_end))
198 |             print(textSplit[index_beg:index_end])
199 |             # print(CurQuery)
200 |             # print(textSplit[index_beg:index_end])
201 | 
202 |             curTextIndex = getWordIndex(CurText)
203 |             curQueryIndex = getWordIndex(CurQuery)
204 |             if (curTextIndex.find(curQueryIndex) != -1):
205 | 
206 |                 contentIndex = ''
207 | 
208 |                 soup1 = BeautifulSoup(qian_currentText, 'html.parser')
209 |                 words1 = nltk.word_tokenize(soup1.get_text())
210 |                 word_tag1 = nltk.pos_tag(words1)
211 |                 newText1 = NewText(word_tag1)
212 |                 textIndex1 = getWordIndex(newText1).strip(' ')
213 |                 textIndex1 = '0 0 0 0 0 0 0 0 0 0 ' + textIndex1
214 |                 print('textIndex1:' + textIndex1)
215 |                 Index1 = textIndex1.split(' ')[-10:]
216 |                 print(Index1)
217 | 
218 |                 soup2 = BeautifulSoup(hou_halfText, 'html.parser')
219 |                 words2 = nltk.word_tokenize(soup2.get_text())
220 |                 word_tag2 = nltk.pos_tag(words2)
221 |                 newText2 = NewText(word_tag2)
222 |                 textIndex2 = getWordIndex(newText2).strip(' ')
223 |                 print('textIndex2:' + textIndex2)
224 |                 textIndex2 = textIndex2 + ' 0 0 0 0 0 0 0 0 0 0'
225 |                 Index2 = textIndex2.split(' ')[:10]
226 |                 print(Index2)
227 | 
228 |                 for i in Index1:
229 |                     contentIndex = contentIndex + ' ' + i
230 |                 for j in Index2:
231 |                     contentIndex = contentIndex + ' ' + j
232 | 
233 |                 contentIndex = contentIndex.strip(' ')
234 |                 print("find noun index:" + contentIndex)
235 | 
236 | 
237 |                 print(docIndex[index_beg-1:index_end])
238 |                 entityIndex=''
239 |                 for i in range(index_beg,index_end):
240 |                     entityIndex=entityIndex+docIndex[i]+' '
241 |                 print('entityIndex:'+entityIndex)
242 |                 print(doc_id)
243 |                 print('contentIndex:' + contentIndex)
244 |             else:
245 |                 print("转换成索引后，找不到了！！！！！")
246 | 
247 |         else:
248 |             print("找不到了！！！！！")
249 | 
250 |     else:
251 |         print(doc_id)
252 |         print(currentText[-(len(currentQuery) + 2):])
253 |         print(currentQuery)
254 |         print("转化后找不到了！！！！")
255 | 
256 | 
257 |     return contentIndex,entityIndex,index_beg,index_end
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 
265 | 
266 | 
267 | def readIdNameFile(dbIdNamePath):
268 |     global db_id_list
269 |     global db_name_list
270 |     IdNameMap=dict()
271 |     tempId=''
272 |     tempName=''
273 |     count=0
274 |     with open(dbIdNamePath,encoding='utf-8') as f:
275 |         line=f.readline().strip('\n')
276 |         while line:
277 | 
278 |             if(count%2==0):
279 |                 db_id_list.append(line)
280 |                 tempId=line
281 |                 # print(line)
282 |             if(count%2==1):
283 |                 db_name_list.append(line)
284 |                 tempName=line
285 |                 IdNameMap[tempId]=str(int((count+1)/2))+'###'+tempName
286 | 
287 |             count=count+1
288 |             line=f.readline().strip('\n')
289 |     return IdNameMap
290 | 
291 |     pass
292 | 
293 | def getWordIndex(docText):
294 |     docIndex = ''
295 |     for i in (preprocessor(docText).strip(' ').split(' ')):
296 |         if (i in wordVecIndexMap):
297 |             docIndex = docIndex + str(wordVecIndexMap[i]) + ' '
298 |         else:
299 |             docIndex = docIndex + '0' + ' '
300 | 
301 |     return docIndex.strip(' ')
302 | 
303 | 
304 | def readQueryFile(path):
305 |     global query_id_list
306 |     global query_name_list
307 |     global doc_id_list
308 | 
309 | 
310 |     with open(path,encoding='utf-8') as f:
311 |         line=f.readline()
312 | 
313 |         while line:
314 |             if(line.find('<query id="')!=-1):
315 |                 pos1=line.find('<query id=')
316 |                 pos2=line.find('">')
317 |                 id=line[pos1+11:pos2]
318 |                 query_id_list.append(id)
319 |                 # print(id)
320 |             if(line.find('<name>')!=-1):
321 |                 pos1=line.find('<name>')
322 |                 pos2=line.find('</name>')
323 |                 name=line[pos1+6:pos2]
324 |                 query_name_list.append(name.strip(' '))
325 |                 # print(name)
326 |             if(line.find('<docid>')!=-1):
327 |                 pos1 = line.find('<docid>')
328 |                 pos2 = line.find('</docid>')
329 |                 doc = line[pos1+7:pos2]
330 |                 doc_id_list.append(doc)
331 |                 # print(doc)
332 | 
333 |             line=f.readline()
334 | 
335 |     pass
336 | 
337 | 
338 | def readAnswerFile(answerPath):
339 |     global answer_query_id_list
340 |     global answer_query_name_list
341 |     global answer_db_id_list
342 |     global answer_db_name_list
343 | 
344 |     with open(answerPath, encoding='utf-8') as ansf:
345 |         ans_line = ansf.readline().strip('\n')
346 |         while ans_line:
347 |             pos1 = ans_line.find("query_id=")
348 |             pos2 = ans_line.find("query_name=")
349 |             pos3 = ans_line.find("db_id=")
350 |             pos4 = ans_line.find("db_name=")
351 |             query_id = ans_line[pos1 + 9:pos2].strip(' ')
352 |             query_name = ans_line[pos2 + 11:pos3].strip(' ')
353 |             db_id = ans_line[pos3 + 6:pos4].strip(' ')
354 |             db_name = ans_line[pos4 + 8:len(ans_line)].strip(' ')
355 | 
356 |             # print(query_id)
357 |             # print(query_name)
358 |             # print(db_id)
359 |             # print(db_name)
360 |             answer_query_id_list.append(query_id)
361 |             answer_query_name_list.append(query_name)
362 |             answer_db_id_list.append(db_id)
363 |             answer_db_name_list.append(db_name)
364 | 
365 |             ans_line = ansf.readline().strip('\n')
366 |             pass
367 | 
368 |     pass
369 | 
370 | 
371 | 
372 | 
373 | 
374 | def findDocQueryAndDbAnswer(docPath,year,testFlag):
375 |     if(testFlag):
376 |         docP=u'H:\yaojuan\QUERY\\'+year+'\\eval\\source_documents'
377 |     else:
378 |         docP=u'H:\yaojuan\QUERY\\'+year+'\\training\\source_documents'
379 | 
380 |     # OOOMap={}
381 |     # # 检查一下query的答案是不都有链接的实体
382 |     # path = u'H:\\yaojuan\\EntityLinkingData\\DB_id_index.txt'
383 |     # count = 0
384 |     # all = 0
385 |     # with open(path,encoding='utf-8') as f:
386 |     #     line = f.readline()
387 |     #     while line:
388 |     #         all = all + 1
389 |     #         if (len(line) < 12):
390 |     #             count = count + 1
391 |     #             OOOMap[line.split(' ')[0]]="糟糕！！！有答案实体与任何实体没有关系"
392 |     #         else:
393 |     #             OOOMap[line.split(' ')[0]]="OK!!!!没问题"
394 |     #
395 |     #         line = f.readline()
396 |     if(testFlag):
397 |         queryIndexFile = open('H:\yaojuan\QUERY\\'+year+'\\eval\word2vec\\test\\test_queryIndex.txt','w',encoding='utf-8')
398 |         queryNounIndexFile = open('H:\yaojuan\QUERY\\' + year + '\eval\word2vec\\test\\test_queryNounIndex.txt', 'w', encoding='utf-8')
399 |         queryTextIndexFile = open('H:\yaojuan\QUERY\\' + year + '\\eval\word2vec\\test\\test_queryTextIndex.txt', 'w', encoding='utf-8')
400 |     else:
401 |         queryIndexFile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train\\train_queryIndex.txt', 'w', encoding='utf-8')
402 |         queryNounIndexFile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train\\train_queryNounIndex.txt', 'w', encoding='utf-8')
403 |         queryTextIndexFile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train\\train_queryTextIndex.txt', 'w',encoding='utf-8')
404 | 
405 |     with open(docPath,encoding='utf-8') as docF:
406 |         doc_line=docF.readline()
407 |         doc_count=0
408 | 
409 |         while doc_line:
410 | 
411 |             pos=doc_line.find(".xml")
412 |             doc_id=doc_line[:pos]
413 |             doc_count=doc_count+1
414 |             # print(doc_id)
415 | 
416 | 
417 |             if(doc_count>0):
418 |                 if(testFlag):
419 |                     docfile= open('H:\yaojuan\QUERY\\'+year+'\\eval\word2vec\\test_docQuery\doc_'+str(doc_count)+'.txt','w',encoding='utf-8')
420 |                     unidocfile = open('H:\yaojuan\QUERY\\'+year+'\\eval\word2vec\\test_docQuery\doc_' + str(doc_count) + '_uni.txt', 'w',encoding='utf-8')
421 |                     docindexfile = open('H:\yaojuan\QUERY\\'+year+'\\eval\word2vec\\test_docQuery\doc_' + str(doc_count) + '_Windex.txt', 'w', encoding='utf-8')
422 |                     textfile=open('H:\yaojuan\QUERY\\'+year+'\\eval\word2vec\\test_docText\\text_'+str(doc_count)+'.txt','w',encoding='utf-8')
423 |                 else:
424 |                     docfile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_docQuery\doc_' + str(doc_count) + '.txt', 'w', encoding='utf-8')
425 |                     unidocfile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_docQuery\doc_' + str(doc_count) + '_uni.txt', 'w', encoding='utf-8')
426 |                     docindexfile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_docQuery\doc_' + str(doc_count) + '_Windex.txt', 'w', encoding='utf-8')
427 |                     textfile = open('H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_docText\\text_' + str(doc_count) + '.txt', 'w', encoding='utf-8')
428 | 
429 |                 with open(docP + "\\" + doc_id+'.xml' , encoding='utf-8') as textF:
430 |                     docText = textF.read()
431 |                 DocText=preprocessor(docText.replace('\n',' '))###预处理后的文本
432 |                 docIndex=getWordIndex(DocText).split(' ')###预处理后的文本单词索引
433 | 
434 | 
435 |                 tempList = []
436 |                 for i in range(len(doc_id_list)):
437 |                     if(doc_id_list[i]==doc_id):
438 | 
439 |                         # print(doc_id_list[i])
440 |                         # print(query_name_list[i])
441 |                         # print(query_id_list[i])
442 |                         tempString = doc_id_list[i] + ' ' + query_id_list[i] + ' ' + query_name_list[i] + '\n'
443 |                         docfile.write(tempString)
444 | 
445 |                         for j in range(len(answer_query_id_list)):
446 |                             if(answer_query_id_list[j]==query_id_list[i]):
447 |                                 # print(answer_query_id_list[j])
448 |                                 # print(query_id_list[i])
449 |                                 # print('query name:'+query_name_list[i]+' find query:'+DocText[int(query_beg_list[i]):int(query_end_list[i])+1].replace('\n',' '))
450 | 
451 |                                 currentText = docText.replace('\n',' ')
452 |                                 currentQuery = query_name_list[i]
453 |                                 entityIndex,contentIndex,index_beg,index_end=getContentIndex(currentText,currentQuery,docIndex,doc_id)
454 |                                 queryIndexFile.write(query_id_list[i]+' '+contentIndex.strip(' ')+'\n')
455 |                                 queryIndexFile.write(query_id_list[i] + ' ' + entityIndex.strip(' ') + '\n')
456 |                                 contentNounIndex, _, _, _ = getContentNounIndex(currentText, currentQuery, docIndex, doc_id)
457 |                                 print('contentNounIndex:'+contentNounIndex)
458 |                                 print('entityIndex:'+entityIndex)
459 |                                 queryNounIndexFile.write(query_id_list[i] + ' ' + contentNounIndex.strip(' ') + '\n')
460 |                                 queryNounIndexFile.write(query_id_list[i] + ' ' + entityIndex.strip(' ') + '\n')
461 | 
462 | 
463 | 
464 | 
465 | 
466 |                                 tempString = doc_id_list[i] + ' ' + answer_query_id_list[j] + ' ' + answer_query_name_list[j] +' '+answer_db_id_list[j]+' '+answer_db_name_list[j]+' beg='
467 |                                 indexString = doc_id_list[i] + ' ' + answer_query_id_list[j] + ' ' + answer_query_name_list[j] + ' ' + answer_db_id_list[j] + ' ' + answer_db_name_list[j] + ' index_beg=' + \
468 |                                              str(index_beg) + ' index_end=' + str(index_end)
469 |                                 tpString = doc_id_list[i] + ' ' + answer_query_name_list[j] +' '+answer_db_id_list[j]+' '+answer_db_name_list[j]
470 | 
471 | 
472 | 
473 |                                 docindexfile.write(indexString+'\n')
474 |                                 docindexfile.write(entityIndex+'\n')
475 |                                 docindexfile.write(contentIndex+'\n')
476 | 
477 | 
478 |                                 docfile.write(tempString+'\n')
479 |                                 print("tempString tempString:"+tempString)
480 |                                 tempList.append(tpString)
481 |                                 # if(answer_db_id_list[j].find('NIL')==-1):
482 |                                 #     if(OOOMap[answer_db_id_list[j]].find("没问题")==-1):
483 |                                 #         print(OOOMap[answer_db_id_list[j]])
484 |                                 #         print(tempString)
485 | 
486 |                                 with open(docP + "\\" + doc_id + '.xml', encoding='utf-8') as textF:
487 |                                     docText = textF.read()
488 |                                 soup = BeautifulSoup(docText, 'html.parser')
489 |                                 words = nltk.word_tokenize(soup.get_text())
490 |                                 word_tag = nltk.pos_tag(words)
491 |                                 newText = NewText(word_tag)
492 |                                 queryTextIndexFile.write(
493 |                                     query_id_list[i] + ' ' + getWordIndex(newText).strip(' ') + '\n')
494 |                                 queryTextIndexFile.write(query_id_list[i] + ' ' + entityIndex.strip(' ') + '\n')
495 |                                 print("textIdex:" + getWordIndex(newText))
496 | 
497 |                 tpList=set(tempList)
498 |                 for tp in tpList:
499 |                     unidocfile.write(tp+'\n')
500 | 
501 |                 docfile.close()
502 | 
503 | 
504 | 
505 |                 with open(docP + "\\" + doc_id +'.xml', encoding='utf-8') as textF:
506 |                     docText = textF.read()
507 |                 soup = BeautifulSoup(docText, 'html.parser')
508 |                 words = nltk.word_tokenize(soup.get_text())
509 |                 word_tag = nltk.pos_tag(words)
510 |                 newText = NewText(word_tag)
511 |                 textfile.write(newText+'\n')
512 |                 fre=DocWordfrequency(newText)
513 |                 for w in fre:
514 |                     # wStr=
515 |                     textfile.write(w + '\n')
516 |                 # print(newText)
517 |                 # print(fre)
518 | 
519 | 
520 |             doc_line = docF.readline()
521 | 
522 |     # queryIndexFile.close()
523 | 
524 | 
525 |     pass
526 | 
527 | 
528 | 
529 | if __name__=='__main__':
530 | 
531 |     year='2010'  #2009 2010 2011  trian2010
532 |     testFlag=True
533 |     readWordVecIndex()
534 |     if(testFlag):
535 |         docPath = u'H:\yaojuan\QUERY\\'+year+'\\eval\\test\DocFileName.txt'
536 |         queryPath = u'H:\yaojuan\QUERY\\'+year+'\\eval\\tac_kbp_'+year+'_english_entity_linking_evaluation_queries.xml'
537 |         answerPath = u'H:\yaojuan\QUERY\\'+year+'\\eval\\test\\answer.txt'
538 |         dbIdNamePath = u'DBIdName.txt'
539 |     else:
540 |         #####train train train 2010 2010 2010
541 |         docPath = u'H:\yaojuan\QUERY\\' + year + '\\training\\train\DocFileName.txt'
542 |         queryPath = u'H:\yaojuan\QUERY\\' + year + '\\training\\tac_kbp_' + year + '_english_entity_linking_training_queries.xml'
543 |         answerPath = u'H:\yaojuan\QUERY\\' + year + '\\training\\train\\answer.txt'
544 |         dbIdNamePath = u'DBIdName.txt'
545 | 
546 | 
547 |     readIdNameFile(dbIdNamePath)
548 |     readQueryFile(queryPath)
549 |     readAnswerFile(answerPath)
550 |     findDocQueryAndDbAnswer(docPath, year,testFlag)
551 | 
552 |     pass
553 | 
554 | 
555 | 
556 | 
557 | 
558 | 
559 | 
560 | 
561 | 
562 | 
563 | 
564 | 
565 | 
566 | 
567 | 


--------------------------------------------------------------------------------
/handleDBIdName.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding:utf8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | 
 7 | def handleDBIdName(path):
 8 | 
 9 |     with open(path,encoding='utf-8') as f:
10 |         line=f.readline().strip('\n')
11 | 
12 |         while line:
13 |             print(line)
14 |             if(line.find('(')):
15 |                 pass
16 | 
17 | 
18 | 
19 |             line = f.readline().strip('\n')
20 | 
21 | 
22 |     pass
23 | 
24 | 
25 | 
26 | 
27 | if __name__=='__main__':
28 |     path='DBIdName.txt'
29 |     handleDBIdName(path)
30 | 
31 |     pass


--------------------------------------------------------------------------------
/handle_db_entity.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding:utf8 -*-
  3 | 
  4 | #
  5 | # ansf=open('all.txt','w',encoding='utf-8')
  6 | import random
  7 | 
  8 | count1=0
  9 | uniMap=dict()
 10 | 
 11 | def handleTrainDBEntity(dbPath,queryPath,newf,allF):
 12 | 
 13 |     Map=dict()
 14 |     with open(dbPath,encoding='utf-8') as f:
 15 |         line=f.readline().strip('\n')
 16 |         lineCount=0
 17 |         while line:
 18 |             if(lineCount%2==0):
 19 |                 subStr=line[len(line.split(' ')[0]):]
 20 |                 # print(subStr)
 21 |             if(lineCount%2==1):
 22 |                 ans=line
 23 |                 Map[subStr]=line
 24 |                 pass
 25 |             lineCount=lineCount+1
 26 |             line=f.readline().strip('\n')
 27 | 
 28 |     with open(queryPath,encoding='utf-8') as f:
 29 |         line=f.readline().strip('\n')
 30 |         while line:
 31 |             lineCount=lineCount+1
 32 |             subStr=line[len(line.split(' ')[0]+' '+line.split(' ')[1]):line.find(' beg=')]
 33 |             # print(dbPath)
 34 |             # print(subStr)
 35 |             # print(line)
 36 |             # print(Map[subStr])
 37 |             newf.write(line+'\n')
 38 |             newf.write(Map[subStr]+'\n')
 39 |             queryId=line.split(' ')[1]
 40 |             templine=Map[subStr].split(' ')
 41 |             ansId=templine[0]
 42 |             ansNum=templine[1]
 43 |             if(subStr.find('XXXXX')==-1):
 44 |                 # if(len(templine)-2==1):
 45 |                 #     print(subStr)
 46 |                 flag=True
 47 |                 for i in range(len(templine)-2):
 48 |                     houId=templine[i+2]
 49 |                     if(len(houId.strip(' '))>0):
 50 |                         if(ansId==houId):
 51 |                             tempStr=queryId+' '+houId+' '+'1'
 52 |                             allF.write(tempStr+'\n')
 53 |                         elif(ansId!=houId and flag):
 54 |                             tempStr=queryId+' '+houId+' '+'0'
 55 |                             allF.write(tempStr+'\n')
 56 |                             flag=False
 57 | 
 58 |             line=f.readline().strip('\n')
 59 |     newf.close()
 60 | 
 61 |     pass
 62 | 
 63 | 
 64 | 
 65 | 
 66 | def handleDBEntity(dbPath,queryPath,newf,allF):
 67 |     global count1
 68 |     global uniMap
 69 |     map=dict()
 70 |     Map=dict()
 71 |     with open(dbPath,encoding='utf-8') as f:
 72 |         line=f.readline().strip('\n')
 73 |         lineCount=0
 74 |         while line:
 75 |             if(lineCount%2==0):
 76 |                 subStr=line[len(line.split(' ')[0]):]
 77 |                 # print(subStr)
 78 |             if(lineCount%2==1):
 79 |                 ans=line
 80 |                 Map[subStr]=line
 81 |                 pass
 82 |             lineCount=lineCount+1
 83 |             line=f.readline().strip('\n')
 84 |     with open(queryPath,encoding='utf-8') as f:
 85 |         line=f.readline().strip('\n')
 86 | 
 87 |         while line:
 88 |             lineCount=lineCount+1
 89 |             subStr=line[len(line.split(' ')[0]+' '+line.split(' ')[1]):line.find(' beg=')]
 90 |             print(line)
 91 |             print(dbPath)
 92 |             print(subStr)
 93 |             print(line)
 94 |             print(Map[subStr])
 95 |             newf.write(line+'\n')
 96 |             newf.write(Map[subStr]+'\n')
 97 |             queryId=line.split(' ')[1]
 98 |             templine=Map[subStr].split(' ')
 99 |             # ansId=templine[0]
100 |             # ansNum=templine[1]
101 |             # print(ansId)
102 |             for i in range(len(templine)):
103 |                 if(i==0):
104 |                     ansId=templine[0]
105 |                 if(i==1):
106 |                     ansNum=int(templine[1])
107 |                 if(i>1 and ansNum>0):
108 |                     houId=templine[i]
109 |                     if(len(houId.strip(' '))>0):
110 |                         if(ansId==houId):
111 |                             tempStr=queryId+' '+houId+' '+'1'
112 |                             if tempStr not in uniMap:
113 |                                 uniMap[tempStr]=1
114 |                             else:
115 |                                 uniMap[tempStr]=uniMap[tempStr]+1
116 |                             count1=count1+1
117 |                             allF.write(tempStr+'\n')
118 |                         else:
119 |                             if(ansId.find('E0')!=-1):
120 |                                 if ansId not in map:
121 |                                     map[ansId]=1
122 |                                     print(ansId)
123 |                             tempStr=queryId+' '+houId+' '+'0'
124 |                             allF.write(tempStr+'\n')
125 |                             # if(ansId.find('NIL')==-1):
126 |                             #     tempStr=queryId+' '+ansId+' '+'1'
127 |                             #     allF.write(tempStr + '\n')
128 |                             # break
129 | 
130 |             line=f.readline().strip('\n')
131 |     newf.close()
132 | 
133 | 
134 |     pass
135 | 
136 | 
137 | 
138 | if __name__=='__main__':
139 | 
140 |     # allF=open('H:\yaojuan\QUERY\\2010\\training\\train_all_data.txt','w',encoding='utf-8')
141 |     # for i in range(1453):
142 |     #
143 |     #     dbPath='H:\yaojuan\QUERY\\2010\\training\\train_dbEntity\db_'+str(i+1)+'.txt'
144 |     #     queryPath='H:\yaojuan\QUERY\\2010\\training\\train_docQuery\doc_'+str(i+1)+'.txt'
145 |     #     newdbPath='H:\yaojuan\QUERY\\2010\\training\\train_dbEntity\db_'+str(i+1)+'_new.txt'
146 |     #     newf=open(newdbPath,'w',encoding='utf-8')
147 |     #     handleDBEntity(dbPath,queryPath,newf,allF)
148 |     #
149 |     # allF.close()
150 |     # print(len(uniMap))
151 |     # print(count1)
152 | 
153 |     year='2009'; fileNum=3695
154 |     # year='2010';fileNum=2231
155 |     # year='2011'; fileNum=2231
156 |     # year='2012'; fileNum=2016
157 |     # year='2013'; fileNum=1820
158 |     # year='2014'; fileNum=138
159 | 
160 | 
161 |     # allF = open('H:\yaojuan\QUERY\\'+year+'\\eval\\test_all_data.txt', 'w', encoding='utf-8')
162 |     allF = open("H:\yaojuan\QUERY\\"+year+"\eval\word2vec\\test_all_data.txt', 'w', encoding='utf-8")
163 | 
164 |     for i in range(fileNum):
165 |         dbPath = 'H:\yaojuan\QUERY\\'+year+'\\eval\\test_dbEntity\db_' + str(i + 1) + '.txt'
166 |         queryPath = 'H:\yaojuan\QUERY\\'+year+'\\eval\\test_docQuery\doc_' + str(i + 1) + '.txt'
167 |         newdbPath = 'H:\yaojuan\QUERY\\'+year+'\\eval\\test_dbEntity\db_' + str(i + 1) + '_new.txt'
168 |         newf = open(newdbPath, 'w', encoding='utf-8')
169 |         handleDBEntity(dbPath, queryPath, newf, allF)
170 | 
171 |     allF.close()
172 |     print(len(uniMap))
173 |     print(count1)
174 | 
175 | 
176 | 
177 | 
178 |     # allF = open('train_all_data.txt', 'w', encoding='utf-8')
179 |     # for i in range(138):
180 |     #     dbPath = 'E:\mypython_Linking\data_handle\\train_dbEntity\db_' + str(i + 1) + '.txt'
181 |     #     queryPath = 'E:\mypython_Linking\data_handle\\train_docQuery\doc_' + str(i + 1) + '.txt'
182 |     #     newdbPath = 'E:\mypython_linking\data_handle\\train_dbEntity\db_' + str(i + 1) + '_new.txt'
183 |     #     newf = open(newdbPath, 'w', encoding='utf-8')
184 |     #     handleTrainDBEntity(dbPath, queryPath, newf, allF)
185 |     #
186 |     # allF.close()
187 |     # pass
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 


--------------------------------------------------------------------------------
/handle_min_db.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding:utf8 -*-
  3 | 
  4 | import jieba
  5 | import nltk
  6 | from bs4 import BeautifulSoup
  7 | from data_handle import doc_query
  8 | wordVecIndexMap=dict()
  9 | 
 10 | def Score():
 11 |     pass
 12 | 
 13 | # def readWordVecIndex():
 14 | #     global wordVecIndexMap
 15 | #     path=u'E:\mypython_Linking\CNN\glove.6B.100d.txt'
 16 | #     #path = u'E:\data_analysis\word2vec\\vector_100.txt'
 17 | #     count=0
 18 | #     with open(path,encoding='utf-8') as f:
 19 | #         line=f.readline()
 20 | #         while line:
 21 | #             count=count+1
 22 | #             wordVecIndexMap[line.split(' ')[0]]=count
 23 | #             line=f.readline()
 24 | #
 25 | #     return wordVecIndexMap
 26 | 
 27 | 
 28 | def NewText(word_tag):
 29 | 
 30 |     String=''
 31 |     for i in word_tag:
 32 | 
 33 |         word=i[0]
 34 |         tag=i[1]
 35 |         # if(tag.find('N')!=-1):
 36 |         #     print(i)
 37 |         ###去掉一些消歧无意义的单词
 38 |         #去掉CD、PRP、VBD、CC、WDT,IN,RP,TO,DT
 39 |         not_hold=['CD']
 40 |         if tag not in not_hold:
 41 |             if(len(word)>1):
 42 |                 String=String+word+' '
 43 | 
 44 |     return String
 45 | 
 46 | 
 47 | 
 48 | def FindRelation(mindbPath,newMinDBTextPath,relation,mindbIndex,mindbAllIndex):
 49 | 
 50 |     newf=open(newMinDBTextPath,'w',encoding='utf-8')
 51 |     dbText=''
 52 |     attributeText=''
 53 |     entityName=''
 54 |     entityId=''
 55 |     with open(mindbPath, encoding='utf-8') as f:
 56 |         line=f.readline()
 57 |         while line:
 58 |             # print(line)
 59 |             if(line.find('<entity id=')!=-1):
 60 |                 pos1=line.find('name="')
 61 |                 pos2=line.find('"',pos1+7)
 62 |                 entityName=line[pos1+6:pos2]
 63 | 
 64 |                 pos3=line.find('id="')
 65 |                 pos4=line.find('"',pos3+5)
 66 |                 entityId=line[pos3+4:pos4]
 67 | 
 68 |                 # print("entityName:"+entityName)
 69 |                 # print("entityId:"+entityId)
 70 | 
 71 |             if(line.find('<fact ')!=-1):
 72 |                 pos1=line.find('name="')
 73 |                 pos2=line.find('"',pos1+7)
 74 |                 rel=line[pos1+6:pos2]
 75 |                 # print("rel:"+rel)
 76 |                 # relation.write(rel+'\n')
 77 | 
 78 |                 pass
 79 | 
 80 |             dbText=dbText+line
 81 |             if(line.find('id="E')==-1):
 82 |                 attributeText=attributeText+line
 83 |             line=f.readline()
 84 |             pass
 85 |     soup = BeautifulSoup(dbText, 'html.parser')
 86 |     newText = doc_query.preprocessor(soup.get_text())
 87 |     words = nltk.word_tokenize(newText)
 88 |     word_tag = nltk.pos_tag(words)
 89 |     newText = NewText(word_tag)
 90 |     # print(newText)
 91 |     newf.write(newText+'\n')
 92 | 
 93 | 
 94 |     entity = doc_query.preprocessor(entityName)
 95 |     entityIndex = doc_query.getWordIndex(entity)
 96 |     textIndex = doc_query.getWordIndex(newText)
 97 |     newf.write(entityIndex+'\n')
 98 |     newf.write(textIndex+'\n')
 99 |     ####想要的文件1 db所有上下文
100 |     mindbAllIndex.write(entityId+' '+entityIndex+' '+textIndex+'\n')
101 |     mindbAllIndex.write(entityId+' '+entityIndex+'\n')
102 | 
103 |     fre = doc_query.DocWordfrequency(newText)
104 |     for w in fre:
105 |         newf.write(w.strip(' ') + '\n')
106 | 
107 |     newf.close()
108 | 
109 |     #########################################################
110 |     soup = BeautifulSoup(attributeText, 'html.parser')
111 |     newText = doc_query.preprocessor(soup.get_text())
112 |     words = nltk.word_tokenize(newText)
113 |     word_tag = nltk.pos_tag(words)
114 |     newText = NewText(word_tag)
115 |     textIndex = doc_query.getWordIndex(newText)
116 |     ####想要的文件2 db所有属性
117 |     mindbIndex.write(entityId + ' ' + entityIndex+' '+textIndex + '\n')
118 |     mindbIndex.write(entityId + ' ' + entityIndex + '\n')
119 |     print(textIndex)
120 |     # print(len(textIndex.split(' ')))
121 |     # print(len(textIndex.split(' ')))
122 |     # print(len(textIndex.split(' ')))
123 |     # print(len(textIndex.split(' ')))
124 |     # if(len(textIndex.split(' '))==0):
125 |     #     print("KKKKKKKKKK!!!")
126 | 
127 | 
128 | 
129 | 
130 |     pass
131 | 
132 | 
133 | 
134 | 
135 | if __name__=='__main__':
136 | 
137 |     # year='2009'; dbnum=106783; testFlage=True
138 | 
139 |     # year='2009'; dbnum=123710; testFlage=True
140 |     year='2010'; dbnum=140097; testFlage=True   #126840
141 | 
142 |     # year='2010'; dbnum=121050; testFlage=True     ##train dbnum=123129
143 | 
144 |     # year='2010'; dbnum=123129; testFlage=False  ##train train train
145 | 
146 |     # year='2011'; dbnum=118890; testFlage=True
147 | 
148 |     # year='2012'; dbnum = 113069; testFlage=True
149 | 
150 |     # year='2013'; dbnum=140190; testFlage=True
151 | 
152 |     # year='2014'; dbnum=142416; testFlage=True     ##train dbnum=144346
153 | 
154 |     # year='2014'; dbnum=144346; testFlage=False  ##train train train
155 | 
156 |     if(testFlage):
157 |         doc_query.readWordVecIndex()
158 |         relation = open("relation.txt", 'w', encoding='utf-8')
159 |         mindbIndex = open(u'H:\yaojuan\QUERY\\'+year+'\\eval\word2vec\\test_mindbIndex.txt','w',encoding='utf-8')
160 |         mindbAllIndex = open(u'H:\yaojuan\QUERY\\'+year+'\\eval\word2vec\\test_mindbAllIndex.txt','w',encoding='utf-8')
161 | 
162 |         for i in range(dbnum):#113069
163 |             mindbPath=u'H:\yaojuan\QUERY\\'+year+'\\eval\\test_minDBText\dbText_'+str(i+1)+'.txt'
164 |             newMinDBTextPath=u'H:\yaojuan\QUERY\\'+year+'\\eval\word2vec\\test_minDBTextOnly\dbText_'+str(i+1)+'.txt'
165 |             FindRelation(mindbPath,newMinDBTextPath,relation,mindbIndex,mindbAllIndex)
166 | 
167 |         # relation.close()
168 |         mindbIndex.close()
169 |         mindbAllIndex.close()
170 |     else:
171 |         doc_query.readWordVecIndex()
172 |         relation = open("relation.txt", 'w', encoding='utf-8')
173 |         mindbIndex = open(u'H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_mindbIndex.txt', 'w', encoding='utf-8')
174 |         mindbAllIndex = open(u'H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_mindbAllIndex.txt', 'w', encoding='utf-8')
175 | 
176 |         for i in range(dbnum):  # 113069
177 |             mindbPath = u'H:\yaojuan\QUERY\\' + year + '\\training\\train_minDBText\dbText_' + str(i + 1) + '.txt'
178 |             newMinDBTextPath = u'H:\yaojuan\QUERY\\' + year + '\\training\word2vec\\train_minDBTextOnly\dbText_' + str(i + 1) + '.txt'
179 |             FindRelation(mindbPath, newMinDBTextPath, relation, mindbIndex, mindbAllIndex)
180 | 
181 |         # relation.close()
182 |         mindbIndex.close()
183 |         mindbAllIndex.close()
184 |     pass
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 


--------------------------------------------------------------------------------
/merge_all_data_db_id.py:
--------------------------------------------------------------------------------
  1 | #coding=utf-8
  2 | #加载Google训练的词向量
  3 | IdSet=set()
  4 | 
  5 | def readFile():
  6 |     global IdSet
  7 |     with open(u'H:\yaojuan\QUERY\\2014\\training\\train_mindbIndex.txt',encoding='utf-8') as f:
  8 |         line=f.readline().strip('\n')
  9 |         while line:
 10 |             dbId=line.split(' ')[0]
 11 |             print(dbId)
 12 |             IdSet.add(dbId)
 13 |             line=f.readline().strip('\n')
 14 |     with open(u'H:\yaojuan\QUERY\\2014\\eval\\test_mindbIndex.txt',encoding='utf-8') as f:
 15 |         line=f.readline().strip('\n')
 16 |         while line:
 17 |             dbId=line.split(' ')[0]
 18 |             print(dbId)
 19 |             IdSet.add(dbId)
 20 |             line=f.readline().strip('\n')
 21 |     with open(u'H:\yaojuan\QUERY\\2013\\eval\\test_mindbIndex.txt', encoding='utf-8') as f:
 22 |         line = f.readline().strip('\n')
 23 |         while line:
 24 |             dbId = line.split(' ')[0]
 25 |             print(dbId)
 26 |             IdSet.add(dbId)
 27 |             line = f.readline().strip('\n')
 28 |     with open(u'H:\yaojuan\QUERY\\2012\\eval\\test_mindbIndex.txt', encoding='utf-8') as f:
 29 |         line = f.readline().strip('\n')
 30 |         while line:
 31 |             dbId = line.split(' ')[0]
 32 |             print(dbId)
 33 |             IdSet.add(dbId)
 34 |             line = f.readline().strip('\n')
 35 |     with open(u'H:\yaojuan\QUERY\\2011\\eval\\test_mindbIndex.txt', encoding='utf-8') as f:
 36 |         line = f.readline().strip('\n')
 37 |         while line:
 38 |             dbId = line.split(' ')[0]
 39 |             print(dbId)
 40 |             IdSet.add(dbId)
 41 |             line = f.readline().strip('\n')
 42 |     with open(u'H:\yaojuan\QUERY\\2010\\eval\\test_mindbIndex.txt', encoding='utf-8') as f:
 43 |         line = f.readline().strip('\n')
 44 |         while line:
 45 |             dbId = line.split(' ')[0]
 46 |             print(dbId)
 47 |             IdSet.add(dbId)
 48 |             line = f.readline().strip('\n')
 49 |     with open(u'H:\yaojuan\QUERY\\2009\\eval\\test_mindbIndex.txt', encoding='utf-8') as f:
 50 |         line = f.readline().strip('\n')
 51 |         while line:
 52 |             dbId = line.split(' ')[0]
 53 |             print(dbId)
 54 |             IdSet.add(dbId)
 55 |             line = f.readline().strip('\n')
 56 |     with open(u'H:\yaojuan\QUERY\\2010\\training\\train_mindbIndex.txt', encoding='utf-8') as f:
 57 |         line = f.readline().strip('\n')
 58 |         while line:
 59 |             dbId = line.split(' ')[0]
 60 |             print(dbId)
 61 |             IdSet.add(dbId)
 62 |             line = f.readline().strip('\n')
 63 | 
 64 | 
 65 | 
 66 |     ########bu no link entity#########
 67 |     tempStoreIdSet=set()
 68 |     with open(u'E:\mypython_Linking\CNN\\bu_link0.8.txt',encoding='utf-8') as f:
 69 |         line=f.readline().strip('\n')
 70 |         while line:
 71 |             dbIds = line.split(' ')
 72 |             for id in dbIds:
 73 |                 if(id not in IdSet):
 74 |                     IdSet.add(id)
 75 |                     tempStoreIdSet.add(id)
 76 |             line=f.readline().strip('\n')
 77 | 
 78 |     ALLPath = 'H:\yaojuan\QUERY\juan\zuizhongkuochong\DB_id_index.txt'
 79 |     linkMap=dict()
 80 |     with open(ALLPath,encoding='utf-8') as f:
 81 |         line=f.readline().strip('\n')
 82 |         while line:
 83 |             id=line.split(' ')[0]
 84 |             linkMap[id]=line[len(id)+1:].strip(' ')
 85 |             # print("YYYYYYYYYYYYYYY")
 86 |             # print(line[len(id)+1:])
 87 |             line=f.readline().strip('\n')
 88 | 
 89 |     while len(tempStoreIdSet)!=1:
 90 |         ###E0006472没有
 91 |         print(len(tempStoreIdSet))
 92 |         print(tempStoreIdSet)
 93 |         # tempStoreIdSet.remove('')
 94 |         IdList=[]
 95 |         for key in tempStoreIdSet:
 96 |             IdList.append(key)
 97 | 
 98 |         for key in IdList:
 99 |             tempStoreIdSet.remove(key)
100 |             if(key!='E0753362'):
101 |                 line=linkMap[key]
102 |                 if(len(line)>2):
103 |                     line=linkMap[key].split(' ')
104 |                     for i in line:
105 |                         if(i not in IdSet):
106 |                             IdSet.add(i)
107 |                             tempStoreIdSet.add(i)
108 | 
109 | 
110 | 
111 | 
112 | 
113 | def produceMyEntityVec(path,newPath,idPath,DB_id_index_Path,min_DB_id_index_Path):
114 |     IdMap=dict()
115 |     with open(path,encoding='utf-8') as f:
116 |         line=f.readline().strip('\n')
117 |         while line:
118 |             id=line.split(' ')[0]
119 |             IdMap[id]=line
120 |             line=f.readline().strip('\n')
121 | 
122 |     linkMap=dict()
123 |     with open(DB_id_index_Path,encoding='utf-8') as f:
124 |         line=f.readline().strip('\n').strip(' ')
125 |         while line:
126 |             id = line.split(' ')[0]
127 |             linkMap[id]=line
128 |             line=f.readline().strip('\n').strip(' ')
129 | 
130 |     newf=open(newPath,'w',encoding='utf-8')
131 |     idf = open(idPath,'w',encoding='utf-8')
132 |     minf = open(min_DB_id_index_Path,'w',encoding='utf-8')
133 |     IdCount=0
134 |     for id in IdSet:
135 |         newf.write(IdMap[id]+'\n')
136 |         idf.write('/m/'+id+' '+str(IdCount)+'\n')
137 |         IdCount=IdCount+1
138 |         minf.write(linkMap[id]+'\n')
139 | 
140 | 
141 | if __name__=='__main__':
142 | 
143 |     ######
144 |     # readFile()
145 |     # print(len(IdSet))
146 |     # if 'E0487663' in IdSet:
147 |     #     print("E0487663 E0487663 E0487663 E0487663")
148 |     # else:
149 |     #     print('WAN WAN WAN WAN WAN WAN')
150 |     # path = 'E:\mypython_Linking\CNN\entityvec.txt'
151 |     # newPath = 'E:\mypython_Linking\CNN\my_entity_vecs.txt'
152 |     # idPath = 'E:\mypython_Linking\CNN\entityId.txt'
153 |     # DB_id_index_Path = 'H:\yaojuan\QUERY\juan\zuizhongkuochong\DB_id_index.txt'
154 |     # min_DB_id_index_Path = 'E:\mypython_Linking\CNN\min_DB_id_index.txt'
155 |     # produceMyEntityVec(path,newPath,idPath,DB_id_index_Path,min_DB_id_index_Path)
156 | 
157 | 
158 |     #####################bu no link entity ########################
159 |     readFile()
160 |     # IdSet.remove('E0753362')
161 |     print(len(IdSet))
162 | 
163 |     if 'E0487663' in IdSet:
164 |         print("E0487663 E0487663 E0487663 E0487663")
165 |     else:
166 |         print('WAN WAN WAN WAN WAN WAN')
167 |     # path = 'E:\mypython_Linking\CNN\entityvec.txt'#总的实体向量，有八十多万 ###读
168 |     path = 'H:\yaojuan\QUERY\juan\zuizhongkuochong\entityvec.txt'#总的实体向量，有八十多万 ###读
169 |     newPath = 'E:\mypython_Linking\CNN\my_entity_vecs_bu.txt'###生成的新文件
170 |     idPath = 'E:\mypython_Linking\CNN\entityId_bu.txt'###生成的新文件
171 |     DB_id_index_Path = 'H:\yaojuan\QUERY\juan\zuizhongkuochong\DB_id_index_bu.txt'###读
172 |     min_DB_id_index_Path = 'E:\mypython_Linking\CNN\min_DB_id_index_bu.txt'###生成的新文件
173 |     produceMyEntityVec(path, newPath, idPath, DB_id_index_Path, min_DB_id_index_Path)
174 | 
175 | 
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/myCNN.py:
--------------------------------------------------------------------------------
  1 | #coding=utf-8
  2 | import numpy
  3 | import tensorflow as tf
  4 | from keras.datasets import mnist
  5 | from keras.models import Sequential
  6 | import keras.backend as K
  7 | from keras.layers import Dense
  8 | from keras.layers import Dropout
  9 | from keras.layers import Flatten
 10 | from keras.layers.convolutional import Conv2D
 11 | from keras.layers.convolutional import MaxPooling2D
 12 | from keras.utils import np_utils
 13 | import matplotlib.pyplot as plt
 14 | from keras.constraints import maxnorm
 15 | from keras.optimizers import SGD
 16 | from keras.layers import *
 17 | from keras.models import *
 18 | import data_load
 19 | import sys
 20 | # from CNN import data_load
 21 | from keras.utils.vis_utils import plot_model
 22 | 
 23 | # import os
 24 | # os.environ["PATH"] += os.pathsep + 'D:/Program Files (x86)/Graphviz2.38/bin/'
 25 | year=sys.argv[1]
 26 | test_query_path=sys.argv[2]
 27 | test_db_path=sys.argv[3]
 28 | 
 29 | #求余弦函数
 30 | def cosVector(x,y):
 31 |     if(len(x)!=len(y)):
 32 |         print('error input,x and y is not in the same space')
 33 |         return;
 34 |     result1=0.0;
 35 |     result2=0.0;
 36 |     result3=0.0;
 37 |     for i in range(len(x)):
 38 |         result1+=x[i]*y[i]   #sum(X*Y)
 39 |         result2+=x[i]**2     #sum(X*X)
 40 |         result3+=y[i]**2     #sum(Y*Y)
 41 |     #print("result is "+str(result1/((result2*result3)**0.5))) #结果显示
 42 |     return result1/((result2*result3)**0.5)
 43 | 
 44 | 
 45 | 
 46 | # fix random seed for reproducibility
 47 | seed = 7
 48 | numpy.random.seed(seed)
 49 | # load data
 50 | (query_train,query_name_train,db_train,db_name_train,lab_train) = data_load.readTrainAllData(local=False)
 51 | (query_test,query_name_test,db_test,db_name_test,lab_test) = data_load.readAllData(local=False,year=year,queryPath=test_query_path,dbPath=test_db_path)
 52 | 
 53 | # query_train = query_train.reshape(query_train.shape[0],query_train.shape[1],query_train.shape[2]).astype('float32')
 54 | # #query_name_train = query_name_train.reshape(query_name_train.shape[0],query_name_train.shape[1],query_name_train.shape[2]).astype('float32')
 55 | # query_test = query_test.reshape(query_test.shape[0],query_test.shape[1],query_test.shape[2]).astype('float32')
 56 | # db_train = db_train.reshape(db_train.shape[0],db_train.shape[1],db_train.shape[2]).astype('float32')
 57 | # db_test = db_test.reshape(db_test.shape[0],db_test.shape[1],db_test.shape[2]).astype('float32')
 58 | num_classes = 2
 59 | 
 60 | 
 61 | # #定义记录位置信息的共现变量
 62 | # pos=np.random.uniform(-0.01,0.01,size=(query_train.shape[1],query_train.shape[2]))
 63 | # POS=[]
 64 | # for i in range(query_train.shape[0]):
 65 | #     POS.append(pos)
 66 | # Pos=np.asarray(POS)
 67 | # print(Pos.shape)
 68 | # query_pos_train=np.concatenate((query_train,Pos),axis=2)
 69 | #
 70 | # POS=[]
 71 | # for i in range(query_test.shape[0]):
 72 | #     POS.append(pos)
 73 | # Pos=np.asarray(POS)
 74 | # print(Pos.shape)
 75 | # query_pos_test=np.concatenate((query_test,Pos),axis=2)
 76 | 
 77 | # 自定义query模型
 78 | query_input=Input(shape=(query_train.shape[1], query_train.shape[2]))
 79 | query_conv1=Conv1D(30, 5, padding='valid', activation='relu')(query_input)
 80 | query_maxp1=MaxPooling1D(pool_size=2)(query_conv1)
 81 | query_drop1=Dropout(0.4)(query_maxp1)
 82 | query_conv2=Conv1D(15, 3, activation='relu')(query_drop1)
 83 | query_maxp2=MaxPooling1D(pool_size=2)(query_conv2)
 84 | query_drop2=Dropout(0.4)(query_maxp2)
 85 | query_flat1=Flatten()(query_drop2)
 86 | 
 87 | query_name_input=Input(shape=(query_name_train.shape[1],query_name_train.shape[2]))
 88 | query_name_flat1=Flatten()(query_name_input)
 89 | query_union=Concatenate()([query_flat1,query_name_flat1])
 90 | 
 91 | query_dens1=Dense(128, activation='relu')(query_union)
 92 | query_drop3=Dropout(0.4)(query_dens1)
 93 | query_dens2=Dense(50, activation='relu')(query_drop3)
 94 | query_drop4=Dropout(0.4)(query_dens2)
 95 | query_model=Dense(20, activation='softmax',name='query_model')(query_drop4)
 96 | 
 97 | # 自定义db模型
 98 | db_input=Input(shape=(db_train.shape[1], db_train.shape[2]))
 99 | db_conv1=Conv1D(30, 5, padding='valid', activation='relu')(db_input)
100 | db_maxp1=MaxPooling1D(pool_size=2)(db_conv1)
101 | db_drop1=Dropout(0.4)(db_maxp1)
102 | db_conv2=Conv1D(15, 3, activation='relu')(db_drop1)
103 | db_maxp2=MaxPooling1D(pool_size=2)(db_conv2)
104 | db_drop2=Dropout(0.4)(db_maxp2)
105 | db_flat1=Flatten()(db_drop2)
106 | 
107 | db_name_input=Input(shape=(db_name_train.shape[1],db_name_train.shape[2]))
108 | db_name_flat1=Flatten()(db_name_input)
109 | db_union=Concatenate()([db_flat1,db_name_flat1])
110 | 
111 | db_dens1=Dense(128, activation='relu')(db_union)
112 | db_drop3=Dropout(0.4)(db_dens1)
113 | db_dens2=Dense(50, activation='relu')(db_drop3)
114 | db_drop4=Dropout(0.4)(db_dens2)
115 | db_model=Dense(20, activation='softmax',name='db_model')(db_drop4)
116 | 
117 | print(query_model.shape)
118 | print(db_model.shape)
119 | 
120 | # x=Concatenate()([query_model,db_model])
121 | # We stack a deep densely-connected network on top
122 | x = Multiply(name='Multiply')([query_model,db_model])
123 | x = Dense(10, activation='relu')(x)
124 | # x = Dense(64, activation='relu')(x)
125 | # x = Dense(64, activation='relu')(x)
126 | 
127 | # And finally we add the main logistic regression layer
128 | main_output = Dense(1, activation='sigmoid', name='main_output')(x)
129 | 
130 | 
131 | model = Model(inputs=[query_input,query_name_input,db_input,db_name_input],outputs=main_output)
132 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
133 | 
134 | 
135 | # new_model = Dot([query_model,db_model])
136 | 
137 | #
138 | # plot_model(model,to_file='myCNN_model_1.png', show_shapes=True)
139 | 
140 | # Fit the model
141 | model.fit([query_train,query_name_train,db_train,db_name_train],lab_train,  epochs=100, batch_size=200, verbose=2)
142 | # Final evaluation of the model
143 | scores = model.evaluate([query_test,query_name_test,db_test,db_name_test], lab_test, verbose=0)
144 | print("Large CNN Error: %.2f%%" % (100 - scores[1] * 100))
145 | 
146 | #save model
147 | model.save('myCNN_model_1.h5')
148 | 
149 | # 已有的model在load权重过后
150 | # 取某一层的输出为输出新建为model，采用函数模型
151 | query_layer_model = Model(inputs=model.input,
152 |                            outputs=model.get_layer('query_model').output)
153 | db_layer_model = Model(inputs=model.input,
154 |                         outputs=model.get_layer('db_model').output)
155 | 
156 | # 以这个model的预测值作为输出
157 | query_output = query_layer_model.predict([query_test,query_name_test,db_test,db_name_test])
158 | db_output = db_layer_model.predict([query_test,query_name_test,db_test,db_name_test])
159 | 
160 | 
161 | model_output = model.predict([query_test,query_name_test,db_test,db_name_test])
162 | print(model_output.shape)
163 | 
164 | predictFile=open(year+'predict.txt','w',encoding='utf-8')
165 | for i in range(model_output.shape[0]):
166 |     x=model_output[i]
167 |     pre=1 / float(1 + np.exp(- x))
168 |     predictFile.write(str(pre)+'\n')
169 | predictFile.close()
170 | 
171 | 
172 | #计算query_output和db_output的余弦值，用60*1的向量存储
173 | rows=query_output.shape[0] #行数
174 | cols=query_output.shape[1] #列数
175 | cosResult= [[0]*1 for i in range(rows)]
176 | 
177 | 
178 | for i in range(rows):
179 |     cosResult[i][0]=cosVector(query_output[i], db_output[i])
180 | 
181 | #print(cosResult)
182 | 
183 | file=open(year+'_cos.txt','w')
184 | for i in cosResult:
185 |   file.write(str(i).replace('[','').replace(']','')+'\n')  #\r\n为换行符
186 | 
187 | file.close()
188 | 


--------------------------------------------------------------------------------
/process_wiki.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaoyao2/Entity-Linking/18d02199ef37045c2642325cb6e3d6f73f76b4f0/process_wiki.py


--------------------------------------------------------------------------------
/produce_entity_index.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | 
 3 | 
 4 | 
 5 | def produce_entity_index(entity2idPath,DB_id_index_Path,allDBIdexPath):
 6 | 
 7 |     idMap=dict()
 8 |     with open(entity2idPath,encoding='utf-8') as f:
 9 |         line=f.readline().strip('\n')
10 |         while line:
11 |             id=line.split(' ')[0][3:]
12 |             index=int(line.split(' ')[1])
13 |             # print(id)
14 |             # print(index+1)
15 |             idMap[id]=index+1
16 |             line=f.readline().strip('\n')
17 | 
18 |     dbf=open(allDBIdexPath,'w',encoding='utf-8')
19 |     with open(DB_id_index_Path,encoding='utf-8') as f:
20 |         line=f.readline().strip('\n').strip(' ')
21 |         lineCount=0
22 |         totalCount=0
23 |         nolinkCount=0
24 |         while line:
25 |             lineCount=lineCount+1
26 |             Ids=line.split(' ')
27 |             tempStr=Ids[0]
28 |             if(len(Ids)==1):
29 |                 tempStr=tempStr+' '+str(idMap[Ids[0]])
30 |                 dbf.write(tempStr+'\n')
31 |                 dbf.write(tempStr+'\n')
32 |                 nolinkCount=nolinkCount+1
33 |             else:
34 |                 for i in range(len(Ids)):
35 |                     if(Ids[i]!='E0006472' and Ids[i]!='E0186505' and Ids[i]!='E0532473'):
36 |                         totalCount = totalCount + 1
37 |                         tempStr=tempStr+' '+str(idMap[Ids[i]])
38 |                 dbf.write(tempStr.strip(' ') + '\n')
39 |                 dbf.write(Ids[0]+' '+str(idMap[Ids[0]])+'\n')
40 |             print(tempStr)
41 | 
42 | 
43 |             line=f.readline().strip('\n').strip(' ')
44 |     print('average link='+str(int(totalCount/lineCount)))
45 |     print(nolinkCount)
46 | 
47 | 
48 |     pass
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | if __name__=='__main__':
59 | 
60 | 
61 |     # entity2idPath='E:\mypython_Linking\CNN\entityId.txt'
62 |     # DB_id_index_Path='E:\mypython_Linking\CNN\min_DB_id_index.txt'
63 |     # allDBIdexPath = 'E:\mypython_Linking\CNN\min_DBIndex.txt'
64 |     # produce_entity_index(entity2idPath,DB_id_index_Path,allDBIdexPath)
65 | 
66 | 
67 |     ###############bu no link entity##############
68 |     entity2idPath = 'E:\mypython_Linking\CNN\entityId_bu.txt'
69 |     DB_id_index_Path = 'E:\mypython_Linking\CNN\min_DB_id_index_bu.txt'
70 |     allDBIdexPath = 'E:\mypython_Linking\CNN\min_DBIndex_bu.txt'####生成新文件
71 |     produce_entity_index(entity2idPath, DB_id_index_Path, allDBIdexPath)
72 | 
73 |     pass


--------------------------------------------------------------------------------
/produce_min_db.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding:utf8 -*-
  3 | 
  4 | from data_handle import doc_query
  5 | 
  6 | Map=dict()
  7 | AnsIdSet=set()
  8 | StoreTempSet=set()
  9 | 
 10 | def ReadDBIndex(indexPath):
 11 |     global Map
 12 |     with open(indexPath,encoding='utf-8') as f:
 13 |         line=f.readline().strip('\n').strip(' ')
 14 |         while line:
 15 |             L=line.split(' ')
 16 |             Map[L[0]]=line
 17 |             line=f.readline().strip('\n').strip(' ')
 18 |     return Map
 19 | 
 20 | 
 21 | 
 22 | def ReadAnswer(answerPath):
 23 |     global AnsIdSet
 24 |     lineCount=0
 25 |     with open(answerPath,encoding='utf-8') as f:
 26 |         line=f.readline().strip('\n').strip(' ')
 27 |         while line:
 28 |             if(lineCount%2==0):
 29 |                 pass
 30 |             if(lineCount%2==1):
 31 |                 line=line.split(' ')
 32 |                 for i in range(len(line)-2):
 33 |                     if(len(line[i+2])>2):
 34 |                         # print(line[i+2])
 35 |                         AnsIdSet.add(line[i+2])
 36 |             lineCount=lineCount+1
 37 |             line=f.readline().strip('\n').strip(' ')
 38 |     return AnsIdSet
 39 | 
 40 | 
 41 | 
 42 | def ProduceMinDB(dbTextPath,year):
 43 |     global StoreTempSet
 44 |     f=open(u'H:\yaojuan\QUERY\\'+year+'\\eval\\test_minDB.txt','w',encoding='utf-8')
 45 |     for key in Map:
 46 |         if(key in AnsIdSet):
 47 |             line=Map[key].split(' ')
 48 |             for i in line:
 49 |                 if(i not in AnsIdSet):
 50 |                     AnsIdSet.add(i)
 51 |                     StoreTempSet.add(i)
 52 | 
 53 |     tempF=open('Temp.txt','w',encoding='utf-8')
 54 |     while len(StoreTempSet)!=1:
 55 |         ###E0006472没有
 56 |         print(len(StoreTempSet))
 57 |         print(StoreTempSet)
 58 |         for key in Map:
 59 |             if(key in StoreTempSet):
 60 |                 StoreTempSet.remove(key)
 61 |                 line=Map[key].split(' ')
 62 |                 for i in line:
 63 |                     if(i not in AnsIdSet):
 64 | 
 65 |                         tempF.write(i+'\n')
 66 | 
 67 |                         AnsIdSet.add(i)
 68 |                         StoreTempSet.add(i)
 69 | 
 70 |     IdNameMap = doc_query.readIdNameFile(dbIdNamePath=u'DBIdName.txt')
 71 |     tempCount=0
 72 |     AnsIdSet.remove('E0006472')
 73 | 
 74 |     for i in AnsIdSet:
 75 |         print(len(AnsIdSet))
 76 |         if(i.find('E0')==-1):
 77 |             print(i)
 78 | 
 79 |         else:
 80 |             tempCount=tempCount+1
 81 | 
 82 |             f.write(i+'\n')
 83 |             f.write(IdNameMap[i]+'\n')
 84 |             num=IdNameMap[i].split('###')[0]
 85 |             dbPath = dbTextPath + num +'.txt'
 86 |             print(dbPath)
 87 |             textf = open(u'H:\yaojuan\QUERY\\'+year+'\\eval\\test_minDBText\dbText_' + str(tempCount) + '.txt', 'w', encoding='utf-8')
 88 |             with open(dbPath,encoding='utf-8') as dbf:
 89 |                 line=dbf.readline()
 90 |                 while line:
 91 |                     textf.write(line)
 92 |                     if(line.find('</facts>')!=-1):
 93 |                         break
 94 |                     line=dbf.readline()
 95 |             dbf.close()
 96 |             textf.close()
 97 | 
 98 | 
 99 |     return AnsIdSet
100 | 
101 | 
102 | 
103 | 
104 | if __name__=='__main__':
105 |     # year='2009'
106 |     # filesnum=3695
107 | 
108 |     year='2010'
109 |     filesnum=2231
110 | 
111 |     # year='2011'
112 |     # filesnum=2231
113 | 
114 |     # year = '2012'
115 |     # filesnum = 2016
116 | 
117 |     # year='2013'
118 |     # filesnum=1820
119 | 
120 |     # year='2014'
121 |     # filesnum=138
122 | 
123 | 
124 |     indexPath = u'DB_id_index.txt'
125 |     ReadDBIndex(indexPath)
126 |     for i in range(filesnum):#文件个数
127 |         answerPath = u'H:\yaojuan\QUERY\\'+year+'\\eval\\test_dbEntity\db_' + str(i + 1) + '.txt'
128 |         ReadAnswer(answerPath)
129 |     dbTextPath = u'H:\\yaojuan\\EntityLinkingData\\data\\'
130 |     ProduceMinDB(dbTextPath,year)
131 | 
132 | 
133 | 
134 |     pass
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/word2vec.py:
--------------------------------------------------------------------------------
 1 | #! -*- coding=utf-8 -*-
 2 | from gensim.models import word2vec
 3 | import logging
 4 | # 主程序
 5 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 6 | sentences = word2vec.Text8Corpus(u"Text8.txt") #加载语料
 7 | #模型初始化S
 8 | model = word2vec.Word2Vec(sentences, size=50) #训练skip-gram模型; 默认window=5
 9 | model2 = word2vec.Word2Vec("hello wrold! hello The training algorithms were originally ported from the C package", size=50, window=5, min_count=5, workers=4)
10 | print "--------/n"
11 | print "--------/n"
12 | y1 = model.similarity("woman", "man")
13 | print u"woman和man的相似度为：", y1
14 | print "--------/n"
15 | print "--------/n"
16 | y2 = model.most_similar("good", topn=20)  # 20个最相关的
17 | print u"和good最相关的词有：/n"
18 | for item in y2:
19 |     print item[0], item[1]
20 | print "--------/n"
21 | print "--------/n"
22 | 
23 | # print ' "boy" is to "father" as "girl" is to ...? /n'
24 | # y3 = model.most_similar(['girl', 'father'], ['boy'], topn=3)
25 | # for item in y3:
26 | #     print item[0], item[1]
27 | # print "--------/n"
28 | # print "--------/n"
29 | #
30 | # y4 = model.doesnt_match("breakfast cereal dinner lunch".split())
31 | # print u"不合群的词：", y4
32 | # print "--------/n"
33 | # print "--------/n"
34 | #
35 | # y5 = model.init_sims()
36 | #
37 | model.wv.save_word2vec_format('vector_50.txt')
38 | # #model = word2vec.Word2Vec.load_word2vec_format('/tmp/vectors.bin', binary=True)
39 | # print "--------/n"
40 | # print "--------/n"
41 | #
42 | # model.most_similar(['girl', 'father'], ['boy'], topn=3)
43 | # print "--------/n"
44 | # print "--------/n"
45 | #
46 | # more_examples = ["he his she", "big bigger bad", "going went being"]
47 | # for example in more_examples:
48 | #     a, b, x = example.split()
49 | #     predicted = model.most_similar([x, b], [a])[0][0]
50 | #     print "'%s' is to '%s' as '%s' is to '%s'" % (a, b, x, predicted)
51 | # print "--------/n"
52 | # print "--------/n"
53 | #
54 | # y6=model.wv['red']  # numpy vector of a word
55 | # print y6
56 | #
57 | # y7=model.wv['white']  # numpy vector of a word
58 | # print y7
59 | #
60 | # y8 = model.similarity("yes", "no")
61 | # print y8
62 | #
63 | # y9 = model.similarity("color", "white")
64 | # print y9
65 | #
66 | # y10 = model.similarity("red", "color")
67 | # print y10


--------------------------------------------------------------------------------