├── README.md
├── convert.py
├── data
    └── raw
    │   └── README.md
├── data_processing.py
├── models
    ├── bilstm_cbow.py
    ├── bilstm_cwe.py
    ├── crf.py
    ├── draw.py
    ├── gbdt1.py
    ├── gbdt2.py
    ├── lf.py
    ├── rf.py
    └── xgb.py
├── parameter.py
└── util.py


/README.md:
--------------------------------------------------------------------------------
 1 | **Paper: _Improving Prosodic Boundaries Prediction for Mandarin Speech Synthesis by Using Enhanced Embedding Feature and Model Fusion Approach_**
 2 | 
 3 | ## **Requirements**
 4 | >**python3.5+**
 5 | 
 6 | >**tensorflow>=1.6**
 7 | 
 8 | >**numpy**
 9 | 
10 | >**pandas**
11 | 
12 | >**scikit-learn**
13 | 
14 | >**gensim**
15 | 
16 | 
17 | ## steps
18 | ### **----------------------data processing-----------------------**
19 | #### 1.run `python convert.py`
20 | >convert `.utf-8` raw files to prosody tagged files
21 | 
22 | #### 2.run `python data_processing.py`
23 | >trans prosody tagged files to dataset
24 | 
25 | ### **-------------------use models to prediction-----------------**
26 | #### `cd models`
27 | >into models
28 | 
29 | #### run `python bilstm_cbow.py`
30 | >use bilstm_cbow to do prosody prediction 
31 | 
32 | 
33 | #### run `python alignment.py`
34 | >use alignment to do prosody prediction 


--------------------------------------------------------------------------------
/convert.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     trans .utf8 files to normal tagged files
  3 | '''
  4 | 
  5 | import os
  6 | import pandas as pd
  7 | import numpy as np
  8 | import re
  9 | 
 10 | #trans to prosody tagged format
 11 | def toProsody(inFile,outFile):
 12 |     f_in=open(file=inFile,encoding="utf-8")
 13 |     doc=""
 14 |     lines=f_in.readlines()
 15 |     for line in lines:
 16 |         line=line.strip()
 17 |         line_list=line.split(sep="\t")
 18 |         if(line_list[0]!=""):
 19 |             doc+=(line_list[0]+"#"+line_list[7])
 20 |         else:
 21 |             doc+="\n"
 22 |     #print(doc)
 23 |     f_out=open(file=outFile,mode="w",encoding="utf-8")
 24 |     f_out.write(doc)
 25 |     f_out.close()
 26 | 
 27 | 
 28 | def merge(file1,file2,file3,outFile):
 29 |     doc=""
 30 |     f1=open(file=file1,encoding="utf-8")
 31 |     lines_f1=f1.readlines()
 32 |     for line_f1 in lines_f1:
 33 |         doc+=line_f1
 34 |     f2=open(file=file2,encoding="utf-8")
 35 |     lines_f2 = f2.readlines()
 36 |     for line_f2 in lines_f2:
 37 |         doc += line_f2
 38 |     f3 = open(file=file3, encoding="utf-8")
 39 |     lines_f3 = f3.readlines()
 40 |     for line_f3 in lines_f3:
 41 |         doc += line_f3
 42 |     f4=open(file=outFile,mode="w",encoding="utf-8")
 43 |     f4.write(doc)
 44 |     f4.close()
 45 | 
 46 | 
 47 | 
 48 | 
 49 | #word pos info
 50 | def toPos(inFile_train,inFile_valid,inFile_test):
 51 |     #---------------------------------------生成pos列表--------------------------------------------#
 52 |     f_train_in = open(file=inFile_train, encoding="utf-8")
 53 |     f_valid_in = open(file=inFile_valid, encoding="utf-8")
 54 |     f_test_in = open(file=inFile_test, encoding="utf-8")
 55 | 
 56 |     pos=[]
 57 |     #收集所有pos
 58 |     #train
 59 |     lines_train = f_train_in.readlines()
 60 |     for line_train in lines_train:
 61 |         line_train = line_train.strip()
 62 |         line_train_list = line_train.split(sep="\t")
 63 |         if (line_train_list[0] != ""):
 64 |             pos.append(line_train_list[1])
 65 |     f_train_in.close()
 66 |     #valid
 67 |     lines_valid = f_valid_in.readlines()
 68 |     for line_valid in lines_valid:
 69 |         line_valid = line_valid.strip()
 70 |         line_valid_list = line_valid.split(sep="\t")
 71 |         if (line_valid_list[0] != ""):
 72 |             pos.append(line_valid_list[1])
 73 |     f_valid_in.close()
 74 |     #test
 75 |     lines_test = f_test_in.readlines()
 76 |     for line_test in lines_test:
 77 |         line_test = line_test.strip()
 78 |         line_test_list = line_test.split(sep="\t")
 79 |         if (line_test_list[0] != ""):
 80 |             pos.append(line_test_list[1])
 81 |     f_test_in.close()
 82 | 
 83 |     #print(pos)
 84 |     print("origin len of pos:",len(pos))
 85 |     sr_all_pos = pd.Series(data=pos)               # 列表做成pandas的Series
 86 |     pos = (sr_all_pos.value_counts()).index        # pos列表.统计每个pos类型出现的频率,同时相当于去重复,得到字的集合(这里还是Serieas的index对象)
 87 |     print(pos)
 88 |     print("len of cleaned:",pos.shape)
 89 |     pos_id = range(1, len(pos) + 1)             # 字的id列表,从1开始，因为准备把0作为填充值
 90 | 
 91 |     # words以及对应的id组件
 92 |     df_pos_ids=pd.DataFrame(data={"pos": pos, "id": pos_id})
 93 |     df_pos_ids. to_csv(path_or_buf="./data/dataset/pos_ids.csv", index=False, encoding="utf_8")
 94 | 
 95 |     pos2id = pd.Series(data=df_pos_ids["id"].values, index=df_pos_ids["pos"].values)
 96 |     id2pos = pd.Series(data=df_pos_ids["pos"].values, index=df_pos_ids["id"].values)
 97 | 
 98 |     #print("pos2id:\n",pos2id.head(10))
 99 |     #print("shape of pos2id:",pos2id.shape)
100 |     #print("id2pos:\n",id2pos.head(10))
101 |     #print("shape of id2pos:",id2pos.shape)
102 | 
103 | 
104 |     #---------------------------------------生成pos标注文件-----------------------------------------#
105 |     #training corpus
106 |     f_train_in = open(file=inFile_train, encoding="utf-8")
107 |     doc_pos = ""
108 |     doc_ids=""
109 |     lines_train = f_train_in.readlines()
110 |     for line_train in lines_train:
111 |         line_train = line_train.strip()
112 |         line_train_list = line_train.split(sep="\t")
113 |         #print("line_train_list:",line_train_list)
114 |         if (line_train_list[0] != ""):
115 |             id=pos2id[line_train_list[1]]
116 |             doc_pos+=(line_train_list[0]+"/"+str(id))
117 |             doc_ids+=(str(id)+" ")
118 |         else:
119 |             doc_pos += "\n"
120 |             doc_ids+="\n"
121 |     #save 2 files
122 |     #f_train_out = open(file="./data/dataset/pos_train.txt", mode="w", encoding="utf-8")
123 |     #f_train_out.write(doc_pos)
124 |     #f_train_out.close()
125 | 
126 |     f_train_out = open(file="./data/dataset/pos_train_tag.txt", mode="w", encoding="utf-8")
127 |     f_train_out.write(doc_ids)
128 |     f_train_out.close()
129 | 
130 |     # validing corpus
131 |     f_valid_in = open(file=inFile_valid, encoding="utf-8")
132 |     doc_pos = ""
133 |     doc_ids = ""
134 |     lines_valid = f_valid_in.readlines()
135 |     for line_valid in lines_valid:
136 |         line_valid = line_valid.strip()
137 |         line_valid_list = line_valid.split(sep="\t")
138 |         # print("line_valid_list:",line_valid_list)
139 |         if (line_valid_list[0] != ""):
140 |             id = pos2id[line_valid_list[1]]
141 |             doc_pos += (line_valid_list[0] + "/" + str(id))
142 |             doc_ids += (str(id) + " ")
143 |         else:
144 |             doc_pos += "\n"
145 |             doc_ids += "\n"
146 |     # save 2 files
147 |     #f_valid_out = open(file="./data/dataset/pos_valid.txt", mode="w", encoding="utf-8")
148 |     #f_valid_out.write(doc_pos)
149 |     #f_valid_out.close()
150 | 
151 |     f_valid_out = open(file="./data/dataset/pos_valid_tag.txt", mode="w", encoding="utf-8")
152 |     f_valid_out.write(doc_ids)
153 |     f_valid_out.close()
154 | 
155 |     #test corpus
156 |     f_test_in = open(file=inFile_test, encoding="utf-8")
157 |     doc_pos = ""
158 |     doc_ids = ""
159 |     lines_test = f_test_in.readlines()
160 |     for line_test in lines_test:
161 |         line_test = line_test.strip()
162 |         line_test_list = line_test.split(sep="\t")
163 |         if (line_test_list[0] != ""):
164 |             id = pos2id[line_test_list[1]]
165 |             doc_pos += (line_test_list[0] + "/" + str(id))
166 |             doc_ids += (str(id) + " ")
167 |         else:
168 |             doc_pos += "\n"
169 |             doc_ids += "\n"
170 |     #f_test_out = open(file="./data/dataset/pos_test.txt", mode="w", encoding="utf-8")
171 |     #f_test_out.write(doc_pos)
172 |     #f_test_out.close()
173 | 
174 |     f_test_out = open(file="./data/dataset/pos_test_tag.txt", mode="w", encoding="utf-8")
175 |     f_test_out.write(doc_ids)
176 |     f_test_out.close()
177 | 
178 | 
179 | #word length info(每个词的长度)
180 | def toWordLength(inFile_train,inFile_valid,inFile_test):
181 |     # ---------------------------------------生成length标注文件-----------------------------------------#
182 |     # training corpus
183 |     f_train_in = open(file=inFile_train, encoding="utf-8")
184 |     doc_length = ""
185 |     doc_ids = ""
186 |     lines_train = f_train_in.readlines()
187 |     for line_train in lines_train:
188 |         line_train = line_train.strip()
189 |         line_train_list = line_train.split(sep="\t")
190 |         # print("line_train_list:",line_train_list)
191 |         if (line_train_list[0] != ""):
192 |             doc_length += (line_train_list[0] + "/" +line_train_list[2])
193 |             doc_ids += (line_train_list[2] + " ")
194 |         else:
195 |             doc_length += "\n"
196 |             doc_ids += "\n"
197 |     # save 2 files
198 |     #f_train_out = open(file="./data/dataset/length_train.txt", mode="w", encoding="utf-8")
199 |     #f_train_out.write(doc_length)
200 |     #f_train_out.close()
201 | 
202 |     f_train_out = open(file="./data/dataset/length_train_tag.txt", mode="w", encoding="utf-8")
203 |     f_train_out.write(doc_ids)
204 |     f_train_out.close()
205 | 
206 |     # validing corpus
207 |     f_valid_in = open(file=inFile_valid, encoding="utf-8")
208 |     doc_length = ""
209 |     doc_ids = ""
210 |     lines_valid = f_valid_in.readlines()
211 |     for line_valid in lines_valid:
212 |         line_valid = line_valid.strip()
213 |         line_valid_list = line_valid.split(sep="\t")
214 |         # print("line_valid_list:",line_valid_list)
215 |         if (line_valid_list[0] != ""):
216 |             doc_length += (line_valid_list[0] + "/" + line_valid_list[2])
217 |             doc_ids += (line_valid_list[2] + " ")
218 |         else:
219 |             doc_length += "\n"
220 |             doc_ids += "\n"
221 |     # save 2 files
222 |     #f_valid_out = open(file="./data/dataset/length_valid.txt", mode="w", encoding="utf-8")
223 |     #f_valid_out.write(doc_length)
224 |     #f_valid_out.close()
225 | 
226 |     f_valid_out = open(file="./data/dataset/length_valid_tag.txt", mode="w", encoding="utf-8")
227 |     f_valid_out.write(doc_ids)
228 |     f_valid_out.close()
229 | 
230 |     # test corpus
231 |     f_test_in = open(file=inFile_test, encoding="utf-8")
232 |     doc_length = ""
233 |     doc_ids = ""
234 |     lines_test = f_test_in.readlines()
235 |     for line_test in lines_test:
236 |         line_test = line_test.strip()
237 |         line_test_list = line_test.split(sep="\t")
238 |         if (line_test_list[0] != ""):
239 |             doc_length += (line_test_list[0] + "/" + line_test_list[2])
240 |             doc_ids += (line_test_list[2] + " ")
241 |         else:
242 |             doc_length += "\n"
243 |             doc_ids += "\n"
244 |     #f_test_out = open(file="./data/dataset/length_test.txt", mode="w", encoding="utf-8")
245 |     #f_test_out.write(doc_length)
246 |     #f_test_out.close()
247 | 
248 |     f_test_out = open(file="./data/dataset/length_test_tag.txt", mode="w", encoding="utf-8")
249 |     f_test_out.write(doc_ids)
250 |     f_test_out.close()
251 | 
252 | #word position info
253 | def toWordAccum(inFile_train,inFile_valid,inFile_test):
254 |     # ---------------------------------------生成accum标注文件-----------------------------------------#
255 |     # training corpus
256 |     f_train_in = open(file=inFile_train, encoding="utf-8")
257 |     doc_position = ""
258 |     doc_ids = ""
259 |     lines_train = f_train_in.readlines()
260 |     for line_train in lines_train:
261 |         line_train = line_train.strip()
262 |         line_train_list = line_train.split(sep="\t")
263 |         # print("line_train_list:",line_train_list)
264 |         if (line_train_list[0] != ""):
265 |             doc_position += (line_train_list[0] + "/" + line_train_list[4])
266 |             doc_ids += (line_train_list[4] + " ")
267 |         else:
268 |             doc_position += "\n"
269 |             doc_ids += "\n"
270 |     # save 2 files
271 |     f_train_out = open(file="./data/dataset/accum_train.txt", mode="w", encoding="utf-8")
272 |     f_train_out.write(doc_position)
273 |     f_train_out.close()
274 | 
275 |     f_train_out = open(file="./data/dataset/accum_train_tag.txt", mode="w", encoding="utf-8")
276 |     f_train_out.write(doc_ids)
277 |     f_train_out.close()
278 | 
279 |     # validing corpus
280 |     f_valid_in = open(file=inFile_valid, encoding="utf-8")
281 |     doc_position = ""
282 |     doc_ids = ""
283 |     lines_valid = f_valid_in.readlines()
284 |     for line_valid in lines_valid:
285 |         line_valid = line_valid.strip()
286 |         line_valid_list = line_valid.split(sep="\t")
287 |         # print("line_valid_list:",line_valid_list)
288 |         if (line_valid_list[0] != ""):
289 |             doc_position += (line_valid_list[0] + "/" + line_valid_list[4])
290 |             doc_ids += (line_valid_list[4] + " ")
291 |         else:
292 |             doc_position += "\n"
293 |             doc_ids += "\n"
294 |     # save 2 files
295 |     f_valid_out = open(file="./data/dataset/accum_valid.txt", mode="w", encoding="utf-8")
296 |     f_valid_out.write(doc_position)
297 |     f_valid_out.close()
298 | 
299 |     f_valid_out = open(file="./data/dataset/accum_valid_tag.txt", mode="w", encoding="utf-8")
300 |     f_valid_out.write(doc_ids)
301 |     f_valid_out.close()
302 | 
303 |     # test corpus
304 |     f_test_in = open(file=inFile_test, encoding="utf-8")
305 |     doc_position = ""
306 |     doc_ids = ""
307 |     lines_test = f_test_in.readlines()
308 |     for line_test in lines_test:
309 |         line_test = line_test.strip()
310 |         line_test_list = line_test.split(sep="\t")
311 |         if (line_test_list[0] != ""):
312 |             doc_position += (line_test_list[0] + "/" + line_test_list[4])
313 |             doc_ids += (line_test_list[4] + " ")
314 |         else:
315 |             doc_position += "\n"
316 |             doc_ids += "\n"
317 |     f_test_out = open(file="./data/dataset/accum_test.txt", mode="w", encoding="utf-8")
318 |     f_test_out.write(doc_position)
319 |     f_test_out.close()
320 | 
321 |     f_test_out = open(file="./data/dataset/accum_test_tag.txt", mode="w", encoding="utf-8")
322 |     f_test_out.write(doc_ids)
323 |     f_test_out.close()
324 | 
325 | def toWordAccumReverse(inFile_train,inFile_valid,inFile_test):
326 |     # ---------------------------------------生成accum标注文件-----------------------------------------#
327 |     # training corpus
328 |     f_train_in = open(file=inFile_train, encoding="utf-8")
329 |     doc_position = ""
330 |     doc_ids = ""
331 |     lines_train = f_train_in.readlines()
332 |     for line_train in lines_train:
333 |         line_train = line_train.strip()
334 |         line_train_list = line_train.split(sep="\t")
335 |         # print("line_train_list:",line_train_list)
336 |         if (line_train_list[0] != ""):
337 |             doc_position += (line_train_list[0] + "/" + line_train_list[5])
338 |             doc_ids += (line_train_list[5] + " ")
339 |         else:
340 |             doc_position += "\n"
341 |             doc_ids += "\n"
342 |     # save 2 files
343 |     #f_train_out = open(file="./data/dataset/accum_reverse_train.txt", mode="w", encoding="utf-8")
344 |     #f_train_out.write(doc_position)
345 |     #f_train_out.close()
346 | 
347 |     f_train_out = open(file="./data/dataset/accum_reverse_train_tag.txt", mode="w", encoding="utf-8")
348 |     f_train_out.write(doc_ids)
349 |     f_train_out.close()
350 | 
351 |     # validing corpus
352 |     f_valid_in = open(file=inFile_valid, encoding="utf-8")
353 |     doc_position = ""
354 |     doc_ids = ""
355 |     lines_valid = f_valid_in.readlines()
356 |     for line_valid in lines_valid:
357 |         line_valid = line_valid.strip()
358 |         line_valid_list = line_valid.split(sep="\t")
359 |         # print("line_valid_list:",line_valid_list)
360 |         if (line_valid_list[0] != ""):
361 |             doc_position += (line_valid_list[0] + "/" + line_valid_list[5])
362 |             doc_ids += (line_valid_list[5] + " ")
363 |         else:
364 |             doc_position += "\n"
365 |             doc_ids += "\n"
366 |     # save 2 files
367 |     #f_valid_out = open(file="./data/dataset/accum_reverse_valid.txt", mode="w", encoding="utf-8")
368 |     #f_valid_out.write(doc_position)
369 |     #f_valid_out.close()
370 | 
371 |     f_valid_out = open(file="./data/dataset/accum_reverse_valid_tag.txt", mode="w", encoding="utf-8")
372 |     f_valid_out.write(doc_ids)
373 |     f_valid_out.close()
374 | 
375 |     # test corpus
376 |     f_test_in = open(file=inFile_test, encoding="utf-8")
377 |     doc_position = ""
378 |     doc_ids = ""
379 |     lines_test = f_test_in.readlines()
380 |     for line_test in lines_test:
381 |         line_test = line_test.strip()
382 |         line_test_list = line_test.split(sep="\t")
383 |         if (line_test_list[0] != ""):
384 |             doc_position += (line_test_list[0] + "/" + line_test_list[5])
385 |             doc_ids += (line_test_list[5] + " ")
386 |         else:
387 |             doc_position += "\n"
388 |             doc_ids += "\n"
389 |     #f_test_out = open(file="./data/dataset/accum_reverse_test.txt", mode="w", encoding="utf-8")
390 |     #f_test_out.write(doc_position)
391 |     #f_test_out.close()
392 | 
393 |     f_test_out = open(file="./data/dataset/accum_reverse_test_tag.txt", mode="w", encoding="utf-8")
394 |     f_test_out.write(doc_ids)
395 |     f_test_out.close()
396 | 
397 | #word position info
398 | def toWordPosition(inFile_train,inFile_valid,inFile_test):
399 |     # ---------------------------------------生成position标注文件-----------------------------------------#
400 |     # training corpus
401 |     f_train_in = open(file=inFile_train, encoding="utf-8")
402 |     doc_position = ""
403 |     doc_ids = ""
404 |     lines_train = f_train_in.readlines()
405 |     i=1
406 |     for line_train in lines_train:
407 |         line_train = line_train.strip()
408 |         line_train_list = line_train.split(sep="\t")
409 |         # print("line_train_list:",line_train_list)
410 |         if (line_train_list[0] != ""):
411 |             doc_position += (line_train_list[0] + "/" + str(i))
412 |             doc_ids += (str(i) + " ")
413 |             i+=1
414 |         else:
415 |             doc_position += "\n"
416 |             doc_ids += "\n"
417 |             i=1
418 |     # save 2 files
419 |     #f_train_out = open(file="./data/dataset/position_train.txt", mode="w", encoding="utf-8")
420 |     #f_train_out.write(doc_position)
421 |     #f_train_out.close()
422 | 
423 |     f_train_out = open(file="./data/dataset/position_train_tag.txt", mode="w", encoding="utf-8")
424 |     f_train_out.write(doc_ids)
425 |     f_train_out.close()
426 | 
427 |     # validing corpus
428 |     f_valid_in = open(file=inFile_valid, encoding="utf-8")
429 |     doc_position = ""
430 |     doc_ids = ""
431 |     lines_valid = f_valid_in.readlines()
432 |     i = 1
433 |     for line_valid in lines_valid:
434 |         line_valid = line_valid.strip()
435 |         line_valid_list = line_valid.split(sep="\t")
436 |         # print("line_valid_list:",line_valid_list)
437 |         if (line_valid_list[0] != ""):
438 |             doc_position += (line_valid_list[0] + "/" + str(i))
439 |             doc_ids += (str(i) + " ")
440 |             i += 1
441 |         else:
442 |             doc_position += "\n"
443 |             doc_ids += "\n"
444 |             i = 1
445 |     # save 2 files
446 |     #f_valid_out = open(file="./data/dataset/position_valid.txt", mode="w", encoding="utf-8")
447 |     #f_valid_out.write(doc_position)
448 |     #f_valid_out.close()
449 | 
450 |     f_valid_out = open(file="./data/dataset/position_valid_tag.txt", mode="w", encoding="utf-8")
451 |     f_valid_out.write(doc_ids)
452 |     f_valid_out.close()
453 | 
454 |     # test corpus
455 |     f_test_in = open(file=inFile_test, encoding="utf-8")
456 |     doc_position = ""
457 |     doc_ids = ""
458 |     lines_test = f_test_in.readlines()
459 |     i=1
460 |     for line_test in lines_test:
461 |         line_test = line_test.strip()
462 |         line_test_list = line_test.split(sep="\t")
463 |         if (line_test_list[0] != ""):
464 |             doc_position += (line_test_list[0] + "/" + str(i))
465 |             doc_ids += (str(i) + " ")
466 |             i+=1
467 |         else:
468 |             doc_position += "\n"
469 |             doc_ids += "\n"
470 |             i=1
471 |     #f_test_out = open(file="./data/dataset/position_test.txt", mode="w", encoding="utf-8")
472 |     #f_test_out.write(doc_position)
473 |     #f_test_out.close()
474 | 
475 |     f_test_out = open(file="./data/dataset/position_test_tag.txt", mode="w", encoding="utf-8")
476 |     f_test_out.write(doc_ids)
477 |     f_test_out.close()
478 | 
479 | 
480 | 
481 | 
482 | if __name__ =="__main__":
483 |     if not os.path.exists("./data/corpus"):
484 |         os.mkdir("./data/corpus/")
485 |     if not os.path.exists("./data/dataset"):
486 |             os.mkdir("./data/dataset/")
487 |     if not os.path.exists("./result"):
488 |         os.mkdir("./result")
489 | 
490 |     print("[1]-> Conver raw .utf-8 files to prosody tagged files")
491 |     toProsody(inFile="./data/raw/prosody_test_tag.utf8",outFile="./data/corpus/prosody_test.txt")
492 |     toProsody(inFile="./data/raw/prosody_train_tag.utf8", outFile="./data/corpus/prosody_train.txt")
493 |     toProsody(inFile="./data/raw/prosody_valid_tag.utf8", outFile="./data/corpus/prosody_valid.txt")
494 | 
495 | 
496 |     print("[2]->merge prosody_train and prosody_valid and prosody_test files")
497 |     merge(
498 |         file1="./data/corpus/prosody_train.txt",
499 |         file2="data/corpus/prosody_valid.txt",
500 |         file3="data/corpus/prosody_test.txt",
501 |         outFile="data/corpus/prosody.txt"
502 |     )
503 | 
504 | 
505 |     print("[3]->generate pos files")
506 |     toPos(inFile_train="./data/raw/prosody_train_tag.utf8",
507 |             inFile_valid="./data/raw/prosody_valid_tag.utf8",
508 |             inFile_test="./data/raw/prosody_test_tag.utf8"
509 |     )
510 | 
511 |     print("[4]->generate length files")
512 |     toWordLength(inFile_train="./data/raw/prosody_train_tag.utf8",
513 |             inFile_valid="./data/raw/prosody_valid_tag.utf8",
514 |             inFile_test="./data/raw/prosody_test_tag.utf8"
515 |      )
516 | 
517 |     print("[5]->generate accmulate files")
518 |     toWordAccum(inFile_train="./data/raw/prosody_train_tag.utf8",
519 |                 inFile_valid="./data/raw/prosody_valid_tag.utf8",
520 |                 inFile_test="./data/raw/prosody_test_tag.utf8"
521 |             )
522 | 
523 |     print("[6]->generate accmulate reverse files")
524 |     toWordAccumReverse(inFile_train="./data/raw/prosody_train_tag.utf8",
525 |                        inFile_valid="./data/raw/prosody_valid_tag.utf8",
526 |                        inFile_test="./data/raw/prosody_test_tag.utf8"
527 |                     )
528 | 
529 |     print("[7]->generate position files")
530 |     toWordPosition(inFile_train="./data/raw/prosody_train_tag.utf8",
531 |                    inFile_valid="./data/raw/prosody_valid_tag.utf8",
532 |                    inFile_test="./data/raw/prosody_test_tag.utf8"
533 |                 )
534 | 
535 | 


--------------------------------------------------------------------------------
/data/raw/README.md:
--------------------------------------------------------------------------------
1 | you should put corpus in this folder.
2 | 
3 | ---`prosody_train_tag.utf8`
4 | 
5 | ---`prosody_valid_tag.utf8`
6 | 
7 | ---`prosody_test_tag.utf8`


--------------------------------------------------------------------------------
/data_processing.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     清洗数据,转换语料格式,得到词嵌入
  3 |     author:xierhacker
  4 |     time:2018.1.22
  5 | '''
  6 | import re
  7 | import os
  8 | import time
  9 | import pandas as pd
 10 | import numpy as np
 11 | from itertools import chain
 12 | #from gensim.models import word2vec
 13 | from parameter import MAX_SENTENCE_SIZE
 14 | from parameter import WORD_EMBEDDING_SIZE
 15 | from parameter import CHAR_EMBEDDING_SIZE
 16 | 
 17 | #原始语料转换为不带任何标记的语料,可以训练字向量
 18 | def toCharCorpus(inFile,outFile):
 19 |     doc = ""
 20 |     file = open(file=inFile, encoding="utf-8")
 21 |     lines = file.readlines()
 22 |     # 匹配#标记
 23 |     pattern1 = re.compile(r"#[0,1,2,3,4]", flags=re.U)
 24 |     # 每个字匹配一次
 25 |     pattern2 =re.compile(r"[^\s]")
 26 |     for line in lines:
 27 |         string = re.sub(pattern=pattern1, repl="", string=line)     #去掉#
 28 |         string=" ".join(re.findall(pattern=pattern2,string=string)) #每个字加上空格
 29 |         string+="\n"
 30 |         doc += string
 31 |     # write to file
 32 |     f = open(file=outFile, mode="w", encoding="utf-8")
 33 |     f.write(doc)
 34 |     f.close()
 35 | 
 36 | 
 37 | #训练字向量并且存储
 38 | def toCharEmbeddings(inFile):
 39 |     sentences = word2vec.Text8Corpus(inFile)
 40 |     model = word2vec.Word2Vec(
 41 |         sentences=sentences,
 42 |         size=CHAR_EMBEDDING_SIZE,           #词向量维度
 43 |         window=5,                           #window大小
 44 |         min_count=0,                        #频率小于这个值被忽略
 45 |         sg=0,                               #sg==0->cbow;   sg==1->skip-gram
 46 |         hs=1,                               #use hierarchical softmax
 47 |         negative=5,                         #use negative sampling
 48 |         sorted_vocab=1,                     #按照词频率从高到低排序
 49 |     )
 50 |     # save embeddings file
 51 |     if not os.path.exists("./data/embeddings"):
 52 |         os.mkdir(path="./data/embeddings")
 53 |     model.wv.save_word2vec_format("./data/embeddings/char_vec.txt", binary=False)
 54 |     #生成char和id相互索引的.csv文件
 55 |     if os.path.exists("./data/embeddings/char_vec.txt"):
 56 |         f=open(file="./data/embeddings/char_vec.txt",encoding="utf-8")
 57 |         lines = f.readlines()
 58 |         # first row is info
 59 |         info = lines[0].strip()
 60 |         info_list = info.split(sep=" ")
 61 |         vocab_size = int(info_list[0])
 62 |         embedding_dims = int(info_list[1])
 63 |         chars=[]
 64 |         ids=[]
 65 |         for i in range(1,vocab_size+1):
 66 |             embed=lines[i].strip()
 67 |             embed_list=embed.split(sep=" ")
 68 |             chars.append(embed_list[0])
 69 |             ids.append(i)
 70 |         pd.DataFrame(data={"chars": chars, "id": ids}). \
 71 |             to_csv(path_or_buf="./data/dataset/chars_ids.csv", index=False, encoding="utf_8")
 72 |     else:
 73 |         print("there is no embedings files")
 74 | 
 75 | 
 76 | 
 77 | #原始语料转换为不带任何标记的语料,可以训练词向量
 78 | def toWordCorpus(inFile,outFile):
 79 |     doc = ""
 80 |     file = open(file=inFile, encoding="utf-8")
 81 |     lines = file.readlines()
 82 |     # 匹配#标记
 83 |     pattern1 = re.compile(r"#[0,1,2,3,4]", flags=re.U)
 84 |     # 每个字匹配一次
 85 |     pattern2 =re.compile(r"[^\s]")
 86 |     for line in lines:
 87 |         string = re.sub(pattern=pattern1, repl=" ", string=line)     #去掉#
 88 |         #string=" ".join(re.findall(pattern=pattern2,string=string)) #每个字加上空格
 89 |         string+="\n"
 90 |         doc += string
 91 |     # write to file
 92 |     f = open(file=outFile, mode="w", encoding="utf-8")
 93 |     f.write(doc)
 94 |     f.close()
 95 | 
 96 | 
 97 | #训练词向量并且存储
 98 | def toWordEmbeddings(inFile):
 99 |     #--------------------------------train word embeddings---------------------------------
100 |     sentences = word2vec.Text8Corpus(inFile)
101 |     model = word2vec.Word2Vec(
102 |         sentences=sentences,
103 |         size=WORD_EMBEDDING_SIZE,       # 词向量维度
104 |         window=5,                       # window大小
105 |         min_count=0,                    # 频率小于这个值被忽略
106 |         sg=0,                           # sg==0->cbow;   sg==1->skip-gram
107 |         hs=1,                           # use hierarchical softmax
108 |         negative=5,                     # use negative sampling
109 |         sorted_vocab=1,                 # 按照词频率从高到低排序
110 |     )
111 |     # save embeddings file
112 |     if not os.path.exists("./data/embeddings"):
113 |         os.mkdir(path="./data/embeddings")
114 |     model.wv.save_word2vec_format("./data/embeddings/word_vec.txt", binary=False)
115 | 
116 |     # ----------------------------------生成word和id相互索引的.csv文件-------------------------
117 |     if os.path.exists("./data/embeddings/word_vec.txt"):
118 |         f = open(file="./data/embeddings/word_vec.txt", encoding="utf-8")
119 |         lines = f.readlines()
120 |         # first row is info
121 |         info = lines[0].strip()
122 |         info_list = info.split(sep=" ")
123 |         vocab_size = int(info_list[0])
124 |         embedding_dims = int(info_list[1])
125 |         words = []
126 |         ids = []
127 |         for i in range(1, vocab_size + 1):
128 |             embed = lines[i].strip()
129 |             embed_list = embed.split(sep=" ")
130 |             words.append(embed_list[0])
131 |             ids.append(i)
132 |         pd.DataFrame(data={"words": words, "id": ids}). \
133 |             to_csv(path_or_buf="./data/dataset/words_ids.csv", index=False, encoding="utf_8")
134 |     else:
135 |         print("there is no embedings files")
136 | 
137 | 
138 | #转换原始corpus为韵律词(PW)格式标记
139 | def toPW(inFile,outFile):
140 |     doc=""
141 |     file = open(file=inFile, encoding="utf-8")
142 |     lines = file.readlines()
143 |     # 匹配#0标记,替换为/n
144 |     pattern1 = re.compile(r"#0", flags=re.U)
145 |     # 匹配#1 #2标记,替换为/b
146 |     pattern2 = re.compile(r"#[1,2]", flags=re.U)
147 |     for line in lines:
148 |         line=line.strip()
149 |         string = re.sub(pattern=pattern1, repl="/n", string=line)           # #0替换为/n
150 |         string = re.sub(pattern=pattern2, repl="/b", string=string)+"\n"    # #1替换为/b
151 |         doc += string
152 |     # write to file
153 |     f = open(file=outFile, mode="w", encoding="utf-8")
154 |     f.write(doc)
155 |     f.close()
156 | 
157 | 
158 | #转换原始corpus为韵律短语(PPH)格式标记
159 | def toPPH(inFile,outFile):
160 |     doc=""
161 |     file = open(file=inFile, encoding="utf-8")
162 |     lines = file.readlines()
163 |     # 匹配#0,#1标记,替换为/n
164 |     pattern1 = re.compile(r"#[0,1]", flags=re.U)
165 |     # 不是/或者b
166 |     pattern2 = re.compile(r"#2", flags=re.U)
167 |     for line in lines:
168 |         line=line.strip()   #去掉一些影响的空格和换行
169 |         string = re.sub(pattern=pattern1, repl="/n", string=line)  # #0和#1替换为/n
170 |         string = re.sub(pattern=pattern2, repl="/b", string=string)+"\n"  # #2替换为/b
171 |         doc += string
172 |     # write to file
173 |     f = open(file=outFile, mode="w", encoding="utf-8")
174 |     f.write(doc)
175 |     f.close()
176 | 
177 | 
178 | #转换原始corpus为语调短语(IPH)格式标记
179 | def toIPH(filename):
180 |     doc = ""
181 |     file = open(file=filename, encoding="utf-8")
182 |     lines = file.readlines()
183 |     # 匹配#1和#2(因为要先去掉#1和#2)
184 |     pattern = re.compile(r"#[1,2]")
185 |     # 匹配#标记
186 |     pattern1 = re.compile(r"#[3,4]", flags=re.U)
187 |     # 不是/或者b
188 |     pattern2 = re.compile(r"(?![/b])")
189 |     # 去掉b后面的/n
190 |     pattern3 = re.compile(r"b/n")
191 |     # 去掉开头的/n
192 |     pattern4 = re.compile(r"^/n")
193 |     for line in lines:
194 |         line = line.strip()  # 去掉一些影响的空格和换行
195 |         string = re.sub(pattern=pattern, repl="", string=line)  # 去掉#1
196 |         string = re.sub(pattern=pattern1, repl="/b", string=string)  # 去掉#
197 |         string = re.sub(pattern=pattern2, repl="/n", string=string)
198 |         string = re.sub(pattern=pattern3, repl="b", string=string)
199 |         string = re.sub(pattern=pattern4, repl="", string=string) + "\n"
200 |         doc += string
201 |     # write to file
202 |     f = open(file="./data/corpus/prosody_iph.txt", mode="w+", encoding="utf-8")
203 |     f.write(doc)
204 |     f.close()
205 | 
206 | 
207 | #清洗
208 | def clean(s):
209 |     if u'“/s' not in s:                 # 句子中间的引号不应去掉
210 |         return s.replace(u' ”/s', '')
211 |     elif u'”/s' not in s:
212 |         return s.replace(u'“/s ', '')
213 |     elif u'‘/s' not in s:
214 |         return s.replace(u' ’/s', '')
215 |     elif u'’/s' not in s:
216 |         return s.replace(u'‘/s ', '')
217 |     else:
218 |         return s
219 | 
220 | def file2corpus(filename):
221 |     '''
222 |     :param filename:
223 |     :return: 语料文件文件转换为一个原始语料句子的list
224 |     '''
225 |     with open(filename, 'rb') as inp:
226 |         corpus = inp.read().decode('UTF-8')   #原始语料 str对象
227 |     corpus = corpus.split('\r')           #换行切分,得到一个简陋列表
228 |     corpus = u''.join(map(clean, corpus))   # 把所有处理的句子连接起来,这里中间连接不用其他字符 str对象
229 |     corpus = re.split(u"\n", corpus)  # 以换行为分割,把语料划分为一个"句子"列表
230 |     #corpus = re.split(u'[，。！？、‘’“”]/[bems]', corpus)    # 以换行为分割,把语料划分为一个"句子"列表
231 |     return corpus              #[人/b  们/e  常/s  说/s  生/b  活/e  是/s  一/s  部/s  教/b  科/m  书/e ,xxx,....]
232 | 
233 | 
234 | def make_component(corpus):
235 |     '''
236 |     :param corpus: 传入原始语料句子corpus列表得到的字数据datas和对应的labels数据都放到dataframe里面存储,方便后面的处理
237 |     :return: df_data
238 |     '''
239 |     sentences= []
240 |     tags = []
241 |     for s in corpus:                                    #corpus列表得到每句corpus想应的sentence以及对应的labels
242 |         sentence_tags = re.findall('([^/]*)/(.)', s)     # sentence_tags:[('人', 'b'), ('们', 'e'), ('常', 's'), ('说', 's')]
243 |         #print("sentence_tags:",sentence_tags)
244 |         if sentence_tags:                            # 顺便去除了一些空样本
245 |             sentence_tags = np.array(sentence_tags)
246 |             sentences.append(sentence_tags[:, 0])    #sentences每一个元素表示一个sentence['人' '们' '常' '说' '生' '活' '是' '一' '部' '教' '科' '书']
247 |             tags.append(sentence_tags[:, 1])         #tags每一个元素表示的是一个句子对应的标签['b' 'e' 's' 's' 'b' 'e' 's' 's' 's' 'b' 'm' 'e']
248 | 
249 |     #使用pandas处理,简化流程
250 |     df_data = pd.DataFrame({'sentences': sentences, 'tags': tags}, index=range(len(sentences)))
251 |     df_data['sentence_len'] = df_data['sentences'].apply(lambda sentences: len(sentences))  # 每句话长度
252 |     print("max sentence length:",df_data["sentence_len"].max())
253 | 
254 |     tags = ['n', 'b']                           #tag列表
255 |     tags_id = range(len(tags))                  #tag的id列表
256 | 
257 |     # tags以及对应的id组件
258 |     pd.DataFrame(data={"tags":tags,"id":tags_id}).\
259 |         to_csv(path_or_buf="./data/dataset/tags_ids.csv",index=False,encoding="utf_8")
260 |     #存储df_data
261 |     df_data.to_csv(path_or_buf="./data/dataset/df_data.csv",index=False,encoding="utf-8")
262 |     return df_data      #暂时不保存,返回
263 | 
264 | 
265 | #read basic component from .csv files
266 | def read_component():
267 |     #读取words和ids的dataframe
268 |     df_words_ids=pd.read_csv(filepath_or_buffer="./data/dataset/words_ids.csv",encoding="utf-8")
269 |     #读取tags和ids的dataframe
270 |     df_tags_ids=pd.read_csv(filepath_or_buffer="./data/dataset/tags_ids.csv",encoding="utf-8")
271 | 
272 |     #转换为words2id, id2words, tags2id, id2tags
273 |     #df_data=pd.DataFrame(data={})
274 |     words2id=pd.Series(data=df_words_ids["id"].values,index=df_words_ids["words"].values)
275 |     id2words=pd.Series(data=df_words_ids["words"].values,index=df_words_ids["id"].values)
276 |     tags2id = pd.Series(data=df_tags_ids["id"].values, index=df_tags_ids["tags"].values)
277 |     id2tags = pd.Series(data=df_tags_ids["tags"].values, index=df_tags_ids["id"].values)
278 |     return words2id, id2words, tags2id, id2tags
279 | 
280 | #转换为最后模型适合的数据集,name表示转换后的数据集存储在哪个文件下面./data/dataset/
281 | def make_dataset(inFile,outFile):
282 |     corpus = file2corpus(inFile)
283 |     #print("----corpus contains ", len(corpus), " sentences.")
284 |     #保存基本组件,并且返回df_data
285 |     print("----saving component <tags_ids.csv> ")
286 |     df_data=make_component(corpus)
287 | 
288 |     #读取组件,并且装换为合适的格式
289 |     words2id, id2words, tags2id, id2tags =read_component()
290 |     #print("words2id.shape:",words2id.shape)
291 |     print("----dataset contains ",df_data.shape[0]," sentences.")
292 | 
293 |     #padding
294 |     def X_padding(sentence):
295 |         ids = list(words2id[sentence])
296 |         if len(ids) > MAX_SENTENCE_SIZE:  # 超过就截断
297 |             return ids[:MAX_SENTENCE_SIZE]
298 |         if len(ids) < MAX_SENTENCE_SIZE:  # 短了就补齐
299 |             ids.extend([0] * (MAX_SENTENCE_SIZE - len(ids)))
300 |         return ids
301 | 
302 |     def y_padding(tags):
303 |         ids = list(tags2id[tags])
304 |         if len(ids) > MAX_SENTENCE_SIZE:  # 超过就截断
305 |             return ids[:MAX_SENTENCE_SIZE]
306 |         if len(ids) < MAX_SENTENCE_SIZE:  # 短了就补齐
307 |             ids.extend([0] * (MAX_SENTENCE_SIZE - len(ids)))
308 |         return ids
309 | 
310 |     #把数据转换为ids表示的的形式
311 |     print("----convert data and label to 'ids' represented")
312 |     df_data['X'] = df_data['sentences'].apply(X_padding)
313 |     df_data['y'] = df_data['tags'].apply(y_padding)
314 |     #print(df_data["X"].head(5))
315 |     #print(df_data["y"].head(5))
316 | 
317 |     #数据集切分
318 |     df_data_train=df_data[:50000]
319 |     df_data_valid=df_data[50000:60000]
320 |     df_data_test = df_data[60000:]
321 | 
322 | 
323 |     #保存最终数据到pkl文件
324 |     print("----saving final dataset <"+outFile+"_summary_train.pkl>")
325 |     df_data_train.to_pickle(path="./data/dataset/"+"/"+outFile+"_summary_train.pkl")
326 |     df_data_train.to_csv(path_or_buf="./data/dataset/" + outFile + "_df_data_train_final.csv", index=False, encoding="utf-8")
327 | 
328 |     print("----saving final dataset <"+outFile+"_summary_valida.pkl>")
329 |     df_data_valid.to_pickle(path="./data/dataset/"+outFile+"_summary_valid.pkl")
330 |     df_data_valid.to_csv(path_or_buf="./data/dataset/" + outFile + "_df_data_valid_final.csv", index=False,
331 |                          encoding="utf-8")
332 | 
333 |     print("----saving final dataset <" + outFile + "_summary_test.pkl>")
334 |     df_data_test.to_pickle(path="./data/dataset/" + "/" + outFile + "_summary_test.pkl")
335 |     df_data_test.to_csv(path_or_buf="./data/dataset/" + outFile + "_df_data_test_final.csv", index=False,
336 |                          encoding="utf-8")
337 | 
338 |     df_data.to_csv(path_or_buf="./data/dataset/" + outFile + "_df_data_final.csv", index=False, encoding="utf-8")
339 | 
340 | 
341 | #summary_train.pkl
342 | if __name__=="__main__":
343 |     start_time = time.time()
344 |     print("[1]-->trans corpus to char corpus and char embeddings...")
345 |     toCharCorpus(inFile="./data/corpus/prosody.txt",outFile="./data/corpus/prosody_char.txt")
346 |     #toCharEmbeddings(inFile="./data/corpus/prosody_char.txt")
347 | 
348 |     print("[2]-->trans corpus to word corpus and word embeddings...")
349 |     toWordCorpus(inFile="./data/corpus/prosody.txt", outFile="./data/corpus/prosody_word.txt")
350 |     #toWordEmbeddings(inFile="./data/corpus/prosody_word.txt")
351 | 
352 |     print("[3]-->trans corpus to PW format......")
353 |     toPW(inFile="./data/corpus/prosody.txt",outFile="./data/corpus/prosody_pw.txt")
354 | 
355 |     print("[4]-->trans corpus to PPH format......")
356 |     toPPH(inFile="./data/corpus/prosody.txt", outFile="./data/corpus/prosody_pph.txt")
357 | 
358 |     #print("[5]-->trans corpus to IPH format......")
359 |     #toIPH("./data/corpus/prosody.txt")
360 | 
361 |     print("[6]-->trans corpus_pw to dataset......")
362 |     make_dataset(inFile="./data/corpus/prosody_pw.txt",outFile="pw")
363 | 
364 |     print("[7]-->trans corpus_pph to dataset......")
365 |     make_dataset(inFile="./data/corpus/prosody_pph.txt", outFile="pph")
366 | 
367 |     #print("[8]-->trans corpus_iph to dataset......")
368 |     #make_dataset(in_filename="./data/corpus/prosody_iph.txt", out_filename="iph")
369 |     duration = time.time() - start_time;
370 |     print("END! this operation spends ", round(duration / 60, 2), " mins")


--------------------------------------------------------------------------------
/models/bilstm_cbow.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     model with CWS and pos information
  3 | '''
  4 | import sys
  5 | sys.path.append("..")
  6 | import numpy as np
  7 | import pandas as pd
  8 | import tensorflow as tf
  9 | import tensorflow.contrib.rnn as rnn
 10 | import tensorflow.contrib.seq2seq as seq2seq
 11 | import time
 12 | import os
 13 | import parameter
 14 | import util
 15 | 
 16 | #指定显卡
 17 | os.environ['CUDA_VISIBLE_DEVICES']='2'
 18 | config=tf.ConfigProto()
 19 | config.gpu_options.allow_growth=True
 20 | 
 21 | 
 22 | class BiLSTM():
 23 |     def __init__(self):
 24 |         # basic environment
 25 |         self.graph = tf.Graph()
 26 |         self.session = tf.Session(graph=self.graph,config=config)
 27 | 
 28 |         # basic parameters
 29 |         self.learning_rate = parameter.LEARNING_RATE
 30 |         self.max_epoch = parameter.MAX_EPOCH
 31 | 
 32 |         self.class_num = parameter.CLASS_NUM
 33 |         self.pos_num = parameter.POS_NUM
 34 |         self.length_num = parameter.LENGTH_NUM
 35 |         self.hidden_units_num = parameter.HIDDEN_UNITS_NUM
 36 |         self.hidden_units_num2 = parameter.HIDDEN_UNITS_NUM2
 37 |         self.layer_num = parameter.LAYER_NUM
 38 |         self.max_sentence_size = parameter.MAX_SENTENCE_SIZE
 39 | 
 40 |         # self.vocab_size = parameter.VOCAB_SIZE
 41 |         self.word_vocab_size = parameter.WORD_VOCAB_SIZE
 42 |         self.embedding_size = parameter.CHAR_EMBEDDING_SIZE
 43 |         self.word_embedding_size = parameter.WORD_EMBEDDING_SIZE
 44 | 
 45 |         self.batch_size = parameter.BATCH_SIZE
 46 |         self.lambda_pw = parameter.LAMBDA_PW
 47 |         self.lambda_pph = parameter.LAMBDA_PPH
 48 |         self.lambda_iph = parameter.LAMBDA_IPH
 49 | 
 50 |         self.keep_prob = parameter.KEEP_PROB
 51 |         self.input_keep_prob = parameter.INPUT_KEEP_PROB
 52 |         self.output_keep_prob = parameter.OUTPUT_KEEP_PROB
 53 | 
 54 |         self.decay_rate = parameter.DECAY
 55 | 
 56 | 
 57 |     #full inference process of each hierachy
 58 |     def hierarchy(self,inputs,y_masked,seq_length,scope_name,reuse=False):
 59 |         if scope_name=="pw":
 60 |             encoder_scope_name="en_lstm_pw"
 61 |             decoder_scope_name = "de_lstm_pw"
 62 |         elif scope_name=="pph":
 63 |             encoder_scope_name = "en_lstm_pph"
 64 |             decoder_scope_name = "de_lstm_pph"
 65 |         else:
 66 |             encoder_scope_name = "en_lstm_iph"
 67 |             decoder_scope_name = "de_lstm_iph"
 68 | 
 69 |         with tf.variable_scope(name_or_scope=scope_name,reuse=reuse):
 70 |             #forward part
 71 |             lstm_forward1=rnn.BasicLSTMCell(num_units=self.hidden_units_num)
 72 |             # 加attention(这里的attention和encoder-decoder架构的attention稍有不同)
 73 |             lstm_forward1 = rnn.AttentionCellWrapper(cell=lstm_forward1, attn_length=5)
 74 | 
 75 |             lstm_forward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num)
 76 |             #加attention
 77 |             lstm_forward2 = rnn.AttentionCellWrapper(cell=lstm_forward2, attn_length=5)
 78 | 
 79 |             lstm_forward=rnn.MultiRNNCell(cells=[lstm_forward1,lstm_forward2])
 80 |             # dropout
 81 |             lstm_forward = rnn.DropoutWrapper(
 82 |                 cell=lstm_forward,
 83 |                 input_keep_prob=self.input_keep_prob_p,
 84 |                 output_keep_prob=self.output_keep_prob_p
 85 |             )
 86 | 
 87 |             #backward part
 88 |             lstm_backward1 = rnn.BasicLSTMCell(num_units=self.hidden_units_num)
 89 |             # 加attention
 90 |             lstm_backward1 = rnn.AttentionCellWrapper(cell=lstm_backward1, attn_length=5)
 91 | 
 92 |             lstm_backward2 = rnn.BasicLSTMCell(num_units=self.hidden_units_num)
 93 |             # 加attention
 94 |             lstm_backward2 = rnn.AttentionCellWrapper(cell=lstm_backward2, attn_length=5)
 95 | 
 96 |             lstm_backward = rnn.MultiRNNCell(cells=[lstm_backward1, lstm_backward2])
 97 |             #drop out
 98 |             lstm_backward = rnn.DropoutWrapper(
 99 |                 cell=lstm_backward,
100 |                 input_keep_prob=self.input_keep_prob_p,
101 |                 output_keep_prob=self.output_keep_prob_p
102 |             )
103 | 
104 |             outputs, states = tf.nn.bidirectional_dynamic_rnn(
105 |                 cell_fw=lstm_forward,
106 |                 cell_bw=lstm_backward,
107 |                 inputs=inputs,
108 |                 sequence_length=seq_length,
109 |                 dtype=tf.float32,
110 |                 scope=decoder_scope_name
111 |             )
112 |             outputs_forward = outputs[0]  # shape of h is [batch_size, max_time, cell_fw.output_size]
113 |             outputs_backward = outputs[1]  # shape of h is [batch_size, max_time, cell_bw.output_size]
114 |             # concat final outputs [batch_size, max_time, cell_fw.output_size*2]
115 |             final_outputs = tf.concat(values=[outputs_forward, outputs_backward], axis=2)
116 |             #shape of h: [batch * time_steps, hidden_units * 2]
117 |             h = tf.reshape(tensor=final_outputs, shape=(-1, self.hidden_units_num * 2))
118 | 
119 |             # 全连接dropout
120 |             h = tf.nn.dropout(x=h, keep_prob=self.keep_prob_p)
121 | 
122 |             # fully connect layer(projection)
123 |             weight=tf.get_variable(
124 |                 name="Weight",
125 |                 shape=(self.hidden_units_num * 2, self.class_num),
126 |                 dtype=tf.float32,
127 |                 initializer=tf.contrib.layers.xavier_initializer()
128 |             )
129 |             bias=tf.get_variable(
130 |                 name="Bias",
131 |                 shape=(self.class_num,),
132 |                 dtype=tf.float32,
133 |                 initializer=tf.contrib.layers.xavier_initializer()
134 |             )
135 |             # logits:[batch_size*max_time, 2]
136 |             #logits =tf.nn.elu(features=tf.matmul(h, weight) + bias)
137 |             logits= tf.matmul(h, weight) + bias
138 | 
139 |             # logits in an normal way:[batch_size,max_time_stpes,2]
140 |             logits_normal = tf.reshape(
141 |                 tensor=logits,
142 |                 shape=(-1, self.max_sentence_size, self.class_num),
143 |                 name="logits_normal"
144 |             )
145 |             # logits_pw_masked [seq_len1+seq_len2+..+seq_lenn, 2]
146 |             logits_masked = tf.boolean_mask(
147 |                 tensor=logits_normal,
148 |                 mask=self.mask,
149 |                 name="logits_masked"
150 |             )
151 |             #print("logits_masked.shape", logits_masked.shape)
152 | 
153 |             # softmax
154 |             prob_masked = tf.nn.softmax(logits=logits_masked, axis=-1, name="prob_pw_masked")
155 |             #print("prob_masked.shape", prob_masked.shape)
156 | 
157 |             # prediction
158 |             # pred:[batch_size*max_time,]
159 |             pred = tf.cast(tf.argmax(logits, 1), tf.int32, name="pred")
160 |             # pred in an normal way,[batch_size, max_time]
161 |             pred_normal = tf.reshape(
162 |                 tensor=pred,
163 |                 shape=(-1, self.max_sentence_size),
164 |                 name="pred_normal"
165 |             )
166 |             # one-hot the pred_normal:[batch_size, max_time,class_num]
167 |             pred_normal_one_hot = tf.one_hot(
168 |                 indices=pred_normal,
169 |                 depth=self.class_num,
170 |                 name="pred_normal_one_hot"
171 |             )
172 |             # pred_masked [seq_len1+seq_len2+....+,]
173 |             pred_masked = tf.boolean_mask(
174 |                 tensor=pred_normal,
175 |                 mask=self.mask,
176 |                 name="pred_masked"
177 |             )
178 | 
179 |             # loss
180 |             loss = tf.losses.sparse_softmax_cross_entropy(
181 |                 labels=y_masked,
182 |                 logits=logits_masked
183 |             ) + tf.contrib.layers.l2_regularizer(self.lambda_pw)(weight)
184 | 
185 |             return loss,prob_masked,pred,pred_masked,pred_normal_one_hot
186 | 
187 | 
188 |     # forward process and training process
189 |     def fit(self, X_train, y_train, len_train, pos_train, length_train, position_train,
190 |             X_valid, y_valid, len_valid, pos_valid, length_valid, position_valid,
191 |             X_test, y_test, len_test, pos_test, length_test, position_test, name, print_log=True):
192 |         # handle data
193 |         y_train_pw = y_train[0]
194 |         y_train_pph = y_train[1]
195 |         # y_train_iph = y_train[2]
196 | 
197 |         y_valid_pw = y_valid[0]
198 |         y_valid_pph = y_valid[1]
199 |         # y_valid_iph = y_valid[2]
200 | 
201 |         y_test_pw = y_test[0]
202 |         y_test_pph = y_test[1]
203 |         # y_valid_iph = y_valid[2]
204 | 
205 | 
206 |         # ------------------------------------------define graph---------------------------------------------#
207 |         with self.graph.as_default():
208 |             #***********************Dataset API****************************
209 |             # create dataset_train object
210 |             dataset_train = tf.data.Dataset.from_tensor_slices(
211 |                 tensors=(X_train, y_train_pw, y_train_pph, len_train, pos_train, length_train, position_train)
212 |             ).repeat().batch(batch_size=self.batch_size).shuffle(buffer_size=2)
213 | 
214 |             # create iterator_train object
215 |             iterator_train = dataset_train.make_one_shot_iterator()
216 | 
217 |             # get batch
218 |             batch_train = iterator_train.get_next()
219 |             #print("batch_train:", batch_train)
220 | 
221 |             # dataset_valid=
222 |             # dataset_test=
223 |             #***************************************************************
224 | 
225 |             #****************** data place holder***************************
226 |             self.X_p = tf.placeholder(dtype=tf.int32,shape=(None, self.max_sentence_size),name="input_p")
227 |             self.y_p_pw = tf.placeholder(dtype=tf.int32,shape=(None, self.max_sentence_size),name="label_p_pw")
228 |             self.y_p_pph = tf.placeholder(dtype=tf.int32,shape=(None, self.max_sentence_size),name="label_p_pph")
229 |             #self.y_p_iph = tf.placeholder(dtype=tf.int32,shape=(None, self.max_sentence_size),name="label_p_iph")
230 | 
231 |             # 相应序列的长度占位
232 |             self.seq_len_p = tf.placeholder(dtype=tf.int32, shape=(None,), name="seq_len")
233 | 
234 |             # 用来去掉padding的mask
235 |             self.mask = tf.sequence_mask(lengths=self.seq_len_p,maxlen=self.max_sentence_size,name="mask")
236 | 
237 |             # 去掉padding之后的labels,shape[seq_len1+seq_len2+....+,]
238 |             y_p_pw_masked = tf.boolean_mask(tensor=self.y_p_pw,mask=self.mask,name="y_p_pw_masked")
239 |             y_p_pph_masked = tf.boolean_mask(tensor=self.y_p_pph,mask=self.mask,name="y_p_pph_masked")
240 |             # y_p_iph_masked = tf.boolean_mask(tensor=self.y_p_iph,mask=self.mask,name="y_p_iph_masked")
241 | 
242 |             # pos info placeholder
243 |             self.pos_p = tf.placeholder(dtype=tf.int32,shape=(None, self.max_sentence_size),name="pos_p")
244 |             self.pos_one_hot = tf.one_hot(indices=self.pos_p, depth=self.pos_num, name="pos_one_hot")
245 |             #print("shape of pos_one_hot:", self.pos_one_hot.shape)
246 | 
247 |             # length info placeholder
248 |             self.length_p = tf.placeholder(dtype=tf.int32,shape=(None, self.max_sentence_size),name="length_p")
249 |             self.length_one_hot = tf.one_hot(indices=self.length_p, depth=self.length_num, name="pos_one_hot")
250 |             #print("shape of length_one_hot:", self.length_one_hot.shape)
251 | 
252 |             # position info placeholder
253 |             self.position_p = tf.placeholder(dtype=tf.int32,shape=(None, self.max_sentence_size),name="position_p")
254 |             self.position_one_hot = tf.one_hot(indices=self.position_p, depth=self.max_sentence_size,name="pos_one_hot")
255 |             #print("shape of position_one_hot:", self.position_one_hot.shape)
256 | 
257 |             # dropout 占位
258 |             self.keep_prob_p = tf.placeholder(dtype=tf.float32, shape=[], name="keep_prob_p")
259 |             self.input_keep_prob_p = tf.placeholder(dtype=tf.float32, shape=[], name="input_keep_prob_p")
260 |             self.output_keep_prob_p = tf.placeholder(dtype=tf.float32, shape=[], name="output_keep_prob_p")
261 | 
262 |             # word embeddings
263 |             self.word_embeddings = tf.Variable(
264 |                 initial_value=util.readEmbeddings(file="../data/embeddings/word_vec.txt"),
265 |                 trainable=False,
266 |                 name="word_embeddings"
267 |             )
268 |             print("wordembedding.shape", self.word_embeddings.shape)
269 | 
270 |             # -------------------------------------PW-----------------------------------------------------
271 |             # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size]
272 |             inputs_pw = tf.nn.embedding_lookup(params=self.word_embeddings, ids=self.X_p, name="embeded_input_pw")
273 |             print("shape of inputs_pw:", inputs_pw.shape)
274 |             inputs_pw = tf.concat(
275 |                 values=[inputs_pw, self.pos_one_hot, self.length_one_hot, self.position_one_hot],
276 |                 axis=2,
277 |                 name="input_pw"
278 |             )
279 |             print("shape of cancated inputs_pw:", inputs_pw.shape)
280 |             self.loss_pw,prob_pw_masked,pred_pw,pred_pw_masked,pred_normal_one_hot_pw=self.hierarchy(
281 |                 inputs=inputs_pw,
282 |                 y_masked=y_p_pw_masked,
283 |                 seq_length=self.seq_len_p,
284 |                 scope_name="pw"
285 |             )
286 | 
287 |             # ----------------------------------PPH--------------------------------------------------
288 |             # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size]
289 |             inputs_pph = tf.nn.embedding_lookup(params=self.word_embeddings, ids=self.X_p, name="embeded_input_pph")
290 |             print("input_pph.shape", inputs_pph.shape)
291 |             # concat all information
292 |             inputs_pph = tf.concat(
293 |                 values=[inputs_pph, self.pos_one_hot, self.length_one_hot, self.position_one_hot,pred_normal_one_hot_pw],
294 |                 axis=2,
295 |                 name="inputs_pph"
296 |             )
297 |             print("shape of input_pph:", inputs_pph.shape)
298 | 
299 |             self.loss_pph, prob_pph_masked,pred_pph, pred_pph_masked, pred_normal_one_hot_pph = self.hierarchy(
300 |                 inputs=inputs_pph,
301 |                 y_masked=y_p_pph_masked,
302 |                 seq_length=self.seq_len_p,
303 |                 scope_name="pph"
304 |             )
305 | 
306 |             # adjust learning rate
307 |             global_step = tf.Variable(initial_value=1, trainable=False)
308 |             start_learning_rate = self.learning_rate
309 |             learning_rate = tf.train.exponential_decay(
310 |                 learning_rate=start_learning_rate,
311 |                 global_step=global_step,
312 |                 decay_steps=(X_train.shape[0] // self.batch_size) + 1,
313 |                 decay_rate=self.decay_rate,
314 |                 staircase=True,
315 |                 name="decay_learning_rate"
316 |             )
317 | 
318 |             # loss
319 |             self.loss = self.loss_pw + self.loss_pph
320 | 
321 |             # optimizer
322 |             self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss,global_step=global_step)
323 |             self.init_op = tf.global_variables_initializer()
324 |             self.init_local_op = tf.local_variables_initializer()
325 | 
326 |         # --------------------------------------------Session-------------------------------------------------
327 |         with self.session as sess:
328 |             print("Training Start")
329 |             sess.run(self.init_op)  # initialize all variables
330 |             sess.run(self.init_local_op)
331 | 
332 |             train_Size = X_train.shape[0];
333 |             validation_Size = X_valid.shape[0]
334 |             test_Size = X_test.shape[0]
335 | 
336 |             self.best_validation_loss = 1000  # best validation accuracy in training process
337 |             # store result
338 |             if not os.path.exists("../result/bilstm/"):
339 |                 os.mkdir("../result/bilstm/")
340 | 
341 |             # epoch
342 |             for epoch in range(1, self.max_epoch + 1):
343 |                 print("Epoch:", epoch)
344 |                 start_time = time.time()  # time evaluation
345 |                 # training loss/accuracy in every mini-batch
346 |                 self.train_losses = []
347 |                 self.train_accus_pw = []
348 |                 self.train_accus_pph = []
349 |                 # self.train_accus_iph = []
350 | 
351 |                 self.c1_f_pw = [];
352 |                 self.c2_f_pw = []  # each class's f1 score
353 |                 self.c1_f_pph = [];
354 |                 self.c2_f_pph = []
355 |                 # self.c1_f_iph = [];
356 |                 # self.c2_f_iph = []
357 |                 lrs = []
358 | 
359 |                 # mini batch
360 |                 for i in range(0, (train_Size // self.batch_size)):
361 |                     elements=sess.run(batch_train)
362 |                     # 注意:这里获取的都是mask之后的值
363 |                     _, train_loss, lr,y_train_pw_masked, y_train_pph_masked, \
364 |                     train_pred_pw, train_pred_pph,  \
365 |                     train_prob_pw_masked, train_prob_pph_masked = sess.run(
366 |                         fetches=[self.optimizer, self.loss,learning_rate,y_p_pw_masked, y_p_pph_masked,
367 |                                  pred_pw_masked, pred_pph_masked, prob_pw_masked, prob_pph_masked ],
368 |                         feed_dict={
369 |                             self.X_p: elements[0],
370 |                             self.y_p_pw: elements[1],
371 |                             self.y_p_pph: elements[2],
372 |                             self.seq_len_p: elements[3],
373 |                             self.pos_p: elements[4],
374 |                             self.length_p: elements[5],
375 |                             self.position_p: elements[6],
376 |                             self.keep_prob_p: self.keep_prob,
377 |                             self.input_keep_prob_p: self.input_keep_prob,
378 |                             self.output_keep_prob_p: self.output_keep_prob
379 |                         }
380 |                     )
381 | 
382 |                     # write the prob to files
383 |                     util.writeProb(
384 |                         prob_pw=train_prob_pw_masked,
385 |                         prob_pph=train_prob_pph_masked,
386 |                         outFile="../result/bilstm/bilstm_prob_train_epoch" + str(epoch) + ".txt"
387 |                     )
388 | 
389 |                     lrs.append(lr)
390 |                     # loss
391 |                     self.train_losses.append(train_loss)
392 |                     # metrics
393 |                     accuracy_pw, f1_pw = util.eval(y_true=y_train_pw_masked, y_pred=train_pred_pw)  # pw
394 |                     accuracy_pph, f1_pph = util.eval(y_true=y_train_pph_masked, y_pred=train_pred_pph)  # pph
395 |                     # accuracy_iph, f1_1_iph, f1_2_iph = util.eval(y_true=y_train_iph_masked,y_pred=train_pred_iph)   # iph
396 | 
397 |                     self.train_accus_pw.append(accuracy_pw)
398 |                     self.train_accus_pph.append(accuracy_pph)
399 |                     # self.train_accus_iph.append(accuracy_iph)
400 |                     # F1-score
401 |                     self.c1_f_pw.append(f1_pw[0]);
402 |                     self.c2_f_pw.append(f1_pw[1])
403 |                     self.c1_f_pph.append(f1_pph[0]);
404 |                     self.c2_f_pph.append(f1_pph[1])
405 |                     # self.c1_f_iph.append(f1_1_iph);
406 |                     # self.c2_f_iph.append(f1_2_iph)
407 | 
408 |                 # ----------------------------------validation in every epoch----------------------------------
409 |                 self.valid_loss, y_valid_pw_masked, y_valid_pph_masked, \
410 |                 valid_pred_pw_masked, valid_pred_pph_masked, valid_pred_pw, valid_pred_pph, \
411 |                 valid_prob_pw_masked, valid_prob_pph_masked = sess.run(
412 |                     fetches=[self.loss, y_p_pw_masked, y_p_pph_masked,
413 |                              pred_pw_masked, pred_pph_masked, pred_pw, pred_pph,
414 |                              prob_pw_masked, prob_pph_masked
415 |                              ],
416 |                     feed_dict={
417 |                         self.X_p: X_valid,
418 |                         self.y_p_pw: y_valid_pw,
419 |                         self.y_p_pph: y_valid_pph,
420 |                         self.seq_len_p: len_valid,
421 |                         self.pos_p: pos_valid,
422 |                         self.length_p: length_valid,
423 |                         self.position_p: position_valid,
424 |                         self.keep_prob_p: 1.0,
425 |                         self.input_keep_prob_p: 1.0,
426 |                         self.output_keep_prob_p: 1.0
427 |                     }
428 |                 )
429 |                 #write the prob to files
430 |                 util.writeProb(
431 |                     prob_pw=valid_prob_pw_masked,
432 |                     prob_pph=valid_prob_pph_masked,
433 |                     outFile="../result/bilstm/bilstm_prob_valid_epoch" + str(epoch) + ".txt"
434 |                 )
435 | 
436 |                 # metrics
437 |                 self.valid_accuracy_pw, self.valid_f1_pw = util.eval(
438 |                     y_true=y_valid_pw_masked,
439 |                     y_pred=valid_pred_pw_masked
440 |                 )
441 |                 self.valid_accuracy_pph, self.valid_f1_pph = util.eval(
442 |                     y_true=y_valid_pph_masked,
443 |                     y_pred=valid_pred_pph_masked
444 |                 )
445 |                 # recover to original corpus txt
446 |                 # shape of valid_pred_pw,valid_pred_pw,valid_pred_pw:[corpus_size*time_stpes]
447 |                 util.recover2(
448 |                     X=X_valid,
449 |                     preds_pw=valid_pred_pw,
450 |                     preds_pph=valid_pred_pph,
451 |                     filename="../result/bilstm/valid_recover_epoch_" + str(epoch) + ".txt"
452 |                 )
453 |                 # ----------------------------------------------------------------------------------------
454 | 
455 |                 # ----------------------------------test in every epoch----------------------------------
456 |                 self.test_loss, y_test_pw_masked, y_test_pph_masked, \
457 |                 test_pred_pw_masked, test_pred_pph_masked, test_pred_pw, test_pred_pph, \
458 |                 test_prob_pw_masked, test_prob_pph_masked = sess.run(
459 |                     fetches=[self.loss, y_p_pw_masked, y_p_pph_masked,
460 |                              pred_pw_masked, pred_pph_masked, pred_pw, pred_pph,
461 |                              prob_pw_masked, prob_pph_masked
462 |                              ],
463 |                     feed_dict={
464 |                         self.X_p: X_test,
465 |                         self.y_p_pw: y_test_pw,
466 |                         self.y_p_pph: y_test_pph,
467 |                         self.seq_len_p: len_test,
468 |                         self.pos_p: pos_test,
469 |                         self.length_p: length_test,
470 |                         self.position_p: position_test,
471 |                         self.keep_prob_p: 1.0,
472 |                         self.input_keep_prob_p: 1.0,
473 |                         self.output_keep_prob_p: 1.0
474 |                     }
475 |                 )
476 |                 # write the prob to files
477 |                 util.writeProb(
478 |                     prob_pw=test_prob_pw_masked,
479 |                     prob_pph=test_prob_pph_masked,
480 |                     outFile="../result/bilstm/bilstm_prob_test_epoch" + str(epoch) + ".txt"
481 |                 )
482 | 
483 |                 # metrics
484 |                 self.test_accuracy_pw, self.test_f1_pw = util.eval(
485 |                     y_true=y_test_pw_masked,
486 |                     y_pred=test_pred_pw_masked
487 |                 )
488 |                 self.test_accuracy_pph, self.test_f1_pph = util.eval(
489 |                     y_true=y_test_pph_masked,
490 |                     y_pred=test_pred_pph_masked
491 |                 )
492 |                 # recover to original corpus txt
493 |                 # shape of test_pred_pw,test_pred_pw,test_pred_pw:[corpus_size*time_stpes]
494 |                 util.recover2(
495 |                     X=X_test,
496 |                     preds_pw=test_pred_pw,
497 |                     preds_pph=test_pred_pph,
498 |                     filename="../result/bilstm/test_recover_epoch_" + str(epoch) + ".txt"
499 |                 )
500 |                 # -----------------------------------------------------------------------------------
501 | 
502 |                 # self.valid_accuracy_iph, self.valid_f1_1_iph, self.valid_f1_2_iph = util.eval(y_true=y_valid_iph_masked,y_pred=valid_pred_iph)
503 | 
504 |                 # show information
505 |                 print("Epoch ", epoch, " finished.", "spend ", round((time.time() - start_time) / 60, 2), " mins")
506 |                 print("learning rate:", sum(lrs) / len(lrs))
507 |                 self.showInfo(type="training")
508 |                 self.showInfo(type="validation")
509 |                 self.showInfo(type="test")
510 | 
511 |                 # when we get a new best validation accuracy,we store the model
512 |                 if self.best_validation_loss < self.valid_loss:
513 |                     self.best_validation_loss = self.valid_loss
514 |                     print("New Best loss ", self.best_validation_loss, " On Validation set! ")
515 |                     print("Saving Models......\n\n")
516 |                     # exist ./models folder?
517 |                     if not os.path.exists("./models/"):
518 |                         os.mkdir(path="./models/")
519 |                     if not os.path.exists("./models/" + name):
520 |                         os.mkdir(path="./models/" + name)
521 |                     if not os.path.exists("./models/" + name + "/bilstm"):
522 |                         os.mkdir(path="./models/" + name + "/bilstm")
523 |                     # create saver
524 |                     saver = tf.train.Saver()
525 |                     saver.save(sess, "./models/" + name + "/bilstm/my-model-10000")
526 |                     # Generates MetaGraphDef.
527 |                     saver.export_meta_graph("./models/" + name + "/bilstm/my-model-10000.meta")
528 |                 print("\n\n")
529 | 
530 | 
531 |     # 返回预测的结果或者准确率,y not None的时候返回准确率,y ==None的时候返回预测值
532 |     def pred(self, name, X, y=None, ):
533 |         start_time = time.time()  # compute time
534 |         if y is None:
535 |             with self.session as sess:
536 |                 # restore model
537 |                 new_saver = tf.train.import_meta_graph(
538 |                     meta_graph_or_file="./models/" + name + "/bilstm/my-model-10000.meta",
539 |                     clear_devices=True
540 |                 )
541 |                 new_saver.restore(sess, "./models/" + name + "/bilstm/my-model-10000")
542 |                 # get default graph
543 |                 graph = tf.get_default_graph()
544 |                 # get opration from the graph
545 |                 pred_normal = graph.get_operation_by_name("pred_normal").outputs[0]
546 |                 X_p = graph.get_operation_by_name("input_placeholder").outputs[0]
547 |                 pred = sess.run(fetches=pred_normal, feed_dict={X_p: X})
548 |                 print("this operation spends ", round((time.time() - start_time) / 60, 2), " mins")
549 |                 return pred
550 |         else:
551 |             with self.session as sess:
552 |                 # restore model
553 |                 new_saver = tf.train.import_meta_graph(
554 |                     meta_graph_or_file="./models/" + name + "/bilstm/my-model-10000.meta",
555 |                     clear_devices=True
556 |                 )
557 |                 new_saver.restore(sess, "./models/" + name + "/bilstm/my-model-10000")
558 |                 graph = tf.get_default_graph()
559 |                 # get opration from the graph
560 |                 accuracy = graph.get_operation_by_name("accuracy").outputs[0]
561 |                 X_p = graph.get_operation_by_name("input_placeholder").outputs[0]
562 |                 y_p = graph.get_operation_by_name("label_placeholder").outputs[0]
563 |                 # forward and get the results
564 |                 accu = sess.run(fetches=accuracy, feed_dict={X_p: X, y_p: y})
565 |                 print("this operation spends ", round((time.time() - start_time) / 60, 2), " mins")
566 |                 return accu
567 | 
568 |     def showInfo(self, type):
569 |         if type == "training":
570 |             # training information
571 |             print("                             /**Training info**/")
572 |             print("----avarage training loss:", sum(self.train_losses) / len(self.train_losses))
573 |             print("PW:")
574 |             print("----avarage accuracy:", sum(self.train_accus_pw) / len(self.train_accus_pw))
575 |             # print("----avarage f1-Score of N:", sum(self.c1_f_pw) / len(self.c1_f_pw))
576 |             print("----avarage f1-Score of B:", sum(self.c2_f_pw) / len(self.c2_f_pw))
577 |             print("PPH:")
578 |             print("----avarage accuracy :", sum(self.train_accus_pph) / len(self.train_accus_pph))
579 |             # print("----avarage f1-Score of N:", sum(self.c1_f_pph) / len(self.c1_f_pph))
580 |             print("----avarage f1-Score of B:", sum(self.c2_f_pph) / len(self.c2_f_pph))
581 |             # print("IPH:")
582 |             # print("----avarage accuracy:", sum(self.train_accus_iph) / len(self.train_accus_iph))
583 |             # print("----avarage f1-Score of N:", sum(self.c1_f_iph) / len(self.c1_f_iph))
584 |             # print("----avarage f1-Score of B:", sum(self.c2_f_iph) / len(self.c2_f_iph))
585 |         elif type == "validation":
586 |             print("                             /**Validation info**/")
587 |             print("----avarage validation loss:", self.valid_loss)
588 |             print("PW:")
589 |             print("----avarage accuracy:", self.valid_accuracy_pw)
590 |             # print("----avarage f1-Score of N:", self.valid_f1_pw[0])
591 |             print("----avarage f1-Score of B:", self.valid_f1_pw[1])
592 |             print("PPH:")
593 |             print("----avarage accuracy :", self.valid_accuracy_pph)
594 |             # print("----avarage f1-Score of N:", self.valid_f1_pph[0])
595 |             print("----avarage f1-Score of B:", self.valid_f1_pph[1])
596 |             # print("IPH:")
597 |             # print("----avarage accuracy:", self.valid_accuracy_iph)
598 |             # print("----avarage f1-Score of N:", self.valid_f1_1_iph)
599 |             # print("----avarage f1-Score of B:", self.valid_f1_2_iph)
600 |         else:
601 |             print("                             /**testation info**/")
602 |             print("----avarage test loss:", self.test_loss)
603 |             print("PW:")
604 |             print("----avarage accuracy:", self.test_accuracy_pw)
605 |             # print("----avarage f1-Score of N:", self.test_f1_pw[0])
606 |             print("----avarage f1-Score of B:", self.test_f1_pw[1])
607 |             print("PPH:")
608 |             print("----avarage accuracy :", self.test_accuracy_pph)
609 |             # print("----avarage f1-Score of N:", self.test_f1_pph[0])
610 |             print("----avarage f1-Score of B:", self.test_f1_pph[1])
611 |             # print("IPH:")
612 |             # print("----avarage accuracy:", self.test_accuracy_iph)
613 |             # print("----avarage f1-Score of N:", self.test_f1_1_iph)
614 |             # print("----avarage f1-Score of B:", self.test_f1_2_iph)
615 | 
616 | 
617 | # train && test
618 | if __name__ == "__main__":
619 |     # 读数据
620 |     print("Loading Data...")
621 |     X_train, y_train, len_train, pos_train, length_train, position_train, \
622 |     X_valid, y_valid, len_valid, pos_valid, length_valid, position_valid, \
623 |     X_test, y_test, len_test, pos_test, length_test, position_test=util.loadData()
624 | 
625 |     # print("Run Model...\n\n\n")
626 |     model = BiLSTM()
627 |     model.fit(
628 |         X_train, y_train, len_train, pos_train, length_train, position_train,
629 |         X_valid, y_valid, len_valid, pos_valid, length_valid, position_valid,
630 |         X_test, y_test, len_test, pos_test, length_test, position_test, "test", False)


--------------------------------------------------------------------------------
/models/bilstm_cwe.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     BILSTM+CBOW
  3 | '''
  4 | 
  5 | import sys
  6 | sys.path.append("..")
  7 | import numpy as np
  8 | import pandas as pd
  9 | import tensorflow as tf
 10 | import tensorflow.contrib.rnn as rnn
 11 | import time
 12 | import os
 13 | import parameter
 14 | import util
 15 | 
 16 | class BiLSTM_CWE():
 17 |     def __init__(self):
 18 |         # basic environment
 19 |         self.graph = tf.Graph()
 20 |         self.session = tf.Session(graph=self.graph)
 21 | 
 22 |         # basic parameters
 23 |         self.learning_rate = parameter.LEARNING_RATE
 24 |         self.max_epoch = parameter.MAX_EPOCH
 25 | 
 26 |         self.class_num = parameter.CLASS_NUM
 27 |         self.pos_num=parameter.POS_NUM
 28 |         self.length_num=parameter.LENGTH_NUM
 29 |         self.hidden_units_num = parameter.HIDDEN_UNITS_NUM
 30 |         self.hidden_units_num2 = parameter.HIDDEN_UNITS_NUM2
 31 |         self.layer_num = parameter.LAYER_NUM
 32 |         self.max_sentence_size = parameter.MAX_SENTENCE_SIZE
 33 | 
 34 |         #self.vocab_size = parameter.VOCAB_SIZE
 35 |         self.word_vocab_size=parameter.WORD_VOCAB_SIZE
 36 |         self.char_embedding_size = parameter.CHAR_EMBEDDING_SIZE
 37 |         self.word_embedding_size=parameter.WORD_EMBEDDING_SIZE
 38 | 
 39 |         self.batch_size = parameter.BATCH_SIZE
 40 |         self.lambda_pw=parameter.LAMBDA_PW
 41 |         self.lambda_pph=parameter.LAMBDA_PPH
 42 |         self.lambda_iph=parameter.LAMBDA_IPH
 43 | 
 44 |         self.keep_prob = parameter.KEEP_PROB
 45 |         self.input_keep_prob=parameter.INPUT_KEEP_PROB
 46 |         self.output_keep_prob=parameter.OUTPUT_KEEP_PROB
 47 | 
 48 |         self.decay_rate=parameter.DECAY
 49 | 
 50 | 
 51 |     # forward process and training process
 52 |     def fit(self, X_train, y_train, len_train,pos_train,length_train,position_train,
 53 |             X_validation, y_validation, len_validation, pos_validation,length_validation,position_validation,
 54 |             name, print_log=True):
 55 |         # ---------------------------------------forward computation--------------------------------------------#
 56 |         y_train_pw = y_train[0]
 57 |         y_train_pph = y_train[1]
 58 |         #y_train_iph = y_train[2]
 59 | 
 60 |         y_validation_pw = y_validation[0]
 61 |         y_validation_pph = y_validation[1]
 62 |         #y_validation_iph = y_validation[2]
 63 |         # ---------------------------------------define graph---------------------------------------------#
 64 |         with self.graph.as_default():
 65 |             # data place holder
 66 |             self.X_p = tf.placeholder(
 67 |                 dtype=tf.int32,
 68 |                 shape=(None, self.max_sentence_size),
 69 |                 name="input_placeholder"
 70 |             )
 71 | 
 72 |             # pos info placeholder
 73 |             self.pos_p = tf.placeholder(
 74 |                 dtype=tf.int32,
 75 |                 shape=(None, self.max_sentence_size),
 76 |                 name="pos_placeholder"
 77 |             )
 78 | 
 79 |             # length info placeholder
 80 |             self.length_p = tf.placeholder(
 81 |                 dtype=tf.int32,
 82 |                 shape=(None, self.max_sentence_size),
 83 |                 name="length_placeholder"
 84 |             )
 85 | 
 86 |             # position info placeholder
 87 |             self.position_p = tf.placeholder(
 88 |                 dtype=tf.int32,
 89 |                 shape=(None, self.max_sentence_size),
 90 |                 name="length_placeholder"
 91 |             )
 92 | 
 93 |             self.y_p_pw = tf.placeholder(
 94 |                 dtype=tf.int32,
 95 |                 shape=(None, self.max_sentence_size),
 96 |                 name="label_placeholder_pw"
 97 |             )
 98 |             self.y_p_pph = tf.placeholder(
 99 |                 dtype=tf.int32,
100 |                 shape=(None, self.max_sentence_size),
101 |                 name="label_placeholder_pph"
102 |             )
103 | 
104 |             #self.y_p_iph = tf.placeholder(
105 |             #    dtype=tf.int32,
106 |             #    shape=(None, self.max_sentence_size),
107 |             #    name="label_placeholder_iph"
108 |             #)
109 |             # dropout 占位
110 |             self.keep_prob_p = tf.placeholder(dtype=tf.float32, shape=[], name="keep_prob_p")
111 |             self.input_keep_prob_p = tf.placeholder(dtype=tf.float32, shape=[], name="input_keep_prob_p")
112 |             self.output_keep_prob_p=tf.placeholder(dtype=tf.float32, shape=[], name="output_keep_prob_p")
113 | 
114 |             # 相应序列的长度占位
115 |             self.seq_len_p = tf.placeholder(
116 |                 dtype=tf.int32,
117 |                 shape=(None,),
118 |                 name="seq_len"
119 |             )
120 | 
121 |             #用来去掉padding的mask
122 |             self.mask = tf.sequence_mask(
123 |                 lengths=self.seq_len_p,
124 |                 maxlen=self.max_sentence_size,
125 |                 name="mask"
126 |             )
127 | 
128 |             #去掉padding之后的labels
129 |             y_p_pw_masked = tf.boolean_mask(                #shape[seq_len1+seq_len2+....+,]
130 |                 tensor=self.y_p_pw,
131 |                 mask=self.mask,
132 |                 name="y_p_pw_masked"
133 |             )
134 | 
135 |             y_p_pph_masked = tf.boolean_mask(               # shape[seq_len1+seq_len2+....+,]
136 |                 tensor=self.y_p_pph,
137 |                 mask=self.mask,
138 |                 name="y_p_pph_masked"
139 |             )
140 | 
141 |             #y_p_iph_masked = tf.boolean_mask(               # shape[seq_len1+seq_len2+....+,]
142 |             #    tensor=self.y_p_iph,
143 |             #    mask=self.mask,
144 |             #    name="y_p_iph_masked"
145 |             #)
146 | 
147 |             # embeddings
148 |             #self.embeddings = tf.Variable(
149 |             #    initial_value=tf.zeros(shape=(self.vocab_size, self.embedding_size), dtype=tf.float32),
150 |             #    name="embeddings"
151 |             #)
152 | 
153 |             self.word_embeddings=tf.Variable(
154 |                 initial_value=util.getCWE(
155 |                         word_embed_file="../data/embeddings/word_vec.txt",
156 |                         char_embed_file="../data/embeddings/char_vec.txt"
157 |                     ),
158 |                 name="word_embeddings"
159 |             )
160 | 
161 |             print("word_embeddings.shape",self.word_embeddings.shape)
162 | 
163 |             # pos one-hot
164 |             self.pos_one_hot = tf.one_hot(
165 |                 indices=self.pos_p,
166 |                 depth=self.pos_num,
167 |                 name="pos_one_hot"
168 |             )
169 |             print("shape of pos_one_hot:", self.pos_one_hot.shape)
170 | 
171 |             # length one-hot
172 |             self.length_one_hot = tf.one_hot(
173 |                 indices=self.length_p,
174 |                 depth=self.length_num,
175 |                 name="pos_one_hot"
176 |             )
177 |             print("shape of length_one_hot:", self.length_one_hot.shape)
178 | 
179 |             # position one-hot
180 |             self.position_one_hot = tf.one_hot(
181 |                 indices=self.position_p,
182 |                 depth=self.max_sentence_size,
183 |                 name="pos_one_hot"
184 |             )
185 |             print("shape of position_one_hot:", self.position_one_hot.shape)
186 | 
187 |             # -------------------------------------PW-----------------------------------------------------
188 |             # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size]
189 |             inputs_pw = tf.nn.embedding_lookup(params=self.word_embeddings, ids=self.X_p, name="embeded_input_pw")
190 |             print("shape of inputs_pw:",inputs_pw.shape)
191 |             #concat all information
192 |             inputs_pw = tf.concat(
193 |                 values=[inputs_pw, self.pos_one_hot, self.length_one_hot, self.position_one_hot],
194 |                 axis=2,
195 |                 name="input_pw"
196 |             )
197 |             print("shape of cancated inputs_pw:", inputs_pw.shape)
198 | 
199 |             # forward part
200 |             en_lstm_forward1_pw = rnn.BasicLSTMCell(num_units=self.hidden_units_num)
201 |             en_lstm_forward2_pw=rnn.BasicLSTMCell(num_units=self.hidden_units_num2)
202 |             en_lstm_forward_pw=rnn.MultiRNNCell(cells=[en_lstm_forward1_pw,en_lstm_forward2_pw])
203 |             #dropout
204 |             en_lstm_forward_pw=rnn.DropoutWrapper(
205 |                 cell=en_lstm_forward_pw,
206 |                 input_keep_prob=self.input_keep_prob_p,
207 |                 output_keep_prob=self.output_keep_prob_p
208 |             )
209 | 
210 |             # backward part
211 |             en_lstm_backward1_pw = rnn.BasicLSTMCell(num_units=self.hidden_units_num)
212 |             en_lstm_backward2_pw=rnn.BasicLSTMCell(num_units=self.hidden_units_num2)
213 |             en_lstm_backward_pw=rnn.MultiRNNCell(cells=[en_lstm_backward1_pw,en_lstm_backward2_pw])
214 |             #dropout
215 |             en_lstm_backward_pw=rnn.DropoutWrapper(
216 |                 cell=en_lstm_backward_pw,
217 |                 input_keep_prob=self.input_keep_prob_p,
218 |                 output_keep_prob=self.output_keep_prob_p
219 |             )
220 | 
221 |             outputs, states = tf.nn.bidirectional_dynamic_rnn(
222 |                 cell_fw=en_lstm_forward_pw,
223 |                 cell_bw=en_lstm_backward_pw,
224 |                 inputs=inputs_pw,
225 |                 sequence_length=self.seq_len_p,
226 |                 dtype=tf.float32,
227 |                 scope="pw"
228 |             )
229 | 
230 |             outputs_forward_pw = outputs[0]                 # shape [batch_size, max_time, cell_fw.output_size]
231 |             outputs_backward_pw = outputs[1]                # shape [batch_size, max_time, cell_bw.output_size]
232 |             # concat final outputs [batch_size, max_time, cell_fw.output_size*2]
233 |             h_pw = tf.concat(values=[outputs_forward_pw, outputs_backward_pw], axis=2)
234 |             h_pw=tf.reshape(tensor=h_pw,shape=(-1,self.hidden_units_num*2),name="h_pw")
235 |             print("h_pw.shape",h_pw.shape)
236 | 
237 |             # 全连接dropout
238 |             h_pw = tf.nn.dropout(x=h_pw, keep_prob=self.keep_prob_p, name="dropout_h_pw")
239 | 
240 |             # fully connect layer(projection)
241 |             w_pw = tf.Variable(
242 |                 initial_value=tf.random_normal(shape=(self.hidden_units_num*2, self.class_num)),
243 |                 name="weights_pw"
244 |             )
245 |             b_pw = tf.Variable(
246 |                 initial_value=tf.random_normal(shape=(self.class_num,)),
247 |                 name="bias_pw"
248 |             )
249 |             #logits
250 |             logits_pw = tf.matmul(h_pw, w_pw) + b_pw        #logits_pw:[batch_size*max_time, 2]
251 |             logits_normal_pw=tf.reshape(                    #logits in an normal way:[batch_size,max_time_stpes,2]
252 |                 tensor=logits_pw,
253 |                 shape=(-1,self.max_sentence_size,self.class_num),
254 |                 name="logits_normal_pw"
255 |             )
256 |             logits_pw_masked = tf.boolean_mask(             # logits_pw_masked [seq_len1+seq_len2+....+,3]
257 |                 tensor=logits_normal_pw,
258 |                 mask=self.mask,
259 |                 name="logits_pw_masked"
260 |             )
261 | 
262 |             # prediction
263 |             pred_pw = tf.cast(tf.argmax(logits_pw, 1), tf.int32, name="pred_pw")   # pred_pw:[batch_size*max_time,]
264 |             pred_normal_pw = tf.reshape(                    # pred in an normal way,[batch_size, max_time]
265 |                 tensor=pred_pw,
266 |                 shape=(-1, self.max_sentence_size),
267 |                 name="pred_normal_pw"
268 |             )
269 | 
270 |             pred_pw_masked = tf.boolean_mask(               # logits_pw_masked [seq_len1+seq_len2+....+,]
271 |                 tensor=pred_normal_pw,
272 |                 mask=self.mask,
273 |                 name="pred_pw_masked"
274 |             )
275 | 
276 |             pred_normal_one_hot_pw = tf.one_hot(            # one-hot the pred_normal:[batch_size, max_time,class_num]
277 |                 indices=pred_normal_pw,
278 |                 depth=self.class_num,
279 |                 name="pred_normal_one_hot_pw"
280 |             )
281 | 
282 |             # loss
283 |             self.loss_pw = tf.losses.sparse_softmax_cross_entropy(
284 |                 labels=y_p_pw_masked,
285 |                 logits=logits_pw_masked
286 |             )+tf.contrib.layers.l2_regularizer(self.lambda_pw)(w_pw)
287 |             # ---------------------------------------------------------------------------------------
288 | 
289 |             # ----------------------------------PPH--------------------------------------------------
290 |             # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size]
291 |             inputs_pph = tf.nn.embedding_lookup(params=self.word_embeddings, ids=self.X_p, name="embeded_input_pph")
292 |             print("shape of input_pph:", inputs_pph.shape)
293 |             # concat all information
294 |             inputs_pph = tf.concat(
295 |                 values=[inputs_pph, self.pos_one_hot, self.length_one_hot, self.position_one_hot,
296 |                         pred_normal_one_hot_pw],
297 |                 axis=2,
298 |                 name="inputs_pph"
299 |             )
300 |             print("shape of input_pph:", inputs_pph.shape)
301 | 
302 |             # forward part
303 |             en_lstm_forward1_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num)
304 |             en_lstm_forward2_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num2)
305 |             en_lstm_forward_pph = rnn.MultiRNNCell(cells=[en_lstm_forward1_pph, en_lstm_forward2_pph])
306 |             #dropout
307 |             en_lstm_forward_pph=rnn.DropoutWrapper(
308 |                 cell=en_lstm_forward_pph,
309 |                 input_keep_prob=self.input_keep_prob_p,
310 |                 output_keep_prob=self.output_keep_prob_p
311 |             )
312 | 
313 |             # backward part
314 |             en_lstm_backward1_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num)
315 |             en_lstm_backward2_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num2)
316 |             en_lstm_backward_pph = rnn.MultiRNNCell(cells=[en_lstm_backward1_pph, en_lstm_backward2_pph])
317 |             #dropout
318 |             en_lstm_backward_pph=rnn.DropoutWrapper(
319 |                 cell=en_lstm_backward_pph,
320 |                 input_keep_prob=self.input_keep_prob_p,
321 |                 output_keep_prob=self.output_keep_prob_p
322 |             )
323 | 
324 |             outputs, states = tf.nn.bidirectional_dynamic_rnn(
325 |                 cell_fw=en_lstm_forward_pph,
326 |                 cell_bw=en_lstm_backward_pph,
327 |                 inputs=inputs_pph,
328 |                 sequence_length=self.seq_len_p,
329 |                 dtype=tf.float32,
330 |                 scope="pph"
331 |             )
332 | 
333 |             outputs_forward_pph = outputs[0]  # shape [batch_size, max_time, cell_fw.output_size]
334 |             outputs_backward_pph = outputs[1]  # shape [batch_size, max_time, cell_bw.output_size]
335 |             # concat final outputs [batch_size, max_time, cell_fw.output_size*2]
336 |             h_pph = tf.concat(values=[outputs_forward_pph, outputs_backward_pph], axis=2)
337 |             h_pph = tf.reshape(tensor=h_pph, shape=(-1, self.hidden_units_num * 2), name="h_pph")
338 | 
339 |             # 全连接dropout
340 |             h_pph = tf.nn.dropout(x=h_pph, keep_prob=self.keep_prob_p, name="dropout_h_pph")
341 | 
342 |             # fully connect layer(projection)
343 |             w_pph = tf.Variable(
344 |                 initial_value=tf.random_normal(shape=(self.hidden_units_num*2, self.class_num)),
345 |                 name="weights_pph"
346 |             )
347 |             b_pph = tf.Variable(
348 |                 initial_value=tf.random_normal(shape=(self.class_num,)),
349 |                 name="bias_pph"
350 |             )
351 |             # logits
352 |             logits_pph = tf.matmul(h_pph, w_pph) + b_pph  # shape of logits:[batch_size*max_time, 2]
353 |             logits_normal_pph = tf.reshape(                 # logits in an normal way:[batch_size,max_time_stpes,2]
354 |                 tensor=logits_pph,
355 |                 shape=(-1, self.max_sentence_size, self.class_num),
356 |                 name="logits_normal_pph"
357 |             )
358 |             logits_pph_masked = tf.boolean_mask(            # [seq_len1+seq_len2+....+,3]
359 |                 tensor=logits_normal_pph,
360 |                 mask=self.mask,
361 |                 name="logits_pph_masked"
362 |             )
363 | 
364 |             # prediction
365 |             pred_pph = tf.cast(tf.argmax(logits_pph, 1), tf.int32, name="pred_pph")  # pred_pph:[batch_size*max_time,]
366 |             pred_normal_pph = tf.reshape(                       # pred in an normal way,[batch_size, max_time]
367 |                 tensor=pred_pph,
368 |                 shape=(-1, self.max_sentence_size),
369 |                 name="pred_normal_pph"
370 |             )
371 |             pred_pph_masked = tf.boolean_mask(                  # logits_pph_masked [seq_len1+seq_len2+....+,]
372 |                 tensor=pred_normal_pph,
373 |                 mask=self.mask,
374 |                 name="pred_pph_masked"
375 |             )
376 |             pred_normal_one_hot_pph = tf.one_hot(               # one-hot the pred_normal:[batch_size, max_time,class_num]
377 |                 indices=pred_normal_pph,
378 |                 depth=self.class_num,
379 |                 name="pred_normal_one_hot_pph"
380 |             )
381 | 
382 |             # loss
383 |             self.loss_pph = tf.losses.sparse_softmax_cross_entropy(
384 |                 labels=y_p_pph_masked,
385 |                 logits=logits_pph_masked
386 |             )+tf.contrib.layers.l2_regularizer(self.lambda_pph)(w_pph)
387 |             # ------------------------------------------------------------------------------------
388 | 
389 |             '''
390 |             # ---------------------------------------IPH------------------------------------------
391 |             # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size]
392 |             inputs_iph = tf.nn.embedding_lookup(params=self.embeddings, ids=self.X_p, name="embeded_input_iph")
393 |             # shape of inputs[batch_size,max_time_stpes,embeddings_dims+class_num]
394 |             inputs_iph = tf.concat(values=[inputs_iph, pred_normal_one_hot_pph], axis=2, name="inputs_pph")
395 |             # print("shape of input_pph:", inputs_pph.shape)
396 |             # encoder cells
397 |             # forward part
398 |             en_lstm_forward1_iph = rnn.BasicLSTMCell(num_units=self.hidden_units_num)
399 |             # en_lstm_forward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2)
400 |             # en_lstm_forward=rnn.MultiRNNCell(cells=[en_lstm_forward1,en_lstm_forward2])
401 | 
402 |             # backward part
403 |             en_lstm_backward1_iph = rnn.BasicLSTMCell(num_units=self.hidden_units_num)
404 |             # en_lstm_backward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2)
405 |             # en_lstm_backward=rnn.MultiRNNCell(cells=[en_lstm_backward1,en_lstm_backward2])
406 | 
407 |             # decoder cells
408 |             de_lstm_iph = rnn.BasicLSTMCell(num_units=self.hidden_units_num*2)
409 | 
410 |             # encode
411 |             encoder_outputs_iph, encoder_states_iph = self.encoder(
412 |                 cell_forward=en_lstm_forward1_iph,
413 |                 cell_backward=en_lstm_backward1_iph,
414 |                 inputs=inputs_iph,
415 |                 seq_length=self.seq_len_p,
416 |                 scope_name="en_lstm_iph"
417 |             )
418 |             # shape of h is [batch*time_steps,hidden_units*2]
419 |             h_iph = self.decoder(
420 |                 cell=de_lstm_iph,
421 |                 initial_state=encoder_states_iph,
422 |                 inputs=encoder_outputs_iph,
423 |                 scope_name="de_lstm_iph"
424 |             )
425 | 
426 |             # fully connect layer(projection)
427 |             w_iph = tf.Variable(
428 |                 initial_value=tf.random_normal(shape=(self.hidden_units_num*2, self.class_num)),
429 |                 name="weights_iph"
430 |             )
431 |             b_iph = tf.Variable(
432 |                 initial_value=tf.random_normal(shape=(self.class_num,)),
433 |                 name="bias_iph"
434 |             )
435 |             # logits
436 |             logits_iph = tf.matmul(h_iph, w_iph) + b_iph  # shape of logits:[batch_size*max_time, 3]
437 |             logits_normal_iph = tf.reshape(                # logits in an normal way:[batch_size,max_time_stpes,3]
438 |                 tensor=logits_iph,
439 |                 shape=(-1, self.max_sentence_size, 3),
440 |                 name="logits_normal_iph"
441 |             )
442 |             logits_iph_masked = tf.boolean_mask(  # [seq_len1+seq_len2+....+,3]
443 |                 tensor=logits_normal_iph,
444 |                 mask=self.mask,
445 |                 name="logits_iph_masked"
446 |             )
447 | 
448 |             # prediction
449 |             pred_iph = tf.cast(tf.argmax(logits_iph, 1), tf.int32, name="pred_iph")  # pred_iph:[batch_size*max_time,]
450 |             pred_normal_iph = tf.reshape(  # pred in an normal way,[batch_size, max_time]
451 |                 tensor=pred_iph,
452 |                 shape=(-1, self.max_sentence_size),
453 |                 name="pred_normal_iph"
454 |             )
455 |             pred_iph_masked = tf.boolean_mask(  # logits_iph_masked [seq_len1+seq_len2+....+,]
456 |                 tensor=pred_normal_iph,
457 |                 mask=self.mask,
458 |                 name="pred_iph_masked"
459 |             )
460 |             pred_normal_one_hot_iph = tf.one_hot(  # one-hot the pred_normal:[batch_size, max_time,class_num]
461 |                 indices=pred_normal_iph,
462 |                 depth=self.class_num,
463 |                 name="pred_normal_one_hot_iph"
464 |             )
465 |             # loss
466 |             self.loss_iph = tf.losses.sparse_softmax_cross_entropy(
467 |                 labels=y_p_iph_masked,
468 |                 logits=logits_iph_masked
469 |             )+tf.contrib.layers.l2_regularizer(self.lambda_iph)(w_iph)
470 | 
471 |             # ---------------------------------------------------------------------------------------
472 |             '''
473 |             # adjust learning rate
474 |             global_step = tf.Variable(initial_value=1, trainable=False)
475 |             start_learning_rate = self.learning_rate
476 |             learning_rate = tf.train.exponential_decay(
477 |                 learning_rate=start_learning_rate,
478 |                 global_step=global_step,
479 |                 decay_steps=(X_train.shape[0] // self.batch_size) + 1,
480 |                 decay_rate=self.decay_rate,
481 |                 staircase=True,
482 |                 name="decay_learning_rate"
483 |             )
484 | 
485 |             # loss
486 |             self.loss = self.loss_pw + self.loss_pph
487 | 
488 | 
489 |             # optimizer
490 |             self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss,global_step)
491 |             self.init_op = tf.global_variables_initializer()
492 |             self.init_local_op = tf.local_variables_initializer()
493 | 
494 |         # ------------------------------------Session-----------------------------------------
495 |         with self.session as sess:
496 |             print("Training Start")
497 |             sess.run(self.init_op)                  # initialize all variables
498 |             sess.run(self.init_local_op)
499 | 
500 |             train_Size = X_train.shape[0];
501 |             validation_Size = X_validation.shape[0]
502 |             self.best_validation_loss = 1000                # best validation accuracy in training process
503 | 
504 |             # epoch
505 |             for epoch in range(1, self.max_epoch + 1):
506 |                 print("Epoch:", epoch)
507 |                 start_time = time.time()  # time evaluation
508 |                 # training loss/accuracy in every mini-batch
509 |                 self.train_losses = []
510 |                 self.train_accus_pw = []
511 |                 self.train_accus_pph = []
512 |                 #self.train_accus_iph = []
513 | 
514 |                 self.c1_f_pw = [];
515 |                 self.c2_f_pw = []  # each class's f1 score
516 |                 self.c1_f_pph = [];
517 |                 self.c2_f_pph = []
518 |                 #self.c1_f_iph = [];
519 |                 #self.c2_f_iph = []
520 | 
521 |                 lrs = []
522 | 
523 |                 # mini batch
524 |                 for i in range(0, (train_Size // self.batch_size)):
525 |                     #注意:这里获取的都是mask之后的值
526 |                     _, train_loss, y_train_pw_masked,y_train_pph_masked,\
527 |                     train_pred_pw, train_pred_pph,lr = sess.run(
528 |                         fetches=[self.optimizer, self.loss,
529 |                                  y_p_pw_masked,y_p_pph_masked,
530 |                                  pred_pw_masked, pred_pph_masked,learning_rate],
531 |                         feed_dict={
532 |                             self.X_p: X_train[i * self.batch_size:(i + 1) * self.batch_size],
533 |                             self.y_p_pw: y_train_pw[i * self.batch_size:(i + 1) * self.batch_size],
534 |                             self.y_p_pph: y_train_pph[i * self.batch_size:(i + 1) * self.batch_size],
535 |                             self.seq_len_p: len_train[i * self.batch_size:(i + 1) * self.batch_size],
536 |                             self.pos_p: pos_train[i * self.batch_size:(i + 1) * self.batch_size],
537 |                             self.length_p: length_train[i * self.batch_size:(i + 1) * self.batch_size],
538 |                             self.position_p: position_train[i * self.batch_size:(i + 1) * self.batch_size],
539 |                             self.keep_prob_p: self.keep_prob,
540 |                             self.input_keep_prob_p:self.input_keep_prob,
541 |                             self.output_keep_prob_p:self.output_keep_prob
542 |                         }
543 |                     )
544 |                     lrs.append(lr)
545 | 
546 |                     # loss
547 |                     self.train_losses.append(train_loss)
548 | 
549 |                     # metrics
550 |                     accuracy_pw, f1_pw= util.eval(y_true=y_train_pw_masked,y_pred=train_pred_pw)       # pw
551 |                     accuracy_pph, f1_pph= util.eval(y_true=y_train_pph_masked,y_pred=train_pred_pph)   # pph
552 |                     #accuracy_iph, f1_1_iph, f1_2_iph = util.eval(y_true=y_train_iph_masked,y_pred=train_pred_iph)   # iph
553 | 
554 |                     self.train_accus_pw.append(accuracy_pw)
555 |                     self.train_accus_pph.append(accuracy_pph)
556 |                     #self.train_accus_iph.append(accuracy_iph)
557 |                     # F1-score
558 |                     self.c1_f_pw.append(f1_pw[0]);
559 |                     self.c2_f_pw.append(f1_pw[1])
560 |                     self.c1_f_pph.append(f1_pph[0]);
561 |                     self.c2_f_pph.append(f1_pph[1])
562 |                     #self.c1_f_iph.append(f1_1_iph);
563 |                     #self.c2_f_iph.append(f1_2_iph)
564 | 
565 |                 print("learning rate:", sum(lrs) / len(lrs))
566 |                 # validation in every epoch
567 |                 self.validation_loss, y_valid_pw_masked,y_valid_pph_masked,\
568 |                 valid_pred_pw, valid_pred_pph = sess.run(
569 |                     fetches=[self.loss, y_p_pw_masked,y_p_pph_masked,
570 |                              pred_pw_masked, pred_pph_masked],
571 |                     feed_dict={
572 |                         self.X_p: X_validation,
573 |                         self.y_p_pw: y_validation_pw,
574 |                         self.y_p_pph: y_validation_pph,
575 |                         self.seq_len_p: len_validation,
576 |                         self.pos_p: pos_validation,
577 |                         self.length_p: length_validation,
578 |                         self.position_p: position_validation,
579 |                         self.keep_prob_p: 1.0,
580 |                         self.input_keep_prob_p:1.0,
581 |                         self.output_keep_prob_p:1.0
582 |                     }
583 |                 )
584 |                 # print("valid_pred_pw.shape:",valid_pred_pw.shape)
585 |                 # print("valid_pred_pph.shape:",valid_pred_pph.shape)
586 |                 # print("valid_pred_iph.shape:",valid_pred_iph.shape)
587 | 
588 |                 # metrics
589 |                 self.valid_accuracy_pw, self.valid_f1_pw = util.eval(y_true=y_valid_pw_masked,y_pred=valid_pred_pw)
590 |                 self.valid_accuracy_pph, self.valid_f1_pph = util.eval(y_true=y_valid_pph_masked,y_pred=valid_pred_pph)
591 | 
592 |                 #self.valid_accuracy_iph, self.valid_f1_1_iph, self.valid_f1_2_iph = util.eval(y_true=y_valid_iph_masked,y_pred=valid_pred_iph)
593 |                 print("Epoch ", epoch, " finished.", "spend ", round((time.time() - start_time) / 60, 2), " mins")
594 |                 self.showInfo(type="training")
595 |                 self.showInfo(type="validation")
596 | 
597 | 
598 |                 # when we get a new best validation accuracy,we store the model
599 |                 if self.best_validation_loss < self.validation_loss:
600 |                     self.best_validation_loss = self.validation_loss
601 |                     print("New Best loss ", self.best_validation_loss, " On Validation set! ")
602 |                     print("Saving Models......\n\n")
603 |                     # exist ./models folder?
604 |                     if not os.path.exists("./models/"):
605 |                         os.mkdir(path="./models/")
606 |                     if not os.path.exists("./models/" + name):
607 |                         os.mkdir(path="./models/" + name)
608 |                     if not os.path.exists("./models/" + name + "/bilstm"):
609 |                         os.mkdir(path="./models/" + name + "/bilstm")
610 |                     # create saver
611 |                     saver = tf.train.Saver()
612 |                     saver.save(sess, "./models/" + name + "/bilstm/my-model-10000")
613 |                     # Generates MetaGraphDef.
614 |                     saver.export_meta_graph("./models/" + name + "/bilstm/my-model-10000.meta")
615 |                 print("\n\n")
616 | 
617 |                 # test:using X_validation_pw
618 |                 test_pred_pw, test_pred_pph = sess.run(
619 |                     fetches=[pred_pw, pred_pph],
620 |                     feed_dict={
621 |                         self.X_p: X_validation,
622 |                         self.seq_len_p: len_validation,
623 |                         self.pos_p: pos_validation,
624 |                         self.length_p: length_validation,
625 |                         self.position_p: position_validation,
626 |                         self.keep_prob_p: 1.0,
627 |                         self.input_keep_prob_p:1.0,
628 |                         self.output_keep_prob_p:1.0
629 |                     }
630 |                 )
631 | 
632 |                 # recover to original corpus txt
633 |                 # shape of valid_pred_pw,valid_pred_pw,valid_pred_pw:[corpus_size*time_stpes]
634 |                 util.recover2(
635 |                     X=X_validation,
636 |                     preds_pw=test_pred_pw,
637 |                     preds_pph=test_pred_pph,
638 |                     filename="../result/bilstm_cwe/recover_epoch_" + str(epoch) + ".txt"
639 |                 )
640 | 
641 |     # 返回预测的结果或者准确率,y not None的时候返回准确率,y ==None的时候返回预测值
642 |     def pred(self, name, X, y=None, ):
643 |         start_time = time.time()  # compute time
644 |         if y is None:
645 |             with self.session as sess:
646 |                 # restore model
647 |                 new_saver = tf.train.import_meta_graph(
648 |                     meta_graph_or_file="./models/" + name + "/bilstm/my-model-10000.meta",
649 |                     clear_devices=True
650 |                 )
651 |                 new_saver.restore(sess, "./models/" + name + "/bilstm/my-model-10000")
652 |                 # get default graph
653 |                 graph = tf.get_default_graph()
654 |                 # get opration from the graph
655 |                 pred_normal = graph.get_operation_by_name("pred_normal").outputs[0]
656 |                 X_p = graph.get_operation_by_name("input_placeholder").outputs[0]
657 |                 pred = sess.run(fetches=pred_normal, feed_dict={X_p: X})
658 |                 print("this operation spends ", round((time.time() - start_time) / 60, 2), " mins")
659 |                 return pred
660 |         else:
661 |             with self.session as sess:
662 |                 # restore model
663 |                 new_saver = tf.train.import_meta_graph(
664 |                     meta_graph_or_file="./models/" + name + "/bilstm/my-model-10000.meta",
665 |                     clear_devices=True
666 |                 )
667 |                 new_saver.restore(sess, "./models/" + name + "/bilstm/my-model-10000")
668 |                 graph = tf.get_default_graph()
669 |                 # get opration from the graph
670 |                 accuracy = graph.get_operation_by_name("accuracy").outputs[0]
671 |                 X_p = graph.get_operation_by_name("input_placeholder").outputs[0]
672 |                 y_p = graph.get_operation_by_name("label_placeholder").outputs[0]
673 |                 # forward and get the results
674 |                 accu = sess.run(fetches=accuracy, feed_dict={X_p: X, y_p: y})
675 |                 print("this operation spends ", round((time.time() - start_time) / 60, 2), " mins")
676 |                 return accu
677 | 
678 | 
679 |     def showInfo(self, type):
680 |         if type == "training":
681 |             # training information
682 |             print("                             /**Training info**/")
683 |             print("----avarage training loss:", sum(self.train_losses) / len(self.train_losses))
684 |             print("PW:")
685 |             print("----avarage accuracy:", sum(self.train_accus_pw) / len(self.train_accus_pw))
686 |             #print("----avarage f1-Score of N:", sum(self.c1_f_pw) / len(self.c1_f_pw))
687 |             print("----avarage f1-Score of B:", sum(self.c2_f_pw) / len(self.c2_f_pw))
688 |             print("PPH:")
689 |             print("----avarage accuracy :", sum(self.train_accus_pph) / len(self.train_accus_pph))
690 |             #print("----avarage f1-Score of N:", sum(self.c1_f_pph) / len(self.c1_f_pph))
691 |             print("----avarage f1-Score of B:", sum(self.c2_f_pph) / len(self.c2_f_pph))
692 |             #print("IPH:")
693 |             #print("----avarage accuracy:", sum(self.train_accus_iph) / len(self.train_accus_iph))
694 |             #print("----avarage f1-Score of N:", sum(self.c1_f_iph) / len(self.c1_f_iph))
695 |             #print("----avarage f1-Score of B:", sum(self.c2_f_iph) / len(self.c2_f_iph))
696 |         else:
697 |             print("                             /**Validation info**/")
698 |             print("----avarage validation loss:", self.validation_loss)
699 |             print("PW:")
700 |             print("----avarage accuracy:", self.valid_accuracy_pw)
701 |             #print("----avarage precision of N:", self.valid_precision_1_pw)
702 |             #print("----avarage recall of N:", self.valid_recall_1_pw)
703 |             #print("----avarage f1-Score of N:", self.valid_f1_1_pw)
704 |             #print("----avarage precision of B:", self.valid_precision_2_pw)
705 |             #print("----avarage recall of B:", self.valid_recall_2_pw)
706 |             print("----avarage f1-Score of B:", self.valid_f1_pw[0])
707 |             print("PPH:")
708 |             print("----avarage accuracy :", self.valid_accuracy_pph)
709 |             #print("----avarage precision of N:", self.valid_precision_1_pph)
710 |             #print("----avarage recall of N:", self.valid_recall_1_pph)
711 |             #print("----avarage f1-Score of N:", self.valid_f1_1_pph)
712 |             #print("----avarage precision of B:", self.valid_precision_2_pph)
713 |             #print("----avarage recall of B:", self.valid_recall_2_pph)
714 |             print("----avarage f1-Score of B:", self.valid_f1_pph[1])
715 |             #print("----avarage f1-Score of N:", self.valid_f1_1_pph)
716 |             #print("----avarage f1-Score of B:", self.valid_f1_2_pph)
717 |             #print("IPH:")
718 |             #print("----avarage accuracy:", self.valid_accuracy_iph)
719 |             #print("----avarage f1-Score of N:", self.valid_f1_1_iph)
720 |             #print("----avarage f1-Score of B:", self.valid_f1_2_iph)
721 | 
722 | 
723 | # train && test
724 | if __name__ == "__main__":
725 |     # 读数据
726 |     # pw
727 |     df_train_pw = pd.read_pickle(path="../data/dataset/pw_summary_train.pkl")
728 |     df_validation_pw = pd.read_pickle(path="../data/dataset/pw_summary_validation.pkl")
729 |     # pph
730 |     df_train_pph = pd.read_pickle(path="../data/dataset/pph_summary_train.pkl")
731 |     df_validation_pph = pd.read_pickle(path="../data/dataset/pph_summary_validation.pkl")
732 | 
733 |     # iph
734 |     #df_train_iph = pd.read_pickle(path="./dataset/temptest/iph_summary_train.pkl")
735 |     #df_validation_iph = pd.read_pickle(path="./dataset/temptest/iph_summary_validation.pkl")
736 | 
737 |     # 实际上,X里面的内容都是一样的,所以这里统一使用pw的X来作为所有的X
738 |     # 但是标签是不一样的,所以需要每个都要具体定义
739 |     X_train = np.asarray(list(df_train_pw['X'].values))
740 |     X_validation = np.asarray(list(df_validation_pw['X'].values))
741 |     #print("X_train:\n",X_train)
742 |     #print("X_train.shape\n",X_train.shape)
743 |     #print("X_validation:\n",X_validation)
744 |     #print("X_validation.shape:\n",X_validation.shape)
745 | 
746 |     # tags
747 |     y_train_pw = np.asarray(list(df_train_pw['y'].values))
748 |     y_validation_pw = np.asarray(list(df_validation_pw['y'].values))
749 | 
750 |     #print("y_train_pw:",y_train_pw)
751 |     #print("y_validation_pw:",y_validation_pw)
752 | 
753 |     y_train_pph = np.asarray(list(df_train_pph['y'].values))
754 |     y_validation_pph = np.asarray(list(df_validation_pph['y'].values))
755 | 
756 |     #y_train_iph = np.asarray(list(df_train_iph['y'].values))
757 |     #y_validation_iph = np.asarray(list(df_validation_iph['y'].values))
758 | 
759 |     # length每一行序列的长度
760 |     # 因为都一样,所以统一使用pw的
761 |     len_train = np.asarray(list(df_train_pw['sentence_len'].values))
762 |     len_validation = np.asarray(list(df_validation_pw['sentence_len'].values))
763 |     #print("len_train:", len_train.shape)
764 |     #print("len_validation:", len_validation.shape)
765 | 
766 |     y_train = [y_train_pw, y_train_pph]
767 |     y_validation = [y_validation_pw, y_validation_pph]
768 |     #print("y_train_pw:\n", y_train_pw);
769 |     #print(y_train_pw.shape)
770 |     #print("y_train_pph:\n", y_train_pph);
771 |     #print(y_train_pph.shape)
772 |     # print("y_train_iph:\n", y_train_iph);
773 |     # print(y_train_iph.shape)
774 | 
775 |     #-----------------------------------Extra Info---------------------------------------------
776 |     #pos
777 |     pos_train = util.readExtraInfo(file="../data/dataset/pos_train_tag.txt")
778 |     pos_validation = util.readExtraInfo(file="../data/dataset/pos_test_tag.txt")
779 | 
780 |     # length
781 |     length_train = util.readExtraInfo(file="../data/dataset/length_train_tag.txt")
782 |     length_validation = util.readExtraInfo(file="../data/dataset/length_test_tag.txt")
783 |     # print("shape of length_train:",length_train.shape)
784 |     # print("shape of length_test:",length_validation.shape)
785 | 
786 |     # position
787 |     position_train = util.readExtraInfo(file="../data/dataset/position_train_tag.txt")
788 |     position_validation = util.readExtraInfo(file="../data/dataset/position_test_tag.txt")
789 |     #print("shape of position_train:", position_train.shape)
790 |     #print("shape of positon_test:", position_validation.shape)
791 |     # accum
792 |     accum_train = util.readExtraInfo(file="../data/dataset/accum_train_tag.txt")
793 |     accum_validation = util.readExtraInfo(file="../data/dataset/accum_test_tag.txt")
794 |     #print("shape of accum_train:", accum_train.shape)
795 |     #print("shape of accum_test:", accum_validation.shape)
796 | 
797 |     # accum reverse
798 |     accumR_train = util.readExtraInfo(file="../data/dataset/accum_reverse_train_tag.txt")
799 |     accumR_validation = util.readExtraInfo(file="../data/dataset/accum_reverse_test_tag.txt")
800 |     #print("shape of accumR_train:", accumR_train.shape)
801 |     #print("shape of accumR_test:", accumR_validation.shape)
802 | 
803 |     model = BiLSTM_CWE()
804 |     model.fit(X_train, y_train, len_train,pos_train,length_train,position_train,
805 |               X_validation, y_validation, len_validation, pos_validation,length_validation,position_validation,
806 |               "test", False)


--------------------------------------------------------------------------------
/models/crf.py:
--------------------------------------------------------------------------------
1 | '''
2 |     use CRF++ tools
3 | '''
4 | 


--------------------------------------------------------------------------------
/models/draw.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | def readData(file):
 6 |     list=[]
 7 |     f=open(file=file)
 8 |     lines=f.readlines()
 9 |     for line in lines:
10 |         line=line.strip()
11 |         list.append(float(line))
12 |     return list
13 | 
14 | 
15 | if __name__ =="__main__":
16 |     #plt.xlabel("Mini-Batch")
17 |     #plt.ylabel("Accuracy")
18 |     #list=readData(file="train_accuracy_epoch1.txt")
19 |     #plt.plot(list,"r")
20 |     list2=readData(file="train_loss_epoch1.txt")
21 |     plt.xlabel("Mini-Batch")
22 |     plt.ylabel("Loss")
23 |     plt.plot(list2,"r")
24 |     plt.show()


--------------------------------------------------------------------------------
/models/gbdt1.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import tensorflow as tf
  4 | import alignment
  5 | import alignment_cwe
  6 | import crf
  7 | import bilstm_cbow
  8 | import bilstm_cwe
  9 | import util
 10 | from sklearn.ensemble import GradientBoostingClassifier
 11 | from sklearn.metrics import accuracy_score
 12 | from sklearn.metrics import f1_score
 13 | 
 14 | class GBDT1():
 15 |     def __init__(self):
 16 |         self.n_estimators=30
 17 |         self.learning_rate=0.08
 18 |         self.sub_sample=0.8
 19 |         self.loss_type="deviance"
 20 | 
 21 |         self.gbdt=GradientBoostingClassifier(
 22 |             loss=self.loss_type,
 23 |             learning_rate=self.learning_rate,
 24 |             n_estimators=self.n_estimators,
 25 |             subsample=self.sub_sample
 26 |         )
 27 | 
 28 |     def fit(self,X_train,y_train,X_test,y_test):
 29 |         self.gbdt.fit(X=X_train,y=y_train)
 30 |         pred=self.gbdt.predict(X=X_test)
 31 |         print(pred.shape)
 32 |         print("accracy:",accuracy_score(y_true=y_test,y_pred=pred))
 33 |         print("f1-score:",f1_score(y_true=y_test,y_pred=pred,average=None))
 34 | 
 35 |     def deMask(self):
 36 |         pass
 37 | 
 38 |     def pred(self,X):
 39 |         pass
 40 | 
 41 | #depadding and will reduce dimension
 42 | def mask(length,X):
 43 |     list = []
 44 |     for i in range(length.shape[0]):
 45 |         sentenece_len = length[i]
 46 |         for j in range(sentenece_len):
 47 |             list.append(X[i, j])
 48 |     return np.array(list,dtype=np.int32)
 49 | 
 50 | def onehot(array):
 51 |     a=np.zeros(shape=(array.shape[0],37),dtype=np.int32)
 52 |     for i in range(array.shape[0]):
 53 |         a[i,array[i]-1]=1
 54 |     return a
 55 | 
 56 | if __name__=="__main__":
 57 | 
 58 |     print("loading data....")
 59 |     #training data
 60 |     # pw 为了获取长度信息
 61 |     #df_train_pw = pd.read_pickle(path="../data/dataset/pw_summary_train.pkl")
 62 |     #len_train = np.asarray(list(df_train_pw['sentence_len'].values))
 63 | 
 64 |     #X_train_crf,labels_train,preds_train_crf=util.extractProb(file="../result/crf/crf_prob_train.txt")
 65 |     #X_train_alignment=util.extractProb2(file="../result/alignment/alignment_prob_train.txt")
 66 |     #X_train_cnn = util.extractProb2(file="../result/cnn/cnn_prob_train.txt")
 67 |     #print("X_train_cnn.shape",X_train_cnn.shape)
 68 | 
 69 |     #pos_train=util.readExtraInfo(file="../data/dataset/pos_train_tag.txt")
 70 |     #pos_train_masked=mask(length=len_train,X=pos_train)
 71 |     #print(pos_train_masked.shape)
 72 |     #pos_train_onehot=onehot(pos_train_masked)
 73 |     #print(pos_train_onehot.shape)
 74 |     #X_train=np.concatenate((X_train_cnn,X_train_alignment,pos_train_onehot),axis=1)
 75 | 
 76 |     #valid data
 77 |     df_valid_pw = pd.read_pickle(path="../data/dataset/pw_summary_valid.pkl")
 78 |     len_valid = np.asarray(list(df_valid_pw['sentence_len'].values))
 79 |     X_valid_crf, labels_valid, preds_valid_crf = util.extractProb(file="../result/crf/crf_prob_valid.txt")
 80 |     X_valid_alignment = util.extractProb2(file="../result/alignment/alignment_prob_valid_epoch5.txt")
 81 |     X_valid_cnn = util.extractProb2(file="../result/cnn/cnn_prob_valid_epoch5.txt")
 82 |     X_valid_attention=util.extractProb2(file="../result/attention/attention_prob_valid_epoch4.txt")
 83 |     X_valid_bilstm=util.extractProb2(file="../result/bilstm/bilstm_prob_valid_epoch3.txt")
 84 | 
 85 |     pos_valid = util.readExtraInfo(file="../data/dataset/pos_valid_tag.txt")
 86 |     pos_valid_masked = mask(length=len_valid, X=pos_valid)
 87 |     print(pos_valid_masked.shape)
 88 |     pos_valid_onehot = onehot(pos_valid_masked)
 89 |     print(pos_valid_onehot.shape)
 90 |     X_valid = np.concatenate(
 91 |         (X_valid_crf,X_valid_cnn, X_valid_alignment,X_valid_attention, X_valid_bilstm,pos_valid_onehot),
 92 |         axis=1
 93 |     )
 94 | 
 95 |     # test data
 96 |     df_test_pw = pd.read_pickle(path="../data/dataset/pw_summary_test.pkl")
 97 |     len_test = np.asarray(list(df_test_pw['sentence_len'].values))
 98 |     X_test_crf, labels_test, preds_test_crf = util.extractProb(file="../result/crf/crf_prob_test.txt")
 99 |     X_test_alignment = util.extractProb2(file="../result/alignment/alignment_prob_test_epoch5.txt")
100 |     X_test_cnn = util.extractProb2(file="../result/cnn/cnn_prob_test_epoch5.txt")
101 |     X_test_attention = util.extractProb2(file="../result/attention/attention_prob_test_epoch4.txt")
102 |     X_test_bilstm=util.extractProb2(file="../result/bilstm/bilstm_prob_test_epoch3.txt")
103 | 
104 |     pos_test = util.readExtraInfo(file="../data/dataset/pos_test_tag.txt")
105 |     pos_test_masked = mask(length=len_test, X=pos_test)
106 |     print(pos_test_masked.shape)
107 |     pos_test_onehot = onehot(pos_test_masked)
108 |     print(pos_test_onehot.shape)
109 |     X_test = np.concatenate(
110 |         (X_test_crf,X_test_cnn, X_test_alignment, X_test_attention,X_test_bilstm,pos_test_onehot),
111 |         axis=1
112 |     )
113 | 
114 |     print("run model....")
115 |     model=GBDT1()
116 |     #model.fit(X_train=X_train, y_train=labels_train, X_test=X_valid, y_test=labels_valid)
117 |     model.fit(X_train=X_valid,y_train=labels_valid,X_test=X_test,y_test=labels_test)
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/models/gbdt2.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import alignment
 4 | import alignment_cwe
 5 | import crf
 6 | import bilstm_cbow
 7 | import bilstm_cwe
 8 | import util
 9 | from sklearn.ensemble import GradientBoostingClassifier
10 | from sklearn.metrics import accuracy_score
11 | from sklearn.metrics import f1_score
12 | 
13 | 
14 | class GBDT2():
15 |     def __init__(self):
16 |         self.n_estimators=50
17 |         self.learning_rate=0.05
18 |         self.sub_sample=0.8
19 |         self.loss_type="deviance"
20 | 
21 |         self.gbdt=GradientBoostingClassifier(
22 |             loss=self.loss_type,
23 |             learning_rate=self.learning_rate,
24 |             n_estimators=self.n_estimators,
25 |             subsample=self.sub_sample
26 |         )
27 | 
28 |     def fit(self,X_train,y_train,X_test,y_test):
29 |         self.gbdt.fit(X=X_train,y=y_train)
30 |         pred=self.gbdt.predict(X=X_test)
31 |         print(pred.shape)
32 |         print("accracy:",accuracy_score(y_true=y_test,y_pred=pred))
33 |         print("f1-score:",f1_score(y_true=y_test,y_pred=pred,average=None))
34 | 
35 | 
36 |     def pred(self,X):
37 |         pass
38 | 
39 | 
40 | if __name__=="__main__":
41 |     print("loading data....")
42 |     #training data
43 |     X_train_crf,labels_train,preds_train_crf=util.extractProb(file="../result/crf/crf_prob_train.txt")
44 |     X_train_alignment=util.extractProb2(file="../result/alignment/alignment_prob_train.txt")
45 |     X_train_cnn = util.extractProb2(file="../result/cnn/cnn_prob_train.txt")
46 |     X_train=np.concatenate((X_train_cnn,X_train_alignment,X_train_crf),axis=1)
47 | 
48 |     #valid data
49 |     X_valid_crf, labels_valid, preds_valid_crf = util.extractProb(file="../result/crf/crf_prob_valid.txt")
50 |     X_valid_alignment = util.extractProb2(file="../result/alignment/alignment_prob_valid.txt")
51 |     X_valid_cnn = util.extractProb2(file="../result/cnn/cnn_prob_valid.txt")
52 |     X_valid = np.concatenate((X_valid_cnn, X_valid_alignment,X_valid_crf), axis=1)
53 | 
54 |     # test data
55 |     X_test_crf, labels_test, preds_test_crf = util.extractProb(file="../result/crf/crf_prob_test.txt")
56 |     X_test_alignment = util.extractProb2(file="../result/alignment/alignment_prob_test.txt")
57 |     X_test_cnn = util.extractProb2(file="../result/cnn/cnn_prob_test.txt")
58 |     X_test = np.concatenate((X_test_cnn, X_test_alignment,X_test_crf), axis=1)
59 | 
60 | 
61 |     print("run model....")
62 |     model=GBDT2()
63 |     model.fit(X_train=X_train, y_train=labels_train, X_test=X_valid, y_test=labels_valid)
64 |     model.fit(X_train=X_train,y_train=labels_train,X_test=X_test,y_test=labels_test)


--------------------------------------------------------------------------------
/models/lf.py:
--------------------------------------------------------------------------------
 1 | def get_dataset(images_paths,labels,batch_size,shuffle=True,last_batch=True):
 2 |     def _decode_images(file_path, label):
 3 |         image_string = tf.read_file(file_path)
 4 |         image_decoded = tf.image.decode_png(image_string)
 5 |         image = tf.cast(image_decoded, tf.float32) / 255.
 6 |         return image, label
 7 | 
 8 |     dataset = tf.data.Dataset.from_tensor_slices((tf.constant(images_paths), tf.constant(labels)))
 9 |     dataset = dataset.map(_decode_images)
10 |     # buffer_size = 10 * batch_size
11 |     buffer_size = 50000
12 |     if shuffle:
13 |         dataset = dataset.shuffle(buffer_size)
14 |     if not last_batch:
15 |         dataset.filter(lambda x, y: tf.equal(tf.shape(x)[0], batch_size))
16 |     dataset = dataset.batch(batch_size)
17 |     return dataset


--------------------------------------------------------------------------------
/models/rf.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import data_utils
  3 | import os
  4 | import math
  5 | import sys
  6 | import time
  7 | 
  8 | class Seq2SeqModel(object):
  9 |     def __init__(self, learning_rate, learning_rate_decay_factor, source_vocab_size=40000, target_vocab_size=40000, num_steps=100, num_epochs=10,
 10 |                  is_training=True):
 11 |         self.min_loss = float(sys.maxint)
 12 |         self.batch_size = 100
 13 |         self.dropout_rate = 0.5
 14 |         self.max_gradient_norm = 5
 15 |         self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
 16 |         self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor)
 17 | 
 18 |         self.num_layers = 1
 19 |         self.emb_dim = 100
 20 |         self.hidden_dim = 100
 21 |         self.attention_hidden_dim = 100
 22 |         self.num_epochs = num_epochs
 23 |         self.num_steps = num_steps
 24 |         self.source_vocab_size = source_vocab_size
 25 |         self.target_vocab_size = target_vocab_size
 26 |         self.global_step = tf.Variable(0, trainable=False)
 27 | 
 28 |         # placeholder of encoder_inputs, decoder_inputs, y_outputs
 29 |         self.encoder_inputs, self.decoder_inputs, self.y_outputs, self.target_weights = self.create_placeholder()
 30 | 
 31 |         # source and target word embedding
 32 |         self.source_embedding = tf.Variable(tf.random_uniform([self.source_vocab_size, self.emb_dim], 0.0, 1.0), name="source_emb")
 33 |         self.target_embedding = tf.Variable(tf.random_uniform([self.target_vocab_size, self.emb_dim], 0.0, 1.0), name="target_emb")
 34 | 
 35 |         self.softmax_w = tf.Variable(tf.random_uniform([self.hidden_dim * 2, self.target_vocab_size], 0.0, 1.0), name="softmax_w", dtype=tf.float32)
 36 |         self.softmax_b = tf.Variable(tf.random_uniform([self.target_vocab_size], 0.0, 1.0), name="softmax_b", dtype=tf.float32)
 37 | 
 38 |         self.attention_W = tf.Variable(tf.random_uniform([self.hidden_dim * 4, self.attention_hidden_dim], 0.0, 1.0), name="attention_W")
 39 |         self.attention_U = tf.Variable(tf.random_uniform([self.hidden_dim * 2, self.attention_hidden_dim], 0.0, 1.0), name="attention_U")
 40 |         self.attention_V = tf.Variable(tf.random_uniform([self.attention_hidden_dim, 1], 0.0, 1.0), name="attention_V")
 41 | 
 42 |         self.encoder_inputs_emb = tf.nn.embedding_lookup(self.source_embedding, self.encoder_inputs)
 43 |         self.encoder_inputs_emb = tf.transpose(self.encoder_inputs_emb, [1, 0, 2])
 44 |         # self.encoder_inputs_emb = tf.reshape(self.encoder_inputs_emb, [-1, self.emb_dim])
 45 |         # self.encoder_inputs_emb = tf.split(0, self.num_steps, self.encoder_inputs_emb)
 46 | 
 47 |         self.decoder_inputs_emb = tf.nn.embedding_lookup(self.target_embedding, self.decoder_inputs)
 48 |         self.decoder_inputs_emb = tf.transpose(self.decoder_inputs_emb, [1, 0, 2])
 49 |         self.decoder_inputs_emb = tf.reshape(self.decoder_inputs_emb, [-1, self.emb_dim])
 50 |         self.decoder_inputs_emb = tf.split(self.decoder_inputs_emb, self.num_steps, 0)
 51 | 
 52 |         # lstm cell
 53 |         self.enc_lstm_cell_fw = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, state_is_tuple=False)
 54 |         self.enc_lstm_cell_bw = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, state_is_tuple=False)
 55 |         self.dec_lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim * 2, state_is_tuple=False)
 56 | 
 57 |         # dropout
 58 |         if is_training:
 59 |             # self.enc_lstm_cell_fw = tf.nn.rnn_cell.DropoutWrapper(self.enc_lstm_cell_fw, output_keep_prob=(1 - self.dropout_rate))
 60 |             # self.enc_lstm_cell_bw = tf.nn.rnn_cell.DropoutWrapper(self.enc_lstm_cell_bw, output_keep_prob=(1 - self.dropout_rate))
 61 |             self.dec_lstm_cell = tf.contrib.rnn.DropoutWrapper(self.dec_lstm_cell, output_keep_prob=(1 - self.dropout_rate))
 62 | 
 63 |         # get the length of each sample
 64 |         self.source_length = tf.reduce_sum(tf.sign(self.encoder_inputs), reduction_indices=1)
 65 |         self.source_length = tf.cast(self.source_length, tf.int32)
 66 |         self.target_length = tf.reduce_sum(tf.sign(self.decoder_inputs), reduction_indices=1)
 67 |         self.target_length = tf.cast(self.target_length, tf.int32)
 68 | 
 69 |         # encode and decode
 70 |         enc_outputs, enc_state = self.encode(self.enc_lstm_cell_fw, self.enc_lstm_cell_bw)
 71 |         if is_training:
 72 |             self.dec_outputs = self.decode(self.dec_lstm_cell, enc_state, enc_outputs)
 73 |         else:
 74 |             self.dec_outputs = self.decode(self.dec_lstm_cell, enc_state, enc_outputs, self.loop_function)
 75 |         # softmax
 76 |         self.outputs = tf.reshape(tf.concat(self.dec_outputs, axis=1), [-1, self.hidden_dim * 2])
 77 |         self.logits = tf.add(tf.matmul(self.outputs, self.softmax_w), self.softmax_b)
 78 |         self.prediction = tf.nn.softmax(self.logits)
 79 | 
 80 |         self.y_output = tf.reshape(self.y_outputs, [-1])
 81 |         self.y_output = tf.one_hot(self.y_output, depth=self.target_vocab_size, on_value=1.0, off_value=0.0)
 82 | 
 83 |         self.target_weight = tf.reshape(self.target_weights, [-1])
 84 | 
 85 |         cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_output)
 86 |         self.cross_entropy_loss = tf.reduce_mean(tf.multiply(self.target_weight, cross_entropy))
 87 | 
 88 |         # Gradients and SGD update operation for training the model.
 89 |         params = tf.trainable_variables()
 90 |         self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
 91 | 
 92 |         gradients = tf.gradients(self.cross_entropy_loss, params)
 93 |         clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm)
 94 |         self.updates = self.optimizer.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)
 95 | 
 96 |         self.saver = tf.train.Saver(tf.global_variables())
 97 | 
 98 |     def create_placeholder(self):
 99 |         encoder_input_pl = tf.placeholder(tf.int64, [None, self.num_steps])
100 |         decoder_input_pl = tf.placeholder(tf.int64, [None, self.num_steps])
101 |         y_output_pl = tf.placeholder(tf.int64, [None, self.num_steps])
102 |         target_weight = tf.placeholder(tf.float32, [None, self.num_steps])
103 |         return encoder_input_pl, decoder_input_pl, y_output_pl, target_weight
104 | 
105 |     def encode(self, cell_fw, cell_bw):
106 |         enc_outputs, (output_state_fw, output_state_bw) = tf.nn.bidirectional_dynamic_rnn(
107 |             cell_fw,
108 |             cell_bw,
109 |             self.encoder_inputs_emb,
110 |             dtype=tf.float32,
111 |             sequence_length=self.source_length,
112 |             time_major=True
113 |         )
114 |         enc_state = tf.concat([output_state_fw, output_state_bw], axis=1)
115 |         enc_outputs = tf.concat(enc_outputs, axis=2)
116 |         enc_outputs = tf.reshape(enc_outputs, [-1, self.emb_dim * 2])
117 |         enc_outputs = tf.split(enc_outputs, self.num_steps, 0)
118 |         return enc_outputs, enc_state
119 | 
120 |     def attention(self, prev_state, enc_outputs):
121 |         """
122 |         Attention model for Neural Machine Translation
123 |         :param prev_state: the decoder hidden state at time i-1
124 |         :param enc_outputs: the encoder outputs, a length 'T' list.
125 |         """
126 |         e_i = []
127 |         c_i = []
128 |         for output in enc_outputs:
129 |             atten_hidden = tf.tanh(tf.add(tf.matmul(prev_state, self.attention_W), tf.matmul(output, self.attention_U)))
130 |             e_i_j = tf.matmul(atten_hidden, self.attention_V)
131 |             e_i.append(e_i_j)
132 |         e_i = tf.concat(e_i, axis=1)
133 |         # e_i = tf.exp(e_i)
134 |         alpha_i = tf.nn.softmax(e_i)
135 |         alpha_i = tf.split(alpha_i, self.num_steps, 1)
136 |         for alpha_i_j, output in zip(alpha_i, enc_outputs):
137 |             c_i_j = tf.multiply(alpha_i_j, output)
138 |             c_i.append(c_i_j)
139 |         c_i = tf.reshape(tf.concat(c_i, axis=1), [-1, self.num_steps, self.hidden_dim * 2])
140 |         c_i = tf.reduce_sum(c_i, 1)
141 |         return c_i
142 | 
143 |     def decode(self, cell, init_state, enc_outputs, loop_function=None):
144 |         outputs = []
145 |         prev = None
146 |         state = init_state
147 |         for i, inp in enumerate(self.decoder_inputs_emb):
148 | 
149 |             if loop_function is not None and prev is not None:
150 |                 with tf.variable_scope("loop_function", reuse=True):
151 |                     inp = loop_function(prev, i)
152 |             if i > 0:
153 |                 tf.get_variable_scope().reuse_variables()
154 |             c_i = self.attention(state, enc_outputs)
155 |             inp = tf.concat([inp, c_i], axis=1)
156 |             output, state = cell(inp, state)
157 |             # print output.eval()
158 |             outputs.append(output)
159 |             if loop_function is not None:
160 |                 prev = output
161 |         return outputs
162 | 
163 |     def loop_function(self, prev, _):
164 |         """
165 |         :param prev: the output of t-1 time
166 |         :param _:
167 |         :return: the embedding of t-1 output
168 |         """
169 |         prev = tf.add(tf.matmul(prev, self.softmax_w), self.softmax_b)
170 |         prev_sympol = tf.arg_max(prev, 1)
171 | 
172 |         emb_prev = tf.nn.embedding_lookup(self.target_embedding, prev_sympol)
173 |         return emb_prev
174 | 
175 |     def train(self, sess, save_path, train_set, val_set, steps_per_checkpoint, train_log):
176 |         num_iterations = int(math.ceil(1.0 * len(train_set) / self.batch_size))
177 |         print("Number of iterations: %d" % num_iterations)
178 | 
179 |         step_time, loss = 0.0, 0.0
180 |         current_step = 0
181 |         previous_losses = []
182 |         while True:
183 |             log_file = open(train_log, 'a')
184 |             start_time = time.time()
185 |             batch_encoder_inputs, batch_decoder_inputs, batch_y_outputs, batch_target_weights = \
186 |                 data_utils.nextRandomBatch(train_set, batch_size=self.batch_size)
187 |             _, step_loss = \
188 |                 sess.run(
189 |                     [
190 |                         self.updates,
191 |                         self.cross_entropy_loss,
192 |                     ],
193 |                     feed_dict={
194 |                         self.encoder_inputs: batch_encoder_inputs,
195 |                         self.decoder_inputs: batch_decoder_inputs,
196 |                         self.y_outputs: batch_y_outputs
197 |                     })
198 |             step_time += (time.time() - start_time) / steps_per_checkpoint
199 |             loss += step_loss / steps_per_checkpoint
200 |             current_step += 1
201 | 
202 |             # Once in a while, we save checkpoint, print statistics, and run evals.
203 |             if current_step % steps_per_checkpoint == 0:
204 |                 perplexity = math.exp(float(loss)) if loss < 300 else float("inf")
205 |                 log_file.write("global step %d learning rate %.4f step-time %.2f perplexity "
206 |                        "%.2f" % (self.global_step.eval(), self.learning_rate.eval(),
207 |                                  step_time, perplexity))
208 |                 log_file.write("\n")
209 |                 if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
210 |                     sess.run(self.learning_rate_decay_op)
211 |                 previous_losses.append(loss)
212 |                 checkpoint_path = os.path.join(save_path, "translate.ckpt")
213 |                 self.saver.save(sess, checkpoint_path, global_step=self.global_step)
214 |                 step_time, loss = 0.0, 0.0
215 | 
216 |             if current_step % 1000 == 0:
217 |                 batch_encoder_val, batch_decoder_val, batch_y_val, batch_target_weights_val = \
218 |                     data_utils.nextRandomBatch(val_set, batch_size=self.batch_size)
219 |                 loss_val = \
220 |                     sess.run(
221 |                         self.cross_entropy_loss,
222 |                         feed_dict={
223 |                             self.encoder_inputs: batch_encoder_val,
224 |                             self.decoder_inputs: batch_decoder_val,
225 |                             self.y_outputs: batch_y_val,
226 |                             self.target_weights: batch_target_weights_val
227 |                         })
228 |                 eval_ppl = math.exp(float(loss_val)) if loss_val < 300 else float("inf")
229 |                 log_file.write("global step %d eval: perplexity %.2f" % (self.global_step.eval(), eval_ppl))
230 |                 log_file.write("\n")
231 |             sys.stdout.flush()
232 |             log_file.close()
233 | 
234 |     def test(self, sess, token_ids):
235 |         # We decode one sentence at a time.
236 |         token_ids = data_utils.padding(token_ids)
237 |         target_ids = data_utils.padding([data_utils.GO_ID])
238 |         y_ids = data_utils.padding([data_utils.EOS_ID])
239 |         encoder_inputs, decoder_inputs, _ = data_utils.nextRandomBatch([(token_ids, target_ids, y_ids)], batch_size=1)
240 |         prediction = sess.run(self.prediction, feed_dict={
241 |             self.encoder_inputs: encoder_inputs,
242 |             self.decoder_inputs: decoder_inputs
243 |         })
244 |         pred_max = tf.arg_max(prediction, 1)
245 |         # prediction = tf.split(0, self.num_steps, prediction)
246 |         # # This is a greedy decoder - outputs are just argmaxes of output_logits.
247 |         # outputs = [int(np.argmax(predict)) for predict in prediction]
248 |         # # If there is an EOS symbol in outputs, cut them at that point.
249 |         # if data_utils.EOS_ID in outputs:
250 |         #     outputs = outputs[:outputs.index(data_utils.EOS_ID)]
251 |         return pred_max.eval()


--------------------------------------------------------------------------------
/models/xgb.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XierHacker/Model_Fusion_Based_Prosody_Prediction/ef174fe63eded966c61880ffce041242fdc0b1ff/models/xgb.py


--------------------------------------------------------------------------------
/parameter.py:
--------------------------------------------------------------------------------
 1 | #basic architecture
 2 | CHAR_EMBEDDING_SIZE=128                 #字嵌入维度
 3 | WORD_EMBEDDING_SIZE=128                 #词嵌入维度
 4 | INPUT_SIZE=WORD_EMBEDDING_SIZE           #词嵌入维度
 5 | 
 6 | MAX_EPOCH=5                            #最大迭代次数
 7 | LAYER_NUM=2                             #lstm层数2
 8 | HIDDEN_UNITS_NUM=256                    #隐藏层结点数量
 9 | HIDDEN_UNITS_NUM2=256                   #隐藏层2结点数量
10 | BATCH_SIZE=20                           #batch大小
11 | 
12 | #learning rate
13 | LEARNING_RATE=0.003                      #学习率
14 | DECAY=0.2                               #衰减系数
15 | 
16 | #Weaken Overfitting
17 | KEEP_PROB=0.5                             #全连接 dropout 比率
18 | INPUT_KEEP_PROB=1.0                       #rnn input  dropout比率
19 | OUTPUT_KEEP_PROB=0.5                      #rnn output dropout 比率
20 | LAMBDA_PW=0.001                           #PW层级正则化系数
21 | LAMBDA_PPH=0.001                          #PW层级正则化系数
22 | LAMBDA_IPH=0.005                          #PW层级正则化系数
23 | 
24 | 
25 | #can't modify
26 | CLASS_NUM=2                             #类别数量2(N,B)
27 | POS_NUM=37                              #词性信息数量
28 | LENGTH_NUM=8                            #长度信息数量
29 | MAX_SENTENCE_SIZE=28                    #固定句子长度为28 (从整个数据集得来)
30 | TIMESTEP_SIZE=MAX_SENTENCE_SIZE         #LSTM的time_step应该和句子长度一致
31 | WORD_VOCAB_SIZE=393256                   # 样本中不同字的个数+1(padding 0)，根据处理数据的时候得到
32 | 


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import tensorflow as tf
  4 | import parameter
  5 | from sklearn.metrics import precision_score
  6 | from sklearn.metrics import recall_score
  7 | from sklearn.metrics import accuracy_score
  8 | from sklearn.metrics import f1_score
  9 | from sklearn.preprocessing import OneHotEncoder
 10 | 
 11 | #load data
 12 | def loadData():
 13 |     # pw
 14 |     df_train_pw = pd.read_pickle(path="../data/dataset/pw_summary_train.pkl")
 15 |     df_valid_pw = pd.read_pickle(path="../data/dataset/pw_summary_valid.pkl")
 16 |     df_test_pw = pd.read_pickle(path="../data/dataset/pw_summary_test.pkl")
 17 | 
 18 |     # pph
 19 |     df_train_pph = pd.read_pickle(path="../data/dataset/pph_summary_train.pkl")
 20 |     df_valid_pph = pd.read_pickle(path="../data/dataset/pph_summary_valid.pkl")
 21 |     df_test_pph = pd.read_pickle(path="../data/dataset/pph_summary_test.pkl")
 22 | 
 23 |     # iph
 24 |     # df_train_iph = pd.read_pickle(path="./dataset/temptest/iph_summary_train.pkl")
 25 |     # df_validation_iph = pd.read_pickle(path="./dataset/temptest/iph_summary_validation.pkl")
 26 | 
 27 |     # 实际上,X里面的内容都是一样的,所以这里统一使用pw的X来作为所有的X
 28 |     # 但是标签是不一样的,所以需要每个都要具体定义
 29 |     X_train = np.asarray(list(df_train_pw['X'].values))
 30 |     X_valid = np.asarray(list(df_valid_pw['X'].values))
 31 |     X_test = np.asarray(list(df_test_pw['X'].values))
 32 | 
 33 |     # print("X_train:\n",X_train)
 34 |     # print("X_train.shape",X_train.shape)
 35 |     # print("X_valid:\n",X_valid)
 36 |     # print("X_valid.shape:",X_valid.shape)
 37 |     # print("X_test:\n", X_test)
 38 |     # print("X_test.shape", X_test.shape)
 39 | 
 40 |     # tags
 41 |     y_train_pw = np.asarray(list(df_train_pw['y'].values))
 42 |     y_valid_pw = np.asarray(list(df_valid_pw['y'].values))
 43 |     y_test_pw = np.asarray(list(df_test_pw['y'].values))
 44 | 
 45 |     y_train_pph = np.asarray(list(df_train_pph['y'].values))
 46 |     y_valid_pph = np.asarray(list(df_valid_pph['y'].values))
 47 |     y_test_pph = np.asarray(list(df_test_pph['y'].values))
 48 | 
 49 |     # y_train_iph = np.asarray(list(df_train_iph['y'].values))
 50 |     # y_validation_iph = np.asarray(list(df_validation_iph['y'].values))
 51 | 
 52 |     # length每一行序列的长度,因为都一样,所以统一使用pw的
 53 |     len_train = np.asarray(list(df_train_pw['sentence_len'].values))
 54 |     len_valid = np.asarray(list(df_valid_pw['sentence_len'].values))
 55 |     len_test = np.asarray(list(df_test_pw['sentence_len'].values))
 56 |     # print("len_train:", len_train.shape)
 57 |     # print("len_valid:", len_valid.shape)
 58 |     # print("len_test:", len_test.shape)
 59 | 
 60 |     # ----------------------------------------Extra Info--------------------------------
 61 |     # pos
 62 |     pos_train = readExtraInfo(file="../data/dataset/pos_train_tag.txt")
 63 |     pos_valid = readExtraInfo(file="../data/dataset/pos_valid_tag.txt")
 64 |     pos_test = readExtraInfo(file="../data/dataset/pos_test_tag.txt")
 65 |     # print("pos_train.shape",pos_train.shape)
 66 |     # print("pos_valid.shape",pos_valid.shape)
 67 |     # print("pos_test.shape", pos_test.shape)
 68 | 
 69 |     # length
 70 |     length_train = readExtraInfo(file="../data/dataset/length_train_tag.txt")
 71 |     length_valid = readExtraInfo(file="../data/dataset/length_valid_tag.txt")
 72 |     length_test = readExtraInfo(file="../data/dataset/length_test_tag.txt")
 73 |     # print("shape of length_train:",length_train.shape)
 74 |     # print("shape of length_valid:",length_valid.shape)
 75 |     # print("shape of length_test:", length_test.shape)
 76 | 
 77 |     # position
 78 |     position_train = readExtraInfo(file="../data/dataset/position_train_tag.txt")
 79 |     position_valid = readExtraInfo(file="../data/dataset/position_valid_tag.txt")
 80 |     position_test = readExtraInfo(file="../data/dataset/position_test_tag.txt")
 81 |     # print("shape of position_train:",position_train.shape)
 82 |     # print("shape of positon_valid:",position_valid.shape)
 83 |     # print("shape of positon_test:", position_test.shape)
 84 | 
 85 |     # accum
 86 |     accum_train = readExtraInfo(file="../data/dataset/accum_train_tag.txt")
 87 |     accum_valid = readExtraInfo(file="../data/dataset/accum_valid_tag.txt")
 88 |     accum_test = readExtraInfo(file="../data/dataset/accum_test_tag.txt")
 89 |     # print("shape of accum_train:", accum_train.shape)
 90 |     # print("shape of accum_valid:", accum_valid.shape)
 91 |     # print("shape of accum_test:", accum_test.shape)
 92 | 
 93 |     # accum reverse
 94 |     accumR_train = readExtraInfo(file="../data/dataset/accum_reverse_train_tag.txt")
 95 |     accumR_valid = readExtraInfo(file="../data/dataset/accum_reverse_valid_tag.txt")
 96 |     accumR_test = readExtraInfo(file="../data/dataset/accum_reverse_test_tag.txt")
 97 |     # print("shape of accumR_train:", accumR_train.shape)
 98 |     # print("shape of accumR_valid:", accumR_valid.shape)
 99 |     # print("shape of accumR_test:", accumR_test.shape)
100 | 
101 |     y_train = [y_train_pw, y_train_pph]
102 |     y_valid = [y_valid_pw, y_valid_pph]
103 |     y_test = [y_test_pw, y_test_pph]
104 | 
105 |     return X_train, y_train, len_train, pos_train, length_train, position_train,\
106 |            X_valid, y_valid, len_valid, pos_valid, length_valid, position_valid,\
107 |            X_test, y_test, len_test, pos_test, length_test, position_test
108 | 
109 |     # print("Run Model...\n\n\n")
110 |     model = Alignment()
111 |     model.fit(
112 |         X_train, y_train, len_train, pos_train, length_train, position_train,
113 |         X_valid, y_valid, len_valid, pos_valid, length_valid, position_valid,
114 |         X_test, y_test, len_test, pos_test, length_test, position_test, "test", False)
115 | 
116 | 
117 | #compute accuracy,precison,recall and f1
118 | def eval(y_true,y_pred):
119 |     #accuracy
120 |     accuracy=accuracy_score(y_true=y_true,y_pred=y_pred)
121 |     #f1-score
122 |     f_1=f1_score(y_true=y_true,y_pred=y_pred,average=None)
123 |     return accuracy,f_1
124 | 
125 | #从得到的prob_pw和prob_pph得到总的prob,并保存
126 | def writeProb(prob_pw,prob_pph,outFile):
127 |     f=open(file=outFile,mode="a+",encoding="utf-8")
128 |     for i in range(prob_pw.shape[0]):
129 |         prob_0=prob_pw[i,0]*prob_pph[i,0]
130 |         prob_1=prob_pw[i,1]*prob_pph[i,0]
131 |         prob_2=prob_pw[i,0]*prob_pph[i,1]+prob_pw[i,1]*prob_pph[i,1]
132 |         s=str(prob_0)+" "+str(prob_1)+" "+str(prob_2)+"\n"
133 |         f.write(s)
134 |     f.close()
135 | 
136 | def getTag2(preds_pw,preds_pph):
137 |     # get complex "#" index
138 |     length = preds_pw.shape[0]
139 |     complex = np.array([preds_pph, preds_pw])
140 |     arg = np.argmax(complex, axis=0)
141 |     # print("arg:\n", arg)
142 |     for i in range(length):
143 |         if arg[i] == 0:
144 |             if complex[0, i] == 1:
145 |                 arg[i] = 4
146 |             else:
147 |                 arg[i] = 0
148 |         if arg[i] == 1:
149 |             if complex[1, i] == 1:
150 |                 arg[i] = 2
151 |             else:
152 |                 arg[i] = 0
153 |     arg = (arg / 2).astype(dtype=np.int32)
154 |     return arg
155 | 
156 | #recover to .txt format
157 | def recover2(X,preds_pw,preds_pph,filename):
158 |     arg=getTag2(preds_pw,preds_pph)
159 |     arg=np.reshape(arg,newshape=(-1,parameter.MAX_SENTENCE_SIZE))   #[test_size,max_sentence_size]
160 |     #print("arg.shape",arg.shape)
161 |     #print("arg:\n", arg)
162 |     #get id2words
163 |     df_words_ids = pd.read_csv(filepath_or_buffer="../data/dataset/words_ids.csv", encoding="utf-8")
164 |     #print(df_words_ids.head(5))
165 |     id2words = pd.Series(data=df_words_ids["words"].values, index=df_words_ids["id"].values)
166 |     #print(id2words[2])
167 |     doc=""
168 |     for i in range(X.shape[0]):
169 |         sentence=""
170 |         for j in range(X.shape[1]):
171 |             if(X[i][j])==0:
172 |                 break;
173 |             else:
174 |                 sentence+=id2words[X[i][j]]
175 |                 if(arg[i][j]!=0):
176 |                     sentence+=("#"+str(arg[i][j]))
177 |         sentence+="\n"
178 |         doc+=sentence
179 |     f=open(filename,mode="w",encoding="utf-8")
180 |     f.write(doc)
181 |     f.close()
182 | 
183 | #read extra information from file,like pos info of word,or position info etc...
184 | def readExtraInfo(file):
185 |     f = open(file=file, encoding="utf-8")
186 |     lines = f.readlines()
187 |     #print("lines numbers:",len(lines))
188 |     X=np.zeros(shape=(len(lines),parameter.MAX_SENTENCE_SIZE),dtype=np.int32)
189 |     i = 0
190 |     for line in lines:
191 |         # print(line)
192 |         line = line.strip()
193 |         line_list = line.split(sep=" ")
194 |         # print(line_list)
195 |         j = 0
196 |         for id in line_list:
197 |             X[i, j] = id
198 |             j += 1
199 |         i += 1
200 |     return X
201 | 
202 | 
203 | #读取预训练的embeddings
204 | def readEmbeddings(file):
205 |     f=open(file=file,encoding="utf-8")
206 |     lines=f.readlines()
207 |     #first row is info
208 |     info=lines[0].strip()
209 |     info_list=info.split(sep=" ")
210 |     vocab_size=int(info_list[0])
211 |     embedding_dims=int(info_list[1])
212 |     embeddings=np.zeros(shape=(vocab_size+1,embedding_dims),dtype=np.float32)
213 |     for i in range(1,vocab_size+1):
214 |         embed=lines[i].strip()
215 |         embed_list=embed.split(sep=" ")
216 |         for j in range(1,embedding_dims+1):
217 |             embeddings[i][j-1]=embed_list[j]
218 |     #print(embeddings.shape)
219 |     return embeddings
220 | 
221 | #返回字增强之后的word-embeddings
222 | def getCWE(word_embed_file,char_embed_file):
223 |     word_embeddings=readEmbeddings(file=word_embed_file)
224 |     print("shape of word_embeddings:",word_embeddings.shape)
225 |     char_embeddings=readEmbeddings(file=char_embed_file)
226 |     print("shape of char_embeddings:",char_embeddings.shape)
227 | 
228 |     #load id-word df
229 |     df_words_ids = pd.read_csv(filepath_or_buffer="../data/dataset/words_ids.csv", encoding="utf-8")
230 |     id2words = pd.Series(data=df_words_ids["words"].values, index=df_words_ids["id"].values)
231 | 
232 |     #load id-char df
233 |     df_chars_ids = pd.read_csv(filepath_or_buffer="../data/dataset/chars_ids.csv", encoding="utf-8")
234 |     chars2id = pd.Series(data=df_chars_ids["id"].values, index=df_chars_ids["chars"].values)
235 | 
236 |     for i in range(1,word_embeddings.shape[0]):
237 |         #print(id2words[i])
238 |         word=id2words[i]
239 |         sum_char_embeddings=np.zeros(shape=(128,),dtype=np.float32)
240 |         for char in word:
241 |             char_id=chars2id[char]
242 |             sum_char_embeddings+=char_embeddings[char_id]
243 |         sum_char_embeddings/=len(word)
244 |         word_embeddings[i]+=sum_char_embeddings
245 |     cwe=word_embeddings/2
246 |     return cwe
247 | 
248 | 
249 | #从crf结果文件中抽取概率,并且返回ndarray类型
250 | def extractProb(file):
251 |     probs=[]
252 |     labels=[]
253 |     preds=[]
254 |     f=open(file=file,encoding="utf-8")
255 |     lines=f.readlines()
256 |     for line in lines:
257 |         line=line.strip()
258 |         if line!="":
259 |             if line[0]!="#":
260 |                 prob = []
261 |                 #print(line)
262 |                 #print(line[0])
263 |                 line_list = line.split(sep="\t")
264 |                 l_0 = line_list[9].split(sep="/")
265 |                 prob.append(float(l_0[1]))
266 |                 l_1 = line_list[10].split(sep="/")
267 |                 prob.append(float(l_1[1]))
268 |                 l_2 = line_list[11].split(sep="/")
269 |                 prob.append(float(l_2[1]))
270 |                 #print(prob)
271 |                 probs.append(prob)
272 |                 labels.append(float(line_list[7]))
273 |                 preds.append(float(line_list[8].split(sep="/")[0]))
274 |     #print("len of probs:",probs[0])
275 |     probs_nd=np.array(probs,dtype=np.float32)
276 |     labels_nd=np.array(labels,dtype=np.int32)
277 |     preds_nd=np.array(preds,dtype=np.int32)
278 |     #print("shape of prob_nd",probs_nd.shape)
279 |     return probs_nd,labels_nd,preds_nd
280 | 
281 | def extractProb2(file):
282 |     probs=[]
283 |     result=[]
284 |     f=open(file=file,encoding="utf-8")
285 |     lines=f.readlines()
286 |     for line in lines:
287 |         line=line.strip()
288 |         prob = []
289 |         #print(line)
290 |         #print(line[0])
291 |         line_list = line.split(sep=" ")
292 |         #print(line_list)
293 |         l_0 = line_list[0]
294 |         prob.append(float(l_0))
295 |         l_1 = line_list[1]
296 |         prob.append(float(l_1))
297 |         l_2 = line_list[2]
298 |         prob.append(float(l_2))
299 |         #print(prob)
300 |         probs.append(prob)
301 |     #print("len of probs:",probs[0])
302 |     probs_nd=np.array(probs,dtype=np.float32)
303 |     #print("shape of prob_nd",probs_nd.shape)
304 |     #print(probs_nd.dtype)
305 |     return probs_nd
306 | 
307 | #统计结果
308 | def statistic(type="valid"):
309 |     print("CRF")
310 |     prob, labels, preds = extractProb(file="./result/crf/crf_prob_"+type+".txt")
311 |     #print("prob.shape", prob.shape)
312 |     #print("labels.shape", labels.shape)
313 |     #print("preds.shape", preds.shape)
314 |     p1, f1 = eval(y_true=labels, y_pred=preds)
315 |     #print("accuracy:", p1)
316 |     print("f1-score:", f1)
317 | 
318 |     print("Alignment")
319 |     prob_align = extractProb2(file="./result/alignment/alignment_prob_"+type+"_epoch5.txt")
320 |     #print("prob_align.shape", prob_align.shape)
321 |     # print("prob_align:",prob_align)
322 |     preds_align = np.argmax(prob_align, axis=-1, )
323 |     # print(preds_align.shape)
324 |     # print(preds_align)
325 |     p2, f2 = eval(y_true=labels, y_pred=preds_align)
326 |     #print("accuracy:", p2)
327 |     print("f1-score:", f2)
328 | 
329 |     print("CNN")
330 |     prob_cnn = extractProb2(file="./result/cnn/cnn_prob_"+type+"_epoch5.txt")
331 |     #print("prob_cnn.shape", prob_cnn.shape)
332 |     # print("prob_cnn:",prob_cnn)
333 |     preds_cnn = np.argmax(prob_cnn, axis=-1, )
334 |     # print(preds_cnn.shape)
335 |     # print(preds_align)
336 |     p3, f3 = eval(y_true=labels, y_pred=preds_cnn)
337 |     #print("accuracy:", p3)
338 |     print("f1-score:", f3)
339 | 
340 |     print("Attention")
341 |     prob_atten = extractProb2(file="./result/attention/attention_prob_"+type+"_epoch4.txt")
342 |     #print("prob_atten.shape", prob_atten.shape)
343 |     # print("prob_atten:",prob_atten)
344 |     preds_atten = np.argmax(prob_atten, axis=-1, )
345 |     # print(preds_tten.shape)
346 |     # print(preds_tten)
347 |     p4, f4 = eval(y_true=labels, y_pred=preds_atten)
348 |     #print("accuracy:", p4)
349 |     print("f1-score:", f4)
350 | 
351 |     print("BiLSTM")
352 |     prob_bilstm = extractProb2(file="./result/bilstm/bilstm_prob_"+type+"_epoch3.txt")
353 |     #print("prob_bilstm.shape", prob_bilstm.shape)
354 |     # print("prob_atten:",prob_atten)
355 |     preds_bilstm = np.argmax(prob_bilstm, axis=-1, )
356 |     # print(preds_tten.shape)
357 |     # print(preds_tten)
358 |     p5, f5 = eval(y_true=labels, y_pred=preds_bilstm)
359 |     #print("accuracy:", p5)
360 |     print("f1-score:", f5)
361 | 
362 | 
363 | if __name__ =="__main__":
364 |     #print("read extra info test:")
365 |     #readExtraInfo(file="./data/dataset/pos_train_tag.txt")
366 |     #readExtraInfo(file="./data/dataset/pos_test_tag.txt")
367 |     #readExtraInfo(file="./data/dataset/length_train_tag.txt")
368 |     #readExtraInfo(file="./data/dataset/length_test_tag.txt")
369 |     #readEmbeddings(file="./data/embeddings/word_vec.txt")
370 |     #readEmbeddings(file="./data/embeddings/char_vec.txt")
371 |     #getCWE(word_embed_file="./data/embeddings/word_vec.txt",char_embed_file="./data/embeddings/char_vec.txt")
372 |     statistic(type="valid")
373 | 
374 | 
375 | 
376 | 
377 | 
378 | 
379 | 
380 | 
381 | 
382 | '''
383 | def getTag(preds_pw,preds_pph,preds_iph):
384 |     # get complex "#" index
385 |     length = preds_pw.shape[0]
386 |     complex = np.array([preds_iph, preds_pph, preds_pw])
387 |     arg = np.argmax(complex, axis=0)
388 |     # print("arg:\n", arg)
389 |     for i in range(length):
390 |         if arg[i] == 0:
391 |             if complex[0, i] == 2:
392 |                 arg[i] = 6
393 |             else:
394 |                 arg[i] = 0
395 |         if arg[i] == 1:
396 |             if complex[1, i] == 2:
397 |                 arg[i] = 4
398 |             else:
399 |                 arg[i] = 0
400 |         if arg[i] == 2:
401 |             if complex[2, i] == 2:
402 |                 arg[i] = 2
403 |             else:
404 |                 arg[i] = 0
405 |     arg = (arg / 2).astype(dtype=np.int32)
406 |     return arg
407 | 
408 | #recover to original result
409 | def recover(X,preds_pw,preds_pph,preds_iph,filename):
410 |     #shape of arg:[test_size,max_sentence_size]
411 |     arg=np.reshape(arg,newshape=(-1,parameter.MAX_SENTENCE_SIZE))
412 |     #print("arg.shape",arg.shape)
413 |     #print("arg:\n", arg)
414 |     #get id2words
415 |     df_words_ids = pd.read_csv(filepath_or_buffer="./dataset/temptest/words_ids.csv", encoding="utf-8")
416 |     #print(df_words_ids.head(5))
417 |     id2words = pd.Series(data=df_words_ids["words"].values, index=df_words_ids["id"].values)
418 |     #print(id2words[2])
419 |     doc=""
420 |     for i in range(X.shape[0]):
421 |         sentence=""
422 |         for j in range(X.shape[1]):
423 |             if(X[i][j])==0:
424 |                 break;
425 |             else:
426 |                 sentence+=id2words[X[i][j]]
427 |                 if(arg[i][j]!=0):
428 |                     sentence+=("#"+str(arg[i][j]))
429 |         sentence+="\n"
430 |         doc+=sentence
431 |     f=open(filename,mode="w",encoding="utf-8")
432 |     f.write(doc)
433 |     f.close()
434 | '''


--------------------------------------------------------------------------------