├── README.md ├── convert.py ├── data └── raw │ └── README.md ├── data_processing.py ├── models ├── bilstm_cbow.py ├── bilstm_cwe.py ├── crf.py ├── draw.py ├── gbdt1.py ├── gbdt2.py ├── lf.py ├── rf.py └── xgb.py ├── parameter.py └── util.py /README.md: -------------------------------------------------------------------------------- 1 | **Paper: _Improving Prosodic Boundaries Prediction for Mandarin Speech Synthesis by Using Enhanced Embedding Feature and Model Fusion Approach_** 2 | 3 | ## **Requirements** 4 | >**python3.5+** 5 | 6 | >**tensorflow>=1.6** 7 | 8 | >**numpy** 9 | 10 | >**pandas** 11 | 12 | >**scikit-learn** 13 | 14 | >**gensim** 15 | 16 | 17 | ## steps 18 | ### **----------------------data processing-----------------------** 19 | #### 1.run `python convert.py` 20 | >convert `.utf-8` raw files to prosody tagged files 21 | 22 | #### 2.run `python data_processing.py` 23 | >trans prosody tagged files to dataset 24 | 25 | ### **-------------------use models to prediction-----------------** 26 | #### `cd models` 27 | >into models 28 | 29 | #### run `python bilstm_cbow.py` 30 | >use bilstm_cbow to do prosody prediction 31 | 32 | 33 | #### run `python alignment.py` 34 | >use alignment to do prosody prediction -------------------------------------------------------------------------------- /convert.py: -------------------------------------------------------------------------------- 1 | ''' 2 | trans .utf8 files to normal tagged files 3 | ''' 4 | 5 | import os 6 | import pandas as pd 7 | import numpy as np 8 | import re 9 | 10 | #trans to prosody tagged format 11 | def toProsody(inFile,outFile): 12 | f_in=open(file=inFile,encoding="utf-8") 13 | doc="" 14 | lines=f_in.readlines() 15 | for line in lines: 16 | line=line.strip() 17 | line_list=line.split(sep="\t") 18 | if(line_list[0]!=""): 19 | doc+=(line_list[0]+"#"+line_list[7]) 20 | else: 21 | doc+="\n" 22 | #print(doc) 23 | f_out=open(file=outFile,mode="w",encoding="utf-8") 24 | f_out.write(doc) 25 | f_out.close() 26 | 27 | 28 | def merge(file1,file2,file3,outFile): 29 | doc="" 30 | f1=open(file=file1,encoding="utf-8") 31 | lines_f1=f1.readlines() 32 | for line_f1 in lines_f1: 33 | doc+=line_f1 34 | f2=open(file=file2,encoding="utf-8") 35 | lines_f2 = f2.readlines() 36 | for line_f2 in lines_f2: 37 | doc += line_f2 38 | f3 = open(file=file3, encoding="utf-8") 39 | lines_f3 = f3.readlines() 40 | for line_f3 in lines_f3: 41 | doc += line_f3 42 | f4=open(file=outFile,mode="w",encoding="utf-8") 43 | f4.write(doc) 44 | f4.close() 45 | 46 | 47 | 48 | 49 | #word pos info 50 | def toPos(inFile_train,inFile_valid,inFile_test): 51 | #---------------------------------------生成pos列表--------------------------------------------# 52 | f_train_in = open(file=inFile_train, encoding="utf-8") 53 | f_valid_in = open(file=inFile_valid, encoding="utf-8") 54 | f_test_in = open(file=inFile_test, encoding="utf-8") 55 | 56 | pos=[] 57 | #收集所有pos 58 | #train 59 | lines_train = f_train_in.readlines() 60 | for line_train in lines_train: 61 | line_train = line_train.strip() 62 | line_train_list = line_train.split(sep="\t") 63 | if (line_train_list[0] != ""): 64 | pos.append(line_train_list[1]) 65 | f_train_in.close() 66 | #valid 67 | lines_valid = f_valid_in.readlines() 68 | for line_valid in lines_valid: 69 | line_valid = line_valid.strip() 70 | line_valid_list = line_valid.split(sep="\t") 71 | if (line_valid_list[0] != ""): 72 | pos.append(line_valid_list[1]) 73 | f_valid_in.close() 74 | #test 75 | lines_test = f_test_in.readlines() 76 | for line_test in lines_test: 77 | line_test = line_test.strip() 78 | line_test_list = line_test.split(sep="\t") 79 | if (line_test_list[0] != ""): 80 | pos.append(line_test_list[1]) 81 | f_test_in.close() 82 | 83 | #print(pos) 84 | print("origin len of pos:",len(pos)) 85 | sr_all_pos = pd.Series(data=pos) # 列表做成pandas的Series 86 | pos = (sr_all_pos.value_counts()).index # pos列表.统计每个pos类型出现的频率,同时相当于去重复,得到字的集合(这里还是Serieas的index对象) 87 | print(pos) 88 | print("len of cleaned:",pos.shape) 89 | pos_id = range(1, len(pos) + 1) # 字的id列表,从1开始,因为准备把0作为填充值 90 | 91 | # words以及对应的id组件 92 | df_pos_ids=pd.DataFrame(data={"pos": pos, "id": pos_id}) 93 | df_pos_ids. to_csv(path_or_buf="./data/dataset/pos_ids.csv", index=False, encoding="utf_8") 94 | 95 | pos2id = pd.Series(data=df_pos_ids["id"].values, index=df_pos_ids["pos"].values) 96 | id2pos = pd.Series(data=df_pos_ids["pos"].values, index=df_pos_ids["id"].values) 97 | 98 | #print("pos2id:\n",pos2id.head(10)) 99 | #print("shape of pos2id:",pos2id.shape) 100 | #print("id2pos:\n",id2pos.head(10)) 101 | #print("shape of id2pos:",id2pos.shape) 102 | 103 | 104 | #---------------------------------------生成pos标注文件-----------------------------------------# 105 | #training corpus 106 | f_train_in = open(file=inFile_train, encoding="utf-8") 107 | doc_pos = "" 108 | doc_ids="" 109 | lines_train = f_train_in.readlines() 110 | for line_train in lines_train: 111 | line_train = line_train.strip() 112 | line_train_list = line_train.split(sep="\t") 113 | #print("line_train_list:",line_train_list) 114 | if (line_train_list[0] != ""): 115 | id=pos2id[line_train_list[1]] 116 | doc_pos+=(line_train_list[0]+"/"+str(id)) 117 | doc_ids+=(str(id)+" ") 118 | else: 119 | doc_pos += "\n" 120 | doc_ids+="\n" 121 | #save 2 files 122 | #f_train_out = open(file="./data/dataset/pos_train.txt", mode="w", encoding="utf-8") 123 | #f_train_out.write(doc_pos) 124 | #f_train_out.close() 125 | 126 | f_train_out = open(file="./data/dataset/pos_train_tag.txt", mode="w", encoding="utf-8") 127 | f_train_out.write(doc_ids) 128 | f_train_out.close() 129 | 130 | # validing corpus 131 | f_valid_in = open(file=inFile_valid, encoding="utf-8") 132 | doc_pos = "" 133 | doc_ids = "" 134 | lines_valid = f_valid_in.readlines() 135 | for line_valid in lines_valid: 136 | line_valid = line_valid.strip() 137 | line_valid_list = line_valid.split(sep="\t") 138 | # print("line_valid_list:",line_valid_list) 139 | if (line_valid_list[0] != ""): 140 | id = pos2id[line_valid_list[1]] 141 | doc_pos += (line_valid_list[0] + "/" + str(id)) 142 | doc_ids += (str(id) + " ") 143 | else: 144 | doc_pos += "\n" 145 | doc_ids += "\n" 146 | # save 2 files 147 | #f_valid_out = open(file="./data/dataset/pos_valid.txt", mode="w", encoding="utf-8") 148 | #f_valid_out.write(doc_pos) 149 | #f_valid_out.close() 150 | 151 | f_valid_out = open(file="./data/dataset/pos_valid_tag.txt", mode="w", encoding="utf-8") 152 | f_valid_out.write(doc_ids) 153 | f_valid_out.close() 154 | 155 | #test corpus 156 | f_test_in = open(file=inFile_test, encoding="utf-8") 157 | doc_pos = "" 158 | doc_ids = "" 159 | lines_test = f_test_in.readlines() 160 | for line_test in lines_test: 161 | line_test = line_test.strip() 162 | line_test_list = line_test.split(sep="\t") 163 | if (line_test_list[0] != ""): 164 | id = pos2id[line_test_list[1]] 165 | doc_pos += (line_test_list[0] + "/" + str(id)) 166 | doc_ids += (str(id) + " ") 167 | else: 168 | doc_pos += "\n" 169 | doc_ids += "\n" 170 | #f_test_out = open(file="./data/dataset/pos_test.txt", mode="w", encoding="utf-8") 171 | #f_test_out.write(doc_pos) 172 | #f_test_out.close() 173 | 174 | f_test_out = open(file="./data/dataset/pos_test_tag.txt", mode="w", encoding="utf-8") 175 | f_test_out.write(doc_ids) 176 | f_test_out.close() 177 | 178 | 179 | #word length info(每个词的长度) 180 | def toWordLength(inFile_train,inFile_valid,inFile_test): 181 | # ---------------------------------------生成length标注文件-----------------------------------------# 182 | # training corpus 183 | f_train_in = open(file=inFile_train, encoding="utf-8") 184 | doc_length = "" 185 | doc_ids = "" 186 | lines_train = f_train_in.readlines() 187 | for line_train in lines_train: 188 | line_train = line_train.strip() 189 | line_train_list = line_train.split(sep="\t") 190 | # print("line_train_list:",line_train_list) 191 | if (line_train_list[0] != ""): 192 | doc_length += (line_train_list[0] + "/" +line_train_list[2]) 193 | doc_ids += (line_train_list[2] + " ") 194 | else: 195 | doc_length += "\n" 196 | doc_ids += "\n" 197 | # save 2 files 198 | #f_train_out = open(file="./data/dataset/length_train.txt", mode="w", encoding="utf-8") 199 | #f_train_out.write(doc_length) 200 | #f_train_out.close() 201 | 202 | f_train_out = open(file="./data/dataset/length_train_tag.txt", mode="w", encoding="utf-8") 203 | f_train_out.write(doc_ids) 204 | f_train_out.close() 205 | 206 | # validing corpus 207 | f_valid_in = open(file=inFile_valid, encoding="utf-8") 208 | doc_length = "" 209 | doc_ids = "" 210 | lines_valid = f_valid_in.readlines() 211 | for line_valid in lines_valid: 212 | line_valid = line_valid.strip() 213 | line_valid_list = line_valid.split(sep="\t") 214 | # print("line_valid_list:",line_valid_list) 215 | if (line_valid_list[0] != ""): 216 | doc_length += (line_valid_list[0] + "/" + line_valid_list[2]) 217 | doc_ids += (line_valid_list[2] + " ") 218 | else: 219 | doc_length += "\n" 220 | doc_ids += "\n" 221 | # save 2 files 222 | #f_valid_out = open(file="./data/dataset/length_valid.txt", mode="w", encoding="utf-8") 223 | #f_valid_out.write(doc_length) 224 | #f_valid_out.close() 225 | 226 | f_valid_out = open(file="./data/dataset/length_valid_tag.txt", mode="w", encoding="utf-8") 227 | f_valid_out.write(doc_ids) 228 | f_valid_out.close() 229 | 230 | # test corpus 231 | f_test_in = open(file=inFile_test, encoding="utf-8") 232 | doc_length = "" 233 | doc_ids = "" 234 | lines_test = f_test_in.readlines() 235 | for line_test in lines_test: 236 | line_test = line_test.strip() 237 | line_test_list = line_test.split(sep="\t") 238 | if (line_test_list[0] != ""): 239 | doc_length += (line_test_list[0] + "/" + line_test_list[2]) 240 | doc_ids += (line_test_list[2] + " ") 241 | else: 242 | doc_length += "\n" 243 | doc_ids += "\n" 244 | #f_test_out = open(file="./data/dataset/length_test.txt", mode="w", encoding="utf-8") 245 | #f_test_out.write(doc_length) 246 | #f_test_out.close() 247 | 248 | f_test_out = open(file="./data/dataset/length_test_tag.txt", mode="w", encoding="utf-8") 249 | f_test_out.write(doc_ids) 250 | f_test_out.close() 251 | 252 | #word position info 253 | def toWordAccum(inFile_train,inFile_valid,inFile_test): 254 | # ---------------------------------------生成accum标注文件-----------------------------------------# 255 | # training corpus 256 | f_train_in = open(file=inFile_train, encoding="utf-8") 257 | doc_position = "" 258 | doc_ids = "" 259 | lines_train = f_train_in.readlines() 260 | for line_train in lines_train: 261 | line_train = line_train.strip() 262 | line_train_list = line_train.split(sep="\t") 263 | # print("line_train_list:",line_train_list) 264 | if (line_train_list[0] != ""): 265 | doc_position += (line_train_list[0] + "/" + line_train_list[4]) 266 | doc_ids += (line_train_list[4] + " ") 267 | else: 268 | doc_position += "\n" 269 | doc_ids += "\n" 270 | # save 2 files 271 | f_train_out = open(file="./data/dataset/accum_train.txt", mode="w", encoding="utf-8") 272 | f_train_out.write(doc_position) 273 | f_train_out.close() 274 | 275 | f_train_out = open(file="./data/dataset/accum_train_tag.txt", mode="w", encoding="utf-8") 276 | f_train_out.write(doc_ids) 277 | f_train_out.close() 278 | 279 | # validing corpus 280 | f_valid_in = open(file=inFile_valid, encoding="utf-8") 281 | doc_position = "" 282 | doc_ids = "" 283 | lines_valid = f_valid_in.readlines() 284 | for line_valid in lines_valid: 285 | line_valid = line_valid.strip() 286 | line_valid_list = line_valid.split(sep="\t") 287 | # print("line_valid_list:",line_valid_list) 288 | if (line_valid_list[0] != ""): 289 | doc_position += (line_valid_list[0] + "/" + line_valid_list[4]) 290 | doc_ids += (line_valid_list[4] + " ") 291 | else: 292 | doc_position += "\n" 293 | doc_ids += "\n" 294 | # save 2 files 295 | f_valid_out = open(file="./data/dataset/accum_valid.txt", mode="w", encoding="utf-8") 296 | f_valid_out.write(doc_position) 297 | f_valid_out.close() 298 | 299 | f_valid_out = open(file="./data/dataset/accum_valid_tag.txt", mode="w", encoding="utf-8") 300 | f_valid_out.write(doc_ids) 301 | f_valid_out.close() 302 | 303 | # test corpus 304 | f_test_in = open(file=inFile_test, encoding="utf-8") 305 | doc_position = "" 306 | doc_ids = "" 307 | lines_test = f_test_in.readlines() 308 | for line_test in lines_test: 309 | line_test = line_test.strip() 310 | line_test_list = line_test.split(sep="\t") 311 | if (line_test_list[0] != ""): 312 | doc_position += (line_test_list[0] + "/" + line_test_list[4]) 313 | doc_ids += (line_test_list[4] + " ") 314 | else: 315 | doc_position += "\n" 316 | doc_ids += "\n" 317 | f_test_out = open(file="./data/dataset/accum_test.txt", mode="w", encoding="utf-8") 318 | f_test_out.write(doc_position) 319 | f_test_out.close() 320 | 321 | f_test_out = open(file="./data/dataset/accum_test_tag.txt", mode="w", encoding="utf-8") 322 | f_test_out.write(doc_ids) 323 | f_test_out.close() 324 | 325 | def toWordAccumReverse(inFile_train,inFile_valid,inFile_test): 326 | # ---------------------------------------生成accum标注文件-----------------------------------------# 327 | # training corpus 328 | f_train_in = open(file=inFile_train, encoding="utf-8") 329 | doc_position = "" 330 | doc_ids = "" 331 | lines_train = f_train_in.readlines() 332 | for line_train in lines_train: 333 | line_train = line_train.strip() 334 | line_train_list = line_train.split(sep="\t") 335 | # print("line_train_list:",line_train_list) 336 | if (line_train_list[0] != ""): 337 | doc_position += (line_train_list[0] + "/" + line_train_list[5]) 338 | doc_ids += (line_train_list[5] + " ") 339 | else: 340 | doc_position += "\n" 341 | doc_ids += "\n" 342 | # save 2 files 343 | #f_train_out = open(file="./data/dataset/accum_reverse_train.txt", mode="w", encoding="utf-8") 344 | #f_train_out.write(doc_position) 345 | #f_train_out.close() 346 | 347 | f_train_out = open(file="./data/dataset/accum_reverse_train_tag.txt", mode="w", encoding="utf-8") 348 | f_train_out.write(doc_ids) 349 | f_train_out.close() 350 | 351 | # validing corpus 352 | f_valid_in = open(file=inFile_valid, encoding="utf-8") 353 | doc_position = "" 354 | doc_ids = "" 355 | lines_valid = f_valid_in.readlines() 356 | for line_valid in lines_valid: 357 | line_valid = line_valid.strip() 358 | line_valid_list = line_valid.split(sep="\t") 359 | # print("line_valid_list:",line_valid_list) 360 | if (line_valid_list[0] != ""): 361 | doc_position += (line_valid_list[0] + "/" + line_valid_list[5]) 362 | doc_ids += (line_valid_list[5] + " ") 363 | else: 364 | doc_position += "\n" 365 | doc_ids += "\n" 366 | # save 2 files 367 | #f_valid_out = open(file="./data/dataset/accum_reverse_valid.txt", mode="w", encoding="utf-8") 368 | #f_valid_out.write(doc_position) 369 | #f_valid_out.close() 370 | 371 | f_valid_out = open(file="./data/dataset/accum_reverse_valid_tag.txt", mode="w", encoding="utf-8") 372 | f_valid_out.write(doc_ids) 373 | f_valid_out.close() 374 | 375 | # test corpus 376 | f_test_in = open(file=inFile_test, encoding="utf-8") 377 | doc_position = "" 378 | doc_ids = "" 379 | lines_test = f_test_in.readlines() 380 | for line_test in lines_test: 381 | line_test = line_test.strip() 382 | line_test_list = line_test.split(sep="\t") 383 | if (line_test_list[0] != ""): 384 | doc_position += (line_test_list[0] + "/" + line_test_list[5]) 385 | doc_ids += (line_test_list[5] + " ") 386 | else: 387 | doc_position += "\n" 388 | doc_ids += "\n" 389 | #f_test_out = open(file="./data/dataset/accum_reverse_test.txt", mode="w", encoding="utf-8") 390 | #f_test_out.write(doc_position) 391 | #f_test_out.close() 392 | 393 | f_test_out = open(file="./data/dataset/accum_reverse_test_tag.txt", mode="w", encoding="utf-8") 394 | f_test_out.write(doc_ids) 395 | f_test_out.close() 396 | 397 | #word position info 398 | def toWordPosition(inFile_train,inFile_valid,inFile_test): 399 | # ---------------------------------------生成position标注文件-----------------------------------------# 400 | # training corpus 401 | f_train_in = open(file=inFile_train, encoding="utf-8") 402 | doc_position = "" 403 | doc_ids = "" 404 | lines_train = f_train_in.readlines() 405 | i=1 406 | for line_train in lines_train: 407 | line_train = line_train.strip() 408 | line_train_list = line_train.split(sep="\t") 409 | # print("line_train_list:",line_train_list) 410 | if (line_train_list[0] != ""): 411 | doc_position += (line_train_list[0] + "/" + str(i)) 412 | doc_ids += (str(i) + " ") 413 | i+=1 414 | else: 415 | doc_position += "\n" 416 | doc_ids += "\n" 417 | i=1 418 | # save 2 files 419 | #f_train_out = open(file="./data/dataset/position_train.txt", mode="w", encoding="utf-8") 420 | #f_train_out.write(doc_position) 421 | #f_train_out.close() 422 | 423 | f_train_out = open(file="./data/dataset/position_train_tag.txt", mode="w", encoding="utf-8") 424 | f_train_out.write(doc_ids) 425 | f_train_out.close() 426 | 427 | # validing corpus 428 | f_valid_in = open(file=inFile_valid, encoding="utf-8") 429 | doc_position = "" 430 | doc_ids = "" 431 | lines_valid = f_valid_in.readlines() 432 | i = 1 433 | for line_valid in lines_valid: 434 | line_valid = line_valid.strip() 435 | line_valid_list = line_valid.split(sep="\t") 436 | # print("line_valid_list:",line_valid_list) 437 | if (line_valid_list[0] != ""): 438 | doc_position += (line_valid_list[0] + "/" + str(i)) 439 | doc_ids += (str(i) + " ") 440 | i += 1 441 | else: 442 | doc_position += "\n" 443 | doc_ids += "\n" 444 | i = 1 445 | # save 2 files 446 | #f_valid_out = open(file="./data/dataset/position_valid.txt", mode="w", encoding="utf-8") 447 | #f_valid_out.write(doc_position) 448 | #f_valid_out.close() 449 | 450 | f_valid_out = open(file="./data/dataset/position_valid_tag.txt", mode="w", encoding="utf-8") 451 | f_valid_out.write(doc_ids) 452 | f_valid_out.close() 453 | 454 | # test corpus 455 | f_test_in = open(file=inFile_test, encoding="utf-8") 456 | doc_position = "" 457 | doc_ids = "" 458 | lines_test = f_test_in.readlines() 459 | i=1 460 | for line_test in lines_test: 461 | line_test = line_test.strip() 462 | line_test_list = line_test.split(sep="\t") 463 | if (line_test_list[0] != ""): 464 | doc_position += (line_test_list[0] + "/" + str(i)) 465 | doc_ids += (str(i) + " ") 466 | i+=1 467 | else: 468 | doc_position += "\n" 469 | doc_ids += "\n" 470 | i=1 471 | #f_test_out = open(file="./data/dataset/position_test.txt", mode="w", encoding="utf-8") 472 | #f_test_out.write(doc_position) 473 | #f_test_out.close() 474 | 475 | f_test_out = open(file="./data/dataset/position_test_tag.txt", mode="w", encoding="utf-8") 476 | f_test_out.write(doc_ids) 477 | f_test_out.close() 478 | 479 | 480 | 481 | 482 | if __name__ =="__main__": 483 | if not os.path.exists("./data/corpus"): 484 | os.mkdir("./data/corpus/") 485 | if not os.path.exists("./data/dataset"): 486 | os.mkdir("./data/dataset/") 487 | if not os.path.exists("./result"): 488 | os.mkdir("./result") 489 | 490 | print("[1]-> Conver raw .utf-8 files to prosody tagged files") 491 | toProsody(inFile="./data/raw/prosody_test_tag.utf8",outFile="./data/corpus/prosody_test.txt") 492 | toProsody(inFile="./data/raw/prosody_train_tag.utf8", outFile="./data/corpus/prosody_train.txt") 493 | toProsody(inFile="./data/raw/prosody_valid_tag.utf8", outFile="./data/corpus/prosody_valid.txt") 494 | 495 | 496 | print("[2]->merge prosody_train and prosody_valid and prosody_test files") 497 | merge( 498 | file1="./data/corpus/prosody_train.txt", 499 | file2="data/corpus/prosody_valid.txt", 500 | file3="data/corpus/prosody_test.txt", 501 | outFile="data/corpus/prosody.txt" 502 | ) 503 | 504 | 505 | print("[3]->generate pos files") 506 | toPos(inFile_train="./data/raw/prosody_train_tag.utf8", 507 | inFile_valid="./data/raw/prosody_valid_tag.utf8", 508 | inFile_test="./data/raw/prosody_test_tag.utf8" 509 | ) 510 | 511 | print("[4]->generate length files") 512 | toWordLength(inFile_train="./data/raw/prosody_train_tag.utf8", 513 | inFile_valid="./data/raw/prosody_valid_tag.utf8", 514 | inFile_test="./data/raw/prosody_test_tag.utf8" 515 | ) 516 | 517 | print("[5]->generate accmulate files") 518 | toWordAccum(inFile_train="./data/raw/prosody_train_tag.utf8", 519 | inFile_valid="./data/raw/prosody_valid_tag.utf8", 520 | inFile_test="./data/raw/prosody_test_tag.utf8" 521 | ) 522 | 523 | print("[6]->generate accmulate reverse files") 524 | toWordAccumReverse(inFile_train="./data/raw/prosody_train_tag.utf8", 525 | inFile_valid="./data/raw/prosody_valid_tag.utf8", 526 | inFile_test="./data/raw/prosody_test_tag.utf8" 527 | ) 528 | 529 | print("[7]->generate position files") 530 | toWordPosition(inFile_train="./data/raw/prosody_train_tag.utf8", 531 | inFile_valid="./data/raw/prosody_valid_tag.utf8", 532 | inFile_test="./data/raw/prosody_test_tag.utf8" 533 | ) 534 | 535 | -------------------------------------------------------------------------------- /data/raw/README.md: -------------------------------------------------------------------------------- 1 | you should put corpus in this folder. 2 | 3 | ---`prosody_train_tag.utf8` 4 | 5 | ---`prosody_valid_tag.utf8` 6 | 7 | ---`prosody_test_tag.utf8` -------------------------------------------------------------------------------- /data_processing.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 清洗数据,转换语料格式,得到词嵌入 3 | author:xierhacker 4 | time:2018.1.22 5 | ''' 6 | import re 7 | import os 8 | import time 9 | import pandas as pd 10 | import numpy as np 11 | from itertools import chain 12 | #from gensim.models import word2vec 13 | from parameter import MAX_SENTENCE_SIZE 14 | from parameter import WORD_EMBEDDING_SIZE 15 | from parameter import CHAR_EMBEDDING_SIZE 16 | 17 | #原始语料转换为不带任何标记的语料,可以训练字向量 18 | def toCharCorpus(inFile,outFile): 19 | doc = "" 20 | file = open(file=inFile, encoding="utf-8") 21 | lines = file.readlines() 22 | # 匹配#标记 23 | pattern1 = re.compile(r"#[0,1,2,3,4]", flags=re.U) 24 | # 每个字匹配一次 25 | pattern2 =re.compile(r"[^\s]") 26 | for line in lines: 27 | string = re.sub(pattern=pattern1, repl="", string=line) #去掉# 28 | string=" ".join(re.findall(pattern=pattern2,string=string)) #每个字加上空格 29 | string+="\n" 30 | doc += string 31 | # write to file 32 | f = open(file=outFile, mode="w", encoding="utf-8") 33 | f.write(doc) 34 | f.close() 35 | 36 | 37 | #训练字向量并且存储 38 | def toCharEmbeddings(inFile): 39 | sentences = word2vec.Text8Corpus(inFile) 40 | model = word2vec.Word2Vec( 41 | sentences=sentences, 42 | size=CHAR_EMBEDDING_SIZE, #词向量维度 43 | window=5, #window大小 44 | min_count=0, #频率小于这个值被忽略 45 | sg=0, #sg==0->cbow; sg==1->skip-gram 46 | hs=1, #use hierarchical softmax 47 | negative=5, #use negative sampling 48 | sorted_vocab=1, #按照词频率从高到低排序 49 | ) 50 | # save embeddings file 51 | if not os.path.exists("./data/embeddings"): 52 | os.mkdir(path="./data/embeddings") 53 | model.wv.save_word2vec_format("./data/embeddings/char_vec.txt", binary=False) 54 | #生成char和id相互索引的.csv文件 55 | if os.path.exists("./data/embeddings/char_vec.txt"): 56 | f=open(file="./data/embeddings/char_vec.txt",encoding="utf-8") 57 | lines = f.readlines() 58 | # first row is info 59 | info = lines[0].strip() 60 | info_list = info.split(sep=" ") 61 | vocab_size = int(info_list[0]) 62 | embedding_dims = int(info_list[1]) 63 | chars=[] 64 | ids=[] 65 | for i in range(1,vocab_size+1): 66 | embed=lines[i].strip() 67 | embed_list=embed.split(sep=" ") 68 | chars.append(embed_list[0]) 69 | ids.append(i) 70 | pd.DataFrame(data={"chars": chars, "id": ids}). \ 71 | to_csv(path_or_buf="./data/dataset/chars_ids.csv", index=False, encoding="utf_8") 72 | else: 73 | print("there is no embedings files") 74 | 75 | 76 | 77 | #原始语料转换为不带任何标记的语料,可以训练词向量 78 | def toWordCorpus(inFile,outFile): 79 | doc = "" 80 | file = open(file=inFile, encoding="utf-8") 81 | lines = file.readlines() 82 | # 匹配#标记 83 | pattern1 = re.compile(r"#[0,1,2,3,4]", flags=re.U) 84 | # 每个字匹配一次 85 | pattern2 =re.compile(r"[^\s]") 86 | for line in lines: 87 | string = re.sub(pattern=pattern1, repl=" ", string=line) #去掉# 88 | #string=" ".join(re.findall(pattern=pattern2,string=string)) #每个字加上空格 89 | string+="\n" 90 | doc += string 91 | # write to file 92 | f = open(file=outFile, mode="w", encoding="utf-8") 93 | f.write(doc) 94 | f.close() 95 | 96 | 97 | #训练词向量并且存储 98 | def toWordEmbeddings(inFile): 99 | #--------------------------------train word embeddings--------------------------------- 100 | sentences = word2vec.Text8Corpus(inFile) 101 | model = word2vec.Word2Vec( 102 | sentences=sentences, 103 | size=WORD_EMBEDDING_SIZE, # 词向量维度 104 | window=5, # window大小 105 | min_count=0, # 频率小于这个值被忽略 106 | sg=0, # sg==0->cbow; sg==1->skip-gram 107 | hs=1, # use hierarchical softmax 108 | negative=5, # use negative sampling 109 | sorted_vocab=1, # 按照词频率从高到低排序 110 | ) 111 | # save embeddings file 112 | if not os.path.exists("./data/embeddings"): 113 | os.mkdir(path="./data/embeddings") 114 | model.wv.save_word2vec_format("./data/embeddings/word_vec.txt", binary=False) 115 | 116 | # ----------------------------------生成word和id相互索引的.csv文件------------------------- 117 | if os.path.exists("./data/embeddings/word_vec.txt"): 118 | f = open(file="./data/embeddings/word_vec.txt", encoding="utf-8") 119 | lines = f.readlines() 120 | # first row is info 121 | info = lines[0].strip() 122 | info_list = info.split(sep=" ") 123 | vocab_size = int(info_list[0]) 124 | embedding_dims = int(info_list[1]) 125 | words = [] 126 | ids = [] 127 | for i in range(1, vocab_size + 1): 128 | embed = lines[i].strip() 129 | embed_list = embed.split(sep=" ") 130 | words.append(embed_list[0]) 131 | ids.append(i) 132 | pd.DataFrame(data={"words": words, "id": ids}). \ 133 | to_csv(path_or_buf="./data/dataset/words_ids.csv", index=False, encoding="utf_8") 134 | else: 135 | print("there is no embedings files") 136 | 137 | 138 | #转换原始corpus为韵律词(PW)格式标记 139 | def toPW(inFile,outFile): 140 | doc="" 141 | file = open(file=inFile, encoding="utf-8") 142 | lines = file.readlines() 143 | # 匹配#0标记,替换为/n 144 | pattern1 = re.compile(r"#0", flags=re.U) 145 | # 匹配#1 #2标记,替换为/b 146 | pattern2 = re.compile(r"#[1,2]", flags=re.U) 147 | for line in lines: 148 | line=line.strip() 149 | string = re.sub(pattern=pattern1, repl="/n", string=line) # #0替换为/n 150 | string = re.sub(pattern=pattern2, repl="/b", string=string)+"\n" # #1替换为/b 151 | doc += string 152 | # write to file 153 | f = open(file=outFile, mode="w", encoding="utf-8") 154 | f.write(doc) 155 | f.close() 156 | 157 | 158 | #转换原始corpus为韵律短语(PPH)格式标记 159 | def toPPH(inFile,outFile): 160 | doc="" 161 | file = open(file=inFile, encoding="utf-8") 162 | lines = file.readlines() 163 | # 匹配#0,#1标记,替换为/n 164 | pattern1 = re.compile(r"#[0,1]", flags=re.U) 165 | # 不是/或者b 166 | pattern2 = re.compile(r"#2", flags=re.U) 167 | for line in lines: 168 | line=line.strip() #去掉一些影响的空格和换行 169 | string = re.sub(pattern=pattern1, repl="/n", string=line) # #0和#1替换为/n 170 | string = re.sub(pattern=pattern2, repl="/b", string=string)+"\n" # #2替换为/b 171 | doc += string 172 | # write to file 173 | f = open(file=outFile, mode="w", encoding="utf-8") 174 | f.write(doc) 175 | f.close() 176 | 177 | 178 | #转换原始corpus为语调短语(IPH)格式标记 179 | def toIPH(filename): 180 | doc = "" 181 | file = open(file=filename, encoding="utf-8") 182 | lines = file.readlines() 183 | # 匹配#1和#2(因为要先去掉#1和#2) 184 | pattern = re.compile(r"#[1,2]") 185 | # 匹配#标记 186 | pattern1 = re.compile(r"#[3,4]", flags=re.U) 187 | # 不是/或者b 188 | pattern2 = re.compile(r"(?![/b])") 189 | # 去掉b后面的/n 190 | pattern3 = re.compile(r"b/n") 191 | # 去掉开头的/n 192 | pattern4 = re.compile(r"^/n") 193 | for line in lines: 194 | line = line.strip() # 去掉一些影响的空格和换行 195 | string = re.sub(pattern=pattern, repl="", string=line) # 去掉#1 196 | string = re.sub(pattern=pattern1, repl="/b", string=string) # 去掉# 197 | string = re.sub(pattern=pattern2, repl="/n", string=string) 198 | string = re.sub(pattern=pattern3, repl="b", string=string) 199 | string = re.sub(pattern=pattern4, repl="", string=string) + "\n" 200 | doc += string 201 | # write to file 202 | f = open(file="./data/corpus/prosody_iph.txt", mode="w+", encoding="utf-8") 203 | f.write(doc) 204 | f.close() 205 | 206 | 207 | #清洗 208 | def clean(s): 209 | if u'“/s' not in s: # 句子中间的引号不应去掉 210 | return s.replace(u' ”/s', '') 211 | elif u'”/s' not in s: 212 | return s.replace(u'“/s ', '') 213 | elif u'‘/s' not in s: 214 | return s.replace(u' ’/s', '') 215 | elif u'’/s' not in s: 216 | return s.replace(u'‘/s ', '') 217 | else: 218 | return s 219 | 220 | def file2corpus(filename): 221 | ''' 222 | :param filename: 223 | :return: 语料文件文件转换为一个原始语料句子的list 224 | ''' 225 | with open(filename, 'rb') as inp: 226 | corpus = inp.read().decode('UTF-8') #原始语料 str对象 227 | corpus = corpus.split('\r') #换行切分,得到一个简陋列表 228 | corpus = u''.join(map(clean, corpus)) # 把所有处理的句子连接起来,这里中间连接不用其他字符 str对象 229 | corpus = re.split(u"\n", corpus) # 以换行为分割,把语料划分为一个"句子"列表 230 | #corpus = re.split(u'[,。!?、‘’“”]/[bems]', corpus) # 以换行为分割,把语料划分为一个"句子"列表 231 | return corpus #[人/b 们/e 常/s 说/s 生/b 活/e 是/s 一/s 部/s 教/b 科/m 书/e ,xxx,....] 232 | 233 | 234 | def make_component(corpus): 235 | ''' 236 | :param corpus: 传入原始语料句子corpus列表得到的字数据datas和对应的labels数据都放到dataframe里面存储,方便后面的处理 237 | :return: df_data 238 | ''' 239 | sentences= [] 240 | tags = [] 241 | for s in corpus: #corpus列表得到每句corpus想应的sentence以及对应的labels 242 | sentence_tags = re.findall('([^/]*)/(.)', s) # sentence_tags:[('人', 'b'), ('们', 'e'), ('常', 's'), ('说', 's')] 243 | #print("sentence_tags:",sentence_tags) 244 | if sentence_tags: # 顺便去除了一些空样本 245 | sentence_tags = np.array(sentence_tags) 246 | sentences.append(sentence_tags[:, 0]) #sentences每一个元素表示一个sentence['人' '们' '常' '说' '生' '活' '是' '一' '部' '教' '科' '书'] 247 | tags.append(sentence_tags[:, 1]) #tags每一个元素表示的是一个句子对应的标签['b' 'e' 's' 's' 'b' 'e' 's' 's' 's' 'b' 'm' 'e'] 248 | 249 | #使用pandas处理,简化流程 250 | df_data = pd.DataFrame({'sentences': sentences, 'tags': tags}, index=range(len(sentences))) 251 | df_data['sentence_len'] = df_data['sentences'].apply(lambda sentences: len(sentences)) # 每句话长度 252 | print("max sentence length:",df_data["sentence_len"].max()) 253 | 254 | tags = ['n', 'b'] #tag列表 255 | tags_id = range(len(tags)) #tag的id列表 256 | 257 | # tags以及对应的id组件 258 | pd.DataFrame(data={"tags":tags,"id":tags_id}).\ 259 | to_csv(path_or_buf="./data/dataset/tags_ids.csv",index=False,encoding="utf_8") 260 | #存储df_data 261 | df_data.to_csv(path_or_buf="./data/dataset/df_data.csv",index=False,encoding="utf-8") 262 | return df_data #暂时不保存,返回 263 | 264 | 265 | #read basic component from .csv files 266 | def read_component(): 267 | #读取words和ids的dataframe 268 | df_words_ids=pd.read_csv(filepath_or_buffer="./data/dataset/words_ids.csv",encoding="utf-8") 269 | #读取tags和ids的dataframe 270 | df_tags_ids=pd.read_csv(filepath_or_buffer="./data/dataset/tags_ids.csv",encoding="utf-8") 271 | 272 | #转换为words2id, id2words, tags2id, id2tags 273 | #df_data=pd.DataFrame(data={}) 274 | words2id=pd.Series(data=df_words_ids["id"].values,index=df_words_ids["words"].values) 275 | id2words=pd.Series(data=df_words_ids["words"].values,index=df_words_ids["id"].values) 276 | tags2id = pd.Series(data=df_tags_ids["id"].values, index=df_tags_ids["tags"].values) 277 | id2tags = pd.Series(data=df_tags_ids["tags"].values, index=df_tags_ids["id"].values) 278 | return words2id, id2words, tags2id, id2tags 279 | 280 | #转换为最后模型适合的数据集,name表示转换后的数据集存储在哪个文件下面./data/dataset/ 281 | def make_dataset(inFile,outFile): 282 | corpus = file2corpus(inFile) 283 | #print("----corpus contains ", len(corpus), " sentences.") 284 | #保存基本组件,并且返回df_data 285 | print("----saving component ") 286 | df_data=make_component(corpus) 287 | 288 | #读取组件,并且装换为合适的格式 289 | words2id, id2words, tags2id, id2tags =read_component() 290 | #print("words2id.shape:",words2id.shape) 291 | print("----dataset contains ",df_data.shape[0]," sentences.") 292 | 293 | #padding 294 | def X_padding(sentence): 295 | ids = list(words2id[sentence]) 296 | if len(ids) > MAX_SENTENCE_SIZE: # 超过就截断 297 | return ids[:MAX_SENTENCE_SIZE] 298 | if len(ids) < MAX_SENTENCE_SIZE: # 短了就补齐 299 | ids.extend([0] * (MAX_SENTENCE_SIZE - len(ids))) 300 | return ids 301 | 302 | def y_padding(tags): 303 | ids = list(tags2id[tags]) 304 | if len(ids) > MAX_SENTENCE_SIZE: # 超过就截断 305 | return ids[:MAX_SENTENCE_SIZE] 306 | if len(ids) < MAX_SENTENCE_SIZE: # 短了就补齐 307 | ids.extend([0] * (MAX_SENTENCE_SIZE - len(ids))) 308 | return ids 309 | 310 | #把数据转换为ids表示的的形式 311 | print("----convert data and label to 'ids' represented") 312 | df_data['X'] = df_data['sentences'].apply(X_padding) 313 | df_data['y'] = df_data['tags'].apply(y_padding) 314 | #print(df_data["X"].head(5)) 315 | #print(df_data["y"].head(5)) 316 | 317 | #数据集切分 318 | df_data_train=df_data[:50000] 319 | df_data_valid=df_data[50000:60000] 320 | df_data_test = df_data[60000:] 321 | 322 | 323 | #保存最终数据到pkl文件 324 | print("----saving final dataset <"+outFile+"_summary_train.pkl>") 325 | df_data_train.to_pickle(path="./data/dataset/"+"/"+outFile+"_summary_train.pkl") 326 | df_data_train.to_csv(path_or_buf="./data/dataset/" + outFile + "_df_data_train_final.csv", index=False, encoding="utf-8") 327 | 328 | print("----saving final dataset <"+outFile+"_summary_valida.pkl>") 329 | df_data_valid.to_pickle(path="./data/dataset/"+outFile+"_summary_valid.pkl") 330 | df_data_valid.to_csv(path_or_buf="./data/dataset/" + outFile + "_df_data_valid_final.csv", index=False, 331 | encoding="utf-8") 332 | 333 | print("----saving final dataset <" + outFile + "_summary_test.pkl>") 334 | df_data_test.to_pickle(path="./data/dataset/" + "/" + outFile + "_summary_test.pkl") 335 | df_data_test.to_csv(path_or_buf="./data/dataset/" + outFile + "_df_data_test_final.csv", index=False, 336 | encoding="utf-8") 337 | 338 | df_data.to_csv(path_or_buf="./data/dataset/" + outFile + "_df_data_final.csv", index=False, encoding="utf-8") 339 | 340 | 341 | #summary_train.pkl 342 | if __name__=="__main__": 343 | start_time = time.time() 344 | print("[1]-->trans corpus to char corpus and char embeddings...") 345 | toCharCorpus(inFile="./data/corpus/prosody.txt",outFile="./data/corpus/prosody_char.txt") 346 | #toCharEmbeddings(inFile="./data/corpus/prosody_char.txt") 347 | 348 | print("[2]-->trans corpus to word corpus and word embeddings...") 349 | toWordCorpus(inFile="./data/corpus/prosody.txt", outFile="./data/corpus/prosody_word.txt") 350 | #toWordEmbeddings(inFile="./data/corpus/prosody_word.txt") 351 | 352 | print("[3]-->trans corpus to PW format......") 353 | toPW(inFile="./data/corpus/prosody.txt",outFile="./data/corpus/prosody_pw.txt") 354 | 355 | print("[4]-->trans corpus to PPH format......") 356 | toPPH(inFile="./data/corpus/prosody.txt", outFile="./data/corpus/prosody_pph.txt") 357 | 358 | #print("[5]-->trans corpus to IPH format......") 359 | #toIPH("./data/corpus/prosody.txt") 360 | 361 | print("[6]-->trans corpus_pw to dataset......") 362 | make_dataset(inFile="./data/corpus/prosody_pw.txt",outFile="pw") 363 | 364 | print("[7]-->trans corpus_pph to dataset......") 365 | make_dataset(inFile="./data/corpus/prosody_pph.txt", outFile="pph") 366 | 367 | #print("[8]-->trans corpus_iph to dataset......") 368 | #make_dataset(in_filename="./data/corpus/prosody_iph.txt", out_filename="iph") 369 | duration = time.time() - start_time; 370 | print("END! this operation spends ", round(duration / 60, 2), " mins") -------------------------------------------------------------------------------- /models/bilstm_cbow.py: -------------------------------------------------------------------------------- 1 | ''' 2 | model with CWS and pos information 3 | ''' 4 | import sys 5 | sys.path.append("..") 6 | import numpy as np 7 | import pandas as pd 8 | import tensorflow as tf 9 | import tensorflow.contrib.rnn as rnn 10 | import tensorflow.contrib.seq2seq as seq2seq 11 | import time 12 | import os 13 | import parameter 14 | import util 15 | 16 | #指定显卡 17 | os.environ['CUDA_VISIBLE_DEVICES']='2' 18 | config=tf.ConfigProto() 19 | config.gpu_options.allow_growth=True 20 | 21 | 22 | class BiLSTM(): 23 | def __init__(self): 24 | # basic environment 25 | self.graph = tf.Graph() 26 | self.session = tf.Session(graph=self.graph,config=config) 27 | 28 | # basic parameters 29 | self.learning_rate = parameter.LEARNING_RATE 30 | self.max_epoch = parameter.MAX_EPOCH 31 | 32 | self.class_num = parameter.CLASS_NUM 33 | self.pos_num = parameter.POS_NUM 34 | self.length_num = parameter.LENGTH_NUM 35 | self.hidden_units_num = parameter.HIDDEN_UNITS_NUM 36 | self.hidden_units_num2 = parameter.HIDDEN_UNITS_NUM2 37 | self.layer_num = parameter.LAYER_NUM 38 | self.max_sentence_size = parameter.MAX_SENTENCE_SIZE 39 | 40 | # self.vocab_size = parameter.VOCAB_SIZE 41 | self.word_vocab_size = parameter.WORD_VOCAB_SIZE 42 | self.embedding_size = parameter.CHAR_EMBEDDING_SIZE 43 | self.word_embedding_size = parameter.WORD_EMBEDDING_SIZE 44 | 45 | self.batch_size = parameter.BATCH_SIZE 46 | self.lambda_pw = parameter.LAMBDA_PW 47 | self.lambda_pph = parameter.LAMBDA_PPH 48 | self.lambda_iph = parameter.LAMBDA_IPH 49 | 50 | self.keep_prob = parameter.KEEP_PROB 51 | self.input_keep_prob = parameter.INPUT_KEEP_PROB 52 | self.output_keep_prob = parameter.OUTPUT_KEEP_PROB 53 | 54 | self.decay_rate = parameter.DECAY 55 | 56 | 57 | #full inference process of each hierachy 58 | def hierarchy(self,inputs,y_masked,seq_length,scope_name,reuse=False): 59 | if scope_name=="pw": 60 | encoder_scope_name="en_lstm_pw" 61 | decoder_scope_name = "de_lstm_pw" 62 | elif scope_name=="pph": 63 | encoder_scope_name = "en_lstm_pph" 64 | decoder_scope_name = "de_lstm_pph" 65 | else: 66 | encoder_scope_name = "en_lstm_iph" 67 | decoder_scope_name = "de_lstm_iph" 68 | 69 | with tf.variable_scope(name_or_scope=scope_name,reuse=reuse): 70 | #forward part 71 | lstm_forward1=rnn.BasicLSTMCell(num_units=self.hidden_units_num) 72 | # 加attention(这里的attention和encoder-decoder架构的attention稍有不同) 73 | lstm_forward1 = rnn.AttentionCellWrapper(cell=lstm_forward1, attn_length=5) 74 | 75 | lstm_forward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num) 76 | #加attention 77 | lstm_forward2 = rnn.AttentionCellWrapper(cell=lstm_forward2, attn_length=5) 78 | 79 | lstm_forward=rnn.MultiRNNCell(cells=[lstm_forward1,lstm_forward2]) 80 | # dropout 81 | lstm_forward = rnn.DropoutWrapper( 82 | cell=lstm_forward, 83 | input_keep_prob=self.input_keep_prob_p, 84 | output_keep_prob=self.output_keep_prob_p 85 | ) 86 | 87 | #backward part 88 | lstm_backward1 = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 89 | # 加attention 90 | lstm_backward1 = rnn.AttentionCellWrapper(cell=lstm_backward1, attn_length=5) 91 | 92 | lstm_backward2 = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 93 | # 加attention 94 | lstm_backward2 = rnn.AttentionCellWrapper(cell=lstm_backward2, attn_length=5) 95 | 96 | lstm_backward = rnn.MultiRNNCell(cells=[lstm_backward1, lstm_backward2]) 97 | #drop out 98 | lstm_backward = rnn.DropoutWrapper( 99 | cell=lstm_backward, 100 | input_keep_prob=self.input_keep_prob_p, 101 | output_keep_prob=self.output_keep_prob_p 102 | ) 103 | 104 | outputs, states = tf.nn.bidirectional_dynamic_rnn( 105 | cell_fw=lstm_forward, 106 | cell_bw=lstm_backward, 107 | inputs=inputs, 108 | sequence_length=seq_length, 109 | dtype=tf.float32, 110 | scope=decoder_scope_name 111 | ) 112 | outputs_forward = outputs[0] # shape of h is [batch_size, max_time, cell_fw.output_size] 113 | outputs_backward = outputs[1] # shape of h is [batch_size, max_time, cell_bw.output_size] 114 | # concat final outputs [batch_size, max_time, cell_fw.output_size*2] 115 | final_outputs = tf.concat(values=[outputs_forward, outputs_backward], axis=2) 116 | #shape of h: [batch * time_steps, hidden_units * 2] 117 | h = tf.reshape(tensor=final_outputs, shape=(-1, self.hidden_units_num * 2)) 118 | 119 | # 全连接dropout 120 | h = tf.nn.dropout(x=h, keep_prob=self.keep_prob_p) 121 | 122 | # fully connect layer(projection) 123 | weight=tf.get_variable( 124 | name="Weight", 125 | shape=(self.hidden_units_num * 2, self.class_num), 126 | dtype=tf.float32, 127 | initializer=tf.contrib.layers.xavier_initializer() 128 | ) 129 | bias=tf.get_variable( 130 | name="Bias", 131 | shape=(self.class_num,), 132 | dtype=tf.float32, 133 | initializer=tf.contrib.layers.xavier_initializer() 134 | ) 135 | # logits:[batch_size*max_time, 2] 136 | #logits =tf.nn.elu(features=tf.matmul(h, weight) + bias) 137 | logits= tf.matmul(h, weight) + bias 138 | 139 | # logits in an normal way:[batch_size,max_time_stpes,2] 140 | logits_normal = tf.reshape( 141 | tensor=logits, 142 | shape=(-1, self.max_sentence_size, self.class_num), 143 | name="logits_normal" 144 | ) 145 | # logits_pw_masked [seq_len1+seq_len2+..+seq_lenn, 2] 146 | logits_masked = tf.boolean_mask( 147 | tensor=logits_normal, 148 | mask=self.mask, 149 | name="logits_masked" 150 | ) 151 | #print("logits_masked.shape", logits_masked.shape) 152 | 153 | # softmax 154 | prob_masked = tf.nn.softmax(logits=logits_masked, axis=-1, name="prob_pw_masked") 155 | #print("prob_masked.shape", prob_masked.shape) 156 | 157 | # prediction 158 | # pred:[batch_size*max_time,] 159 | pred = tf.cast(tf.argmax(logits, 1), tf.int32, name="pred") 160 | # pred in an normal way,[batch_size, max_time] 161 | pred_normal = tf.reshape( 162 | tensor=pred, 163 | shape=(-1, self.max_sentence_size), 164 | name="pred_normal" 165 | ) 166 | # one-hot the pred_normal:[batch_size, max_time,class_num] 167 | pred_normal_one_hot = tf.one_hot( 168 | indices=pred_normal, 169 | depth=self.class_num, 170 | name="pred_normal_one_hot" 171 | ) 172 | # pred_masked [seq_len1+seq_len2+....+,] 173 | pred_masked = tf.boolean_mask( 174 | tensor=pred_normal, 175 | mask=self.mask, 176 | name="pred_masked" 177 | ) 178 | 179 | # loss 180 | loss = tf.losses.sparse_softmax_cross_entropy( 181 | labels=y_masked, 182 | logits=logits_masked 183 | ) + tf.contrib.layers.l2_regularizer(self.lambda_pw)(weight) 184 | 185 | return loss,prob_masked,pred,pred_masked,pred_normal_one_hot 186 | 187 | 188 | # forward process and training process 189 | def fit(self, X_train, y_train, len_train, pos_train, length_train, position_train, 190 | X_valid, y_valid, len_valid, pos_valid, length_valid, position_valid, 191 | X_test, y_test, len_test, pos_test, length_test, position_test, name, print_log=True): 192 | # handle data 193 | y_train_pw = y_train[0] 194 | y_train_pph = y_train[1] 195 | # y_train_iph = y_train[2] 196 | 197 | y_valid_pw = y_valid[0] 198 | y_valid_pph = y_valid[1] 199 | # y_valid_iph = y_valid[2] 200 | 201 | y_test_pw = y_test[0] 202 | y_test_pph = y_test[1] 203 | # y_valid_iph = y_valid[2] 204 | 205 | 206 | # ------------------------------------------define graph---------------------------------------------# 207 | with self.graph.as_default(): 208 | #***********************Dataset API**************************** 209 | # create dataset_train object 210 | dataset_train = tf.data.Dataset.from_tensor_slices( 211 | tensors=(X_train, y_train_pw, y_train_pph, len_train, pos_train, length_train, position_train) 212 | ).repeat().batch(batch_size=self.batch_size).shuffle(buffer_size=2) 213 | 214 | # create iterator_train object 215 | iterator_train = dataset_train.make_one_shot_iterator() 216 | 217 | # get batch 218 | batch_train = iterator_train.get_next() 219 | #print("batch_train:", batch_train) 220 | 221 | # dataset_valid= 222 | # dataset_test= 223 | #*************************************************************** 224 | 225 | #****************** data place holder*************************** 226 | self.X_p = tf.placeholder(dtype=tf.int32,shape=(None, self.max_sentence_size),name="input_p") 227 | self.y_p_pw = tf.placeholder(dtype=tf.int32,shape=(None, self.max_sentence_size),name="label_p_pw") 228 | self.y_p_pph = tf.placeholder(dtype=tf.int32,shape=(None, self.max_sentence_size),name="label_p_pph") 229 | #self.y_p_iph = tf.placeholder(dtype=tf.int32,shape=(None, self.max_sentence_size),name="label_p_iph") 230 | 231 | # 相应序列的长度占位 232 | self.seq_len_p = tf.placeholder(dtype=tf.int32, shape=(None,), name="seq_len") 233 | 234 | # 用来去掉padding的mask 235 | self.mask = tf.sequence_mask(lengths=self.seq_len_p,maxlen=self.max_sentence_size,name="mask") 236 | 237 | # 去掉padding之后的labels,shape[seq_len1+seq_len2+....+,] 238 | y_p_pw_masked = tf.boolean_mask(tensor=self.y_p_pw,mask=self.mask,name="y_p_pw_masked") 239 | y_p_pph_masked = tf.boolean_mask(tensor=self.y_p_pph,mask=self.mask,name="y_p_pph_masked") 240 | # y_p_iph_masked = tf.boolean_mask(tensor=self.y_p_iph,mask=self.mask,name="y_p_iph_masked") 241 | 242 | # pos info placeholder 243 | self.pos_p = tf.placeholder(dtype=tf.int32,shape=(None, self.max_sentence_size),name="pos_p") 244 | self.pos_one_hot = tf.one_hot(indices=self.pos_p, depth=self.pos_num, name="pos_one_hot") 245 | #print("shape of pos_one_hot:", self.pos_one_hot.shape) 246 | 247 | # length info placeholder 248 | self.length_p = tf.placeholder(dtype=tf.int32,shape=(None, self.max_sentence_size),name="length_p") 249 | self.length_one_hot = tf.one_hot(indices=self.length_p, depth=self.length_num, name="pos_one_hot") 250 | #print("shape of length_one_hot:", self.length_one_hot.shape) 251 | 252 | # position info placeholder 253 | self.position_p = tf.placeholder(dtype=tf.int32,shape=(None, self.max_sentence_size),name="position_p") 254 | self.position_one_hot = tf.one_hot(indices=self.position_p, depth=self.max_sentence_size,name="pos_one_hot") 255 | #print("shape of position_one_hot:", self.position_one_hot.shape) 256 | 257 | # dropout 占位 258 | self.keep_prob_p = tf.placeholder(dtype=tf.float32, shape=[], name="keep_prob_p") 259 | self.input_keep_prob_p = tf.placeholder(dtype=tf.float32, shape=[], name="input_keep_prob_p") 260 | self.output_keep_prob_p = tf.placeholder(dtype=tf.float32, shape=[], name="output_keep_prob_p") 261 | 262 | # word embeddings 263 | self.word_embeddings = tf.Variable( 264 | initial_value=util.readEmbeddings(file="../data/embeddings/word_vec.txt"), 265 | trainable=False, 266 | name="word_embeddings" 267 | ) 268 | print("wordembedding.shape", self.word_embeddings.shape) 269 | 270 | # -------------------------------------PW----------------------------------------------------- 271 | # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size] 272 | inputs_pw = tf.nn.embedding_lookup(params=self.word_embeddings, ids=self.X_p, name="embeded_input_pw") 273 | print("shape of inputs_pw:", inputs_pw.shape) 274 | inputs_pw = tf.concat( 275 | values=[inputs_pw, self.pos_one_hot, self.length_one_hot, self.position_one_hot], 276 | axis=2, 277 | name="input_pw" 278 | ) 279 | print("shape of cancated inputs_pw:", inputs_pw.shape) 280 | self.loss_pw,prob_pw_masked,pred_pw,pred_pw_masked,pred_normal_one_hot_pw=self.hierarchy( 281 | inputs=inputs_pw, 282 | y_masked=y_p_pw_masked, 283 | seq_length=self.seq_len_p, 284 | scope_name="pw" 285 | ) 286 | 287 | # ----------------------------------PPH-------------------------------------------------- 288 | # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size] 289 | inputs_pph = tf.nn.embedding_lookup(params=self.word_embeddings, ids=self.X_p, name="embeded_input_pph") 290 | print("input_pph.shape", inputs_pph.shape) 291 | # concat all information 292 | inputs_pph = tf.concat( 293 | values=[inputs_pph, self.pos_one_hot, self.length_one_hot, self.position_one_hot,pred_normal_one_hot_pw], 294 | axis=2, 295 | name="inputs_pph" 296 | ) 297 | print("shape of input_pph:", inputs_pph.shape) 298 | 299 | self.loss_pph, prob_pph_masked,pred_pph, pred_pph_masked, pred_normal_one_hot_pph = self.hierarchy( 300 | inputs=inputs_pph, 301 | y_masked=y_p_pph_masked, 302 | seq_length=self.seq_len_p, 303 | scope_name="pph" 304 | ) 305 | 306 | # adjust learning rate 307 | global_step = tf.Variable(initial_value=1, trainable=False) 308 | start_learning_rate = self.learning_rate 309 | learning_rate = tf.train.exponential_decay( 310 | learning_rate=start_learning_rate, 311 | global_step=global_step, 312 | decay_steps=(X_train.shape[0] // self.batch_size) + 1, 313 | decay_rate=self.decay_rate, 314 | staircase=True, 315 | name="decay_learning_rate" 316 | ) 317 | 318 | # loss 319 | self.loss = self.loss_pw + self.loss_pph 320 | 321 | # optimizer 322 | self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss,global_step=global_step) 323 | self.init_op = tf.global_variables_initializer() 324 | self.init_local_op = tf.local_variables_initializer() 325 | 326 | # --------------------------------------------Session------------------------------------------------- 327 | with self.session as sess: 328 | print("Training Start") 329 | sess.run(self.init_op) # initialize all variables 330 | sess.run(self.init_local_op) 331 | 332 | train_Size = X_train.shape[0]; 333 | validation_Size = X_valid.shape[0] 334 | test_Size = X_test.shape[0] 335 | 336 | self.best_validation_loss = 1000 # best validation accuracy in training process 337 | # store result 338 | if not os.path.exists("../result/bilstm/"): 339 | os.mkdir("../result/bilstm/") 340 | 341 | # epoch 342 | for epoch in range(1, self.max_epoch + 1): 343 | print("Epoch:", epoch) 344 | start_time = time.time() # time evaluation 345 | # training loss/accuracy in every mini-batch 346 | self.train_losses = [] 347 | self.train_accus_pw = [] 348 | self.train_accus_pph = [] 349 | # self.train_accus_iph = [] 350 | 351 | self.c1_f_pw = []; 352 | self.c2_f_pw = [] # each class's f1 score 353 | self.c1_f_pph = []; 354 | self.c2_f_pph = [] 355 | # self.c1_f_iph = []; 356 | # self.c2_f_iph = [] 357 | lrs = [] 358 | 359 | # mini batch 360 | for i in range(0, (train_Size // self.batch_size)): 361 | elements=sess.run(batch_train) 362 | # 注意:这里获取的都是mask之后的值 363 | _, train_loss, lr,y_train_pw_masked, y_train_pph_masked, \ 364 | train_pred_pw, train_pred_pph, \ 365 | train_prob_pw_masked, train_prob_pph_masked = sess.run( 366 | fetches=[self.optimizer, self.loss,learning_rate,y_p_pw_masked, y_p_pph_masked, 367 | pred_pw_masked, pred_pph_masked, prob_pw_masked, prob_pph_masked ], 368 | feed_dict={ 369 | self.X_p: elements[0], 370 | self.y_p_pw: elements[1], 371 | self.y_p_pph: elements[2], 372 | self.seq_len_p: elements[3], 373 | self.pos_p: elements[4], 374 | self.length_p: elements[5], 375 | self.position_p: elements[6], 376 | self.keep_prob_p: self.keep_prob, 377 | self.input_keep_prob_p: self.input_keep_prob, 378 | self.output_keep_prob_p: self.output_keep_prob 379 | } 380 | ) 381 | 382 | # write the prob to files 383 | util.writeProb( 384 | prob_pw=train_prob_pw_masked, 385 | prob_pph=train_prob_pph_masked, 386 | outFile="../result/bilstm/bilstm_prob_train_epoch" + str(epoch) + ".txt" 387 | ) 388 | 389 | lrs.append(lr) 390 | # loss 391 | self.train_losses.append(train_loss) 392 | # metrics 393 | accuracy_pw, f1_pw = util.eval(y_true=y_train_pw_masked, y_pred=train_pred_pw) # pw 394 | accuracy_pph, f1_pph = util.eval(y_true=y_train_pph_masked, y_pred=train_pred_pph) # pph 395 | # accuracy_iph, f1_1_iph, f1_2_iph = util.eval(y_true=y_train_iph_masked,y_pred=train_pred_iph) # iph 396 | 397 | self.train_accus_pw.append(accuracy_pw) 398 | self.train_accus_pph.append(accuracy_pph) 399 | # self.train_accus_iph.append(accuracy_iph) 400 | # F1-score 401 | self.c1_f_pw.append(f1_pw[0]); 402 | self.c2_f_pw.append(f1_pw[1]) 403 | self.c1_f_pph.append(f1_pph[0]); 404 | self.c2_f_pph.append(f1_pph[1]) 405 | # self.c1_f_iph.append(f1_1_iph); 406 | # self.c2_f_iph.append(f1_2_iph) 407 | 408 | # ----------------------------------validation in every epoch---------------------------------- 409 | self.valid_loss, y_valid_pw_masked, y_valid_pph_masked, \ 410 | valid_pred_pw_masked, valid_pred_pph_masked, valid_pred_pw, valid_pred_pph, \ 411 | valid_prob_pw_masked, valid_prob_pph_masked = sess.run( 412 | fetches=[self.loss, y_p_pw_masked, y_p_pph_masked, 413 | pred_pw_masked, pred_pph_masked, pred_pw, pred_pph, 414 | prob_pw_masked, prob_pph_masked 415 | ], 416 | feed_dict={ 417 | self.X_p: X_valid, 418 | self.y_p_pw: y_valid_pw, 419 | self.y_p_pph: y_valid_pph, 420 | self.seq_len_p: len_valid, 421 | self.pos_p: pos_valid, 422 | self.length_p: length_valid, 423 | self.position_p: position_valid, 424 | self.keep_prob_p: 1.0, 425 | self.input_keep_prob_p: 1.0, 426 | self.output_keep_prob_p: 1.0 427 | } 428 | ) 429 | #write the prob to files 430 | util.writeProb( 431 | prob_pw=valid_prob_pw_masked, 432 | prob_pph=valid_prob_pph_masked, 433 | outFile="../result/bilstm/bilstm_prob_valid_epoch" + str(epoch) + ".txt" 434 | ) 435 | 436 | # metrics 437 | self.valid_accuracy_pw, self.valid_f1_pw = util.eval( 438 | y_true=y_valid_pw_masked, 439 | y_pred=valid_pred_pw_masked 440 | ) 441 | self.valid_accuracy_pph, self.valid_f1_pph = util.eval( 442 | y_true=y_valid_pph_masked, 443 | y_pred=valid_pred_pph_masked 444 | ) 445 | # recover to original corpus txt 446 | # shape of valid_pred_pw,valid_pred_pw,valid_pred_pw:[corpus_size*time_stpes] 447 | util.recover2( 448 | X=X_valid, 449 | preds_pw=valid_pred_pw, 450 | preds_pph=valid_pred_pph, 451 | filename="../result/bilstm/valid_recover_epoch_" + str(epoch) + ".txt" 452 | ) 453 | # ---------------------------------------------------------------------------------------- 454 | 455 | # ----------------------------------test in every epoch---------------------------------- 456 | self.test_loss, y_test_pw_masked, y_test_pph_masked, \ 457 | test_pred_pw_masked, test_pred_pph_masked, test_pred_pw, test_pred_pph, \ 458 | test_prob_pw_masked, test_prob_pph_masked = sess.run( 459 | fetches=[self.loss, y_p_pw_masked, y_p_pph_masked, 460 | pred_pw_masked, pred_pph_masked, pred_pw, pred_pph, 461 | prob_pw_masked, prob_pph_masked 462 | ], 463 | feed_dict={ 464 | self.X_p: X_test, 465 | self.y_p_pw: y_test_pw, 466 | self.y_p_pph: y_test_pph, 467 | self.seq_len_p: len_test, 468 | self.pos_p: pos_test, 469 | self.length_p: length_test, 470 | self.position_p: position_test, 471 | self.keep_prob_p: 1.0, 472 | self.input_keep_prob_p: 1.0, 473 | self.output_keep_prob_p: 1.0 474 | } 475 | ) 476 | # write the prob to files 477 | util.writeProb( 478 | prob_pw=test_prob_pw_masked, 479 | prob_pph=test_prob_pph_masked, 480 | outFile="../result/bilstm/bilstm_prob_test_epoch" + str(epoch) + ".txt" 481 | ) 482 | 483 | # metrics 484 | self.test_accuracy_pw, self.test_f1_pw = util.eval( 485 | y_true=y_test_pw_masked, 486 | y_pred=test_pred_pw_masked 487 | ) 488 | self.test_accuracy_pph, self.test_f1_pph = util.eval( 489 | y_true=y_test_pph_masked, 490 | y_pred=test_pred_pph_masked 491 | ) 492 | # recover to original corpus txt 493 | # shape of test_pred_pw,test_pred_pw,test_pred_pw:[corpus_size*time_stpes] 494 | util.recover2( 495 | X=X_test, 496 | preds_pw=test_pred_pw, 497 | preds_pph=test_pred_pph, 498 | filename="../result/bilstm/test_recover_epoch_" + str(epoch) + ".txt" 499 | ) 500 | # ----------------------------------------------------------------------------------- 501 | 502 | # self.valid_accuracy_iph, self.valid_f1_1_iph, self.valid_f1_2_iph = util.eval(y_true=y_valid_iph_masked,y_pred=valid_pred_iph) 503 | 504 | # show information 505 | print("Epoch ", epoch, " finished.", "spend ", round((time.time() - start_time) / 60, 2), " mins") 506 | print("learning rate:", sum(lrs) / len(lrs)) 507 | self.showInfo(type="training") 508 | self.showInfo(type="validation") 509 | self.showInfo(type="test") 510 | 511 | # when we get a new best validation accuracy,we store the model 512 | if self.best_validation_loss < self.valid_loss: 513 | self.best_validation_loss = self.valid_loss 514 | print("New Best loss ", self.best_validation_loss, " On Validation set! ") 515 | print("Saving Models......\n\n") 516 | # exist ./models folder? 517 | if not os.path.exists("./models/"): 518 | os.mkdir(path="./models/") 519 | if not os.path.exists("./models/" + name): 520 | os.mkdir(path="./models/" + name) 521 | if not os.path.exists("./models/" + name + "/bilstm"): 522 | os.mkdir(path="./models/" + name + "/bilstm") 523 | # create saver 524 | saver = tf.train.Saver() 525 | saver.save(sess, "./models/" + name + "/bilstm/my-model-10000") 526 | # Generates MetaGraphDef. 527 | saver.export_meta_graph("./models/" + name + "/bilstm/my-model-10000.meta") 528 | print("\n\n") 529 | 530 | 531 | # 返回预测的结果或者准确率,y not None的时候返回准确率,y ==None的时候返回预测值 532 | def pred(self, name, X, y=None, ): 533 | start_time = time.time() # compute time 534 | if y is None: 535 | with self.session as sess: 536 | # restore model 537 | new_saver = tf.train.import_meta_graph( 538 | meta_graph_or_file="./models/" + name + "/bilstm/my-model-10000.meta", 539 | clear_devices=True 540 | ) 541 | new_saver.restore(sess, "./models/" + name + "/bilstm/my-model-10000") 542 | # get default graph 543 | graph = tf.get_default_graph() 544 | # get opration from the graph 545 | pred_normal = graph.get_operation_by_name("pred_normal").outputs[0] 546 | X_p = graph.get_operation_by_name("input_placeholder").outputs[0] 547 | pred = sess.run(fetches=pred_normal, feed_dict={X_p: X}) 548 | print("this operation spends ", round((time.time() - start_time) / 60, 2), " mins") 549 | return pred 550 | else: 551 | with self.session as sess: 552 | # restore model 553 | new_saver = tf.train.import_meta_graph( 554 | meta_graph_or_file="./models/" + name + "/bilstm/my-model-10000.meta", 555 | clear_devices=True 556 | ) 557 | new_saver.restore(sess, "./models/" + name + "/bilstm/my-model-10000") 558 | graph = tf.get_default_graph() 559 | # get opration from the graph 560 | accuracy = graph.get_operation_by_name("accuracy").outputs[0] 561 | X_p = graph.get_operation_by_name("input_placeholder").outputs[0] 562 | y_p = graph.get_operation_by_name("label_placeholder").outputs[0] 563 | # forward and get the results 564 | accu = sess.run(fetches=accuracy, feed_dict={X_p: X, y_p: y}) 565 | print("this operation spends ", round((time.time() - start_time) / 60, 2), " mins") 566 | return accu 567 | 568 | def showInfo(self, type): 569 | if type == "training": 570 | # training information 571 | print(" /**Training info**/") 572 | print("----avarage training loss:", sum(self.train_losses) / len(self.train_losses)) 573 | print("PW:") 574 | print("----avarage accuracy:", sum(self.train_accus_pw) / len(self.train_accus_pw)) 575 | # print("----avarage f1-Score of N:", sum(self.c1_f_pw) / len(self.c1_f_pw)) 576 | print("----avarage f1-Score of B:", sum(self.c2_f_pw) / len(self.c2_f_pw)) 577 | print("PPH:") 578 | print("----avarage accuracy :", sum(self.train_accus_pph) / len(self.train_accus_pph)) 579 | # print("----avarage f1-Score of N:", sum(self.c1_f_pph) / len(self.c1_f_pph)) 580 | print("----avarage f1-Score of B:", sum(self.c2_f_pph) / len(self.c2_f_pph)) 581 | # print("IPH:") 582 | # print("----avarage accuracy:", sum(self.train_accus_iph) / len(self.train_accus_iph)) 583 | # print("----avarage f1-Score of N:", sum(self.c1_f_iph) / len(self.c1_f_iph)) 584 | # print("----avarage f1-Score of B:", sum(self.c2_f_iph) / len(self.c2_f_iph)) 585 | elif type == "validation": 586 | print(" /**Validation info**/") 587 | print("----avarage validation loss:", self.valid_loss) 588 | print("PW:") 589 | print("----avarage accuracy:", self.valid_accuracy_pw) 590 | # print("----avarage f1-Score of N:", self.valid_f1_pw[0]) 591 | print("----avarage f1-Score of B:", self.valid_f1_pw[1]) 592 | print("PPH:") 593 | print("----avarage accuracy :", self.valid_accuracy_pph) 594 | # print("----avarage f1-Score of N:", self.valid_f1_pph[0]) 595 | print("----avarage f1-Score of B:", self.valid_f1_pph[1]) 596 | # print("IPH:") 597 | # print("----avarage accuracy:", self.valid_accuracy_iph) 598 | # print("----avarage f1-Score of N:", self.valid_f1_1_iph) 599 | # print("----avarage f1-Score of B:", self.valid_f1_2_iph) 600 | else: 601 | print(" /**testation info**/") 602 | print("----avarage test loss:", self.test_loss) 603 | print("PW:") 604 | print("----avarage accuracy:", self.test_accuracy_pw) 605 | # print("----avarage f1-Score of N:", self.test_f1_pw[0]) 606 | print("----avarage f1-Score of B:", self.test_f1_pw[1]) 607 | print("PPH:") 608 | print("----avarage accuracy :", self.test_accuracy_pph) 609 | # print("----avarage f1-Score of N:", self.test_f1_pph[0]) 610 | print("----avarage f1-Score of B:", self.test_f1_pph[1]) 611 | # print("IPH:") 612 | # print("----avarage accuracy:", self.test_accuracy_iph) 613 | # print("----avarage f1-Score of N:", self.test_f1_1_iph) 614 | # print("----avarage f1-Score of B:", self.test_f1_2_iph) 615 | 616 | 617 | # train && test 618 | if __name__ == "__main__": 619 | # 读数据 620 | print("Loading Data...") 621 | X_train, y_train, len_train, pos_train, length_train, position_train, \ 622 | X_valid, y_valid, len_valid, pos_valid, length_valid, position_valid, \ 623 | X_test, y_test, len_test, pos_test, length_test, position_test=util.loadData() 624 | 625 | # print("Run Model...\n\n\n") 626 | model = BiLSTM() 627 | model.fit( 628 | X_train, y_train, len_train, pos_train, length_train, position_train, 629 | X_valid, y_valid, len_valid, pos_valid, length_valid, position_valid, 630 | X_test, y_test, len_test, pos_test, length_test, position_test, "test", False) -------------------------------------------------------------------------------- /models/bilstm_cwe.py: -------------------------------------------------------------------------------- 1 | ''' 2 | BILSTM+CBOW 3 | ''' 4 | 5 | import sys 6 | sys.path.append("..") 7 | import numpy as np 8 | import pandas as pd 9 | import tensorflow as tf 10 | import tensorflow.contrib.rnn as rnn 11 | import time 12 | import os 13 | import parameter 14 | import util 15 | 16 | class BiLSTM_CWE(): 17 | def __init__(self): 18 | # basic environment 19 | self.graph = tf.Graph() 20 | self.session = tf.Session(graph=self.graph) 21 | 22 | # basic parameters 23 | self.learning_rate = parameter.LEARNING_RATE 24 | self.max_epoch = parameter.MAX_EPOCH 25 | 26 | self.class_num = parameter.CLASS_NUM 27 | self.pos_num=parameter.POS_NUM 28 | self.length_num=parameter.LENGTH_NUM 29 | self.hidden_units_num = parameter.HIDDEN_UNITS_NUM 30 | self.hidden_units_num2 = parameter.HIDDEN_UNITS_NUM2 31 | self.layer_num = parameter.LAYER_NUM 32 | self.max_sentence_size = parameter.MAX_SENTENCE_SIZE 33 | 34 | #self.vocab_size = parameter.VOCAB_SIZE 35 | self.word_vocab_size=parameter.WORD_VOCAB_SIZE 36 | self.char_embedding_size = parameter.CHAR_EMBEDDING_SIZE 37 | self.word_embedding_size=parameter.WORD_EMBEDDING_SIZE 38 | 39 | self.batch_size = parameter.BATCH_SIZE 40 | self.lambda_pw=parameter.LAMBDA_PW 41 | self.lambda_pph=parameter.LAMBDA_PPH 42 | self.lambda_iph=parameter.LAMBDA_IPH 43 | 44 | self.keep_prob = parameter.KEEP_PROB 45 | self.input_keep_prob=parameter.INPUT_KEEP_PROB 46 | self.output_keep_prob=parameter.OUTPUT_KEEP_PROB 47 | 48 | self.decay_rate=parameter.DECAY 49 | 50 | 51 | # forward process and training process 52 | def fit(self, X_train, y_train, len_train,pos_train,length_train,position_train, 53 | X_validation, y_validation, len_validation, pos_validation,length_validation,position_validation, 54 | name, print_log=True): 55 | # ---------------------------------------forward computation--------------------------------------------# 56 | y_train_pw = y_train[0] 57 | y_train_pph = y_train[1] 58 | #y_train_iph = y_train[2] 59 | 60 | y_validation_pw = y_validation[0] 61 | y_validation_pph = y_validation[1] 62 | #y_validation_iph = y_validation[2] 63 | # ---------------------------------------define graph---------------------------------------------# 64 | with self.graph.as_default(): 65 | # data place holder 66 | self.X_p = tf.placeholder( 67 | dtype=tf.int32, 68 | shape=(None, self.max_sentence_size), 69 | name="input_placeholder" 70 | ) 71 | 72 | # pos info placeholder 73 | self.pos_p = tf.placeholder( 74 | dtype=tf.int32, 75 | shape=(None, self.max_sentence_size), 76 | name="pos_placeholder" 77 | ) 78 | 79 | # length info placeholder 80 | self.length_p = tf.placeholder( 81 | dtype=tf.int32, 82 | shape=(None, self.max_sentence_size), 83 | name="length_placeholder" 84 | ) 85 | 86 | # position info placeholder 87 | self.position_p = tf.placeholder( 88 | dtype=tf.int32, 89 | shape=(None, self.max_sentence_size), 90 | name="length_placeholder" 91 | ) 92 | 93 | self.y_p_pw = tf.placeholder( 94 | dtype=tf.int32, 95 | shape=(None, self.max_sentence_size), 96 | name="label_placeholder_pw" 97 | ) 98 | self.y_p_pph = tf.placeholder( 99 | dtype=tf.int32, 100 | shape=(None, self.max_sentence_size), 101 | name="label_placeholder_pph" 102 | ) 103 | 104 | #self.y_p_iph = tf.placeholder( 105 | # dtype=tf.int32, 106 | # shape=(None, self.max_sentence_size), 107 | # name="label_placeholder_iph" 108 | #) 109 | # dropout 占位 110 | self.keep_prob_p = tf.placeholder(dtype=tf.float32, shape=[], name="keep_prob_p") 111 | self.input_keep_prob_p = tf.placeholder(dtype=tf.float32, shape=[], name="input_keep_prob_p") 112 | self.output_keep_prob_p=tf.placeholder(dtype=tf.float32, shape=[], name="output_keep_prob_p") 113 | 114 | # 相应序列的长度占位 115 | self.seq_len_p = tf.placeholder( 116 | dtype=tf.int32, 117 | shape=(None,), 118 | name="seq_len" 119 | ) 120 | 121 | #用来去掉padding的mask 122 | self.mask = tf.sequence_mask( 123 | lengths=self.seq_len_p, 124 | maxlen=self.max_sentence_size, 125 | name="mask" 126 | ) 127 | 128 | #去掉padding之后的labels 129 | y_p_pw_masked = tf.boolean_mask( #shape[seq_len1+seq_len2+....+,] 130 | tensor=self.y_p_pw, 131 | mask=self.mask, 132 | name="y_p_pw_masked" 133 | ) 134 | 135 | y_p_pph_masked = tf.boolean_mask( # shape[seq_len1+seq_len2+....+,] 136 | tensor=self.y_p_pph, 137 | mask=self.mask, 138 | name="y_p_pph_masked" 139 | ) 140 | 141 | #y_p_iph_masked = tf.boolean_mask( # shape[seq_len1+seq_len2+....+,] 142 | # tensor=self.y_p_iph, 143 | # mask=self.mask, 144 | # name="y_p_iph_masked" 145 | #) 146 | 147 | # embeddings 148 | #self.embeddings = tf.Variable( 149 | # initial_value=tf.zeros(shape=(self.vocab_size, self.embedding_size), dtype=tf.float32), 150 | # name="embeddings" 151 | #) 152 | 153 | self.word_embeddings=tf.Variable( 154 | initial_value=util.getCWE( 155 | word_embed_file="../data/embeddings/word_vec.txt", 156 | char_embed_file="../data/embeddings/char_vec.txt" 157 | ), 158 | name="word_embeddings" 159 | ) 160 | 161 | print("word_embeddings.shape",self.word_embeddings.shape) 162 | 163 | # pos one-hot 164 | self.pos_one_hot = tf.one_hot( 165 | indices=self.pos_p, 166 | depth=self.pos_num, 167 | name="pos_one_hot" 168 | ) 169 | print("shape of pos_one_hot:", self.pos_one_hot.shape) 170 | 171 | # length one-hot 172 | self.length_one_hot = tf.one_hot( 173 | indices=self.length_p, 174 | depth=self.length_num, 175 | name="pos_one_hot" 176 | ) 177 | print("shape of length_one_hot:", self.length_one_hot.shape) 178 | 179 | # position one-hot 180 | self.position_one_hot = tf.one_hot( 181 | indices=self.position_p, 182 | depth=self.max_sentence_size, 183 | name="pos_one_hot" 184 | ) 185 | print("shape of position_one_hot:", self.position_one_hot.shape) 186 | 187 | # -------------------------------------PW----------------------------------------------------- 188 | # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size] 189 | inputs_pw = tf.nn.embedding_lookup(params=self.word_embeddings, ids=self.X_p, name="embeded_input_pw") 190 | print("shape of inputs_pw:",inputs_pw.shape) 191 | #concat all information 192 | inputs_pw = tf.concat( 193 | values=[inputs_pw, self.pos_one_hot, self.length_one_hot, self.position_one_hot], 194 | axis=2, 195 | name="input_pw" 196 | ) 197 | print("shape of cancated inputs_pw:", inputs_pw.shape) 198 | 199 | # forward part 200 | en_lstm_forward1_pw = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 201 | en_lstm_forward2_pw=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 202 | en_lstm_forward_pw=rnn.MultiRNNCell(cells=[en_lstm_forward1_pw,en_lstm_forward2_pw]) 203 | #dropout 204 | en_lstm_forward_pw=rnn.DropoutWrapper( 205 | cell=en_lstm_forward_pw, 206 | input_keep_prob=self.input_keep_prob_p, 207 | output_keep_prob=self.output_keep_prob_p 208 | ) 209 | 210 | # backward part 211 | en_lstm_backward1_pw = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 212 | en_lstm_backward2_pw=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 213 | en_lstm_backward_pw=rnn.MultiRNNCell(cells=[en_lstm_backward1_pw,en_lstm_backward2_pw]) 214 | #dropout 215 | en_lstm_backward_pw=rnn.DropoutWrapper( 216 | cell=en_lstm_backward_pw, 217 | input_keep_prob=self.input_keep_prob_p, 218 | output_keep_prob=self.output_keep_prob_p 219 | ) 220 | 221 | outputs, states = tf.nn.bidirectional_dynamic_rnn( 222 | cell_fw=en_lstm_forward_pw, 223 | cell_bw=en_lstm_backward_pw, 224 | inputs=inputs_pw, 225 | sequence_length=self.seq_len_p, 226 | dtype=tf.float32, 227 | scope="pw" 228 | ) 229 | 230 | outputs_forward_pw = outputs[0] # shape [batch_size, max_time, cell_fw.output_size] 231 | outputs_backward_pw = outputs[1] # shape [batch_size, max_time, cell_bw.output_size] 232 | # concat final outputs [batch_size, max_time, cell_fw.output_size*2] 233 | h_pw = tf.concat(values=[outputs_forward_pw, outputs_backward_pw], axis=2) 234 | h_pw=tf.reshape(tensor=h_pw,shape=(-1,self.hidden_units_num*2),name="h_pw") 235 | print("h_pw.shape",h_pw.shape) 236 | 237 | # 全连接dropout 238 | h_pw = tf.nn.dropout(x=h_pw, keep_prob=self.keep_prob_p, name="dropout_h_pw") 239 | 240 | # fully connect layer(projection) 241 | w_pw = tf.Variable( 242 | initial_value=tf.random_normal(shape=(self.hidden_units_num*2, self.class_num)), 243 | name="weights_pw" 244 | ) 245 | b_pw = tf.Variable( 246 | initial_value=tf.random_normal(shape=(self.class_num,)), 247 | name="bias_pw" 248 | ) 249 | #logits 250 | logits_pw = tf.matmul(h_pw, w_pw) + b_pw #logits_pw:[batch_size*max_time, 2] 251 | logits_normal_pw=tf.reshape( #logits in an normal way:[batch_size,max_time_stpes,2] 252 | tensor=logits_pw, 253 | shape=(-1,self.max_sentence_size,self.class_num), 254 | name="logits_normal_pw" 255 | ) 256 | logits_pw_masked = tf.boolean_mask( # logits_pw_masked [seq_len1+seq_len2+....+,3] 257 | tensor=logits_normal_pw, 258 | mask=self.mask, 259 | name="logits_pw_masked" 260 | ) 261 | 262 | # prediction 263 | pred_pw = tf.cast(tf.argmax(logits_pw, 1), tf.int32, name="pred_pw") # pred_pw:[batch_size*max_time,] 264 | pred_normal_pw = tf.reshape( # pred in an normal way,[batch_size, max_time] 265 | tensor=pred_pw, 266 | shape=(-1, self.max_sentence_size), 267 | name="pred_normal_pw" 268 | ) 269 | 270 | pred_pw_masked = tf.boolean_mask( # logits_pw_masked [seq_len1+seq_len2+....+,] 271 | tensor=pred_normal_pw, 272 | mask=self.mask, 273 | name="pred_pw_masked" 274 | ) 275 | 276 | pred_normal_one_hot_pw = tf.one_hot( # one-hot the pred_normal:[batch_size, max_time,class_num] 277 | indices=pred_normal_pw, 278 | depth=self.class_num, 279 | name="pred_normal_one_hot_pw" 280 | ) 281 | 282 | # loss 283 | self.loss_pw = tf.losses.sparse_softmax_cross_entropy( 284 | labels=y_p_pw_masked, 285 | logits=logits_pw_masked 286 | )+tf.contrib.layers.l2_regularizer(self.lambda_pw)(w_pw) 287 | # --------------------------------------------------------------------------------------- 288 | 289 | # ----------------------------------PPH-------------------------------------------------- 290 | # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size] 291 | inputs_pph = tf.nn.embedding_lookup(params=self.word_embeddings, ids=self.X_p, name="embeded_input_pph") 292 | print("shape of input_pph:", inputs_pph.shape) 293 | # concat all information 294 | inputs_pph = tf.concat( 295 | values=[inputs_pph, self.pos_one_hot, self.length_one_hot, self.position_one_hot, 296 | pred_normal_one_hot_pw], 297 | axis=2, 298 | name="inputs_pph" 299 | ) 300 | print("shape of input_pph:", inputs_pph.shape) 301 | 302 | # forward part 303 | en_lstm_forward1_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 304 | en_lstm_forward2_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 305 | en_lstm_forward_pph = rnn.MultiRNNCell(cells=[en_lstm_forward1_pph, en_lstm_forward2_pph]) 306 | #dropout 307 | en_lstm_forward_pph=rnn.DropoutWrapper( 308 | cell=en_lstm_forward_pph, 309 | input_keep_prob=self.input_keep_prob_p, 310 | output_keep_prob=self.output_keep_prob_p 311 | ) 312 | 313 | # backward part 314 | en_lstm_backward1_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 315 | en_lstm_backward2_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 316 | en_lstm_backward_pph = rnn.MultiRNNCell(cells=[en_lstm_backward1_pph, en_lstm_backward2_pph]) 317 | #dropout 318 | en_lstm_backward_pph=rnn.DropoutWrapper( 319 | cell=en_lstm_backward_pph, 320 | input_keep_prob=self.input_keep_prob_p, 321 | output_keep_prob=self.output_keep_prob_p 322 | ) 323 | 324 | outputs, states = tf.nn.bidirectional_dynamic_rnn( 325 | cell_fw=en_lstm_forward_pph, 326 | cell_bw=en_lstm_backward_pph, 327 | inputs=inputs_pph, 328 | sequence_length=self.seq_len_p, 329 | dtype=tf.float32, 330 | scope="pph" 331 | ) 332 | 333 | outputs_forward_pph = outputs[0] # shape [batch_size, max_time, cell_fw.output_size] 334 | outputs_backward_pph = outputs[1] # shape [batch_size, max_time, cell_bw.output_size] 335 | # concat final outputs [batch_size, max_time, cell_fw.output_size*2] 336 | h_pph = tf.concat(values=[outputs_forward_pph, outputs_backward_pph], axis=2) 337 | h_pph = tf.reshape(tensor=h_pph, shape=(-1, self.hidden_units_num * 2), name="h_pph") 338 | 339 | # 全连接dropout 340 | h_pph = tf.nn.dropout(x=h_pph, keep_prob=self.keep_prob_p, name="dropout_h_pph") 341 | 342 | # fully connect layer(projection) 343 | w_pph = tf.Variable( 344 | initial_value=tf.random_normal(shape=(self.hidden_units_num*2, self.class_num)), 345 | name="weights_pph" 346 | ) 347 | b_pph = tf.Variable( 348 | initial_value=tf.random_normal(shape=(self.class_num,)), 349 | name="bias_pph" 350 | ) 351 | # logits 352 | logits_pph = tf.matmul(h_pph, w_pph) + b_pph # shape of logits:[batch_size*max_time, 2] 353 | logits_normal_pph = tf.reshape( # logits in an normal way:[batch_size,max_time_stpes,2] 354 | tensor=logits_pph, 355 | shape=(-1, self.max_sentence_size, self.class_num), 356 | name="logits_normal_pph" 357 | ) 358 | logits_pph_masked = tf.boolean_mask( # [seq_len1+seq_len2+....+,3] 359 | tensor=logits_normal_pph, 360 | mask=self.mask, 361 | name="logits_pph_masked" 362 | ) 363 | 364 | # prediction 365 | pred_pph = tf.cast(tf.argmax(logits_pph, 1), tf.int32, name="pred_pph") # pred_pph:[batch_size*max_time,] 366 | pred_normal_pph = tf.reshape( # pred in an normal way,[batch_size, max_time] 367 | tensor=pred_pph, 368 | shape=(-1, self.max_sentence_size), 369 | name="pred_normal_pph" 370 | ) 371 | pred_pph_masked = tf.boolean_mask( # logits_pph_masked [seq_len1+seq_len2+....+,] 372 | tensor=pred_normal_pph, 373 | mask=self.mask, 374 | name="pred_pph_masked" 375 | ) 376 | pred_normal_one_hot_pph = tf.one_hot( # one-hot the pred_normal:[batch_size, max_time,class_num] 377 | indices=pred_normal_pph, 378 | depth=self.class_num, 379 | name="pred_normal_one_hot_pph" 380 | ) 381 | 382 | # loss 383 | self.loss_pph = tf.losses.sparse_softmax_cross_entropy( 384 | labels=y_p_pph_masked, 385 | logits=logits_pph_masked 386 | )+tf.contrib.layers.l2_regularizer(self.lambda_pph)(w_pph) 387 | # ------------------------------------------------------------------------------------ 388 | 389 | ''' 390 | # ---------------------------------------IPH------------------------------------------ 391 | # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size] 392 | inputs_iph = tf.nn.embedding_lookup(params=self.embeddings, ids=self.X_p, name="embeded_input_iph") 393 | # shape of inputs[batch_size,max_time_stpes,embeddings_dims+class_num] 394 | inputs_iph = tf.concat(values=[inputs_iph, pred_normal_one_hot_pph], axis=2, name="inputs_pph") 395 | # print("shape of input_pph:", inputs_pph.shape) 396 | # encoder cells 397 | # forward part 398 | en_lstm_forward1_iph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 399 | # en_lstm_forward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 400 | # en_lstm_forward=rnn.MultiRNNCell(cells=[en_lstm_forward1,en_lstm_forward2]) 401 | 402 | # backward part 403 | en_lstm_backward1_iph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 404 | # en_lstm_backward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 405 | # en_lstm_backward=rnn.MultiRNNCell(cells=[en_lstm_backward1,en_lstm_backward2]) 406 | 407 | # decoder cells 408 | de_lstm_iph = rnn.BasicLSTMCell(num_units=self.hidden_units_num*2) 409 | 410 | # encode 411 | encoder_outputs_iph, encoder_states_iph = self.encoder( 412 | cell_forward=en_lstm_forward1_iph, 413 | cell_backward=en_lstm_backward1_iph, 414 | inputs=inputs_iph, 415 | seq_length=self.seq_len_p, 416 | scope_name="en_lstm_iph" 417 | ) 418 | # shape of h is [batch*time_steps,hidden_units*2] 419 | h_iph = self.decoder( 420 | cell=de_lstm_iph, 421 | initial_state=encoder_states_iph, 422 | inputs=encoder_outputs_iph, 423 | scope_name="de_lstm_iph" 424 | ) 425 | 426 | # fully connect layer(projection) 427 | w_iph = tf.Variable( 428 | initial_value=tf.random_normal(shape=(self.hidden_units_num*2, self.class_num)), 429 | name="weights_iph" 430 | ) 431 | b_iph = tf.Variable( 432 | initial_value=tf.random_normal(shape=(self.class_num,)), 433 | name="bias_iph" 434 | ) 435 | # logits 436 | logits_iph = tf.matmul(h_iph, w_iph) + b_iph # shape of logits:[batch_size*max_time, 3] 437 | logits_normal_iph = tf.reshape( # logits in an normal way:[batch_size,max_time_stpes,3] 438 | tensor=logits_iph, 439 | shape=(-1, self.max_sentence_size, 3), 440 | name="logits_normal_iph" 441 | ) 442 | logits_iph_masked = tf.boolean_mask( # [seq_len1+seq_len2+....+,3] 443 | tensor=logits_normal_iph, 444 | mask=self.mask, 445 | name="logits_iph_masked" 446 | ) 447 | 448 | # prediction 449 | pred_iph = tf.cast(tf.argmax(logits_iph, 1), tf.int32, name="pred_iph") # pred_iph:[batch_size*max_time,] 450 | pred_normal_iph = tf.reshape( # pred in an normal way,[batch_size, max_time] 451 | tensor=pred_iph, 452 | shape=(-1, self.max_sentence_size), 453 | name="pred_normal_iph" 454 | ) 455 | pred_iph_masked = tf.boolean_mask( # logits_iph_masked [seq_len1+seq_len2+....+,] 456 | tensor=pred_normal_iph, 457 | mask=self.mask, 458 | name="pred_iph_masked" 459 | ) 460 | pred_normal_one_hot_iph = tf.one_hot( # one-hot the pred_normal:[batch_size, max_time,class_num] 461 | indices=pred_normal_iph, 462 | depth=self.class_num, 463 | name="pred_normal_one_hot_iph" 464 | ) 465 | # loss 466 | self.loss_iph = tf.losses.sparse_softmax_cross_entropy( 467 | labels=y_p_iph_masked, 468 | logits=logits_iph_masked 469 | )+tf.contrib.layers.l2_regularizer(self.lambda_iph)(w_iph) 470 | 471 | # --------------------------------------------------------------------------------------- 472 | ''' 473 | # adjust learning rate 474 | global_step = tf.Variable(initial_value=1, trainable=False) 475 | start_learning_rate = self.learning_rate 476 | learning_rate = tf.train.exponential_decay( 477 | learning_rate=start_learning_rate, 478 | global_step=global_step, 479 | decay_steps=(X_train.shape[0] // self.batch_size) + 1, 480 | decay_rate=self.decay_rate, 481 | staircase=True, 482 | name="decay_learning_rate" 483 | ) 484 | 485 | # loss 486 | self.loss = self.loss_pw + self.loss_pph 487 | 488 | 489 | # optimizer 490 | self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss,global_step) 491 | self.init_op = tf.global_variables_initializer() 492 | self.init_local_op = tf.local_variables_initializer() 493 | 494 | # ------------------------------------Session----------------------------------------- 495 | with self.session as sess: 496 | print("Training Start") 497 | sess.run(self.init_op) # initialize all variables 498 | sess.run(self.init_local_op) 499 | 500 | train_Size = X_train.shape[0]; 501 | validation_Size = X_validation.shape[0] 502 | self.best_validation_loss = 1000 # best validation accuracy in training process 503 | 504 | # epoch 505 | for epoch in range(1, self.max_epoch + 1): 506 | print("Epoch:", epoch) 507 | start_time = time.time() # time evaluation 508 | # training loss/accuracy in every mini-batch 509 | self.train_losses = [] 510 | self.train_accus_pw = [] 511 | self.train_accus_pph = [] 512 | #self.train_accus_iph = [] 513 | 514 | self.c1_f_pw = []; 515 | self.c2_f_pw = [] # each class's f1 score 516 | self.c1_f_pph = []; 517 | self.c2_f_pph = [] 518 | #self.c1_f_iph = []; 519 | #self.c2_f_iph = [] 520 | 521 | lrs = [] 522 | 523 | # mini batch 524 | for i in range(0, (train_Size // self.batch_size)): 525 | #注意:这里获取的都是mask之后的值 526 | _, train_loss, y_train_pw_masked,y_train_pph_masked,\ 527 | train_pred_pw, train_pred_pph,lr = sess.run( 528 | fetches=[self.optimizer, self.loss, 529 | y_p_pw_masked,y_p_pph_masked, 530 | pred_pw_masked, pred_pph_masked,learning_rate], 531 | feed_dict={ 532 | self.X_p: X_train[i * self.batch_size:(i + 1) * self.batch_size], 533 | self.y_p_pw: y_train_pw[i * self.batch_size:(i + 1) * self.batch_size], 534 | self.y_p_pph: y_train_pph[i * self.batch_size:(i + 1) * self.batch_size], 535 | self.seq_len_p: len_train[i * self.batch_size:(i + 1) * self.batch_size], 536 | self.pos_p: pos_train[i * self.batch_size:(i + 1) * self.batch_size], 537 | self.length_p: length_train[i * self.batch_size:(i + 1) * self.batch_size], 538 | self.position_p: position_train[i * self.batch_size:(i + 1) * self.batch_size], 539 | self.keep_prob_p: self.keep_prob, 540 | self.input_keep_prob_p:self.input_keep_prob, 541 | self.output_keep_prob_p:self.output_keep_prob 542 | } 543 | ) 544 | lrs.append(lr) 545 | 546 | # loss 547 | self.train_losses.append(train_loss) 548 | 549 | # metrics 550 | accuracy_pw, f1_pw= util.eval(y_true=y_train_pw_masked,y_pred=train_pred_pw) # pw 551 | accuracy_pph, f1_pph= util.eval(y_true=y_train_pph_masked,y_pred=train_pred_pph) # pph 552 | #accuracy_iph, f1_1_iph, f1_2_iph = util.eval(y_true=y_train_iph_masked,y_pred=train_pred_iph) # iph 553 | 554 | self.train_accus_pw.append(accuracy_pw) 555 | self.train_accus_pph.append(accuracy_pph) 556 | #self.train_accus_iph.append(accuracy_iph) 557 | # F1-score 558 | self.c1_f_pw.append(f1_pw[0]); 559 | self.c2_f_pw.append(f1_pw[1]) 560 | self.c1_f_pph.append(f1_pph[0]); 561 | self.c2_f_pph.append(f1_pph[1]) 562 | #self.c1_f_iph.append(f1_1_iph); 563 | #self.c2_f_iph.append(f1_2_iph) 564 | 565 | print("learning rate:", sum(lrs) / len(lrs)) 566 | # validation in every epoch 567 | self.validation_loss, y_valid_pw_masked,y_valid_pph_masked,\ 568 | valid_pred_pw, valid_pred_pph = sess.run( 569 | fetches=[self.loss, y_p_pw_masked,y_p_pph_masked, 570 | pred_pw_masked, pred_pph_masked], 571 | feed_dict={ 572 | self.X_p: X_validation, 573 | self.y_p_pw: y_validation_pw, 574 | self.y_p_pph: y_validation_pph, 575 | self.seq_len_p: len_validation, 576 | self.pos_p: pos_validation, 577 | self.length_p: length_validation, 578 | self.position_p: position_validation, 579 | self.keep_prob_p: 1.0, 580 | self.input_keep_prob_p:1.0, 581 | self.output_keep_prob_p:1.0 582 | } 583 | ) 584 | # print("valid_pred_pw.shape:",valid_pred_pw.shape) 585 | # print("valid_pred_pph.shape:",valid_pred_pph.shape) 586 | # print("valid_pred_iph.shape:",valid_pred_iph.shape) 587 | 588 | # metrics 589 | self.valid_accuracy_pw, self.valid_f1_pw = util.eval(y_true=y_valid_pw_masked,y_pred=valid_pred_pw) 590 | self.valid_accuracy_pph, self.valid_f1_pph = util.eval(y_true=y_valid_pph_masked,y_pred=valid_pred_pph) 591 | 592 | #self.valid_accuracy_iph, self.valid_f1_1_iph, self.valid_f1_2_iph = util.eval(y_true=y_valid_iph_masked,y_pred=valid_pred_iph) 593 | print("Epoch ", epoch, " finished.", "spend ", round((time.time() - start_time) / 60, 2), " mins") 594 | self.showInfo(type="training") 595 | self.showInfo(type="validation") 596 | 597 | 598 | # when we get a new best validation accuracy,we store the model 599 | if self.best_validation_loss < self.validation_loss: 600 | self.best_validation_loss = self.validation_loss 601 | print("New Best loss ", self.best_validation_loss, " On Validation set! ") 602 | print("Saving Models......\n\n") 603 | # exist ./models folder? 604 | if not os.path.exists("./models/"): 605 | os.mkdir(path="./models/") 606 | if not os.path.exists("./models/" + name): 607 | os.mkdir(path="./models/" + name) 608 | if not os.path.exists("./models/" + name + "/bilstm"): 609 | os.mkdir(path="./models/" + name + "/bilstm") 610 | # create saver 611 | saver = tf.train.Saver() 612 | saver.save(sess, "./models/" + name + "/bilstm/my-model-10000") 613 | # Generates MetaGraphDef. 614 | saver.export_meta_graph("./models/" + name + "/bilstm/my-model-10000.meta") 615 | print("\n\n") 616 | 617 | # test:using X_validation_pw 618 | test_pred_pw, test_pred_pph = sess.run( 619 | fetches=[pred_pw, pred_pph], 620 | feed_dict={ 621 | self.X_p: X_validation, 622 | self.seq_len_p: len_validation, 623 | self.pos_p: pos_validation, 624 | self.length_p: length_validation, 625 | self.position_p: position_validation, 626 | self.keep_prob_p: 1.0, 627 | self.input_keep_prob_p:1.0, 628 | self.output_keep_prob_p:1.0 629 | } 630 | ) 631 | 632 | # recover to original corpus txt 633 | # shape of valid_pred_pw,valid_pred_pw,valid_pred_pw:[corpus_size*time_stpes] 634 | util.recover2( 635 | X=X_validation, 636 | preds_pw=test_pred_pw, 637 | preds_pph=test_pred_pph, 638 | filename="../result/bilstm_cwe/recover_epoch_" + str(epoch) + ".txt" 639 | ) 640 | 641 | # 返回预测的结果或者准确率,y not None的时候返回准确率,y ==None的时候返回预测值 642 | def pred(self, name, X, y=None, ): 643 | start_time = time.time() # compute time 644 | if y is None: 645 | with self.session as sess: 646 | # restore model 647 | new_saver = tf.train.import_meta_graph( 648 | meta_graph_or_file="./models/" + name + "/bilstm/my-model-10000.meta", 649 | clear_devices=True 650 | ) 651 | new_saver.restore(sess, "./models/" + name + "/bilstm/my-model-10000") 652 | # get default graph 653 | graph = tf.get_default_graph() 654 | # get opration from the graph 655 | pred_normal = graph.get_operation_by_name("pred_normal").outputs[0] 656 | X_p = graph.get_operation_by_name("input_placeholder").outputs[0] 657 | pred = sess.run(fetches=pred_normal, feed_dict={X_p: X}) 658 | print("this operation spends ", round((time.time() - start_time) / 60, 2), " mins") 659 | return pred 660 | else: 661 | with self.session as sess: 662 | # restore model 663 | new_saver = tf.train.import_meta_graph( 664 | meta_graph_or_file="./models/" + name + "/bilstm/my-model-10000.meta", 665 | clear_devices=True 666 | ) 667 | new_saver.restore(sess, "./models/" + name + "/bilstm/my-model-10000") 668 | graph = tf.get_default_graph() 669 | # get opration from the graph 670 | accuracy = graph.get_operation_by_name("accuracy").outputs[0] 671 | X_p = graph.get_operation_by_name("input_placeholder").outputs[0] 672 | y_p = graph.get_operation_by_name("label_placeholder").outputs[0] 673 | # forward and get the results 674 | accu = sess.run(fetches=accuracy, feed_dict={X_p: X, y_p: y}) 675 | print("this operation spends ", round((time.time() - start_time) / 60, 2), " mins") 676 | return accu 677 | 678 | 679 | def showInfo(self, type): 680 | if type == "training": 681 | # training information 682 | print(" /**Training info**/") 683 | print("----avarage training loss:", sum(self.train_losses) / len(self.train_losses)) 684 | print("PW:") 685 | print("----avarage accuracy:", sum(self.train_accus_pw) / len(self.train_accus_pw)) 686 | #print("----avarage f1-Score of N:", sum(self.c1_f_pw) / len(self.c1_f_pw)) 687 | print("----avarage f1-Score of B:", sum(self.c2_f_pw) / len(self.c2_f_pw)) 688 | print("PPH:") 689 | print("----avarage accuracy :", sum(self.train_accus_pph) / len(self.train_accus_pph)) 690 | #print("----avarage f1-Score of N:", sum(self.c1_f_pph) / len(self.c1_f_pph)) 691 | print("----avarage f1-Score of B:", sum(self.c2_f_pph) / len(self.c2_f_pph)) 692 | #print("IPH:") 693 | #print("----avarage accuracy:", sum(self.train_accus_iph) / len(self.train_accus_iph)) 694 | #print("----avarage f1-Score of N:", sum(self.c1_f_iph) / len(self.c1_f_iph)) 695 | #print("----avarage f1-Score of B:", sum(self.c2_f_iph) / len(self.c2_f_iph)) 696 | else: 697 | print(" /**Validation info**/") 698 | print("----avarage validation loss:", self.validation_loss) 699 | print("PW:") 700 | print("----avarage accuracy:", self.valid_accuracy_pw) 701 | #print("----avarage precision of N:", self.valid_precision_1_pw) 702 | #print("----avarage recall of N:", self.valid_recall_1_pw) 703 | #print("----avarage f1-Score of N:", self.valid_f1_1_pw) 704 | #print("----avarage precision of B:", self.valid_precision_2_pw) 705 | #print("----avarage recall of B:", self.valid_recall_2_pw) 706 | print("----avarage f1-Score of B:", self.valid_f1_pw[0]) 707 | print("PPH:") 708 | print("----avarage accuracy :", self.valid_accuracy_pph) 709 | #print("----avarage precision of N:", self.valid_precision_1_pph) 710 | #print("----avarage recall of N:", self.valid_recall_1_pph) 711 | #print("----avarage f1-Score of N:", self.valid_f1_1_pph) 712 | #print("----avarage precision of B:", self.valid_precision_2_pph) 713 | #print("----avarage recall of B:", self.valid_recall_2_pph) 714 | print("----avarage f1-Score of B:", self.valid_f1_pph[1]) 715 | #print("----avarage f1-Score of N:", self.valid_f1_1_pph) 716 | #print("----avarage f1-Score of B:", self.valid_f1_2_pph) 717 | #print("IPH:") 718 | #print("----avarage accuracy:", self.valid_accuracy_iph) 719 | #print("----avarage f1-Score of N:", self.valid_f1_1_iph) 720 | #print("----avarage f1-Score of B:", self.valid_f1_2_iph) 721 | 722 | 723 | # train && test 724 | if __name__ == "__main__": 725 | # 读数据 726 | # pw 727 | df_train_pw = pd.read_pickle(path="../data/dataset/pw_summary_train.pkl") 728 | df_validation_pw = pd.read_pickle(path="../data/dataset/pw_summary_validation.pkl") 729 | # pph 730 | df_train_pph = pd.read_pickle(path="../data/dataset/pph_summary_train.pkl") 731 | df_validation_pph = pd.read_pickle(path="../data/dataset/pph_summary_validation.pkl") 732 | 733 | # iph 734 | #df_train_iph = pd.read_pickle(path="./dataset/temptest/iph_summary_train.pkl") 735 | #df_validation_iph = pd.read_pickle(path="./dataset/temptest/iph_summary_validation.pkl") 736 | 737 | # 实际上,X里面的内容都是一样的,所以这里统一使用pw的X来作为所有的X 738 | # 但是标签是不一样的,所以需要每个都要具体定义 739 | X_train = np.asarray(list(df_train_pw['X'].values)) 740 | X_validation = np.asarray(list(df_validation_pw['X'].values)) 741 | #print("X_train:\n",X_train) 742 | #print("X_train.shape\n",X_train.shape) 743 | #print("X_validation:\n",X_validation) 744 | #print("X_validation.shape:\n",X_validation.shape) 745 | 746 | # tags 747 | y_train_pw = np.asarray(list(df_train_pw['y'].values)) 748 | y_validation_pw = np.asarray(list(df_validation_pw['y'].values)) 749 | 750 | #print("y_train_pw:",y_train_pw) 751 | #print("y_validation_pw:",y_validation_pw) 752 | 753 | y_train_pph = np.asarray(list(df_train_pph['y'].values)) 754 | y_validation_pph = np.asarray(list(df_validation_pph['y'].values)) 755 | 756 | #y_train_iph = np.asarray(list(df_train_iph['y'].values)) 757 | #y_validation_iph = np.asarray(list(df_validation_iph['y'].values)) 758 | 759 | # length每一行序列的长度 760 | # 因为都一样,所以统一使用pw的 761 | len_train = np.asarray(list(df_train_pw['sentence_len'].values)) 762 | len_validation = np.asarray(list(df_validation_pw['sentence_len'].values)) 763 | #print("len_train:", len_train.shape) 764 | #print("len_validation:", len_validation.shape) 765 | 766 | y_train = [y_train_pw, y_train_pph] 767 | y_validation = [y_validation_pw, y_validation_pph] 768 | #print("y_train_pw:\n", y_train_pw); 769 | #print(y_train_pw.shape) 770 | #print("y_train_pph:\n", y_train_pph); 771 | #print(y_train_pph.shape) 772 | # print("y_train_iph:\n", y_train_iph); 773 | # print(y_train_iph.shape) 774 | 775 | #-----------------------------------Extra Info--------------------------------------------- 776 | #pos 777 | pos_train = util.readExtraInfo(file="../data/dataset/pos_train_tag.txt") 778 | pos_validation = util.readExtraInfo(file="../data/dataset/pos_test_tag.txt") 779 | 780 | # length 781 | length_train = util.readExtraInfo(file="../data/dataset/length_train_tag.txt") 782 | length_validation = util.readExtraInfo(file="../data/dataset/length_test_tag.txt") 783 | # print("shape of length_train:",length_train.shape) 784 | # print("shape of length_test:",length_validation.shape) 785 | 786 | # position 787 | position_train = util.readExtraInfo(file="../data/dataset/position_train_tag.txt") 788 | position_validation = util.readExtraInfo(file="../data/dataset/position_test_tag.txt") 789 | #print("shape of position_train:", position_train.shape) 790 | #print("shape of positon_test:", position_validation.shape) 791 | # accum 792 | accum_train = util.readExtraInfo(file="../data/dataset/accum_train_tag.txt") 793 | accum_validation = util.readExtraInfo(file="../data/dataset/accum_test_tag.txt") 794 | #print("shape of accum_train:", accum_train.shape) 795 | #print("shape of accum_test:", accum_validation.shape) 796 | 797 | # accum reverse 798 | accumR_train = util.readExtraInfo(file="../data/dataset/accum_reverse_train_tag.txt") 799 | accumR_validation = util.readExtraInfo(file="../data/dataset/accum_reverse_test_tag.txt") 800 | #print("shape of accumR_train:", accumR_train.shape) 801 | #print("shape of accumR_test:", accumR_validation.shape) 802 | 803 | model = BiLSTM_CWE() 804 | model.fit(X_train, y_train, len_train,pos_train,length_train,position_train, 805 | X_validation, y_validation, len_validation, pos_validation,length_validation,position_validation, 806 | "test", False) -------------------------------------------------------------------------------- /models/crf.py: -------------------------------------------------------------------------------- 1 | ''' 2 | use CRF++ tools 3 | ''' 4 | -------------------------------------------------------------------------------- /models/draw.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | def readData(file): 6 | list=[] 7 | f=open(file=file) 8 | lines=f.readlines() 9 | for line in lines: 10 | line=line.strip() 11 | list.append(float(line)) 12 | return list 13 | 14 | 15 | if __name__ =="__main__": 16 | #plt.xlabel("Mini-Batch") 17 | #plt.ylabel("Accuracy") 18 | #list=readData(file="train_accuracy_epoch1.txt") 19 | #plt.plot(list,"r") 20 | list2=readData(file="train_loss_epoch1.txt") 21 | plt.xlabel("Mini-Batch") 22 | plt.ylabel("Loss") 23 | plt.plot(list2,"r") 24 | plt.show() -------------------------------------------------------------------------------- /models/gbdt1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import tensorflow as tf 4 | import alignment 5 | import alignment_cwe 6 | import crf 7 | import bilstm_cbow 8 | import bilstm_cwe 9 | import util 10 | from sklearn.ensemble import GradientBoostingClassifier 11 | from sklearn.metrics import accuracy_score 12 | from sklearn.metrics import f1_score 13 | 14 | class GBDT1(): 15 | def __init__(self): 16 | self.n_estimators=30 17 | self.learning_rate=0.08 18 | self.sub_sample=0.8 19 | self.loss_type="deviance" 20 | 21 | self.gbdt=GradientBoostingClassifier( 22 | loss=self.loss_type, 23 | learning_rate=self.learning_rate, 24 | n_estimators=self.n_estimators, 25 | subsample=self.sub_sample 26 | ) 27 | 28 | def fit(self,X_train,y_train,X_test,y_test): 29 | self.gbdt.fit(X=X_train,y=y_train) 30 | pred=self.gbdt.predict(X=X_test) 31 | print(pred.shape) 32 | print("accracy:",accuracy_score(y_true=y_test,y_pred=pred)) 33 | print("f1-score:",f1_score(y_true=y_test,y_pred=pred,average=None)) 34 | 35 | def deMask(self): 36 | pass 37 | 38 | def pred(self,X): 39 | pass 40 | 41 | #depadding and will reduce dimension 42 | def mask(length,X): 43 | list = [] 44 | for i in range(length.shape[0]): 45 | sentenece_len = length[i] 46 | for j in range(sentenece_len): 47 | list.append(X[i, j]) 48 | return np.array(list,dtype=np.int32) 49 | 50 | def onehot(array): 51 | a=np.zeros(shape=(array.shape[0],37),dtype=np.int32) 52 | for i in range(array.shape[0]): 53 | a[i,array[i]-1]=1 54 | return a 55 | 56 | if __name__=="__main__": 57 | 58 | print("loading data....") 59 | #training data 60 | # pw 为了获取长度信息 61 | #df_train_pw = pd.read_pickle(path="../data/dataset/pw_summary_train.pkl") 62 | #len_train = np.asarray(list(df_train_pw['sentence_len'].values)) 63 | 64 | #X_train_crf,labels_train,preds_train_crf=util.extractProb(file="../result/crf/crf_prob_train.txt") 65 | #X_train_alignment=util.extractProb2(file="../result/alignment/alignment_prob_train.txt") 66 | #X_train_cnn = util.extractProb2(file="../result/cnn/cnn_prob_train.txt") 67 | #print("X_train_cnn.shape",X_train_cnn.shape) 68 | 69 | #pos_train=util.readExtraInfo(file="../data/dataset/pos_train_tag.txt") 70 | #pos_train_masked=mask(length=len_train,X=pos_train) 71 | #print(pos_train_masked.shape) 72 | #pos_train_onehot=onehot(pos_train_masked) 73 | #print(pos_train_onehot.shape) 74 | #X_train=np.concatenate((X_train_cnn,X_train_alignment,pos_train_onehot),axis=1) 75 | 76 | #valid data 77 | df_valid_pw = pd.read_pickle(path="../data/dataset/pw_summary_valid.pkl") 78 | len_valid = np.asarray(list(df_valid_pw['sentence_len'].values)) 79 | X_valid_crf, labels_valid, preds_valid_crf = util.extractProb(file="../result/crf/crf_prob_valid.txt") 80 | X_valid_alignment = util.extractProb2(file="../result/alignment/alignment_prob_valid_epoch5.txt") 81 | X_valid_cnn = util.extractProb2(file="../result/cnn/cnn_prob_valid_epoch5.txt") 82 | X_valid_attention=util.extractProb2(file="../result/attention/attention_prob_valid_epoch4.txt") 83 | X_valid_bilstm=util.extractProb2(file="../result/bilstm/bilstm_prob_valid_epoch3.txt") 84 | 85 | pos_valid = util.readExtraInfo(file="../data/dataset/pos_valid_tag.txt") 86 | pos_valid_masked = mask(length=len_valid, X=pos_valid) 87 | print(pos_valid_masked.shape) 88 | pos_valid_onehot = onehot(pos_valid_masked) 89 | print(pos_valid_onehot.shape) 90 | X_valid = np.concatenate( 91 | (X_valid_crf,X_valid_cnn, X_valid_alignment,X_valid_attention, X_valid_bilstm,pos_valid_onehot), 92 | axis=1 93 | ) 94 | 95 | # test data 96 | df_test_pw = pd.read_pickle(path="../data/dataset/pw_summary_test.pkl") 97 | len_test = np.asarray(list(df_test_pw['sentence_len'].values)) 98 | X_test_crf, labels_test, preds_test_crf = util.extractProb(file="../result/crf/crf_prob_test.txt") 99 | X_test_alignment = util.extractProb2(file="../result/alignment/alignment_prob_test_epoch5.txt") 100 | X_test_cnn = util.extractProb2(file="../result/cnn/cnn_prob_test_epoch5.txt") 101 | X_test_attention = util.extractProb2(file="../result/attention/attention_prob_test_epoch4.txt") 102 | X_test_bilstm=util.extractProb2(file="../result/bilstm/bilstm_prob_test_epoch3.txt") 103 | 104 | pos_test = util.readExtraInfo(file="../data/dataset/pos_test_tag.txt") 105 | pos_test_masked = mask(length=len_test, X=pos_test) 106 | print(pos_test_masked.shape) 107 | pos_test_onehot = onehot(pos_test_masked) 108 | print(pos_test_onehot.shape) 109 | X_test = np.concatenate( 110 | (X_test_crf,X_test_cnn, X_test_alignment, X_test_attention,X_test_bilstm,pos_test_onehot), 111 | axis=1 112 | ) 113 | 114 | print("run model....") 115 | model=GBDT1() 116 | #model.fit(X_train=X_train, y_train=labels_train, X_test=X_valid, y_test=labels_valid) 117 | model.fit(X_train=X_valid,y_train=labels_valid,X_test=X_test,y_test=labels_test) 118 | 119 | 120 | -------------------------------------------------------------------------------- /models/gbdt2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import alignment 4 | import alignment_cwe 5 | import crf 6 | import bilstm_cbow 7 | import bilstm_cwe 8 | import util 9 | from sklearn.ensemble import GradientBoostingClassifier 10 | from sklearn.metrics import accuracy_score 11 | from sklearn.metrics import f1_score 12 | 13 | 14 | class GBDT2(): 15 | def __init__(self): 16 | self.n_estimators=50 17 | self.learning_rate=0.05 18 | self.sub_sample=0.8 19 | self.loss_type="deviance" 20 | 21 | self.gbdt=GradientBoostingClassifier( 22 | loss=self.loss_type, 23 | learning_rate=self.learning_rate, 24 | n_estimators=self.n_estimators, 25 | subsample=self.sub_sample 26 | ) 27 | 28 | def fit(self,X_train,y_train,X_test,y_test): 29 | self.gbdt.fit(X=X_train,y=y_train) 30 | pred=self.gbdt.predict(X=X_test) 31 | print(pred.shape) 32 | print("accracy:",accuracy_score(y_true=y_test,y_pred=pred)) 33 | print("f1-score:",f1_score(y_true=y_test,y_pred=pred,average=None)) 34 | 35 | 36 | def pred(self,X): 37 | pass 38 | 39 | 40 | if __name__=="__main__": 41 | print("loading data....") 42 | #training data 43 | X_train_crf,labels_train,preds_train_crf=util.extractProb(file="../result/crf/crf_prob_train.txt") 44 | X_train_alignment=util.extractProb2(file="../result/alignment/alignment_prob_train.txt") 45 | X_train_cnn = util.extractProb2(file="../result/cnn/cnn_prob_train.txt") 46 | X_train=np.concatenate((X_train_cnn,X_train_alignment,X_train_crf),axis=1) 47 | 48 | #valid data 49 | X_valid_crf, labels_valid, preds_valid_crf = util.extractProb(file="../result/crf/crf_prob_valid.txt") 50 | X_valid_alignment = util.extractProb2(file="../result/alignment/alignment_prob_valid.txt") 51 | X_valid_cnn = util.extractProb2(file="../result/cnn/cnn_prob_valid.txt") 52 | X_valid = np.concatenate((X_valid_cnn, X_valid_alignment,X_valid_crf), axis=1) 53 | 54 | # test data 55 | X_test_crf, labels_test, preds_test_crf = util.extractProb(file="../result/crf/crf_prob_test.txt") 56 | X_test_alignment = util.extractProb2(file="../result/alignment/alignment_prob_test.txt") 57 | X_test_cnn = util.extractProb2(file="../result/cnn/cnn_prob_test.txt") 58 | X_test = np.concatenate((X_test_cnn, X_test_alignment,X_test_crf), axis=1) 59 | 60 | 61 | print("run model....") 62 | model=GBDT2() 63 | model.fit(X_train=X_train, y_train=labels_train, X_test=X_valid, y_test=labels_valid) 64 | model.fit(X_train=X_train,y_train=labels_train,X_test=X_test,y_test=labels_test) -------------------------------------------------------------------------------- /models/lf.py: -------------------------------------------------------------------------------- 1 | def get_dataset(images_paths,labels,batch_size,shuffle=True,last_batch=True): 2 | def _decode_images(file_path, label): 3 | image_string = tf.read_file(file_path) 4 | image_decoded = tf.image.decode_png(image_string) 5 | image = tf.cast(image_decoded, tf.float32) / 255. 6 | return image, label 7 | 8 | dataset = tf.data.Dataset.from_tensor_slices((tf.constant(images_paths), tf.constant(labels))) 9 | dataset = dataset.map(_decode_images) 10 | # buffer_size = 10 * batch_size 11 | buffer_size = 50000 12 | if shuffle: 13 | dataset = dataset.shuffle(buffer_size) 14 | if not last_batch: 15 | dataset.filter(lambda x, y: tf.equal(tf.shape(x)[0], batch_size)) 16 | dataset = dataset.batch(batch_size) 17 | return dataset -------------------------------------------------------------------------------- /models/rf.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import data_utils 3 | import os 4 | import math 5 | import sys 6 | import time 7 | 8 | class Seq2SeqModel(object): 9 | def __init__(self, learning_rate, learning_rate_decay_factor, source_vocab_size=40000, target_vocab_size=40000, num_steps=100, num_epochs=10, 10 | is_training=True): 11 | self.min_loss = float(sys.maxint) 12 | self.batch_size = 100 13 | self.dropout_rate = 0.5 14 | self.max_gradient_norm = 5 15 | self.learning_rate = tf.Variable(float(learning_rate), trainable=False) 16 | self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * learning_rate_decay_factor) 17 | 18 | self.num_layers = 1 19 | self.emb_dim = 100 20 | self.hidden_dim = 100 21 | self.attention_hidden_dim = 100 22 | self.num_epochs = num_epochs 23 | self.num_steps = num_steps 24 | self.source_vocab_size = source_vocab_size 25 | self.target_vocab_size = target_vocab_size 26 | self.global_step = tf.Variable(0, trainable=False) 27 | 28 | # placeholder of encoder_inputs, decoder_inputs, y_outputs 29 | self.encoder_inputs, self.decoder_inputs, self.y_outputs, self.target_weights = self.create_placeholder() 30 | 31 | # source and target word embedding 32 | self.source_embedding = tf.Variable(tf.random_uniform([self.source_vocab_size, self.emb_dim], 0.0, 1.0), name="source_emb") 33 | self.target_embedding = tf.Variable(tf.random_uniform([self.target_vocab_size, self.emb_dim], 0.0, 1.0), name="target_emb") 34 | 35 | self.softmax_w = tf.Variable(tf.random_uniform([self.hidden_dim * 2, self.target_vocab_size], 0.0, 1.0), name="softmax_w", dtype=tf.float32) 36 | self.softmax_b = tf.Variable(tf.random_uniform([self.target_vocab_size], 0.0, 1.0), name="softmax_b", dtype=tf.float32) 37 | 38 | self.attention_W = tf.Variable(tf.random_uniform([self.hidden_dim * 4, self.attention_hidden_dim], 0.0, 1.0), name="attention_W") 39 | self.attention_U = tf.Variable(tf.random_uniform([self.hidden_dim * 2, self.attention_hidden_dim], 0.0, 1.0), name="attention_U") 40 | self.attention_V = tf.Variable(tf.random_uniform([self.attention_hidden_dim, 1], 0.0, 1.0), name="attention_V") 41 | 42 | self.encoder_inputs_emb = tf.nn.embedding_lookup(self.source_embedding, self.encoder_inputs) 43 | self.encoder_inputs_emb = tf.transpose(self.encoder_inputs_emb, [1, 0, 2]) 44 | # self.encoder_inputs_emb = tf.reshape(self.encoder_inputs_emb, [-1, self.emb_dim]) 45 | # self.encoder_inputs_emb = tf.split(0, self.num_steps, self.encoder_inputs_emb) 46 | 47 | self.decoder_inputs_emb = tf.nn.embedding_lookup(self.target_embedding, self.decoder_inputs) 48 | self.decoder_inputs_emb = tf.transpose(self.decoder_inputs_emb, [1, 0, 2]) 49 | self.decoder_inputs_emb = tf.reshape(self.decoder_inputs_emb, [-1, self.emb_dim]) 50 | self.decoder_inputs_emb = tf.split(self.decoder_inputs_emb, self.num_steps, 0) 51 | 52 | # lstm cell 53 | self.enc_lstm_cell_fw = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, state_is_tuple=False) 54 | self.enc_lstm_cell_bw = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim, state_is_tuple=False) 55 | self.dec_lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.hidden_dim * 2, state_is_tuple=False) 56 | 57 | # dropout 58 | if is_training: 59 | # self.enc_lstm_cell_fw = tf.nn.rnn_cell.DropoutWrapper(self.enc_lstm_cell_fw, output_keep_prob=(1 - self.dropout_rate)) 60 | # self.enc_lstm_cell_bw = tf.nn.rnn_cell.DropoutWrapper(self.enc_lstm_cell_bw, output_keep_prob=(1 - self.dropout_rate)) 61 | self.dec_lstm_cell = tf.contrib.rnn.DropoutWrapper(self.dec_lstm_cell, output_keep_prob=(1 - self.dropout_rate)) 62 | 63 | # get the length of each sample 64 | self.source_length = tf.reduce_sum(tf.sign(self.encoder_inputs), reduction_indices=1) 65 | self.source_length = tf.cast(self.source_length, tf.int32) 66 | self.target_length = tf.reduce_sum(tf.sign(self.decoder_inputs), reduction_indices=1) 67 | self.target_length = tf.cast(self.target_length, tf.int32) 68 | 69 | # encode and decode 70 | enc_outputs, enc_state = self.encode(self.enc_lstm_cell_fw, self.enc_lstm_cell_bw) 71 | if is_training: 72 | self.dec_outputs = self.decode(self.dec_lstm_cell, enc_state, enc_outputs) 73 | else: 74 | self.dec_outputs = self.decode(self.dec_lstm_cell, enc_state, enc_outputs, self.loop_function) 75 | # softmax 76 | self.outputs = tf.reshape(tf.concat(self.dec_outputs, axis=1), [-1, self.hidden_dim * 2]) 77 | self.logits = tf.add(tf.matmul(self.outputs, self.softmax_w), self.softmax_b) 78 | self.prediction = tf.nn.softmax(self.logits) 79 | 80 | self.y_output = tf.reshape(self.y_outputs, [-1]) 81 | self.y_output = tf.one_hot(self.y_output, depth=self.target_vocab_size, on_value=1.0, off_value=0.0) 82 | 83 | self.target_weight = tf.reshape(self.target_weights, [-1]) 84 | 85 | cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_output) 86 | self.cross_entropy_loss = tf.reduce_mean(tf.multiply(self.target_weight, cross_entropy)) 87 | 88 | # Gradients and SGD update operation for training the model. 89 | params = tf.trainable_variables() 90 | self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) 91 | 92 | gradients = tf.gradients(self.cross_entropy_loss, params) 93 | clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm) 94 | self.updates = self.optimizer.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) 95 | 96 | self.saver = tf.train.Saver(tf.global_variables()) 97 | 98 | def create_placeholder(self): 99 | encoder_input_pl = tf.placeholder(tf.int64, [None, self.num_steps]) 100 | decoder_input_pl = tf.placeholder(tf.int64, [None, self.num_steps]) 101 | y_output_pl = tf.placeholder(tf.int64, [None, self.num_steps]) 102 | target_weight = tf.placeholder(tf.float32, [None, self.num_steps]) 103 | return encoder_input_pl, decoder_input_pl, y_output_pl, target_weight 104 | 105 | def encode(self, cell_fw, cell_bw): 106 | enc_outputs, (output_state_fw, output_state_bw) = tf.nn.bidirectional_dynamic_rnn( 107 | cell_fw, 108 | cell_bw, 109 | self.encoder_inputs_emb, 110 | dtype=tf.float32, 111 | sequence_length=self.source_length, 112 | time_major=True 113 | ) 114 | enc_state = tf.concat([output_state_fw, output_state_bw], axis=1) 115 | enc_outputs = tf.concat(enc_outputs, axis=2) 116 | enc_outputs = tf.reshape(enc_outputs, [-1, self.emb_dim * 2]) 117 | enc_outputs = tf.split(enc_outputs, self.num_steps, 0) 118 | return enc_outputs, enc_state 119 | 120 | def attention(self, prev_state, enc_outputs): 121 | """ 122 | Attention model for Neural Machine Translation 123 | :param prev_state: the decoder hidden state at time i-1 124 | :param enc_outputs: the encoder outputs, a length 'T' list. 125 | """ 126 | e_i = [] 127 | c_i = [] 128 | for output in enc_outputs: 129 | atten_hidden = tf.tanh(tf.add(tf.matmul(prev_state, self.attention_W), tf.matmul(output, self.attention_U))) 130 | e_i_j = tf.matmul(atten_hidden, self.attention_V) 131 | e_i.append(e_i_j) 132 | e_i = tf.concat(e_i, axis=1) 133 | # e_i = tf.exp(e_i) 134 | alpha_i = tf.nn.softmax(e_i) 135 | alpha_i = tf.split(alpha_i, self.num_steps, 1) 136 | for alpha_i_j, output in zip(alpha_i, enc_outputs): 137 | c_i_j = tf.multiply(alpha_i_j, output) 138 | c_i.append(c_i_j) 139 | c_i = tf.reshape(tf.concat(c_i, axis=1), [-1, self.num_steps, self.hidden_dim * 2]) 140 | c_i = tf.reduce_sum(c_i, 1) 141 | return c_i 142 | 143 | def decode(self, cell, init_state, enc_outputs, loop_function=None): 144 | outputs = [] 145 | prev = None 146 | state = init_state 147 | for i, inp in enumerate(self.decoder_inputs_emb): 148 | 149 | if loop_function is not None and prev is not None: 150 | with tf.variable_scope("loop_function", reuse=True): 151 | inp = loop_function(prev, i) 152 | if i > 0: 153 | tf.get_variable_scope().reuse_variables() 154 | c_i = self.attention(state, enc_outputs) 155 | inp = tf.concat([inp, c_i], axis=1) 156 | output, state = cell(inp, state) 157 | # print output.eval() 158 | outputs.append(output) 159 | if loop_function is not None: 160 | prev = output 161 | return outputs 162 | 163 | def loop_function(self, prev, _): 164 | """ 165 | :param prev: the output of t-1 time 166 | :param _: 167 | :return: the embedding of t-1 output 168 | """ 169 | prev = tf.add(tf.matmul(prev, self.softmax_w), self.softmax_b) 170 | prev_sympol = tf.arg_max(prev, 1) 171 | 172 | emb_prev = tf.nn.embedding_lookup(self.target_embedding, prev_sympol) 173 | return emb_prev 174 | 175 | def train(self, sess, save_path, train_set, val_set, steps_per_checkpoint, train_log): 176 | num_iterations = int(math.ceil(1.0 * len(train_set) / self.batch_size)) 177 | print("Number of iterations: %d" % num_iterations) 178 | 179 | step_time, loss = 0.0, 0.0 180 | current_step = 0 181 | previous_losses = [] 182 | while True: 183 | log_file = open(train_log, 'a') 184 | start_time = time.time() 185 | batch_encoder_inputs, batch_decoder_inputs, batch_y_outputs, batch_target_weights = \ 186 | data_utils.nextRandomBatch(train_set, batch_size=self.batch_size) 187 | _, step_loss = \ 188 | sess.run( 189 | [ 190 | self.updates, 191 | self.cross_entropy_loss, 192 | ], 193 | feed_dict={ 194 | self.encoder_inputs: batch_encoder_inputs, 195 | self.decoder_inputs: batch_decoder_inputs, 196 | self.y_outputs: batch_y_outputs 197 | }) 198 | step_time += (time.time() - start_time) / steps_per_checkpoint 199 | loss += step_loss / steps_per_checkpoint 200 | current_step += 1 201 | 202 | # Once in a while, we save checkpoint, print statistics, and run evals. 203 | if current_step % steps_per_checkpoint == 0: 204 | perplexity = math.exp(float(loss)) if loss < 300 else float("inf") 205 | log_file.write("global step %d learning rate %.4f step-time %.2f perplexity " 206 | "%.2f" % (self.global_step.eval(), self.learning_rate.eval(), 207 | step_time, perplexity)) 208 | log_file.write("\n") 209 | if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): 210 | sess.run(self.learning_rate_decay_op) 211 | previous_losses.append(loss) 212 | checkpoint_path = os.path.join(save_path, "translate.ckpt") 213 | self.saver.save(sess, checkpoint_path, global_step=self.global_step) 214 | step_time, loss = 0.0, 0.0 215 | 216 | if current_step % 1000 == 0: 217 | batch_encoder_val, batch_decoder_val, batch_y_val, batch_target_weights_val = \ 218 | data_utils.nextRandomBatch(val_set, batch_size=self.batch_size) 219 | loss_val = \ 220 | sess.run( 221 | self.cross_entropy_loss, 222 | feed_dict={ 223 | self.encoder_inputs: batch_encoder_val, 224 | self.decoder_inputs: batch_decoder_val, 225 | self.y_outputs: batch_y_val, 226 | self.target_weights: batch_target_weights_val 227 | }) 228 | eval_ppl = math.exp(float(loss_val)) if loss_val < 300 else float("inf") 229 | log_file.write("global step %d eval: perplexity %.2f" % (self.global_step.eval(), eval_ppl)) 230 | log_file.write("\n") 231 | sys.stdout.flush() 232 | log_file.close() 233 | 234 | def test(self, sess, token_ids): 235 | # We decode one sentence at a time. 236 | token_ids = data_utils.padding(token_ids) 237 | target_ids = data_utils.padding([data_utils.GO_ID]) 238 | y_ids = data_utils.padding([data_utils.EOS_ID]) 239 | encoder_inputs, decoder_inputs, _ = data_utils.nextRandomBatch([(token_ids, target_ids, y_ids)], batch_size=1) 240 | prediction = sess.run(self.prediction, feed_dict={ 241 | self.encoder_inputs: encoder_inputs, 242 | self.decoder_inputs: decoder_inputs 243 | }) 244 | pred_max = tf.arg_max(prediction, 1) 245 | # prediction = tf.split(0, self.num_steps, prediction) 246 | # # This is a greedy decoder - outputs are just argmaxes of output_logits. 247 | # outputs = [int(np.argmax(predict)) for predict in prediction] 248 | # # If there is an EOS symbol in outputs, cut them at that point. 249 | # if data_utils.EOS_ID in outputs: 250 | # outputs = outputs[:outputs.index(data_utils.EOS_ID)] 251 | return pred_max.eval() -------------------------------------------------------------------------------- /models/xgb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XierHacker/Model_Fusion_Based_Prosody_Prediction/ef174fe63eded966c61880ffce041242fdc0b1ff/models/xgb.py -------------------------------------------------------------------------------- /parameter.py: -------------------------------------------------------------------------------- 1 | #basic architecture 2 | CHAR_EMBEDDING_SIZE=128 #字嵌入维度 3 | WORD_EMBEDDING_SIZE=128 #词嵌入维度 4 | INPUT_SIZE=WORD_EMBEDDING_SIZE #词嵌入维度 5 | 6 | MAX_EPOCH=5 #最大迭代次数 7 | LAYER_NUM=2 #lstm层数2 8 | HIDDEN_UNITS_NUM=256 #隐藏层结点数量 9 | HIDDEN_UNITS_NUM2=256 #隐藏层2结点数量 10 | BATCH_SIZE=20 #batch大小 11 | 12 | #learning rate 13 | LEARNING_RATE=0.003 #学习率 14 | DECAY=0.2 #衰减系数 15 | 16 | #Weaken Overfitting 17 | KEEP_PROB=0.5 #全连接 dropout 比率 18 | INPUT_KEEP_PROB=1.0 #rnn input dropout比率 19 | OUTPUT_KEEP_PROB=0.5 #rnn output dropout 比率 20 | LAMBDA_PW=0.001 #PW层级正则化系数 21 | LAMBDA_PPH=0.001 #PW层级正则化系数 22 | LAMBDA_IPH=0.005 #PW层级正则化系数 23 | 24 | 25 | #can't modify 26 | CLASS_NUM=2 #类别数量2(N,B) 27 | POS_NUM=37 #词性信息数量 28 | LENGTH_NUM=8 #长度信息数量 29 | MAX_SENTENCE_SIZE=28 #固定句子长度为28 (从整个数据集得来) 30 | TIMESTEP_SIZE=MAX_SENTENCE_SIZE #LSTM的time_step应该和句子长度一致 31 | WORD_VOCAB_SIZE=393256 # 样本中不同字的个数+1(padding 0),根据处理数据的时候得到 32 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import tensorflow as tf 4 | import parameter 5 | from sklearn.metrics import precision_score 6 | from sklearn.metrics import recall_score 7 | from sklearn.metrics import accuracy_score 8 | from sklearn.metrics import f1_score 9 | from sklearn.preprocessing import OneHotEncoder 10 | 11 | #load data 12 | def loadData(): 13 | # pw 14 | df_train_pw = pd.read_pickle(path="../data/dataset/pw_summary_train.pkl") 15 | df_valid_pw = pd.read_pickle(path="../data/dataset/pw_summary_valid.pkl") 16 | df_test_pw = pd.read_pickle(path="../data/dataset/pw_summary_test.pkl") 17 | 18 | # pph 19 | df_train_pph = pd.read_pickle(path="../data/dataset/pph_summary_train.pkl") 20 | df_valid_pph = pd.read_pickle(path="../data/dataset/pph_summary_valid.pkl") 21 | df_test_pph = pd.read_pickle(path="../data/dataset/pph_summary_test.pkl") 22 | 23 | # iph 24 | # df_train_iph = pd.read_pickle(path="./dataset/temptest/iph_summary_train.pkl") 25 | # df_validation_iph = pd.read_pickle(path="./dataset/temptest/iph_summary_validation.pkl") 26 | 27 | # 实际上,X里面的内容都是一样的,所以这里统一使用pw的X来作为所有的X 28 | # 但是标签是不一样的,所以需要每个都要具体定义 29 | X_train = np.asarray(list(df_train_pw['X'].values)) 30 | X_valid = np.asarray(list(df_valid_pw['X'].values)) 31 | X_test = np.asarray(list(df_test_pw['X'].values)) 32 | 33 | # print("X_train:\n",X_train) 34 | # print("X_train.shape",X_train.shape) 35 | # print("X_valid:\n",X_valid) 36 | # print("X_valid.shape:",X_valid.shape) 37 | # print("X_test:\n", X_test) 38 | # print("X_test.shape", X_test.shape) 39 | 40 | # tags 41 | y_train_pw = np.asarray(list(df_train_pw['y'].values)) 42 | y_valid_pw = np.asarray(list(df_valid_pw['y'].values)) 43 | y_test_pw = np.asarray(list(df_test_pw['y'].values)) 44 | 45 | y_train_pph = np.asarray(list(df_train_pph['y'].values)) 46 | y_valid_pph = np.asarray(list(df_valid_pph['y'].values)) 47 | y_test_pph = np.asarray(list(df_test_pph['y'].values)) 48 | 49 | # y_train_iph = np.asarray(list(df_train_iph['y'].values)) 50 | # y_validation_iph = np.asarray(list(df_validation_iph['y'].values)) 51 | 52 | # length每一行序列的长度,因为都一样,所以统一使用pw的 53 | len_train = np.asarray(list(df_train_pw['sentence_len'].values)) 54 | len_valid = np.asarray(list(df_valid_pw['sentence_len'].values)) 55 | len_test = np.asarray(list(df_test_pw['sentence_len'].values)) 56 | # print("len_train:", len_train.shape) 57 | # print("len_valid:", len_valid.shape) 58 | # print("len_test:", len_test.shape) 59 | 60 | # ----------------------------------------Extra Info-------------------------------- 61 | # pos 62 | pos_train = readExtraInfo(file="../data/dataset/pos_train_tag.txt") 63 | pos_valid = readExtraInfo(file="../data/dataset/pos_valid_tag.txt") 64 | pos_test = readExtraInfo(file="../data/dataset/pos_test_tag.txt") 65 | # print("pos_train.shape",pos_train.shape) 66 | # print("pos_valid.shape",pos_valid.shape) 67 | # print("pos_test.shape", pos_test.shape) 68 | 69 | # length 70 | length_train = readExtraInfo(file="../data/dataset/length_train_tag.txt") 71 | length_valid = readExtraInfo(file="../data/dataset/length_valid_tag.txt") 72 | length_test = readExtraInfo(file="../data/dataset/length_test_tag.txt") 73 | # print("shape of length_train:",length_train.shape) 74 | # print("shape of length_valid:",length_valid.shape) 75 | # print("shape of length_test:", length_test.shape) 76 | 77 | # position 78 | position_train = readExtraInfo(file="../data/dataset/position_train_tag.txt") 79 | position_valid = readExtraInfo(file="../data/dataset/position_valid_tag.txt") 80 | position_test = readExtraInfo(file="../data/dataset/position_test_tag.txt") 81 | # print("shape of position_train:",position_train.shape) 82 | # print("shape of positon_valid:",position_valid.shape) 83 | # print("shape of positon_test:", position_test.shape) 84 | 85 | # accum 86 | accum_train = readExtraInfo(file="../data/dataset/accum_train_tag.txt") 87 | accum_valid = readExtraInfo(file="../data/dataset/accum_valid_tag.txt") 88 | accum_test = readExtraInfo(file="../data/dataset/accum_test_tag.txt") 89 | # print("shape of accum_train:", accum_train.shape) 90 | # print("shape of accum_valid:", accum_valid.shape) 91 | # print("shape of accum_test:", accum_test.shape) 92 | 93 | # accum reverse 94 | accumR_train = readExtraInfo(file="../data/dataset/accum_reverse_train_tag.txt") 95 | accumR_valid = readExtraInfo(file="../data/dataset/accum_reverse_valid_tag.txt") 96 | accumR_test = readExtraInfo(file="../data/dataset/accum_reverse_test_tag.txt") 97 | # print("shape of accumR_train:", accumR_train.shape) 98 | # print("shape of accumR_valid:", accumR_valid.shape) 99 | # print("shape of accumR_test:", accumR_test.shape) 100 | 101 | y_train = [y_train_pw, y_train_pph] 102 | y_valid = [y_valid_pw, y_valid_pph] 103 | y_test = [y_test_pw, y_test_pph] 104 | 105 | return X_train, y_train, len_train, pos_train, length_train, position_train,\ 106 | X_valid, y_valid, len_valid, pos_valid, length_valid, position_valid,\ 107 | X_test, y_test, len_test, pos_test, length_test, position_test 108 | 109 | # print("Run Model...\n\n\n") 110 | model = Alignment() 111 | model.fit( 112 | X_train, y_train, len_train, pos_train, length_train, position_train, 113 | X_valid, y_valid, len_valid, pos_valid, length_valid, position_valid, 114 | X_test, y_test, len_test, pos_test, length_test, position_test, "test", False) 115 | 116 | 117 | #compute accuracy,precison,recall and f1 118 | def eval(y_true,y_pred): 119 | #accuracy 120 | accuracy=accuracy_score(y_true=y_true,y_pred=y_pred) 121 | #f1-score 122 | f_1=f1_score(y_true=y_true,y_pred=y_pred,average=None) 123 | return accuracy,f_1 124 | 125 | #从得到的prob_pw和prob_pph得到总的prob,并保存 126 | def writeProb(prob_pw,prob_pph,outFile): 127 | f=open(file=outFile,mode="a+",encoding="utf-8") 128 | for i in range(prob_pw.shape[0]): 129 | prob_0=prob_pw[i,0]*prob_pph[i,0] 130 | prob_1=prob_pw[i,1]*prob_pph[i,0] 131 | prob_2=prob_pw[i,0]*prob_pph[i,1]+prob_pw[i,1]*prob_pph[i,1] 132 | s=str(prob_0)+" "+str(prob_1)+" "+str(prob_2)+"\n" 133 | f.write(s) 134 | f.close() 135 | 136 | def getTag2(preds_pw,preds_pph): 137 | # get complex "#" index 138 | length = preds_pw.shape[0] 139 | complex = np.array([preds_pph, preds_pw]) 140 | arg = np.argmax(complex, axis=0) 141 | # print("arg:\n", arg) 142 | for i in range(length): 143 | if arg[i] == 0: 144 | if complex[0, i] == 1: 145 | arg[i] = 4 146 | else: 147 | arg[i] = 0 148 | if arg[i] == 1: 149 | if complex[1, i] == 1: 150 | arg[i] = 2 151 | else: 152 | arg[i] = 0 153 | arg = (arg / 2).astype(dtype=np.int32) 154 | return arg 155 | 156 | #recover to .txt format 157 | def recover2(X,preds_pw,preds_pph,filename): 158 | arg=getTag2(preds_pw,preds_pph) 159 | arg=np.reshape(arg,newshape=(-1,parameter.MAX_SENTENCE_SIZE)) #[test_size,max_sentence_size] 160 | #print("arg.shape",arg.shape) 161 | #print("arg:\n", arg) 162 | #get id2words 163 | df_words_ids = pd.read_csv(filepath_or_buffer="../data/dataset/words_ids.csv", encoding="utf-8") 164 | #print(df_words_ids.head(5)) 165 | id2words = pd.Series(data=df_words_ids["words"].values, index=df_words_ids["id"].values) 166 | #print(id2words[2]) 167 | doc="" 168 | for i in range(X.shape[0]): 169 | sentence="" 170 | for j in range(X.shape[1]): 171 | if(X[i][j])==0: 172 | break; 173 | else: 174 | sentence+=id2words[X[i][j]] 175 | if(arg[i][j]!=0): 176 | sentence+=("#"+str(arg[i][j])) 177 | sentence+="\n" 178 | doc+=sentence 179 | f=open(filename,mode="w",encoding="utf-8") 180 | f.write(doc) 181 | f.close() 182 | 183 | #read extra information from file,like pos info of word,or position info etc... 184 | def readExtraInfo(file): 185 | f = open(file=file, encoding="utf-8") 186 | lines = f.readlines() 187 | #print("lines numbers:",len(lines)) 188 | X=np.zeros(shape=(len(lines),parameter.MAX_SENTENCE_SIZE),dtype=np.int32) 189 | i = 0 190 | for line in lines: 191 | # print(line) 192 | line = line.strip() 193 | line_list = line.split(sep=" ") 194 | # print(line_list) 195 | j = 0 196 | for id in line_list: 197 | X[i, j] = id 198 | j += 1 199 | i += 1 200 | return X 201 | 202 | 203 | #读取预训练的embeddings 204 | def readEmbeddings(file): 205 | f=open(file=file,encoding="utf-8") 206 | lines=f.readlines() 207 | #first row is info 208 | info=lines[0].strip() 209 | info_list=info.split(sep=" ") 210 | vocab_size=int(info_list[0]) 211 | embedding_dims=int(info_list[1]) 212 | embeddings=np.zeros(shape=(vocab_size+1,embedding_dims),dtype=np.float32) 213 | for i in range(1,vocab_size+1): 214 | embed=lines[i].strip() 215 | embed_list=embed.split(sep=" ") 216 | for j in range(1,embedding_dims+1): 217 | embeddings[i][j-1]=embed_list[j] 218 | #print(embeddings.shape) 219 | return embeddings 220 | 221 | #返回字增强之后的word-embeddings 222 | def getCWE(word_embed_file,char_embed_file): 223 | word_embeddings=readEmbeddings(file=word_embed_file) 224 | print("shape of word_embeddings:",word_embeddings.shape) 225 | char_embeddings=readEmbeddings(file=char_embed_file) 226 | print("shape of char_embeddings:",char_embeddings.shape) 227 | 228 | #load id-word df 229 | df_words_ids = pd.read_csv(filepath_or_buffer="../data/dataset/words_ids.csv", encoding="utf-8") 230 | id2words = pd.Series(data=df_words_ids["words"].values, index=df_words_ids["id"].values) 231 | 232 | #load id-char df 233 | df_chars_ids = pd.read_csv(filepath_or_buffer="../data/dataset/chars_ids.csv", encoding="utf-8") 234 | chars2id = pd.Series(data=df_chars_ids["id"].values, index=df_chars_ids["chars"].values) 235 | 236 | for i in range(1,word_embeddings.shape[0]): 237 | #print(id2words[i]) 238 | word=id2words[i] 239 | sum_char_embeddings=np.zeros(shape=(128,),dtype=np.float32) 240 | for char in word: 241 | char_id=chars2id[char] 242 | sum_char_embeddings+=char_embeddings[char_id] 243 | sum_char_embeddings/=len(word) 244 | word_embeddings[i]+=sum_char_embeddings 245 | cwe=word_embeddings/2 246 | return cwe 247 | 248 | 249 | #从crf结果文件中抽取概率,并且返回ndarray类型 250 | def extractProb(file): 251 | probs=[] 252 | labels=[] 253 | preds=[] 254 | f=open(file=file,encoding="utf-8") 255 | lines=f.readlines() 256 | for line in lines: 257 | line=line.strip() 258 | if line!="": 259 | if line[0]!="#": 260 | prob = [] 261 | #print(line) 262 | #print(line[0]) 263 | line_list = line.split(sep="\t") 264 | l_0 = line_list[9].split(sep="/") 265 | prob.append(float(l_0[1])) 266 | l_1 = line_list[10].split(sep="/") 267 | prob.append(float(l_1[1])) 268 | l_2 = line_list[11].split(sep="/") 269 | prob.append(float(l_2[1])) 270 | #print(prob) 271 | probs.append(prob) 272 | labels.append(float(line_list[7])) 273 | preds.append(float(line_list[8].split(sep="/")[0])) 274 | #print("len of probs:",probs[0]) 275 | probs_nd=np.array(probs,dtype=np.float32) 276 | labels_nd=np.array(labels,dtype=np.int32) 277 | preds_nd=np.array(preds,dtype=np.int32) 278 | #print("shape of prob_nd",probs_nd.shape) 279 | return probs_nd,labels_nd,preds_nd 280 | 281 | def extractProb2(file): 282 | probs=[] 283 | result=[] 284 | f=open(file=file,encoding="utf-8") 285 | lines=f.readlines() 286 | for line in lines: 287 | line=line.strip() 288 | prob = [] 289 | #print(line) 290 | #print(line[0]) 291 | line_list = line.split(sep=" ") 292 | #print(line_list) 293 | l_0 = line_list[0] 294 | prob.append(float(l_0)) 295 | l_1 = line_list[1] 296 | prob.append(float(l_1)) 297 | l_2 = line_list[2] 298 | prob.append(float(l_2)) 299 | #print(prob) 300 | probs.append(prob) 301 | #print("len of probs:",probs[0]) 302 | probs_nd=np.array(probs,dtype=np.float32) 303 | #print("shape of prob_nd",probs_nd.shape) 304 | #print(probs_nd.dtype) 305 | return probs_nd 306 | 307 | #统计结果 308 | def statistic(type="valid"): 309 | print("CRF") 310 | prob, labels, preds = extractProb(file="./result/crf/crf_prob_"+type+".txt") 311 | #print("prob.shape", prob.shape) 312 | #print("labels.shape", labels.shape) 313 | #print("preds.shape", preds.shape) 314 | p1, f1 = eval(y_true=labels, y_pred=preds) 315 | #print("accuracy:", p1) 316 | print("f1-score:", f1) 317 | 318 | print("Alignment") 319 | prob_align = extractProb2(file="./result/alignment/alignment_prob_"+type+"_epoch5.txt") 320 | #print("prob_align.shape", prob_align.shape) 321 | # print("prob_align:",prob_align) 322 | preds_align = np.argmax(prob_align, axis=-1, ) 323 | # print(preds_align.shape) 324 | # print(preds_align) 325 | p2, f2 = eval(y_true=labels, y_pred=preds_align) 326 | #print("accuracy:", p2) 327 | print("f1-score:", f2) 328 | 329 | print("CNN") 330 | prob_cnn = extractProb2(file="./result/cnn/cnn_prob_"+type+"_epoch5.txt") 331 | #print("prob_cnn.shape", prob_cnn.shape) 332 | # print("prob_cnn:",prob_cnn) 333 | preds_cnn = np.argmax(prob_cnn, axis=-1, ) 334 | # print(preds_cnn.shape) 335 | # print(preds_align) 336 | p3, f3 = eval(y_true=labels, y_pred=preds_cnn) 337 | #print("accuracy:", p3) 338 | print("f1-score:", f3) 339 | 340 | print("Attention") 341 | prob_atten = extractProb2(file="./result/attention/attention_prob_"+type+"_epoch4.txt") 342 | #print("prob_atten.shape", prob_atten.shape) 343 | # print("prob_atten:",prob_atten) 344 | preds_atten = np.argmax(prob_atten, axis=-1, ) 345 | # print(preds_tten.shape) 346 | # print(preds_tten) 347 | p4, f4 = eval(y_true=labels, y_pred=preds_atten) 348 | #print("accuracy:", p4) 349 | print("f1-score:", f4) 350 | 351 | print("BiLSTM") 352 | prob_bilstm = extractProb2(file="./result/bilstm/bilstm_prob_"+type+"_epoch3.txt") 353 | #print("prob_bilstm.shape", prob_bilstm.shape) 354 | # print("prob_atten:",prob_atten) 355 | preds_bilstm = np.argmax(prob_bilstm, axis=-1, ) 356 | # print(preds_tten.shape) 357 | # print(preds_tten) 358 | p5, f5 = eval(y_true=labels, y_pred=preds_bilstm) 359 | #print("accuracy:", p5) 360 | print("f1-score:", f5) 361 | 362 | 363 | if __name__ =="__main__": 364 | #print("read extra info test:") 365 | #readExtraInfo(file="./data/dataset/pos_train_tag.txt") 366 | #readExtraInfo(file="./data/dataset/pos_test_tag.txt") 367 | #readExtraInfo(file="./data/dataset/length_train_tag.txt") 368 | #readExtraInfo(file="./data/dataset/length_test_tag.txt") 369 | #readEmbeddings(file="./data/embeddings/word_vec.txt") 370 | #readEmbeddings(file="./data/embeddings/char_vec.txt") 371 | #getCWE(word_embed_file="./data/embeddings/word_vec.txt",char_embed_file="./data/embeddings/char_vec.txt") 372 | statistic(type="valid") 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | ''' 383 | def getTag(preds_pw,preds_pph,preds_iph): 384 | # get complex "#" index 385 | length = preds_pw.shape[0] 386 | complex = np.array([preds_iph, preds_pph, preds_pw]) 387 | arg = np.argmax(complex, axis=0) 388 | # print("arg:\n", arg) 389 | for i in range(length): 390 | if arg[i] == 0: 391 | if complex[0, i] == 2: 392 | arg[i] = 6 393 | else: 394 | arg[i] = 0 395 | if arg[i] == 1: 396 | if complex[1, i] == 2: 397 | arg[i] = 4 398 | else: 399 | arg[i] = 0 400 | if arg[i] == 2: 401 | if complex[2, i] == 2: 402 | arg[i] = 2 403 | else: 404 | arg[i] = 0 405 | arg = (arg / 2).astype(dtype=np.int32) 406 | return arg 407 | 408 | #recover to original result 409 | def recover(X,preds_pw,preds_pph,preds_iph,filename): 410 | #shape of arg:[test_size,max_sentence_size] 411 | arg=np.reshape(arg,newshape=(-1,parameter.MAX_SENTENCE_SIZE)) 412 | #print("arg.shape",arg.shape) 413 | #print("arg:\n", arg) 414 | #get id2words 415 | df_words_ids = pd.read_csv(filepath_or_buffer="./dataset/temptest/words_ids.csv", encoding="utf-8") 416 | #print(df_words_ids.head(5)) 417 | id2words = pd.Series(data=df_words_ids["words"].values, index=df_words_ids["id"].values) 418 | #print(id2words[2]) 419 | doc="" 420 | for i in range(X.shape[0]): 421 | sentence="" 422 | for j in range(X.shape[1]): 423 | if(X[i][j])==0: 424 | break; 425 | else: 426 | sentence+=id2words[X[i][j]] 427 | if(arg[i][j]!=0): 428 | sentence+=("#"+str(arg[i][j])) 429 | sentence+="\n" 430 | doc+=sentence 431 | f=open(filename,mode="w",encoding="utf-8") 432 | f.write(doc) 433 | f.close() 434 | ''' --------------------------------------------------------------------------------