├── README.md ├── alignment.py ├── attention_based_model.py ├── data └── corpus │ └── prosody.txt ├── data_processing.py ├── model.py ├── parameter.py ├── seq2seq.py ├── temp_test.py └── util.py /README.md: -------------------------------------------------------------------------------- 1 | # Attension Based Chinese Prosody Prediction 2 | 3 | ## **Requirements** 4 | >python3.5+ 5 | 6 | >tensorflow>=1.4 7 | 8 | >numpy 9 | 10 | >pandas 11 | 12 | >scikit-learn 13 | 14 | ## **File Description** 15 | `alignment.py`:prosody prediction with alignment model 16 | 17 | `data_processing.py`:conver corpus and make dataset format. 18 | 19 | 20 | ## **Steps** 21 | ##### 1.run `python data_processing` 22 | >conver corpus and make dataset format. 23 | 24 | ##### 2.run `python alignment.py` 25 | >use basic alignment model to do prosody prediction 26 | 27 | 28 | -------------------------------------------------------------------------------- /alignment.py: -------------------------------------------------------------------------------- 1 | ''' 2 | model with attention 3 | ''' 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import tensorflow as tf 8 | import tensorflow.contrib.rnn as rnn 9 | import tensorflow.contrib.seq2seq as seq2seq 10 | import time 11 | import os 12 | import parameter 13 | import util 14 | 15 | class Alignment_Seq2Seq(): 16 | def __init__(self): 17 | # basic environment 18 | self.graph = tf.Graph() 19 | self.session = tf.Session(graph=self.graph) 20 | 21 | # basic parameters 22 | self.learning_rate = parameter.LEARNING_RATE 23 | self.max_epoch = parameter.MAX_EPOCH 24 | self.embedding_size = parameter.EMBEDDING_SIZE 25 | self.class_num = parameter.CLASS_NUM 26 | self.hidden_units_num = parameter.HIDDEN_UNITS_NUM 27 | self.hidden_units_num2 = parameter.HIDDEN_UNITS_NUM2 28 | self.layer_num = parameter.LAYER_NUM 29 | self.max_sentence_size = parameter.MAX_SENTENCE_SIZE 30 | self.vocab_size = parameter.VOCAB_SIZE 31 | self.batch_size = parameter.BATCH_SIZE 32 | self.lambda_pw=parameter.LAMBDA_PW 33 | self.lambda_pph=parameter.LAMBDA_PPH 34 | self.lambda_iph=parameter.LAMBDA_IPH 35 | 36 | # encoder,传入是前向和后向的cell,还有inputs 37 | # 输出是 38 | def encoder(self, cell_forward, cell_backward, inputs, seq_length, scope_name): 39 | outputs, states = tf.nn.bidirectional_dynamic_rnn( 40 | cell_fw=cell_forward, 41 | cell_bw=cell_backward, 42 | inputs=inputs, 43 | sequence_length=seq_length, 44 | dtype=tf.float32, 45 | scope=scope_name 46 | ) 47 | 48 | outputs_forward = outputs[0] # shape of h is [batch_size, max_time, cell_fw.output_size] 49 | outputs_backward = outputs[1] # shape of h is [batch_size, max_time, cell_bw.output_size] 50 | states_forward = states[0] # .c:[batch_size,num_units] .h:[batch_size,num_units] 51 | states_backward = states[1] 52 | #concat final outputs [batch_size, max_time, cell_fw.output_size*2] 53 | encoder_outputs = tf.concat(values=[outputs_forward, outputs_backward], axis=2) 54 | #concat final states 55 | state_h_concat=tf.concat(values=[states_forward.h,states_backward.h],axis=1,name="state_h_concat") 56 | #print("state_h_concat:",state_h_concat) 57 | state_c_concat=tf.concat(values=[states_forward.c,states_backward.c],axis=1,name="state_c_concat") 58 | #print("state_c_concat:",state_c_concat) 59 | encoder_states=rnn.LSTMStateTuple(c=state_c_concat,h=state_h_concat) 60 | 61 | return encoder_outputs, encoder_states 62 | 63 | def decoder(self, cell, initial_state, inputs, scope_name): 64 | # outputs:[batch_size,time_steps,hidden_size*2] 65 | outputs, states = tf.nn.dynamic_rnn( 66 | cell=cell, 67 | inputs=inputs, 68 | initial_state=initial_state, 69 | scope=scope_name 70 | ) 71 | #[batch_size*time_steps,hidden_size*2] 72 | decoder_outputs = tf.reshape(tensor=outputs, shape=(-1, self.hidden_units_num*2)) 73 | return decoder_outputs 74 | 75 | # forward process and training process 76 | def fit(self, X_train, y_train, len_train, X_validation, y_validation, len_validation, name, print_log=True): 77 | # ---------------------------------------forward computation--------------------------------------------# 78 | y_train_pw = y_train[0] 79 | y_train_pph = y_train[1] 80 | y_train_iph = y_train[2] 81 | 82 | y_validation_pw = y_validation[0] 83 | y_validation_pph = y_validation[1] 84 | y_validation_iph = y_validation[2] 85 | # ---------------------------------------define graph---------------------------------------------# 86 | with self.graph.as_default(): 87 | # data place holder 88 | self.X_p = tf.placeholder( 89 | dtype=tf.int32, 90 | shape=(None, self.max_sentence_size), 91 | name="input_placeholder" 92 | ) 93 | 94 | self.y_p_pw = tf.placeholder( 95 | dtype=tf.int32, 96 | shape=(None, self.max_sentence_size), 97 | name="label_placeholder_pw" 98 | ) 99 | self.y_p_pph = tf.placeholder( 100 | dtype=tf.int32, 101 | shape=(None, self.max_sentence_size), 102 | name="label_placeholder_pph" 103 | ) 104 | self.y_p_iph = tf.placeholder( 105 | dtype=tf.int32, 106 | shape=(None, self.max_sentence_size), 107 | name="label_placeholder_iph" 108 | ) 109 | 110 | # 相应序列的长度占位 111 | self.seq_len_p = tf.placeholder( 112 | dtype=tf.int32, 113 | shape=(None,), 114 | name="seq_len" 115 | ) 116 | 117 | #用来去掉padding的mask 118 | self.mask = tf.sequence_mask( 119 | lengths=self.seq_len_p, 120 | maxlen=self.max_sentence_size, 121 | name="mask" 122 | ) 123 | 124 | #去掉padding之后的labels 125 | y_p_pw_masked = tf.boolean_mask( #shape[seq_len1+seq_len2+....+,] 126 | tensor=self.y_p_pw, 127 | mask=self.mask, 128 | name="y_p_pw_masked" 129 | ) 130 | y_p_pph_masked = tf.boolean_mask( # shape[seq_len1+seq_len2+....+,] 131 | tensor=self.y_p_pph, 132 | mask=self.mask, 133 | name="y_p_pph_masked" 134 | ) 135 | y_p_iph_masked = tf.boolean_mask( # shape[seq_len1+seq_len2+....+,] 136 | tensor=self.y_p_iph, 137 | mask=self.mask, 138 | name="y_p_iph_masked" 139 | ) 140 | 141 | # embeddings 142 | self.embeddings = tf.Variable( 143 | initial_value=tf.zeros(shape=(self.vocab_size, self.embedding_size), dtype=tf.float32), 144 | name="embeddings" 145 | ) 146 | 147 | # -------------------------------------PW----------------------------------------------------- 148 | # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size] 149 | inputs_pw = tf.nn.embedding_lookup(params=self.embeddings, ids=self.X_p, name="embeded_input_pw") 150 | 151 | # encoder cells 152 | # forward part 153 | en_lstm_forward1_pw = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 154 | # en_lstm_forward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 155 | # en_lstm_forward=rnn.MultiRNNCell(cells=[en_lstm_forward1,en_lstm_forward2]) 156 | 157 | # backward part 158 | en_lstm_backward1_pw = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 159 | # en_lstm_backward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 160 | # en_lstm_backward=rnn.MultiRNNCell(cells=[en_lstm_backward1,en_lstm_backward2]) 161 | 162 | # decoder cells 163 | de_lstm_pw = rnn.BasicLSTMCell(num_units=self.hidden_units_num*2) 164 | 165 | # encode 166 | encoder_outputs_pw, encoder_states_pw = self.encoder( 167 | cell_forward=en_lstm_forward1_pw, 168 | cell_backward=en_lstm_backward1_pw, 169 | inputs=inputs_pw, 170 | seq_length=self.seq_len_p, 171 | scope_name="en_lstm_pw" 172 | ) 173 | # decode 174 | h_pw = self.decoder( # shape of h is [batch*time_steps,hidden_units*2] 175 | cell=de_lstm_pw, 176 | initial_state=encoder_states_pw, 177 | inputs=encoder_outputs_pw, 178 | scope_name="de_lstm_pw" 179 | ) 180 | 181 | # fully connect layer(projection) 182 | w_pw = tf.Variable( 183 | initial_value=tf.random_normal(shape=(self.hidden_units_num*2, self.class_num)), 184 | name="weights_pw" 185 | ) 186 | b_pw = tf.Variable( 187 | initial_value=tf.random_normal(shape=(self.class_num,)), 188 | name="bias_pw" 189 | ) 190 | #logits 191 | logits_pw = tf.matmul(h_pw, w_pw) + b_pw #logits_pw:[batch_size*max_time, 3] 192 | logits_normal_pw=tf.reshape( #logits in an normal way:[batch_size,max_time_stpes,3] 193 | tensor=logits_pw, 194 | shape=(-1,self.max_sentence_size,3), 195 | name="logits_normal_pw" 196 | ) 197 | logits_pw_masked = tf.boolean_mask( # logits_pw_masked [seq_len1+seq_len2+....+,3] 198 | tensor=logits_normal_pw, 199 | mask=self.mask, 200 | name="logits_pw_masked" 201 | ) 202 | 203 | # prediction 204 | pred_pw = tf.cast(tf.argmax(logits_pw, 1), tf.int32, name="pred_pw") # pred_pw:[batch_size*max_time,] 205 | pred_normal_pw = tf.reshape( # pred in an normal way,[batch_size, max_time] 206 | tensor=pred_pw, 207 | shape=(-1, self.max_sentence_size), 208 | name="pred_normal_pw" 209 | ) 210 | 211 | pred_pw_masked = tf.boolean_mask( # logits_pw_masked [seq_len1+seq_len2+....+,] 212 | tensor=pred_normal_pw, 213 | mask=self.mask, 214 | name="pred_pw_masked" 215 | ) 216 | 217 | pred_normal_one_hot_pw = tf.one_hot( # one-hot the pred_normal:[batch_size, max_time,class_num] 218 | indices=pred_normal_pw, 219 | depth=self.class_num, 220 | name="pred_normal_one_hot_pw" 221 | ) 222 | 223 | # loss 224 | self.loss_pw = tf.losses.sparse_softmax_cross_entropy( 225 | labels=y_p_pw_masked, 226 | logits=logits_pw_masked 227 | )+tf.contrib.layers.l2_regularizer(self.lambda_pw)(w_pw) 228 | # --------------------------------------------------------------------------------------- 229 | 230 | # ----------------------------------PPH-------------------------------------------------- 231 | # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size] 232 | inputs_pph = tf.nn.embedding_lookup(params=self.embeddings, ids=self.X_p, name="embeded_input_pph") 233 | # shape of inputs[batch_size,max_time_stpes,embeddings_dims+class_num] 234 | inputs_pph = tf.concat(values=[inputs_pph, pred_normal_one_hot_pw], axis=2, name="inputs_pph") 235 | # print("shape of input_pph:", inputs_pph.shape) 236 | 237 | # encoder cells 238 | # forward part 239 | en_lstm_forward1_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 240 | # en_lstm_forward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 241 | # en_lstm_forward=rnn.MultiRNNCell(cells=[en_lstm_forward1,en_lstm_forward2]) 242 | 243 | # backward part 244 | en_lstm_backward1_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 245 | # en_lstm_backward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 246 | # en_lstm_backward=rnn.MultiRNNCell(cells=[en_lstm_backward1,en_lstm_backward2]) 247 | 248 | # decoder cells 249 | de_lstm_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num*2) 250 | 251 | # encode 252 | encoder_outputs_pph, encoder_states_pph = self.encoder( 253 | cell_forward=en_lstm_forward1_pph, 254 | cell_backward=en_lstm_backward1_pph, 255 | inputs=inputs_pph, 256 | seq_length=self.seq_len_p, 257 | scope_name="en_lstm_pph" 258 | ) 259 | # shape of h is [batch*time_steps,hidden_units*2] 260 | h_pph = self.decoder( 261 | cell=de_lstm_pph, 262 | initial_state=encoder_states_pph, 263 | inputs=encoder_outputs_pph, 264 | scope_name="de_lstm_pph" 265 | ) 266 | 267 | # fully connect layer(projection) 268 | w_pph = tf.Variable( 269 | initial_value=tf.random_normal(shape=(self.hidden_units_num*2, self.class_num)), 270 | name="weights_pph" 271 | ) 272 | b_pph = tf.Variable( 273 | initial_value=tf.random_normal(shape=(self.class_num,)), 274 | name="bias_pph" 275 | ) 276 | # logits 277 | logits_pph = tf.matmul(h_pph, w_pph) + b_pph # shape of logits:[batch_size*max_time, 3] 278 | logits_normal_pph = tf.reshape( # logits in an normal way:[batch_size,max_time_stpes,3] 279 | tensor=logits_pph, 280 | shape=(-1, self.max_sentence_size, 3), 281 | name="logits_normal_pph" 282 | ) 283 | logits_pph_masked = tf.boolean_mask( # [seq_len1+seq_len2+....+,3] 284 | tensor=logits_normal_pph, 285 | mask=self.mask, 286 | name="logits_pph_masked" 287 | ) 288 | 289 | # prediction 290 | pred_pph = tf.cast(tf.argmax(logits_pph, 1), tf.int32, name="pred_pph") # pred_pph:[batch_size*max_time,] 291 | pred_normal_pph = tf.reshape( # pred in an normal way,[batch_size, max_time] 292 | tensor=pred_pph, 293 | shape=(-1, self.max_sentence_size), 294 | name="pred_normal_pph" 295 | ) 296 | pred_pph_masked = tf.boolean_mask( # logits_pph_masked [seq_len1+seq_len2+....+,] 297 | tensor=pred_normal_pph, 298 | mask=self.mask, 299 | name="pred_pph_masked" 300 | ) 301 | pred_normal_one_hot_pph = tf.one_hot( # one-hot the pred_normal:[batch_size, max_time,class_num] 302 | indices=pred_normal_pph, 303 | depth=self.class_num, 304 | name="pred_normal_one_hot_pph" 305 | ) 306 | 307 | # loss 308 | self.loss_pph = tf.losses.sparse_softmax_cross_entropy( 309 | labels=y_p_pph_masked, 310 | logits=logits_pph_masked 311 | )+tf.contrib.layers.l2_regularizer(self.lambda_pph)(w_pph) 312 | # ------------------------------------------------------------------------------------ 313 | 314 | # ---------------------------------------IPH------------------------------------------ 315 | # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size] 316 | inputs_iph = tf.nn.embedding_lookup(params=self.embeddings, ids=self.X_p, name="embeded_input_iph") 317 | # shape of inputs[batch_size,max_time_stpes,embeddings_dims+class_num] 318 | inputs_iph = tf.concat(values=[inputs_iph, pred_normal_one_hot_pph], axis=2, name="inputs_pph") 319 | # print("shape of input_pph:", inputs_pph.shape) 320 | # encoder cells 321 | # forward part 322 | en_lstm_forward1_iph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 323 | # en_lstm_forward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 324 | # en_lstm_forward=rnn.MultiRNNCell(cells=[en_lstm_forward1,en_lstm_forward2]) 325 | 326 | # backward part 327 | en_lstm_backward1_iph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 328 | # en_lstm_backward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 329 | # en_lstm_backward=rnn.MultiRNNCell(cells=[en_lstm_backward1,en_lstm_backward2]) 330 | 331 | # decoder cells 332 | de_lstm_iph = rnn.BasicLSTMCell(num_units=self.hidden_units_num*2) 333 | 334 | # encode 335 | encoder_outputs_iph, encoder_states_iph = self.encoder( 336 | cell_forward=en_lstm_forward1_iph, 337 | cell_backward=en_lstm_backward1_iph, 338 | inputs=inputs_iph, 339 | seq_length=self.seq_len_p, 340 | scope_name="en_lstm_iph" 341 | ) 342 | # shape of h is [batch*time_steps,hidden_units*2] 343 | h_iph = self.decoder( 344 | cell=de_lstm_iph, 345 | initial_state=encoder_states_iph, 346 | inputs=encoder_outputs_iph, 347 | scope_name="de_lstm_iph" 348 | ) 349 | 350 | # fully connect layer(projection) 351 | w_iph = tf.Variable( 352 | initial_value=tf.random_normal(shape=(self.hidden_units_num*2, self.class_num)), 353 | name="weights_iph" 354 | ) 355 | b_iph = tf.Variable( 356 | initial_value=tf.random_normal(shape=(self.class_num,)), 357 | name="bias_iph" 358 | ) 359 | # logits 360 | logits_iph = tf.matmul(h_iph, w_iph) + b_iph # shape of logits:[batch_size*max_time, 3] 361 | logits_normal_iph = tf.reshape( # logits in an normal way:[batch_size,max_time_stpes,3] 362 | tensor=logits_iph, 363 | shape=(-1, self.max_sentence_size, 3), 364 | name="logits_normal_iph" 365 | ) 366 | logits_iph_masked = tf.boolean_mask( # [seq_len1+seq_len2+....+,3] 367 | tensor=logits_normal_iph, 368 | mask=self.mask, 369 | name="logits_iph_masked" 370 | ) 371 | 372 | # prediction 373 | pred_iph = tf.cast(tf.argmax(logits_iph, 1), tf.int32, name="pred_iph") # pred_iph:[batch_size*max_time,] 374 | pred_normal_iph = tf.reshape( # pred in an normal way,[batch_size, max_time] 375 | tensor=pred_iph, 376 | shape=(-1, self.max_sentence_size), 377 | name="pred_normal_iph" 378 | ) 379 | pred_iph_masked = tf.boolean_mask( # logits_iph_masked [seq_len1+seq_len2+....+,] 380 | tensor=pred_normal_iph, 381 | mask=self.mask, 382 | name="pred_iph_masked" 383 | ) 384 | pred_normal_one_hot_iph = tf.one_hot( # one-hot the pred_normal:[batch_size, max_time,class_num] 385 | indices=pred_normal_iph, 386 | depth=self.class_num, 387 | name="pred_normal_one_hot_iph" 388 | ) 389 | # loss 390 | self.loss_iph = tf.losses.sparse_softmax_cross_entropy( 391 | labels=y_p_iph_masked, 392 | logits=logits_iph_masked 393 | )+tf.contrib.layers.l2_regularizer(self.lambda_iph)(w_iph) 394 | 395 | # --------------------------------------------------------------------------------------- 396 | # loss 397 | self.loss = self.loss_pw + self.loss_pph + self.loss_iph 398 | # optimizer 399 | self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss) 400 | self.init_op = tf.global_variables_initializer() 401 | self.init_local_op = tf.local_variables_initializer() 402 | 403 | # ------------------------------------Session----------------------------------------- 404 | with self.session as sess: 405 | print("Training Start") 406 | sess.run(self.init_op) # initialize all variables 407 | sess.run(self.init_local_op) 408 | 409 | train_Size = X_train.shape[0]; 410 | validation_Size = X_validation.shape[0] 411 | self.best_validation_loss = 1000 # best validation accuracy in training process 412 | 413 | # epoch 414 | for epoch in range(1, self.max_epoch + 1): 415 | print("Epoch:", epoch) 416 | start_time = time.time() # time evaluation 417 | # training loss/accuracy in every mini-batch 418 | self.train_losses = [] 419 | self.train_accus_pw = [] 420 | self.train_accus_pph = [] 421 | self.train_accus_iph = [] 422 | 423 | self.c1_f_pw = []; 424 | self.c2_f_pw = [] # each class's f1 score 425 | self.c1_f_pph = []; 426 | self.c2_f_pph = [] 427 | self.c1_f_iph = []; 428 | self.c2_f_iph = [] 429 | 430 | # mini batch 431 | for i in range(0, (train_Size // self.batch_size)): 432 | #注意:这里获取的都是mask之后的值 433 | _, train_loss, y_train_pw_masked,y_train_pph_masked,y_train_iph_masked,\ 434 | train_pred_pw, train_pred_pph, train_pred_iph = sess.run( 435 | fetches=[self.optimizer, self.loss, 436 | y_p_pw_masked,y_p_pph_masked,y_p_iph_masked, 437 | pred_pw_masked, pred_pph_masked, pred_iph_masked], 438 | feed_dict={ 439 | self.X_p: X_train[i * self.batch_size:(i + 1) * self.batch_size], 440 | self.y_p_pw: y_train_pw[i * self.batch_size:(i + 1) * self.batch_size], 441 | self.y_p_pph: y_train_pph[i * self.batch_size:(i + 1) * self.batch_size], 442 | self.y_p_iph: y_train_iph[i * self.batch_size:(i + 1) * self.batch_size], 443 | self.seq_len_p: len_train[i * self.batch_size:(i + 1) * self.batch_size] 444 | } 445 | ) 446 | 447 | # loss 448 | self.train_losses.append(train_loss) 449 | # metrics 450 | 451 | accuracy_pw, f1_1_pw, f1_2_pw = util.eval(y_true=y_train_pw_masked,y_pred=train_pred_pw) # pw 452 | accuracy_pph, f1_1_pph, f1_2_pph = util.eval(y_true=y_train_pph_masked,y_pred=train_pred_pph) # pph 453 | accuracy_iph, f1_1_iph, f1_2_iph = util.eval(y_true=y_train_iph_masked,y_pred=train_pred_iph) # iph 454 | 455 | self.train_accus_pw.append(accuracy_pw) 456 | self.train_accus_pph.append(accuracy_pph) 457 | self.train_accus_iph.append(accuracy_iph) 458 | # F1-score 459 | self.c1_f_pw.append(f1_1_pw); 460 | self.c2_f_pw.append(f1_2_pw) 461 | self.c1_f_pph.append(f1_1_pph); 462 | self.c2_f_pph.append(f1_2_pph) 463 | self.c1_f_iph.append(f1_1_iph); 464 | self.c2_f_iph.append(f1_2_iph) 465 | 466 | # validation in every epoch 467 | self.validation_loss, y_valid_pw_masked,y_valid_pph_masked,y_valid_iph_masked,\ 468 | valid_pred_pw, valid_pred_pph, valid_pred_iph = sess.run( 469 | fetches=[self.loss, y_p_pw_masked,y_p_pph_masked,y_p_iph_masked, 470 | pred_pw_masked, pred_pph_masked, pred_iph_masked], 471 | feed_dict={ 472 | self.X_p: X_validation, 473 | self.y_p_pw: y_validation_pw, 474 | self.y_p_pph: y_validation_pph, 475 | self.y_p_iph: y_validation_iph, 476 | self.seq_len_p: len_validation 477 | } 478 | ) 479 | # print("valid_pred_pw.shape:",valid_pred_pw.shape) 480 | # print("valid_pred_pph.shape:",valid_pred_pph.shape) 481 | # print("valid_pred_iph.shape:",valid_pred_iph.shape) 482 | 483 | # metrics 484 | self.valid_accuracy_pw, self.valid_f1_1_pw, self.valid_f1_2_pw = util.eval(y_true=y_valid_pw_masked,y_pred=valid_pred_pw) 485 | self.valid_accuracy_pph, self.valid_f1_1_pph, self.valid_f1_2_pph = util.eval(y_true=y_valid_pph_masked,y_pred=valid_pred_pph) 486 | self.valid_accuracy_iph, self.valid_f1_1_iph, self.valid_f1_2_iph = util.eval(y_true=y_valid_iph_masked,y_pred=valid_pred_iph) 487 | 488 | #print information 489 | print("Epoch ", epoch, " finished.", "spend ", round((time.time() - start_time) / 60, 2), " mins") 490 | self.showInfo(type="training") 491 | self.showInfo(type="validation") 492 | 493 | # when we get a new best validation accuracy,we store the model 494 | if self.best_validation_loss < self.validation_loss: 495 | self.best_validation_loss = self.validation_loss 496 | print("New Best loss ", self.best_validation_loss, " On Validation set! ") 497 | print("Saving Models......\n\n") 498 | # exist ./models folder? 499 | if not os.path.exists("./models/"): 500 | os.mkdir(path="./models/") 501 | if not os.path.exists("./models/" + name): 502 | os.mkdir(path="./models/" + name) 503 | if not os.path.exists("./models/" + name + "/bilstm"): 504 | os.mkdir(path="./models/" + name + "/bilstm") 505 | # create saver 506 | saver = tf.train.Saver() 507 | saver.save(sess, "./models/" + name + "/bilstm/my-model-10000") 508 | # Generates MetaGraphDef. 509 | saver.export_meta_graph("./models/" + name + "/bilstm/my-model-10000.meta") 510 | print("\n\n") 511 | # test:using X_validation_pw 512 | test_pred_pw, test_pred_pph, test_pred_iph = sess.run( 513 | fetches=[pred_pw, pred_pph, pred_iph], 514 | feed_dict={ 515 | self.X_p: X_validation, 516 | self.seq_len_p: len_validation 517 | } 518 | ) 519 | # recover to original corpus txt 520 | # shape of valid_pred_pw,valid_pred_pw,valid_pred_pw:[corpus_size*time_stpes] 521 | util.recover( 522 | X=X_validation, 523 | preds_pw=test_pred_pw, 524 | preds_pph=test_pred_pph, 525 | preds_iph=test_pred_iph, 526 | filename="recover_epoch_" + str(epoch) + ".txt" 527 | ) 528 | 529 | # 返回预测的结果或者准确率,y not None的时候返回准确率,y ==None的时候返回预测值 530 | def pred(self, name, X, y=None, ): 531 | start_time = time.time() # compute time 532 | if y is None: 533 | with self.session as sess: 534 | # restore model 535 | new_saver = tf.train.import_meta_graph( 536 | meta_graph_or_file="./models/" + name + "/bilstm/my-model-10000.meta", 537 | clear_devices=True 538 | ) 539 | new_saver.restore(sess, "./models/" + name + "/bilstm/my-model-10000") 540 | # get default graph 541 | graph = tf.get_default_graph() 542 | # get opration from the graph 543 | pred_normal = graph.get_operation_by_name("pred_normal").outputs[0] 544 | X_p = graph.get_operation_by_name("input_placeholder").outputs[0] 545 | pred = sess.run(fetches=pred_normal, feed_dict={X_p: X}) 546 | print("this operation spends ", round((time.time() - start_time) / 60, 2), " mins") 547 | return pred 548 | else: 549 | with self.session as sess: 550 | # restore model 551 | new_saver = tf.train.import_meta_graph( 552 | meta_graph_or_file="./models/" + name + "/bilstm/my-model-10000.meta", 553 | clear_devices=True 554 | ) 555 | new_saver.restore(sess, "./models/" + name + "/bilstm/my-model-10000") 556 | graph = tf.get_default_graph() 557 | # get opration from the graph 558 | accuracy = graph.get_operation_by_name("accuracy").outputs[0] 559 | X_p = graph.get_operation_by_name("input_placeholder").outputs[0] 560 | y_p = graph.get_operation_by_name("label_placeholder").outputs[0] 561 | # forward and get the results 562 | accu = sess.run(fetches=accuracy, feed_dict={X_p: X, y_p: y}) 563 | print("this operation spends ", round((time.time() - start_time) / 60, 2), " mins") 564 | return accu 565 | 566 | 567 | def showInfo(self, type): 568 | if type == "training": 569 | # training information 570 | print(" /**Training info**/") 571 | print("----avarage training loss:", sum(self.train_losses) / len(self.train_losses)) 572 | print("PW:") 573 | print("----avarage accuracy:", sum(self.train_accus_pw) / len(self.train_accus_pw)) 574 | print("----avarage f1-Score of N:", sum(self.c1_f_pw) / len(self.c1_f_pw)) 575 | print("----avarage f1-Score of B:", sum(self.c2_f_pw) / len(self.c2_f_pw)) 576 | print("PPH:") 577 | print("----avarage accuracy :", sum(self.train_accus_pph) / len(self.train_accus_pph)) 578 | print("----avarage f1-Score of N:", sum(self.c1_f_pph) / len(self.c1_f_pph)) 579 | print("----avarage f1-Score of B:", sum(self.c2_f_pph) / len(self.c2_f_pph)) 580 | print("IPH:") 581 | print("----avarage accuracy:", sum(self.train_accus_iph) / len(self.train_accus_iph)) 582 | print("----avarage f1-Score of N:", sum(self.c1_f_iph) / len(self.c1_f_iph)) 583 | print("----avarage f1-Score of B:", sum(self.c2_f_iph) / len(self.c2_f_iph)) 584 | else: 585 | print(" /**Validation info**/") 586 | print("----avarage validation loss:", self.validation_loss) 587 | print("PW:") 588 | print("----avarage accuracy:", self.valid_accuracy_pw) 589 | print("----avarage f1-Score of N:", self.valid_f1_1_pw) 590 | print("----avarage f1-Score of B:", self.valid_f1_2_pw) 591 | print("PPH:") 592 | print("----avarage accuracy :", self.valid_accuracy_pph) 593 | print("----avarage f1-Score of N:", self.valid_f1_1_pph) 594 | print("----avarage f1-Score of B:", self.valid_f1_2_pph) 595 | print("IPH:") 596 | print("----avarage accuracy:", self.valid_accuracy_iph) 597 | print("----avarage f1-Score of N:", self.valid_f1_1_iph) 598 | print("----avarage f1-Score of B:", self.valid_f1_2_iph) 599 | 600 | 601 | # train && test 602 | if __name__ == "__main__": 603 | # 读数据 604 | # pw 605 | df_train_pw = pd.read_pickle(path="./dataset/temptest/pw_summary_train.pkl") 606 | df_validation_pw = pd.read_pickle(path="./dataset/temptest/pw_summary_validation.pkl") 607 | # pph 608 | df_train_pph = pd.read_pickle(path="./dataset/temptest/pph_summary_train.pkl") 609 | df_validation_pph = pd.read_pickle(path="./dataset/temptest/pph_summary_validation.pkl") 610 | # iph 611 | df_train_iph = pd.read_pickle(path="./dataset/temptest/iph_summary_train.pkl") 612 | df_validation_iph = pd.read_pickle(path="./dataset/temptest/iph_summary_validation.pkl") 613 | 614 | # 实际上,X里面的内容都是一样的,所以这里统一使用pw的X来作为所有的X 615 | # 但是标签是不一样的,所以需要每个都要具体定义 616 | X_train = np.asarray(list(df_train_pw['X'].values)) 617 | X_validation = np.asarray(list(df_validation_pw['X'].values)) 618 | 619 | # tags 620 | y_train_pw = np.asarray(list(df_train_pw['y'].values)) 621 | y_validation_pw = np.asarray(list(df_validation_pw['y'].values)) 622 | 623 | y_train_pph = np.asarray(list(df_train_pph['y'].values)) 624 | y_validation_pph = np.asarray(list(df_validation_pph['y'].values)) 625 | 626 | y_train_iph = np.asarray(list(df_train_iph['y'].values)) 627 | y_validation_iph = np.asarray(list(df_validation_iph['y'].values)) 628 | 629 | # length每一行序列的长度 630 | # 因为都一样,所以统一使用pw的 631 | len_train = np.asarray(list(df_train_pw['sentence_len'].values)) 632 | len_validation = np.asarray(list(df_validation_pw['sentence_len'].values)) 633 | print("len_train:", len_train.shape) 634 | print("len_validation:", len_validation.shape) 635 | 636 | # X_train = [X_train_pw, X_train_pph, X_train_iph] 637 | y_train = [y_train_pw, y_train_pph, y_train_iph] 638 | # X_validation = [X_validation_pw, X_validation_pph, X_validation_iph] 639 | y_validation = [y_validation_pw, y_validation_pph, y_validation_iph] 640 | 641 | # print("X_train_pw:\n",X_train_pw); print(X_train_pw.shape) 642 | # print("X_train_pph:\n", X_train_pph); print(X_train_pph.shape) 643 | # print("X_train_iph:\n", X_train_iph); print(X_train_iph.shape) 644 | 645 | # print("y_train_pw:\n", y_train_pw); 646 | # print(y_train_pw.shape) 647 | # print("y_train_pph:\n", y_train_pph); 648 | # print(y_train_pph.shape) 649 | # print("y_train_iph:\n", y_train_iph); 650 | # print(y_train_iph.shape) 651 | 652 | model = Alignment_Seq2Seq() 653 | model.fit(X_train, y_train, len_train, X_validation, y_validation, len_validation, "test", False) -------------------------------------------------------------------------------- /attention_based_model.py: -------------------------------------------------------------------------------- 1 | ''' 2 | /******model with attention********/ 3 | author:xierhacker 4 | time:2018.1.08 5 | 6 | ''' 7 | 8 | import numpy as np 9 | import pandas as pd 10 | import tensorflow as tf 11 | import tensorflow.contrib.rnn as rnn 12 | import tensorflow.contrib.seq2seq as seq2seq 13 | import time 14 | import os 15 | import parameter 16 | import util 17 | 18 | class Attension_Alignment_Seq2Seq(): 19 | def __init__(self): 20 | # basic environment 21 | self.graph = tf.Graph() 22 | self.session = tf.Session(graph=self.graph) 23 | 24 | #basic parameters 25 | self.learning_rate = parameter.LEARNING_RATE 26 | self.max_epoch = parameter.MAX_EPOCH 27 | self.embedding_size = parameter.EMBEDDING_SIZE 28 | self.class_num = parameter.CLASS_NUM 29 | self.hidden_units_num = parameter.HIDDEN_UNITS_NUM 30 | self.hidden_units_num2=parameter.HIDDEN_UNITS_NUM2 31 | self.layer_num = parameter.LAYER_NUM 32 | self.max_sentence_size=parameter.MAX_SENTENCE_SIZE 33 | self.vocab_size=parameter.VOCAB_SIZE 34 | self.batch_size=parameter.BATCH_SIZE 35 | 36 | #encoder 37 | def encoder(self,cell_forward,cell_backward,inputs,scope_name): 38 | outputs, states = tf.nn.bidirectional_dynamic_rnn( 39 | cell_fw=cell_forward, 40 | cell_bw=cell_backward, 41 | inputs=inputs, 42 | dtype=tf.float32, 43 | scope=scope_name 44 | ) 45 | outputs_forward = outputs[0] # shape of h is [batch_size, max_time, cell_fw.output_size] 46 | outputs_backward = outputs[1] # shape of h is [batch_size, max_time, cell_bw.output_size] 47 | #shape of h is [batch_size, max_time, cell_fw.output_size*2] 48 | encoder_outputs = tf.concat(values=[outputs_forward, outputs_backward], axis=2) 49 | 50 | states_forward=states[0] # .c:[batch_size,cell_fw.output_size] .h:[batch_size,cell_fw.output_size] 51 | states_backward=states[1] 52 | print(type(states_forward)) 53 | #shape of encoder_states_concat[2,batch_size,cell_fw.output_size*2] 54 | #encoder_states_concat = tf.concat([states_forward, states_backward], axis=2) 55 | #print(encoder_states_concat) 56 | #encoder_states=[encoder_states_concat[0],encoder_states_concat[1]] 57 | #encoder_states=tuple(encoder_states) 58 | #print(type(encoder_states)) 59 | return encoder_outputs,states_forward 60 | 61 | def decoder(self,cell,initial_state,inputs,scope_name): 62 | outputs,states=tf.nn.dynamic_rnn( 63 | cell=cell, 64 | inputs=inputs, 65 | initial_state=initial_state, 66 | scope=scope_name 67 | ) 68 | #outputs #[batch_size,time_steps,hidden_size] 69 | decoder_outputs=tf.reshape(tensor=outputs,shape=(-1,self.hidden_units_num)) 70 | return decoder_outputs 71 | 72 | 73 | def attention(self, prev_state, enc_outputs): 74 | """ 75 | :param prev_state: the decoder hidden state at time i-1 76 | :param enc_outputs: the encoder outputs, a length 'T' list. 77 | shape of state.h:[batch_size,hidden_units_num] 78 | shape of enc_outputs:[batch_size,time_steps,hidden_units_num*2] 79 | shape of tf.matmul(prev_state, self.attention_W): [batch_size,hidden_units_num] 80 | shape of tf.matmul(output, self.attention_U): [batch_size,hidden_units_num] 81 | shape of tf.matmul(atten_hidden, self.attention_V): [batch_size,1] 82 | 83 | e_ik=g(s_i-1,h_k) 84 | """ 85 | e_i = [] 86 | c_i = [] 87 | #c=tf.zeros(shape=(enc_outputs.shape[0],self.hidden_units_num*2)) 88 | for j in range(self.max_sentence_size): 89 | atten_hidden = tf.tanh( 90 | tf.add(tf.matmul(prev_state.h, self.attention_W), tf.matmul(enc_outputs[:,j,:], self.attention_U)) 91 | ) 92 | e_i_j = tf.matmul(atten_hidden, self.attention_V) 93 | e_i.append(e_i_j) 94 | #print("len of e_i:",len(e_i)) 95 | #print("shape of elements in e_i:",e_i[0].shape) 96 | e_i = tf.concat(e_i, axis=1) #e_i shape:[batch_size,max_time_steps] 97 | alpha_i = tf.nn.softmax(e_i) #alpha_i :[batch_size,max_time_steps] 98 | #print("shape of alpha",alpha_i.shape) 99 | #print(alpha_i[:,0].shape) 100 | #comute cz 101 | for j in range(self.max_sentence_size): 102 | alpha_time_j=alpha_i[:,j] 103 | alpha_time_j=tf.reshape(tensor=alpha_time_j,shape=(-1,1)) 104 | #print("shape of alpha_time_j:",alpha_time_j.shape) 105 | c_time_j=tf.multiply(x=alpha_time_j,y=enc_outputs[:,j,:]) 106 | c_i.append(c_time_j) 107 | c_i=sum(c_i) 108 | #print("shape of c_i:",c_i.shape) 109 | return c_i #shape of c_i[batch_size,hidden_units_num*2] 110 | 111 | 112 | def decode(self, cell, init_state, enc_outputs, loop_function=None): 113 | with tf.variable_scope(name_or_scope="decode_pw",reuse=tf.AUTO_REUSE): 114 | outputs = [] 115 | prev = None 116 | state = init_state 117 | #print("type of init state:",init_state) 118 | #print("shape of init state:",init_state.shape) 119 | for i in range(self.max_sentence_size): 120 | if i > 0: 121 | tf.get_variable_scope().reuse_variables() 122 | c_i = self.attention(state, enc_outputs) #[batch_size,hidden_units_num*2] 123 | inp=tf.concat(values=[enc_outputs[:,i,:],c_i],axis=1) #[batch_size,hidden_units_num*4] 124 | #print("shape of inp:",inp.shape) 125 | output, state = cell(inp, state,scope="de_lstm") #shape of output[batch_size,hidden_units_size] 126 | #print("shape of output:",output.shape) 127 | outputs.append(output) 128 | #print("len of output:",len(outputs)) 129 | outputs=tf.concat(values=outputs,axis=0) #outputs:[batch_size*timesteps,hiddem_units_num] 130 | #print("shape of outputs:",outputs.shape) 131 | return outputs 132 | 133 | ''' 134 | for i, inp in enumerate(self.decoder_inputs_emb): 135 | if loop_function is not None and prev is not None: 136 | with tf.variable_scope("loop_function", reuse=True): 137 | inp = loop_function(prev, i) 138 | if i > 0: 139 | tf.get_variable_scope().reuse_variables() 140 | c_i = self.attention(state, enc_outputs) 141 | inp = tf.concat([inp, c_i], axis=1) 142 | output, state = cell(inp, state) 143 | # print output.eval() 144 | outputs.append(output) 145 | if loop_function is not None: 146 | prev = output 147 | return outputs 148 | ''' 149 | 150 | def loop_function(self, prev, _): 151 | """ 152 | :param prev: the output of t-1 time 153 | :param _: 154 | :return: the embedding of t-1 output 155 | """ 156 | prev = tf.add(tf.matmul(prev, self.softmax_w), self.softmax_b) 157 | prev_sympol = tf.arg_max(prev, 1) 158 | #emb_prev = tf.nn.embedding_lookup(self.target_embedding, prev_sympol) 159 | return emb_prev 160 | 161 | # forward process and training process 162 | def fit(self,X_train,y_train,X_validation,y_validation,name,print_log=True): 163 | #---------------------------------------forward computation--------------------------------------------# 164 | X_train_pw = X_train[0];X_train_pph = X_train[1];X_train_iph = X_train[2] 165 | y_train_pw = y_train[0];y_train_pph = y_train[1];y_train_iph = y_train[2] 166 | 167 | X_validation_pw = X_validation[0];X_validation_pph = X_validation[1];X_validation_iph = X_validation[2] 168 | y_validation_pw = y_validation[0];y_validation_pph = y_validation[1];y_validation_iph = y_validation[2] 169 | 170 | #---------------------------------------define graph---------------------------------------------# 171 | with self.graph.as_default(): 172 | # data place holder 173 | self.X_p_pw = tf.placeholder( 174 | dtype=tf.int32, 175 | shape=(None, self.max_sentence_size), 176 | name="input_placeholder_pw" 177 | ) 178 | self.y_p_pw = tf.placeholder( 179 | dtype=tf.int32, 180 | shape=(None,self.max_sentence_size), 181 | name="label_placeholder_pw" 182 | ) 183 | 184 | self.X_p_pph = tf.placeholder( 185 | dtype=tf.int32, 186 | shape=(None, self.max_sentence_size), 187 | name="input_placeholder_pph" 188 | ) 189 | 190 | self.y_p_pph = tf.placeholder( 191 | dtype=tf.int32, 192 | shape=(None, self.max_sentence_size), 193 | name="label_placeholder_pph" 194 | ) 195 | self.X_p_iph = tf.placeholder( 196 | dtype=tf.int32, 197 | shape=(None, self.max_sentence_size), 198 | name="input_placeholder_iph" 199 | ) 200 | 201 | self.y_p_iph = tf.placeholder( 202 | dtype=tf.int32, 203 | shape=(None, self.max_sentence_size), 204 | name="label_placeholder_iph" 205 | ) 206 | 207 | 208 | #attention variables 209 | self.attention_W = tf.Variable( 210 | tf.random_uniform([self.hidden_units_num, self.hidden_units_num], 0.0, 1.0), 211 | name="attention_W" 212 | ) 213 | self.attention_U = tf.Variable( 214 | tf.random_uniform([self.hidden_units_num * 2, self.hidden_units_num], 0.0, 1.0), 215 | name="attention_U" 216 | ) 217 | 218 | self.attention_V = tf.Variable( 219 | tf.random_uniform([self.hidden_units_num, 1], 0.0, 1.0), 220 | name="attention_V" 221 | ) 222 | 223 | #embeddings 224 | self.embeddings=tf.Variable( 225 | initial_value=tf.zeros(shape=(self.vocab_size,self.embedding_size),dtype=tf.float32), 226 | name="embeddings" 227 | ) 228 | 229 | #-------------------------------------PW----------------------------------------------------- 230 | #embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size] 231 | inputs_pw=tf.nn.embedding_lookup(params=self.embeddings,ids=self.X_p_pw,name="embeded_input_pw") 232 | 233 | # encoder cells 234 | # forward part 235 | en_lstm_forward1_pw = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 236 | # en_lstm_forward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 237 | # en_lstm_forward=rnn.MultiRNNCell(cells=[en_lstm_forward1,en_lstm_forward2]) 238 | 239 | # backward part 240 | en_lstm_backward1_pw = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 241 | # en_lstm_backward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 242 | # en_lstm_backward=rnn.MultiRNNCell(cells=[en_lstm_backward1,en_lstm_backward2]) 243 | 244 | # decoder cells 245 | de_lstm_pw = rnn.BasicLSTMCell(num_units=self.hidden_units_num,reuse=tf.AUTO_REUSE) 246 | 247 | # encode 248 | encoder_outputs_pw, encoder_states_pw = self.encoder( 249 | cell_forward=en_lstm_forward1_pw, 250 | cell_backward=en_lstm_backward1_pw, 251 | inputs=inputs_pw, 252 | scope_name="en_lstm_pw" 253 | ) 254 | #print("shape of encoder_outputs:",encoder_outputs_pw.shape) 255 | #print("shape encoder_states_pw.h",encoder_states_pw.h.shape) 256 | #print("shape encoder_states_pw.c",encoder_states_pw.c.shape) 257 | 258 | #attention test 259 | #self.attention(prev_state=encoder_states_pw,enc_outputs=encoder_outputs_pw) 260 | 261 | #decode test 262 | h_pw=self.decode( 263 | cell=de_lstm_pw, 264 | init_state=encoder_states_pw, 265 | enc_outputs=encoder_outputs_pw 266 | ) 267 | #h_pw = self.decode(self.dec_lstm_cell, enc_state, enc_outputs) 268 | #h_pw = self.decoder( 269 | # cell=de_lstm_pw, 270 | # initial_state=encoder_states_pw, 271 | # inputs=encoder_outputs_pw, 272 | # scope_name="de_lstm_pw" 273 | #) 274 | 275 | ''' 276 | ) 277 | if is_training: 278 | self. 279 | else: 280 | self.dec_outputs = self.decode(self.dec_lstm_cell, enc_state, enc_outputs, self.loop_function) 281 | # shape of h is [batch*time_steps,hidden_units] 282 | 283 | ''' 284 | # fully connect layer(projection) 285 | w_pw = tf.Variable( 286 | initial_value=tf.random_normal(shape=(self.hidden_units_num2, self.class_num)), 287 | name="weights_pw" 288 | ) 289 | b_pw = tf.Variable( 290 | initial_value=tf.random_normal(shape=(self.class_num,)), 291 | name="bias_pw" 292 | ) 293 | logits_pw = tf.matmul(h_pw, w_pw) + b_pw # shape of logits:[batch_size*max_time, 3] 294 | 295 | # prediction 296 | # shape of pred[batch_size*max_time, 1] 297 | pred_pw = tf.cast(tf.argmax(logits_pw, 1), tf.int32, name="pred_pw") 298 | 299 | # pred in an normal way,shape is [batch_size, max_time,1] 300 | pred_normal_pw = tf.reshape( 301 | tensor=pred_pw, 302 | shape=(-1, self.max_sentence_size), 303 | name="pred_normal" 304 | ) 305 | 306 | # one-hot the pred_normal:[batch_size, max_time,class_num] 307 | pred_normal_one_hot_pw = tf.one_hot( 308 | indices=pred_normal_pw, 309 | depth=self.class_num, 310 | name="pred_normal_one_hot_pw" 311 | ) 312 | 313 | # loss 314 | self.loss_pw = tf.losses.sparse_softmax_cross_entropy( 315 | labels=tf.reshape(self.y_p_pw, shape=[-1]), 316 | logits=logits_pw 317 | ) 318 | #--------------------------------------------------------------------------------------- 319 | 320 | ''' 321 | #----------------------------------PPH-------------------------------------------------- 322 | # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size] 323 | inputs_pph = tf.nn.embedding_lookup(params=self.embeddings, ids=self.X_p_pph, name="embeded_input_pph") 324 | # shape of inputs[batch_size,max_time_stpes,embeddings_dims+class_num] 325 | inputs_pph = tf.concat(values=[inputs_pph, pred_normal_one_hot_pw], axis=2, name="inputs_pph") 326 | print("shape of input_pph:", inputs_pph.shape) 327 | 328 | # encoder cells 329 | # forward part 330 | en_lstm_forward1_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 331 | # en_lstm_forward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 332 | # en_lstm_forward=rnn.MultiRNNCell(cells=[en_lstm_forward1,en_lstm_forward2]) 333 | 334 | # backward part 335 | en_lstm_backward1_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 336 | # en_lstm_backward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 337 | # en_lstm_backward=rnn.MultiRNNCell(cells=[en_lstm_backward1,en_lstm_backward2]) 338 | 339 | # decoder cells 340 | de_lstm_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 341 | 342 | # encode 343 | encoder_outputs_pph, encoder_states_pph = self.encoder( 344 | cell_forward=en_lstm_forward1_pph, 345 | cell_backward=en_lstm_backward1_pph, 346 | inputs=inputs_pph, 347 | scope_name="en_lstm_pph" 348 | ) 349 | # shape of h is [batch*time_steps,hidden_units] 350 | h_pph = self.decoder( 351 | cell=de_lstm_pph, 352 | initial_state=encoder_states_pph, 353 | inputs=encoder_outputs_pph, 354 | scope_name="de_lstm_pph" 355 | ) 356 | 357 | # fully connect layer(projection) 358 | w_pph = tf.Variable( 359 | initial_value=tf.random_normal(shape=(self.hidden_units_num2, self.class_num)), 360 | name="weights_pph" 361 | ) 362 | b_pph = tf.Variable( 363 | initial_value=tf.random_normal(shape=(self.class_num,)), 364 | name="bias_pph" 365 | ) 366 | logits_pph = tf.matmul(h_pph, w_pph) + b_pph # shape of logits:[batch_size*max_time, 5] 367 | 368 | # prediction 369 | # shape of pred[batch_size*max_time, 1] 370 | pred_pph = tf.cast(tf.argmax(logits_pph, 1), tf.int32, name="pred_pph") 371 | 372 | # pred in an normal way,shape is [batch_size, max_time,1] 373 | pred_normal_pph = tf.reshape( 374 | tensor=pred_pph, 375 | shape=(-1, self.max_sentence_size), 376 | name="pred_normal" 377 | ) 378 | # one-hot the pred_normal:[batch_size, max_time,class_num] 379 | pred_normal_one_hot_pph = tf.one_hot( 380 | indices=pred_normal_pph, 381 | depth=self.class_num, 382 | name="pred_normal_one_hot_pph" 383 | ) 384 | 385 | # loss 386 | self.loss_pph = tf.losses.sparse_softmax_cross_entropy( 387 | labels=tf.reshape(self.y_p_pph, shape=[-1]), 388 | logits=logits_pph 389 | ) 390 | #------------------------------------------------------------------------------------ 391 | 392 | #---------------------------------------IPH------------------------------------------ 393 | # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size] 394 | inputs_iph = tf.nn.embedding_lookup(params=self.embeddings, ids=self.X_p_iph, name="embeded_input_iph") 395 | # shape of inputs[batch_size,max_time_stpes,embeddings_dims+class_num] 396 | inputs_iph = tf.concat(values=[inputs_iph, pred_normal_one_hot_pph], axis=2, name="inputs_pph") 397 | print("shape of input_pph:", inputs_pph.shape) 398 | # encoder cells 399 | # forward part 400 | en_lstm_forward1_iph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 401 | # en_lstm_forward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 402 | # en_lstm_forward=rnn.MultiRNNCell(cells=[en_lstm_forward1,en_lstm_forward2]) 403 | 404 | # backward part 405 | en_lstm_backward1_iph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 406 | # en_lstm_backward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 407 | # en_lstm_backward=rnn.MultiRNNCell(cells=[en_lstm_backward1,en_lstm_backward2]) 408 | 409 | # decoder cells 410 | de_lstm_iph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 411 | 412 | # encode 413 | encoder_outputs_iph, encoder_states_iph = self.encoder( 414 | cell_forward=en_lstm_forward1_iph, 415 | cell_backward=en_lstm_backward1_iph, 416 | inputs=inputs_iph, 417 | scope_name="en_lstm_iph" 418 | ) 419 | # shape of h is [batch*time_steps,hidden_units] 420 | h_iph = self.decoder( 421 | cell=de_lstm_iph, 422 | initial_state=encoder_states_iph, 423 | inputs=encoder_outputs_iph, 424 | scope_name="de_lstm_iph" 425 | ) 426 | 427 | # fully connect layer(projection) 428 | w_iph = tf.Variable( 429 | initial_value=tf.random_normal(shape=(self.hidden_units_num2, self.class_num)), 430 | name="weights_iph" 431 | ) 432 | b_iph = tf.Variable( 433 | initial_value=tf.random_normal(shape=(self.class_num,)), 434 | name="bias_iph" 435 | ) 436 | logits_iph = tf.matmul(h_iph, w_iph) + b_iph # shape of logits:[batch_size*max_time, 5] 437 | 438 | # prediction 439 | # shape of pred[batch_size*max_time, 1] 440 | pred_iph = tf.cast(tf.argmax(logits_iph, 1), tf.int32, name="pred_iph") 441 | 442 | # pred in an normal way,shape is [batch_size, max_time,1] 443 | pred_normal_iph = tf.reshape( 444 | tensor=pred_iph, 445 | shape=(-1, self.max_sentence_size), 446 | name="pred_normal" 447 | ) 448 | 449 | # one-hot the pred_normal:[batch_size, max_time,class_num] 450 | pred_normal_one_hot_iph = tf.one_hot( 451 | indices=pred_normal_iph, 452 | depth=self.class_num, 453 | name="pred_normal_one_hot_iph" 454 | ) 455 | 456 | # loss 457 | self.loss_iph = tf.losses.sparse_softmax_cross_entropy( 458 | labels=tf.reshape(self.y_p_iph, shape=[-1]), 459 | logits=logits_iph 460 | ) 461 | 462 | #--------------------------------------------------------------------------------------- 463 | ''' 464 | #loss 465 | self.loss=self.loss_pw #+self.loss_pph+self.loss_iph 466 | #optimizer 467 | self.optimizer=tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss) 468 | self.init_op=tf.global_variables_initializer() 469 | self.init_local_op=tf.local_variables_initializer() 470 | 471 | #------------------------------------Session----------------------------------------- 472 | with self.session as sess: 473 | print("Training Start") 474 | sess.run(self.init_op) # initialize all variables 475 | sess.run(self.init_local_op) 476 | 477 | train_Size = X_train_pw.shape[0]; 478 | validation_Size = X_validation_pw.shape[0] 479 | best_validation_loss = 0 # best validation accuracy in training process 480 | 481 | #epoch 482 | for epoch in range(1, self.max_epoch + 1): 483 | print("Epoch:", epoch) 484 | start_time = time.time() # time evaluation 485 | # training loss/accuracy in every mini-batch 486 | train_losses = [] 487 | train_accus_pw = [] 488 | train_accus_pph = [] 489 | train_accus_iph = [] 490 | 491 | c1_f_pw = []; c2_f_pw = [] # each class's f1 score 492 | c1_f_pph = []; c2_f_pph = [] 493 | c1_f_iph = []; c2_f_iph = [] 494 | 495 | # mini batch 496 | for i in range(0, (train_Size // self.batch_size)): 497 | _, train_loss, train_pred_pw= sess.run( 498 | fetches=[self.optimizer, self.loss, pred_pw], 499 | feed_dict={ 500 | self.X_p_pw: X_train_pw[i * self.batch_size:(i + 1) * self.batch_size], 501 | self.y_p_pw: y_train_pw[i * self.batch_size:(i + 1) * self.batch_size], 502 | } 503 | ) 504 | 505 | # loss 506 | train_losses.append(train_loss) 507 | # metrics 508 | # pw 509 | accuracy_pw, f1_1_pw, f1_2_pw = util.eval( 510 | y_true=np.reshape(y_train_pw[i * self.batch_size:(i + 1) * self.batch_size], [-1]), 511 | y_pred=train_pred_pw 512 | ) 513 | print("f1_score of N:",f1_1_pw) 514 | print("f1_score of B:",f1_2_pw) 515 | print() 516 | 517 | #c1_f_pw.append(f1_1_pw); 518 | #c2_f_pw.append(f1_2_pw) 519 | 520 | ''' 521 | # mini batch 522 | for i in range(0, (train_Size // self.batch_size)): 523 | _, train_loss, train_pred_pw,train_pred_pph,train_pred_iph= sess.run( 524 | fetches=[self.optimizer, self.loss, pred_pw,pred_pph,pred_iph], 525 | feed_dict={ 526 | self.X_p_pw: X_train_pw[i * self.batch_size:(i + 1) * self.batch_size], 527 | self.y_p_pw: y_train_pw[i * self.batch_size:(i + 1) * self.batch_size], 528 | self.X_p_pph: X_train_pph[i * self.batch_size:(i + 1) * self.batch_size], 529 | self.y_p_pph: y_train_pph[i * self.batch_size:(i + 1) * self.batch_size], 530 | self.X_p_iph: X_train_iph[i * self.batch_size:(i + 1) * self.batch_size], 531 | self.y_p_iph: y_train_iph[i * self.batch_size:(i + 1) * self.batch_size], 532 | } 533 | ) 534 | 535 | #loss 536 | train_losses.append(train_loss) 537 | # metrics 538 | #pw 539 | accuracy_pw, f1_1_pw,f1_2_pw = util.eval( 540 | y_true=np.reshape(y_train_pw[i * self.batch_size:(i + 1) * self.batch_size], [-1]), 541 | y_pred=train_pred_pw 542 | ) 543 | 544 | # pph 545 | accuracy_pph, f1_1_pph, f1_2_pph = util.eval( 546 | y_true=np.reshape(y_train_pph[i * self.batch_size:(i + 1) * self.batch_size], [-1]), 547 | y_pred=train_pred_pph 548 | ) 549 | 550 | # iph 551 | accuracy_iph,f1_1_iph, f1_2_iph = util.eval( 552 | y_true=np.reshape(y_train_iph[i * self.batch_size:(i + 1) * self.batch_size], [-1]), 553 | y_pred=train_pred_iph 554 | ) 555 | train_accus_pw.append(accuracy_pw) 556 | train_accus_pph.append(accuracy_pph) 557 | train_accus_iph.append(accuracy_iph) 558 | #F1-score 559 | c1_f_pw.append(f1_1_pw); c2_f_pw.append(f1_2_pw) 560 | c1_f_pph.append(f1_1_pph); c2_f_pph.append(f1_2_pph) 561 | c1_f_iph.append(f1_1_iph); c2_f_iph.append(f1_2_iph) 562 | 563 | #validation in every epoch 564 | validation_loss, valid_pred_pw,valid_pred_pph,valid_pred_iph= sess.run( 565 | fetches=[self.loss, pred_pw,pred_pph,pred_iph], 566 | feed_dict={ 567 | self.X_p_pw: X_validation_pw, self.y_p_pw: y_validation_pw, 568 | self.X_p_pph: X_validation_pph, self.y_p_pph: y_validation_pph, 569 | self.X_p_iph: X_validation_iph, self.y_p_iph: y_validation_iph 570 | } 571 | ) 572 | # metrics 573 | # pw 574 | valid_accuracy_pw, valid_f1_1_pw, valid_f1_2_pw = util.eval( 575 | y_true=np.reshape(y_validation_pw, [-1]), 576 | y_pred=valid_pred_pw 577 | ) 578 | 579 | # pph 580 | valid_accuracy_pph, valid_f1_1_pph, valid_f1_2_pph = util.eval( 581 | y_true=np.reshape(y_validation_pph, [-1]), 582 | y_pred=valid_pred_pph 583 | ) 584 | 585 | # iph 586 | valid_accuracy_iph, valid_f1_1_iph, valid_f1_2_iph = util.eval( 587 | y_true=np.reshape(y_validation_iph, [-1]), 588 | y_pred=valid_pred_iph 589 | ) 590 | 591 | 592 | 593 | # show information 594 | print("Epoch ", epoch, " finished.", "spend ", round((time.time() - start_time) / 60, 2), " mins") 595 | print(" /**Training info**/") 596 | print("----avarage training loss:", sum(train_losses) / len(train_losses)) 597 | print("PW:") 598 | print("----avarage accuracy:", sum(train_accus_pw) / len(train_accus_pw)) 599 | print("----avarage f1-Score of N:", sum(c1_f_pw) / len(c1_f_pw)) 600 | print("----avarage f1-Score of B:", sum(c2_f_pw) / len(c2_f_pw)) 601 | print("PPH:") 602 | print("----avarage accuracy :", sum(train_accus_pph) / len(train_accus_pph)) 603 | print("----avarage f1-Score of N:", sum(c1_f_pph) / len(c1_f_pph)) 604 | print("----avarage f1-Score of B:", sum(c2_f_pph) / len(c2_f_pph)) 605 | print("IPH:") 606 | print("----avarage accuracy:", sum(train_accus_iph) / len(train_accus_iph)) 607 | print("----avarage f1-Score of N:", sum(c1_f_iph) / len(c1_f_iph)) 608 | print("----avarage f1-Score of B:", sum(c2_f_iph) / len(c2_f_iph)) 609 | print() 610 | 611 | print(" /**Validation info**/") 612 | print("----avarage validation loss:", validation_loss) 613 | print("PW:") 614 | print("----avarage accuracy:", valid_accuracy_pw) 615 | print("----avarage f1-Score of N:", valid_f1_1_pw) 616 | print("----avarage f1-Score of B:", valid_f1_2_pw) 617 | print("PPH:") 618 | print("----avarage accuracy :", valid_accuracy_pph) 619 | print("----avarage f1-Score of N:", valid_f1_1_pph) 620 | print("----avarage f1-Score of B:", valid_f1_2_pph) 621 | print("IPH:") 622 | print("----avarage accuracy:", valid_accuracy_iph) 623 | print("----avarage f1-Score of N:", valid_f1_1_iph) 624 | print("----avarage f1-Score of B:", valid_f1_2_iph) 625 | print("\n\n") 626 | 627 | # when we get a new best validation accuracy,we store the model 628 | if best_validation_loss < validation_loss: 629 | best_validation_loss=validation_loss 630 | print("New Best loss ",best_validation_loss," On Validation set! ") 631 | print("Saving Models......") 632 | #exist ./models folder? 633 | if not os.path.exists("./models/"): 634 | os.mkdir(path="./models/") 635 | if not os.path.exists("./models/"+name): 636 | os.mkdir(path="./models/"+name) 637 | if not os.path.exists("./models/"+name+"/bilstm"): 638 | os.mkdir(path="./models/"+name+"/bilstm") 639 | #create saver 640 | saver = tf.train.Saver() 641 | saver.save(sess, "./models/"+name+"/bilstm/my-model-10000") 642 | # Generates MetaGraphDef. 643 | saver.export_meta_graph("./models/"+name+"/bilstm/my-model-10000.meta") 644 | ''' 645 | 646 | #返回预测的结果或者准确率,y not None的时候返回准确率,y ==None的时候返回预测值 647 | def pred(self,name,X,y=None,): 648 | start_time = time.time() #compute time 649 | if y is None: 650 | with self.session as sess: 651 | # restore model 652 | new_saver=tf.train.import_meta_graph( 653 | meta_graph_or_file="./models/"+name+"/bilstm/my-model-10000.meta", 654 | clear_devices=True 655 | ) 656 | new_saver.restore(sess, "./models/"+name+"/bilstm/my-model-10000") 657 | #get default graph 658 | graph = tf.get_default_graph() 659 | # get opration from the graph 660 | pred_normal = graph.get_operation_by_name("pred_normal").outputs[0] 661 | X_p = graph.get_operation_by_name("input_placeholder").outputs[0] 662 | pred = sess.run(fetches=pred_normal, feed_dict={X_p: X}) 663 | print("this operation spends ",round((time.time()-start_time)/60,2)," mins") 664 | return pred 665 | else: 666 | with self.session as sess: 667 | # restore model 668 | new_saver = tf.train.import_meta_graph( 669 | meta_graph_or_file="./models/" + name + "/bilstm/my-model-10000.meta", 670 | clear_devices=True 671 | ) 672 | new_saver.restore(sess, "./models/" + name + "/bilstm/my-model-10000") 673 | graph = tf.get_default_graph() 674 | # get opration from the graph 675 | accuracy=graph.get_operation_by_name("accuracy").outputs[0] 676 | X_p = graph.get_operation_by_name("input_placeholder").outputs[0] 677 | y_p=graph.get_operation_by_name("label_placeholder").outputs[0] 678 | #forward and get the results 679 | accu = sess.run(fetches=accuracy,feed_dict={X_p: X,y_p: y}) 680 | print("this operation spends ", round((time.time() - start_time) / 60, 2), " mins") 681 | return accu 682 | 683 | #把一个句子转成一个分词后的结构 684 | def infer(self,sentence,name): 685 | pass 686 | 687 | 688 | #train && test 689 | if __name__=="__main__": 690 | # 读数据 691 | df_train_pw = pd.read_pickle(path="./dataset/temptest/pw_summary_train.pkl") 692 | df_validation_pw = pd.read_pickle(path="./dataset/temptest/pw_summary_validation.pkl") 693 | X_train_pw = np.asarray(list(df_train_pw['X'].values)) 694 | y_train_pw = np.asarray(list(df_train_pw['y'].values)) 695 | X_validation_pw = np.asarray(list(df_validation_pw['X'].values)) 696 | y_validation_pw = np.asarray(list(df_validation_pw['y'].values)) 697 | 698 | 699 | 700 | df_train_pph = pd.read_pickle(path="./dataset/temptest/pph_summary_train.pkl") 701 | df_validation_pph = pd.read_pickle(path="./dataset/temptest/pph_summary_validation.pkl") 702 | X_train_pph = np.asarray(list(df_train_pph['X'].values)) 703 | y_train_pph = np.asarray(list(df_train_pph['y'].values)) 704 | X_validation_pph = np.asarray(list(df_validation_pph['X'].values)) 705 | y_validation_pph = np.asarray(list(df_validation_pph['y'].values)) 706 | 707 | 708 | df_train_iph = pd.read_pickle(path="./dataset/temptest/iph_summary_train.pkl") 709 | df_validation_iph = pd.read_pickle(path="./dataset/temptest/iph_summary_validation.pkl") 710 | X_train_iph = np.asarray(list(df_train_iph['X'].values)) 711 | y_train_iph = np.asarray(list(df_train_iph['y'].values)) 712 | X_validation_iph = np.asarray(list(df_validation_iph['X'].values)) 713 | y_validation_iph = np.asarray(list(df_validation_iph['y'].values)) 714 | 715 | 716 | X_train = [X_train_pw, X_train_pph, X_train_iph] 717 | y_train = [y_train_pw, y_train_pph, y_train_iph] 718 | X_validation = [X_validation_pw, X_validation_pph, X_validation_iph] 719 | y_validation = [y_validation_pw, y_validation_pph, y_validation_iph] 720 | 721 | model = Attension_Alignment_Seq2Seq() 722 | model.fit(X_train, y_train, X_validation, y_validation, "test", False) 723 | 724 | # testing model 725 | #accuracy = model.pred(name="test", X=X_test, y=y_test) 726 | #print(accuracy) -------------------------------------------------------------------------------- /data_processing.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 清洗数据,转换语料格式,得到字嵌入 3 | author:xierhacker 4 | time:2018.1.4 5 | ''' 6 | import re 7 | import os 8 | import time 9 | import pandas as pd 10 | import numpy as np 11 | from itertools import chain 12 | from gensim.models import word2vec 13 | from sklearn.model_selection import train_test_split 14 | from parameter import MAX_SENTENCE_SIZE 15 | 16 | #原始语料转换为不带任何标记的语料,可以训练字向量 17 | def toCharCorpus(filename): 18 | doc = "" 19 | file = open(file=filename, encoding="utf-8") 20 | lines = file.readlines() 21 | # 匹配#标记 22 | pattern1 = re.compile(r"#[1,2,3,4]", flags=re.U) 23 | # 每个字匹配一次 24 | pattern2 =re.compile(r"[^\s]") 25 | for line in lines: 26 | string = re.sub(pattern=pattern1, repl="", string=line) #去掉# 27 | string=" ".join(re.findall(pattern=pattern2,string=string)) #每个字加上空格 28 | string+="\n" 29 | doc += string 30 | # write to file 31 | f = open(file="./data/corpus/prosody_char.txt", mode="w", encoding="utf-8") 32 | f.write(doc) 33 | f.close() 34 | 35 | #训练字向量并且存储 36 | def toEmbeddings(filename): 37 | sentences = word2vec.Text8Corpus(filename) 38 | model = word2vec.Word2Vec(sentences=sentences, size=1001) 39 | # save embeddings file 40 | if not os.path.exists("./data/embeddings"): 41 | os.mkdir(path="./data/embeddings") 42 | model.wv.save_word2vec_format("./data/embeddings/vec.txt", binary=False) 43 | 44 | 45 | #转换原始corpus为韵律词(PW)格式标记 46 | def toPW(filename): 47 | doc="" 48 | file = open(file=filename, encoding="utf-8") 49 | lines = file.readlines() 50 | # 匹配#标记 51 | pattern1 = re.compile(r"#[1,2,3,4]", flags=re.U) 52 | # 不是/或者b 53 | pattern2 = re.compile(r"(?![/b])") 54 | #去掉b后面的/n 55 | pattern3= re.compile(r"b/n") 56 | #去掉开头的/n 57 | pattern4=re.compile(r"^/n") 58 | for line in lines: 59 | line=line.strip() 60 | string = re.sub(pattern=pattern1, repl="/b", string=line) # 去掉# 61 | string=re.sub(pattern=pattern2,repl="/n",string=string) 62 | string = re.sub(pattern=pattern3, repl="b", string=string) 63 | string = re.sub(pattern=pattern4, repl="", string=string)+"\n" 64 | doc += string 65 | # write to file 66 | f = open(file="./data/corpus/prosody_pw.txt", mode="w", encoding="utf-8") 67 | f.write(doc) 68 | f.close() 69 | 70 | #转换原始corpus为韵律短语(PPH)格式标记 71 | def toPPH(filename): 72 | doc="" 73 | file = open(file=filename, encoding="utf-8") 74 | lines = file.readlines() 75 | #匹配#1(因为要先去掉#1) 76 | pattern=re.compile(r"#1") 77 | # 匹配#标记 78 | pattern1 = re.compile(r"#[2,3,4]", flags=re.U) 79 | # 不是/或者b 80 | pattern2 = re.compile(r"(?![/b])") 81 | #去掉b后面的/n 82 | pattern3= re.compile(r"b/n") 83 | #去掉开头的/n 84 | pattern4=re.compile(r"^/n") 85 | for line in lines: 86 | line=line.strip() #去掉一些影响的空格和换行 87 | string = re.sub(pattern=pattern, repl="", string=line) # 去掉#1 88 | string = re.sub(pattern=pattern1, repl="/b", string=string) # 去掉# 89 | string=re.sub(pattern=pattern2,repl="/n",string=string) 90 | string = re.sub(pattern=pattern3, repl="b", string=string) 91 | string = re.sub(pattern=pattern4, repl="", string=string)+"\n" 92 | doc += string 93 | # write to file 94 | f = open(file="./data/corpus/prosody_pph.txt", mode="w", encoding="utf-8") 95 | f.write(doc) 96 | f.close() 97 | 98 | #转换原始corpus为语调短语(IPH)格式标记 99 | def toIPH(filename): 100 | doc = "" 101 | file = open(file=filename, encoding="utf-8") 102 | lines = file.readlines() 103 | # 匹配#1和#2(因为要先去掉#1和#2) 104 | pattern = re.compile(r"#[1,2]") 105 | # 匹配#标记 106 | pattern1 = re.compile(r"#[3,4]", flags=re.U) 107 | # 不是/或者b 108 | pattern2 = re.compile(r"(?![/b])") 109 | # 去掉b后面的/n 110 | pattern3 = re.compile(r"b/n") 111 | # 去掉开头的/n 112 | pattern4 = re.compile(r"^/n") 113 | for line in lines: 114 | line = line.strip() # 去掉一些影响的空格和换行 115 | string = re.sub(pattern=pattern, repl="", string=line) # 去掉#1 116 | string = re.sub(pattern=pattern1, repl="/b", string=string) # 去掉# 117 | string = re.sub(pattern=pattern2, repl="/n", string=string) 118 | string = re.sub(pattern=pattern3, repl="b", string=string) 119 | string = re.sub(pattern=pattern4, repl="", string=string) + "\n" 120 | doc += string 121 | # write to file 122 | f = open(file="./data/corpus/prosody_iph.txt", mode="w+", encoding="utf-8") 123 | f.write(doc) 124 | f.close() 125 | 126 | #清洗 127 | def clean(s): 128 | if u'“/s' not in s: # 句子中间的引号不应去掉 129 | return s.replace(u' ”/s', '') 130 | elif u'”/s' not in s: 131 | return s.replace(u'“/s ', '') 132 | elif u'‘/s' not in s: 133 | return s.replace(u' ’/s', '') 134 | elif u'’/s' not in s: 135 | return s.replace(u'‘/s ', '') 136 | else: 137 | return s 138 | 139 | def file2corpus(filename): 140 | ''' 141 | :param filename: 142 | :return: 语料文件文件转换为一个原始语料句子的list 143 | ''' 144 | with open(filename, 'rb') as inp: 145 | corpus = inp.read().decode('UTF-8') #原始语料 str对象 146 | corpus = corpus.split('\r') #换行切分,得到一个简陋列表 147 | corpus = u''.join(map(clean, corpus)) # 把所有处理的句子连接起来,这里中间连接不用其他字符 str对象 148 | corpus = re.split(u"\n", corpus) # 以换行为分割,把语料划分为一个"句子"列表 149 | #corpus = re.split(u'[,。!?、‘’“”]/[bems]', corpus) # 以换行为分割,把语料划分为一个"句子"列表 150 | return corpus #[人/b 们/e 常/s 说/s 生/b 活/e 是/s 一/s 部/s 教/b 科/m 书/e ,xxx,....] 151 | 152 | 153 | def make_component(corpus,name): 154 | ''' 155 | :param corpus: 传入原始语料句子corpus列表得到的字数据datas和对应的labels数据都放到dataframe里面存储,方便后面的处理 156 | :return: df_data 157 | ''' 158 | sentences= [] 159 | tags = [] 160 | for s in corpus: #corpus列表得到每句corpus想应的sentence以及对应的labels 161 | sentence_tags = re.findall('(.)/(.)', s) # sentence_tags:[('人', 'b'), ('们', 'e'), ('常', 's'), ('说', 's')] 162 | if sentence_tags: # 顺便去除了一些空样本 163 | sentence_tags = np.array(sentence_tags) 164 | sentences.append(sentence_tags[:, 0]) #sentences每一个元素表示一个sentence['人' '们' '常' '说' '生' '活' '是' '一' '部' '教' '科' '书'] 165 | tags.append(sentence_tags[:, 1]) #tags每一个元素表示的是一个句子对应的标签['b' 'e' 's' 's' 'b' 'e' 's' 's' 's' 'b' 'm' 'e'] 166 | 167 | #使用pandas处理,简化流程 168 | df_data = pd.DataFrame({'sentences': sentences, 'tags': tags}, index=range(len(sentences))) 169 | df_data['sentence_len'] = df_data['sentences'].apply(lambda sentences: len(sentences)) # 每句话长度 170 | 171 | # 得到所有的字,这里的all_words是一个列表,存放了这个语料中所有的词 172 | all_words = list(chain(*df_data['sentences'].values)) 173 | sr_allwords = pd.Series(data=all_words) # 2.列表做成pandas的Series 174 | words = (sr_allwords.value_counts()).index #字列表.统计每个字出现的频率,同时相当于去重复,得到字的集合(这里还是Serieas的index对象) 175 | #print(lenwords) 176 | 177 | words_id = range(1, len(words) + 1) #字的id列表,从1开始,因为准备把0作为填充值 178 | tags = ['x', 'n', 'b'] #tag列表 179 | tags_id = range(len(tags)) #tag的id列表 180 | 181 | #保存基本组件,在./dataset/name/下面会有words_ids.csv,tags_ids.csv,df_data.csv三个文件 182 | if not os.path.exists("./dataset/"): 183 | os.mkdir("./dataset/") 184 | if not os.path.exists("./dataset/"+name): 185 | os.mkdir("./dataset/"+name) 186 | 187 | # words以及对应的id组件 188 | pd.DataFrame(data={"words":words,"id":words_id}).\ 189 | to_csv(path_or_buf="./dataset/"+name+"/words_ids.csv",index=False,encoding="utf_8") 190 | # tags以及对应的id组件 191 | pd.DataFrame(data={"tags":tags,"id":tags_id}).\ 192 | to_csv(path_or_buf="./dataset/"+name+"/tags_ids.csv",index=False,encoding="utf_8") 193 | print("max_sentence_size:",df_data["sentence_len"].max()) 194 | return df_data #暂时不保存,返回 195 | 196 | def read_component(name): 197 | ''' 198 | 从文件里面读取基本的component 199 | :param name: 200 | :return: words2id, id2words, tags2id, id2tags 201 | ''' 202 | #读取words和ids的dataframe 203 | df_words_ids=pd.read_csv(filepath_or_buffer="./dataset/"+name+"/words_ids.csv",encoding="utf-8") 204 | #读取tags和ids的dataframe 205 | df_tags_ids=pd.read_csv(filepath_or_buffer="./dataset/"+name+"/tags_ids.csv",encoding="utf-8") 206 | #装换为words2id, id2words, tags2id, id2tags 207 | #df_data=pd.DataFrame(data={}) 208 | words2id=pd.Series(data=df_words_ids["id"].values,index=df_words_ids["words"].values) 209 | id2words=pd.Series(data=df_words_ids["words"].values,index=df_words_ids["id"].values) 210 | tags2id = pd.Series(data=df_tags_ids["id"].values, index=df_tags_ids["tags"].values) 211 | id2tags = pd.Series(data=df_tags_ids["tags"].values, index=df_tags_ids["id"].values) 212 | return words2id, id2words, tags2id, id2tags 213 | 214 | 215 | def make_dataset(in_filename,project_name,out_filename): 216 | ''' 217 | 转换为最后模型适合的数据集,name表示转换后的数据集存储在哪个文件下面./dataset/name 218 | :param filename: 原始数据集的文件名 219 | :param name: 220 | :return: 221 | ''' 222 | print("Conver Corpus To Dataset!") 223 | start_time=time.time() 224 | corpus = file2corpus(in_filename); print(" corpus contains ", len(corpus), " sentences.") 225 | #保存基本组件,并且返回 226 | print(" ----saving component and ") 227 | df_data=make_component(corpus,project_name) 228 | #print(df_data.head(3)) 229 | 230 | #读取组件,并且装换为合适的格式 231 | words2id, id2words, tags2id, id2tags =read_component(project_name) 232 | print("words2id.shape:",words2id.shape) 233 | print(" dataset contains ",df_data.shape[0]," sentences.") 234 | 235 | def X_padding(sentence): 236 | ''' 237 | !!!!!!!!!!可提速!!!!!!!! 238 | 返回一句话padding之后的id列表,使用的时候,把一个字符串转为list传进来就行 239 | :param sentence: 一个句子的列表 240 | :param word2id: word2id映射 241 | :return: 一句话的padding后的 ids 242 | 243 | ''' 244 | ids = list(words2id[sentence]) 245 | if len(ids) > MAX_SENTENCE_SIZE: # 超过就截断 246 | return ids[:MAX_SENTENCE_SIZE] 247 | if len(ids) < MAX_SENTENCE_SIZE: # 短了就补齐 248 | ids.extend([0] * (MAX_SENTENCE_SIZE - len(ids))) 249 | return ids 250 | 251 | def y_padding(tags): 252 | ''' 253 | !!!!!!!!!!!可提速!!!!!!!!! 254 | #得到一个label的padding后的id 255 | :param tags: 256 | :param tags2id: 257 | :return: 258 | ''' 259 | ids = list(tags2id[tags]) 260 | if len(ids) > MAX_SENTENCE_SIZE: # 超过就截断 261 | return ids[:MAX_SENTENCE_SIZE] 262 | if len(ids) < MAX_SENTENCE_SIZE: # 短了就补齐 263 | ids.extend([0] * (MAX_SENTENCE_SIZE - len(ids))) 264 | return ids 265 | 266 | #把数据转换为ids的数据 267 | print(" convert data and label to 'ids' represented") 268 | df_data['X'] = df_data['sentences'].apply(X_padding) 269 | df_data['y'] = df_data['tags'].apply(y_padding) 270 | #print(df_data["X"].head(5)) 271 | #print(df_data["y"].head(5)) 272 | 273 | #数据集切分0.2 比例 274 | df_data_train=df_data[:47176] 275 | df_data_validation=df_data[47176:] 276 | #df_data_train,df_data_test=train_test_split(df_data,test_size=0.2) #训练集和测试集 277 | #df_data_train,df_data_validation=train_test_split(df_data_train,test_size=0.1) #训练集和验证集 278 | 279 | #保存最终数据到pkl文件 280 | print(" ----saving final dataset <"+out_filename+"_summary_train.pkl>") 281 | df_data_train.to_pickle(path="./dataset/"+project_name+"/"+out_filename+"_summary_train.pkl") 282 | 283 | print(" ----saving final dataset ") 284 | df_data_validation.to_pickle(path="./dataset/"+project_name+"/"+out_filename+"_summary_validation.pkl") 285 | 286 | duration=time.time()-start_time; 287 | print("END! this operation spends ",round(duration/60,2)," mins") 288 | 289 | 290 | #summary_train.pkl 291 | 292 | if __name__=="__main__": 293 | print("[1]-->trans corpus to char corpus and saving......") 294 | toCharCorpus(filename="./data/corpus/prosody.txt") 295 | print("[2]-->get embeddings and saving......") 296 | toEmbeddings(filename="./data/corpus/prosody_char.txt") 297 | 298 | print("[3]-->trans corpus to PW format......") 299 | toPW("./data/corpus/prosody.txt") 300 | print("[4]-->trans corpus to PPH format......") 301 | toPPH("./data/corpus/prosody.txt") 302 | print("[5]-->trans corpus to IPH format......") 303 | toIPH("./data/corpus/prosody.txt") 304 | 305 | print("[6]-->trans corpus_pw to dataset......") 306 | make_dataset(in_filename="./data/corpus/prosody_pw.txt",project_name="temptest",out_filename="pw") 307 | 308 | print("[7]-->trans corpus_pph to dataset......") 309 | make_dataset(in_filename="./data/corpus/prosody_pph.txt", project_name="temptest", out_filename="pph") 310 | 311 | print("[8]-->trans corpus_iph to dataset......") 312 | make_dataset(in_filename="./data/corpus/prosody_iph.txt", project_name="temptest", out_filename="iph") 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | ''' 2 | model with attention 3 | ''' 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import tensorflow as tf 8 | import tensorflow.contrib.rnn as rnn 9 | import tensorflow.contrib.seq2seq as seq2seq 10 | import time 11 | import os 12 | import parameter 13 | import util 14 | 15 | class Attension_Alignment_Seq2Seq(): 16 | def __init__(self): 17 | # basic environment 18 | self.graph = tf.Graph() 19 | self.session = tf.Session(graph=self.graph) 20 | 21 | # basic parameters 22 | self.learning_rate = parameter.LEARNING_RATE 23 | self.max_epoch = parameter.MAX_EPOCH 24 | self.embedding_size = parameter.EMBEDDING_SIZE 25 | self.class_num = parameter.CLASS_NUM 26 | self.hidden_units_num = parameter.HIDDEN_UNITS_NUM 27 | self.hidden_units_num2 = parameter.HIDDEN_UNITS_NUM2 28 | self.layer_num = parameter.LAYER_NUM 29 | self.max_sentence_size = parameter.MAX_SENTENCE_SIZE 30 | self.vocab_size = parameter.VOCAB_SIZE 31 | self.batch_size = parameter.BATCH_SIZE 32 | 33 | # encoder,传入是前向和后向的cell,还有inputs 34 | # 输出是 35 | def encoder(self, cell_forward, cell_backward, inputs, seq_length, scope_name): 36 | outputs, states = tf.nn.bidirectional_dynamic_rnn( 37 | cell_fw=cell_forward, 38 | cell_bw=cell_backward, 39 | inputs=inputs, 40 | sequence_length=seq_length, 41 | dtype=tf.float32, 42 | scope=scope_name 43 | ) 44 | 45 | outputs_forward = outputs[0] # shape of h is [batch_size, max_time, cell_fw.output_size] 46 | outputs_backward = outputs[1] # shape of h is [batch_size, max_time, cell_bw.output_size] 47 | states_forward = states[0] # .c:[batch_size,num_units] .h:[batch_size,num_units] 48 | states_backward = states[1] 49 | #concat final outputs [batch_size, max_time, cell_fw.output_size*2] 50 | encoder_outputs = tf.concat(values=[outputs_forward, outputs_backward], axis=2) 51 | #concat final states 52 | state_h_concat=tf.concat(values=[states_forward.h,states_backward.h],axis=1,name="state_h_concat") 53 | print("state_h_concat:",state_h_concat) 54 | state_c_concat=tf.concat(values=[states_forward.c,states_backward.c],axis=1,name="state_c_concat") 55 | print("state_c_concat:",state_c_concat) 56 | encoder_states=rnn.LSTMStateTuple(c=state_c_concat,h=state_h_concat) 57 | 58 | return encoder_outputs, encoder_states 59 | 60 | def decoder(self, cell, initial_state, inputs, scope_name): 61 | # outputs:[batch_size,time_steps,hidden_size*2] 62 | outputs, states = tf.nn.dynamic_rnn( 63 | cell=cell, 64 | inputs=inputs, 65 | initial_state=initial_state, 66 | scope=scope_name 67 | ) 68 | #[batch_size*time_steps,hidden_size*2] 69 | decoder_outputs = tf.reshape(tensor=outputs, shape=(-1, self.hidden_units_num*2)) 70 | return decoder_outputs 71 | 72 | # forward process and training process 73 | def fit(self, X_train, y_train, len_train, X_validation, y_validation, len_validation, name, print_log=True): 74 | # ---------------------------------------forward computation--------------------------------------------# 75 | y_train_pw = y_train[0] 76 | y_train_pph = y_train[1] 77 | y_train_iph = y_train[2] 78 | 79 | y_validation_pw = y_validation[0] 80 | y_validation_pph = y_validation[1] 81 | y_validation_iph = y_validation[2] 82 | # ---------------------------------------define graph---------------------------------------------# 83 | with self.graph.as_default(): 84 | # data place holder 85 | self.X_p = tf.placeholder( 86 | dtype=tf.int32, 87 | shape=(None, self.max_sentence_size), 88 | name="input_placeholder" 89 | ) 90 | 91 | self.y_p_pw = tf.placeholder( 92 | dtype=tf.int32, 93 | shape=(None, self.max_sentence_size), 94 | name="label_placeholder_pw" 95 | ) 96 | self.y_p_pph = tf.placeholder( 97 | dtype=tf.int32, 98 | shape=(None, self.max_sentence_size), 99 | name="label_placeholder_pph" 100 | ) 101 | self.y_p_iph = tf.placeholder( 102 | dtype=tf.int32, 103 | shape=(None, self.max_sentence_size), 104 | name="label_placeholder_iph" 105 | ) 106 | 107 | # 相应序列的长度占位 108 | self.seq_len_p = tf.placeholder( 109 | dtype=tf.int32, 110 | shape=(None,), 111 | name="seq_len" 112 | ) 113 | 114 | #用来去掉padding的mask 115 | self.mask = tf.sequence_mask( 116 | lengths=self.seq_len_p, 117 | maxlen=self.max_sentence_size, 118 | name="mask" 119 | ) 120 | 121 | #去掉padding之后的labels 122 | y_p_pw_masked = tf.boolean_mask( #shape[seq_len1+seq_len2+....+,] 123 | tensor=self.y_p_pw, 124 | mask=self.mask, 125 | name="y_p_pw_masked" 126 | ) 127 | y_p_pph_masked = tf.boolean_mask( # shape[seq_len1+seq_len2+....+,] 128 | tensor=self.y_p_pph, 129 | mask=self.mask, 130 | name="y_p_pph_masked" 131 | ) 132 | y_p_iph_masked = tf.boolean_mask( # shape[seq_len1+seq_len2+....+,] 133 | tensor=self.y_p_iph, 134 | mask=self.mask, 135 | name="y_p_iph_masked" 136 | ) 137 | 138 | # embeddings 139 | self.embeddings = tf.Variable( 140 | initial_value=tf.zeros(shape=(self.vocab_size, self.embedding_size), dtype=tf.float32), 141 | name="embeddings" 142 | ) 143 | 144 | # -------------------------------------PW----------------------------------------------------- 145 | # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size] 146 | inputs_pw = tf.nn.embedding_lookup(params=self.embeddings, ids=self.X_p, name="embeded_input_pw") 147 | 148 | # encoder cells 149 | # forward part 150 | en_lstm_forward1_pw = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 151 | # en_lstm_forward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 152 | # en_lstm_forward=rnn.MultiRNNCell(cells=[en_lstm_forward1,en_lstm_forward2]) 153 | 154 | # backward part 155 | en_lstm_backward1_pw = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 156 | # en_lstm_backward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 157 | # en_lstm_backward=rnn.MultiRNNCell(cells=[en_lstm_backward1,en_lstm_backward2]) 158 | 159 | # decoder cells 160 | de_lstm_pw = rnn.BasicLSTMCell(num_units=self.hidden_units_num*2) 161 | 162 | # encode 163 | encoder_outputs_pw, encoder_states_pw = self.encoder( 164 | cell_forward=en_lstm_forward1_pw, 165 | cell_backward=en_lstm_backward1_pw, 166 | inputs=inputs_pw, 167 | seq_length=self.seq_len_p, 168 | scope_name="en_lstm_pw" 169 | ) 170 | # decode 171 | h_pw = self.decoder( # shape of h is [batch*time_steps,hidden_units*2] 172 | cell=de_lstm_pw, 173 | initial_state=encoder_states_pw, 174 | inputs=encoder_outputs_pw, 175 | scope_name="de_lstm_pw" 176 | ) 177 | 178 | # fully connect layer(projection) 179 | w_pw = tf.Variable( 180 | initial_value=tf.random_normal(shape=(self.hidden_units_num*2, self.class_num)), 181 | name="weights_pw" 182 | ) 183 | b_pw = tf.Variable( 184 | initial_value=tf.random_normal(shape=(self.class_num,)), 185 | name="bias_pw" 186 | ) 187 | #logits 188 | logits_pw = tf.matmul(h_pw, w_pw) + b_pw #logits_pw:[batch_size*max_time, 3] 189 | logits_normal_pw=tf.reshape( #logits in an normal way:[batch_size,max_time_stpes,3] 190 | tensor=logits_pw, 191 | shape=(-1,self.max_sentence_size,3), 192 | name="logits_normal_pw" 193 | ) 194 | logits_pw_masked = tf.boolean_mask( # logits_pw_masked [seq_len1+seq_len2+....+,3] 195 | tensor=logits_normal_pw, 196 | mask=self.mask, 197 | name="logits_pw_masked" 198 | ) 199 | 200 | # prediction 201 | pred_pw = tf.cast(tf.argmax(logits_pw, 1), tf.int32, name="pred_pw") # pred_pw:[batch_size*max_time,] 202 | pred_normal_pw = tf.reshape( # pred in an normal way,[batch_size, max_time] 203 | tensor=pred_pw, 204 | shape=(-1, self.max_sentence_size), 205 | name="pred_normal_pw" 206 | ) 207 | 208 | pred_pw_masked = tf.boolean_mask( # logits_pw_masked [seq_len1+seq_len2+....+,] 209 | tensor=pred_normal_pw, 210 | mask=self.mask, 211 | name="pred_pw_masked" 212 | ) 213 | 214 | pred_normal_one_hot_pw = tf.one_hot( # one-hot the pred_normal:[batch_size, max_time,class_num] 215 | indices=pred_normal_pw, 216 | depth=self.class_num, 217 | name="pred_normal_one_hot_pw" 218 | ) 219 | 220 | # loss 221 | self.loss_pw = tf.losses.sparse_softmax_cross_entropy( 222 | labels=y_p_pw_masked, 223 | logits=logits_pw_masked 224 | ) 225 | # --------------------------------------------------------------------------------------- 226 | 227 | # ----------------------------------PPH-------------------------------------------------- 228 | # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size] 229 | inputs_pph = tf.nn.embedding_lookup(params=self.embeddings, ids=self.X_p, name="embeded_input_pph") 230 | # shape of inputs[batch_size,max_time_stpes,embeddings_dims+class_num] 231 | inputs_pph = tf.concat(values=[inputs_pph, pred_normal_one_hot_pw], axis=2, name="inputs_pph") 232 | # print("shape of input_pph:", inputs_pph.shape) 233 | 234 | # encoder cells 235 | # forward part 236 | en_lstm_forward1_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 237 | # en_lstm_forward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 238 | # en_lstm_forward=rnn.MultiRNNCell(cells=[en_lstm_forward1,en_lstm_forward2]) 239 | 240 | # backward part 241 | en_lstm_backward1_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 242 | # en_lstm_backward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 243 | # en_lstm_backward=rnn.MultiRNNCell(cells=[en_lstm_backward1,en_lstm_backward2]) 244 | 245 | # decoder cells 246 | de_lstm_pph = rnn.BasicLSTMCell(num_units=self.hidden_units_num*2) 247 | 248 | # encode 249 | encoder_outputs_pph, encoder_states_pph = self.encoder( 250 | cell_forward=en_lstm_forward1_pph, 251 | cell_backward=en_lstm_backward1_pph, 252 | inputs=inputs_pph, 253 | seq_length=self.seq_len_p, 254 | scope_name="en_lstm_pph" 255 | ) 256 | # shape of h is [batch*time_steps,hidden_units*2] 257 | h_pph = self.decoder( 258 | cell=de_lstm_pph, 259 | initial_state=encoder_states_pph, 260 | inputs=encoder_outputs_pph, 261 | scope_name="de_lstm_pph" 262 | ) 263 | 264 | # fully connect layer(projection) 265 | w_pph = tf.Variable( 266 | initial_value=tf.random_normal(shape=(self.hidden_units_num*2, self.class_num)), 267 | name="weights_pph" 268 | ) 269 | b_pph = tf.Variable( 270 | initial_value=tf.random_normal(shape=(self.class_num,)), 271 | name="bias_pph" 272 | ) 273 | # logits 274 | logits_pph = tf.matmul(h_pph, w_pph) + b_pph # shape of logits:[batch_size*max_time, 3] 275 | logits_normal_pph = tf.reshape( # logits in an normal way:[batch_size,max_time_stpes,3] 276 | tensor=logits_pph, 277 | shape=(-1, self.max_sentence_size, 3), 278 | name="logits_normal_pph" 279 | ) 280 | logits_pph_masked = tf.boolean_mask( # [seq_len1+seq_len2+....+,3] 281 | tensor=logits_normal_pph, 282 | mask=self.mask, 283 | name="logits_pph_masked" 284 | ) 285 | 286 | # prediction 287 | pred_pph = tf.cast(tf.argmax(logits_pph, 1), tf.int32, name="pred_pph") # pred_pph:[batch_size*max_time,] 288 | pred_normal_pph = tf.reshape( # pred in an normal way,[batch_size, max_time] 289 | tensor=pred_pph, 290 | shape=(-1, self.max_sentence_size), 291 | name="pred_normal_pph" 292 | ) 293 | pred_pph_masked = tf.boolean_mask( # logits_pph_masked [seq_len1+seq_len2+....+,] 294 | tensor=pred_normal_pph, 295 | mask=self.mask, 296 | name="pred_pph_masked" 297 | ) 298 | pred_normal_one_hot_pph = tf.one_hot( # one-hot the pred_normal:[batch_size, max_time,class_num] 299 | indices=pred_normal_pph, 300 | depth=self.class_num, 301 | name="pred_normal_one_hot_pph" 302 | ) 303 | 304 | # loss 305 | self.loss_pph = tf.losses.sparse_softmax_cross_entropy( 306 | labels=y_p_pph_masked, 307 | logits=logits_pph_masked 308 | ) 309 | # ------------------------------------------------------------------------------------ 310 | 311 | # ---------------------------------------IPH------------------------------------------ 312 | # embeded inputs:[batch_size,MAX_TIME_STPES,embedding_size] 313 | inputs_iph = tf.nn.embedding_lookup(params=self.embeddings, ids=self.X_p, name="embeded_input_iph") 314 | # shape of inputs[batch_size,max_time_stpes,embeddings_dims+class_num] 315 | inputs_iph = tf.concat(values=[inputs_iph, pred_normal_one_hot_pph], axis=2, name="inputs_pph") 316 | # print("shape of input_pph:", inputs_pph.shape) 317 | # encoder cells 318 | # forward part 319 | en_lstm_forward1_iph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 320 | # en_lstm_forward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 321 | # en_lstm_forward=rnn.MultiRNNCell(cells=[en_lstm_forward1,en_lstm_forward2]) 322 | 323 | # backward part 324 | en_lstm_backward1_iph = rnn.BasicLSTMCell(num_units=self.hidden_units_num) 325 | # en_lstm_backward2=rnn.BasicLSTMCell(num_units=self.hidden_units_num2) 326 | # en_lstm_backward=rnn.MultiRNNCell(cells=[en_lstm_backward1,en_lstm_backward2]) 327 | 328 | # decoder cells 329 | de_lstm_iph = rnn.BasicLSTMCell(num_units=self.hidden_units_num*2) 330 | 331 | # encode 332 | encoder_outputs_iph, encoder_states_iph = self.encoder( 333 | cell_forward=en_lstm_forward1_iph, 334 | cell_backward=en_lstm_backward1_iph, 335 | inputs=inputs_iph, 336 | seq_length=self.seq_len_p, 337 | scope_name="en_lstm_iph" 338 | ) 339 | # shape of h is [batch*time_steps,hidden_units*2] 340 | h_iph = self.decoder( 341 | cell=de_lstm_iph, 342 | initial_state=encoder_states_iph, 343 | inputs=encoder_outputs_iph, 344 | scope_name="de_lstm_iph" 345 | ) 346 | 347 | # fully connect layer(projection) 348 | w_iph = tf.Variable( 349 | initial_value=tf.random_normal(shape=(self.hidden_units_num*2, self.class_num)), 350 | name="weights_iph" 351 | ) 352 | b_iph = tf.Variable( 353 | initial_value=tf.random_normal(shape=(self.class_num,)), 354 | name="bias_iph" 355 | ) 356 | # logits 357 | logits_iph = tf.matmul(h_iph, w_iph) + b_iph # shape of logits:[batch_size*max_time, 3] 358 | logits_normal_iph = tf.reshape( # logits in an normal way:[batch_size,max_time_stpes,3] 359 | tensor=logits_iph, 360 | shape=(-1, self.max_sentence_size, 3), 361 | name="logits_normal_iph" 362 | ) 363 | logits_iph_masked = tf.boolean_mask( # [seq_len1+seq_len2+....+,3] 364 | tensor=logits_normal_iph, 365 | mask=self.mask, 366 | name="logits_iph_masked" 367 | ) 368 | 369 | # prediction 370 | pred_iph = tf.cast(tf.argmax(logits_iph, 1), tf.int32, name="pred_iph") # pred_iph:[batch_size*max_time,] 371 | pred_normal_iph = tf.reshape( # pred in an normal way,[batch_size, max_time] 372 | tensor=pred_iph, 373 | shape=(-1, self.max_sentence_size), 374 | name="pred_normal_iph" 375 | ) 376 | pred_iph_masked = tf.boolean_mask( # logits_iph_masked [seq_len1+seq_len2+....+,] 377 | tensor=pred_normal_iph, 378 | mask=self.mask, 379 | name="pred_iph_masked" 380 | ) 381 | pred_normal_one_hot_iph = tf.one_hot( # one-hot the pred_normal:[batch_size, max_time,class_num] 382 | indices=pred_normal_iph, 383 | depth=self.class_num, 384 | name="pred_normal_one_hot_iph" 385 | ) 386 | # loss 387 | self.loss_iph = tf.losses.sparse_softmax_cross_entropy( 388 | labels=y_p_iph_masked, 389 | logits=logits_iph_masked 390 | ) 391 | 392 | # --------------------------------------------------------------------------------------- 393 | # loss 394 | self.loss = self.loss_pw + self.loss_pph + self.loss_iph 395 | # optimizer 396 | self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss) 397 | self.init_op = tf.global_variables_initializer() 398 | self.init_local_op = tf.local_variables_initializer() 399 | 400 | # ------------------------------------Session----------------------------------------- 401 | with self.session as sess: 402 | print("Training Start") 403 | sess.run(self.init_op) # initialize all variables 404 | sess.run(self.init_local_op) 405 | 406 | train_Size = X_train.shape[0]; 407 | validation_Size = X_validation.shape[0] 408 | best_validation_loss = 1000 # best validation accuracy in training process 409 | 410 | # epoch 411 | for epoch in range(1, self.max_epoch + 1): 412 | print("Epoch:", epoch) 413 | start_time = time.time() # time evaluation 414 | # training loss/accuracy in every mini-batch 415 | train_losses = [] 416 | train_accus_pw = [] 417 | train_accus_pph = [] 418 | train_accus_iph = [] 419 | 420 | c1_f_pw = []; 421 | c2_f_pw = [] # each class's f1 score 422 | c1_f_pph = []; 423 | c2_f_pph = [] 424 | c1_f_iph = []; 425 | c2_f_iph = [] 426 | 427 | # mini batch 428 | for i in range(0, (train_Size // self.batch_size)): 429 | #注意:这里获取的都是mask之后的值 430 | _, train_loss, y_train_pw_masked,y_train_pph_masked,y_train_iph_masked,\ 431 | train_pred_pw, train_pred_pph, train_pred_iph = sess.run( 432 | fetches=[self.optimizer, self.loss, 433 | y_p_pw_masked,y_p_pph_masked,y_p_iph_masked, 434 | pred_pw_masked, pred_pph_masked, pred_iph_masked], 435 | feed_dict={ 436 | self.X_p: X_train[i * self.batch_size:(i + 1) * self.batch_size], 437 | self.y_p_pw: y_train_pw[i * self.batch_size:(i + 1) * self.batch_size], 438 | self.y_p_pph: y_train_pph[i * self.batch_size:(i + 1) * self.batch_size], 439 | self.y_p_iph: y_train_iph[i * self.batch_size:(i + 1) * self.batch_size], 440 | self.seq_len_p: len_train[i * self.batch_size:(i + 1) * self.batch_size] 441 | } 442 | ) 443 | 444 | # loss 445 | train_losses.append(train_loss) 446 | # metrics 447 | 448 | accuracy_pw, f1_1_pw, f1_2_pw = util.eval(y_true=y_train_pw_masked,y_pred=train_pred_pw) # pw 449 | accuracy_pph, f1_1_pph, f1_2_pph = util.eval(y_true=y_train_pph_masked,y_pred=train_pred_pph) # pph 450 | accuracy_iph, f1_1_iph, f1_2_iph = util.eval(y_true=y_train_iph_masked,y_pred=train_pred_iph) # iph 451 | 452 | train_accus_pw.append(accuracy_pw) 453 | train_accus_pph.append(accuracy_pph) 454 | train_accus_iph.append(accuracy_iph) 455 | # F1-score 456 | c1_f_pw.append(f1_1_pw); 457 | c2_f_pw.append(f1_2_pw) 458 | c1_f_pph.append(f1_1_pph); 459 | c2_f_pph.append(f1_2_pph) 460 | c1_f_iph.append(f1_1_iph); 461 | c2_f_iph.append(f1_2_iph) 462 | 463 | # validation in every epoch 464 | validation_loss, y_valid_pw_masked,y_valid_pph_masked,y_valid_iph_masked,\ 465 | valid_pred_pw, valid_pred_pph, valid_pred_iph = sess.run( 466 | fetches=[self.loss, y_p_pw_masked,y_p_pph_masked,y_p_iph_masked, 467 | pred_pw_masked, pred_pph_masked, pred_iph_masked], 468 | feed_dict={ 469 | self.X_p: X_validation, 470 | self.y_p_pw: y_validation_pw, 471 | self.y_p_pph: y_validation_pph, 472 | self.y_p_iph: y_validation_iph, 473 | self.seq_len_p: len_validation 474 | } 475 | ) 476 | # print("valid_pred_pw.shape:",valid_pred_pw.shape) 477 | # print("valid_pred_pph.shape:",valid_pred_pph.shape) 478 | # print("valid_pred_iph.shape:",valid_pred_iph.shape) 479 | 480 | # metrics 481 | valid_accuracy_pw, valid_f1_1_pw, valid_f1_2_pw = util.eval(y_true=y_valid_pw_masked,y_pred=valid_pred_pw) 482 | valid_accuracy_pph, valid_f1_1_pph, valid_f1_2_pph = util.eval(y_true=y_valid_pph_masked,y_pred=valid_pred_pph) 483 | valid_accuracy_iph, valid_f1_1_iph, valid_f1_2_iph = util.eval(y_true=y_valid_iph_masked,y_pred=valid_pred_iph) 484 | 485 | # show information 486 | print("Epoch ", epoch, " finished.", "spend ", round((time.time() - start_time) / 60, 2), " mins") 487 | print(" /**Training info**/") 488 | print("----avarage training loss:", sum(train_losses) / len(train_losses)) 489 | print("PW:") 490 | print("----avarage accuracy:", sum(train_accus_pw) / len(train_accus_pw)) 491 | print("----avarage f1-Score of N:", sum(c1_f_pw) / len(c1_f_pw)) 492 | print("----avarage f1-Score of B:", sum(c2_f_pw) / len(c2_f_pw)) 493 | print("PPH:") 494 | print("----avarage accuracy :", sum(train_accus_pph) / len(train_accus_pph)) 495 | print("----avarage f1-Score of N:", sum(c1_f_pph) / len(c1_f_pph)) 496 | print("----avarage f1-Score of B:", sum(c2_f_pph) / len(c2_f_pph)) 497 | print("IPH:") 498 | print("----avarage accuracy:", sum(train_accus_iph) / len(train_accus_iph)) 499 | print("----avarage f1-Score of N:", sum(c1_f_iph) / len(c1_f_iph)) 500 | print("----avarage f1-Score of B:", sum(c2_f_iph) / len(c2_f_iph)) 501 | 502 | print(" /**Validation info**/") 503 | print("----avarage validation loss:", validation_loss) 504 | print("PW:") 505 | print("----avarage accuracy:", valid_accuracy_pw) 506 | print("----avarage f1-Score of N:", valid_f1_1_pw) 507 | print("----avarage f1-Score of B:", valid_f1_2_pw) 508 | print("PPH:") 509 | print("----avarage accuracy :", valid_accuracy_pph) 510 | print("----avarage f1-Score of N:", valid_f1_1_pph) 511 | print("----avarage f1-Score of B:", valid_f1_2_pph) 512 | print("IPH:") 513 | print("----avarage accuracy:", valid_accuracy_iph) 514 | print("----avarage f1-Score of N:", valid_f1_1_iph) 515 | print("----avarage f1-Score of B:", valid_f1_2_iph) 516 | 517 | # when we get a new best validation accuracy,we store the model 518 | if best_validation_loss < validation_loss: 519 | best_validation_loss = validation_loss 520 | print("New Best loss ", best_validation_loss, " On Validation set! ") 521 | print("Saving Models......\n\n") 522 | # exist ./models folder? 523 | if not os.path.exists("./models/"): 524 | os.mkdir(path="./models/") 525 | if not os.path.exists("./models/" + name): 526 | os.mkdir(path="./models/" + name) 527 | if not os.path.exists("./models/" + name + "/bilstm"): 528 | os.mkdir(path="./models/" + name + "/bilstm") 529 | # create saver 530 | saver = tf.train.Saver() 531 | saver.save(sess, "./models/" + name + "/bilstm/my-model-10000") 532 | # Generates MetaGraphDef. 533 | saver.export_meta_graph("./models/" + name + "/bilstm/my-model-10000.meta") 534 | print("\n\n") 535 | # test:using X_validation_pw 536 | test_pred_pw, test_pred_pph, test_pred_iph = sess.run( 537 | fetches=[pred_pw, pred_pph, pred_iph], 538 | feed_dict={ 539 | self.X_p: X_validation, 540 | self.seq_len_p: len_validation 541 | } 542 | ) 543 | # recover to original corpus txt 544 | # shape of valid_pred_pw,valid_pred_pw,valid_pred_pw:[corpus_size*time_stpes] 545 | util.recover( 546 | X=X_validation, 547 | preds_pw=test_pred_pw, 548 | preds_pph=test_pred_pph, 549 | preds_iph=test_pred_iph, 550 | filename="recover_epoch_" + str(epoch) + ".txt" 551 | ) 552 | 553 | # 返回预测的结果或者准确率,y not None的时候返回准确率,y ==None的时候返回预测值 554 | def pred(self, name, X, y=None, ): 555 | start_time = time.time() # compute time 556 | if y is None: 557 | with self.session as sess: 558 | # restore model 559 | new_saver = tf.train.import_meta_graph( 560 | meta_graph_or_file="./models/" + name + "/bilstm/my-model-10000.meta", 561 | clear_devices=True 562 | ) 563 | new_saver.restore(sess, "./models/" + name + "/bilstm/my-model-10000") 564 | # get default graph 565 | graph = tf.get_default_graph() 566 | # get opration from the graph 567 | pred_normal = graph.get_operation_by_name("pred_normal").outputs[0] 568 | X_p = graph.get_operation_by_name("input_placeholder").outputs[0] 569 | pred = sess.run(fetches=pred_normal, feed_dict={X_p: X}) 570 | print("this operation spends ", round((time.time() - start_time) / 60, 2), " mins") 571 | return pred 572 | else: 573 | with self.session as sess: 574 | # restore model 575 | new_saver = tf.train.import_meta_graph( 576 | meta_graph_or_file="./models/" + name + "/bilstm/my-model-10000.meta", 577 | clear_devices=True 578 | ) 579 | new_saver.restore(sess, "./models/" + name + "/bilstm/my-model-10000") 580 | graph = tf.get_default_graph() 581 | # get opration from the graph 582 | accuracy = graph.get_operation_by_name("accuracy").outputs[0] 583 | X_p = graph.get_operation_by_name("input_placeholder").outputs[0] 584 | y_p = graph.get_operation_by_name("label_placeholder").outputs[0] 585 | # forward and get the results 586 | accu = sess.run(fetches=accuracy, feed_dict={X_p: X, y_p: y}) 587 | print("this operation spends ", round((time.time() - start_time) / 60, 2), " mins") 588 | return accu 589 | 590 | def showInfo(self, type): 591 | if tpye == "training": 592 | pass 593 | else: 594 | pass 595 | 596 | 597 | # train && test 598 | if __name__ == "__main__": 599 | # 读数据 600 | # pw 601 | df_train_pw = pd.read_pickle(path="./dataset/temptest/pw_summary_train.pkl") 602 | df_validation_pw = pd.read_pickle(path="./dataset/temptest/pw_summary_validation.pkl") 603 | # pph 604 | df_train_pph = pd.read_pickle(path="./dataset/temptest/pph_summary_train.pkl") 605 | df_validation_pph = pd.read_pickle(path="./dataset/temptest/pph_summary_validation.pkl") 606 | # iph 607 | df_train_iph = pd.read_pickle(path="./dataset/temptest/iph_summary_train.pkl") 608 | df_validation_iph = pd.read_pickle(path="./dataset/temptest/iph_summary_validation.pkl") 609 | 610 | # 实际上,X里面的内容都是一样的,所以这里统一使用pw的X来作为所有的X 611 | # 但是标签是不一样的,所以需要每个都要具体定义 612 | X_train = np.asarray(list(df_train_pw['X'].values)) 613 | X_validation = np.asarray(list(df_validation_pw['X'].values)) 614 | 615 | # tags 616 | y_train_pw = np.asarray(list(df_train_pw['y'].values)) 617 | y_validation_pw = np.asarray(list(df_validation_pw['y'].values)) 618 | 619 | y_train_pph = np.asarray(list(df_train_pph['y'].values)) 620 | y_validation_pph = np.asarray(list(df_validation_pph['y'].values)) 621 | 622 | y_train_iph = np.asarray(list(df_train_iph['y'].values)) 623 | y_validation_iph = np.asarray(list(df_validation_iph['y'].values)) 624 | 625 | # length每一行序列的长度 626 | # 因为都一样,所以统一使用pw的 627 | len_train = np.asarray(list(df_train_pw['sentence_len'].values)) 628 | len_validation = np.asarray(list(df_validation_pw['sentence_len'].values)) 629 | print("len_train:", len_train.shape) 630 | print("len_validation:", len_validation.shape) 631 | 632 | # X_train = [X_train_pw, X_train_pph, X_train_iph] 633 | y_train = [y_train_pw, y_train_pph, y_train_iph] 634 | # X_validation = [X_validation_pw, X_validation_pph, X_validation_iph] 635 | y_validation = [y_validation_pw, y_validation_pph, y_validation_iph] 636 | 637 | # print("X_train_pw:\n",X_train_pw); print(X_train_pw.shape) 638 | # print("X_train_pph:\n", X_train_pph); print(X_train_pph.shape) 639 | # print("X_train_iph:\n", X_train_iph); print(X_train_iph.shape) 640 | 641 | # print("y_train_pw:\n", y_train_pw); 642 | # print(y_train_pw.shape) 643 | # print("y_train_pph:\n", y_train_pph); 644 | # print(y_train_pph.shape) 645 | # print("y_train_iph:\n", y_train_iph); 646 | # print(y_train_iph.shape) 647 | 648 | model = Attension_Alignment_Seq2Seq() 649 | model.fit(X_train, y_train, len_train, X_validation, y_validation, len_validation, "test", False) -------------------------------------------------------------------------------- /parameter.py: -------------------------------------------------------------------------------- 1 | ''' 2 | ####file that contains LSTM parameters of this model 3 | ####modify this file to change LSTM parameters 4 | ''' 5 | 6 | #basic architecture 7 | MAX_SENTENCE_SIZE=59 #固定句子长度为59 (从整个数据集得来) 8 | TIMESTEP_SIZE=MAX_SENTENCE_SIZE #LSTM的time_step应该和句子长度一致 9 | INPUT_SIZE=EMBEDDING_SIZE=1001 #嵌入向量维度,和输入大小应当一样 10 | MAX_EPOCH=10 #最大迭代次数 11 | LAYER_NUM=2 #lstm层数2 12 | HIDDEN_UNITS_NUM=128 #隐藏层结点数量 13 | HIDDEN_UNITS_NUM2=128 #隐藏层2结点数量 14 | BATCH_SIZE=1000 #batch大小 15 | 16 | #learning rate 17 | LEARNING_RATE=0.01 #学习率 18 | DECAY=0.85 #衰减系数 19 | 20 | #Weaken Overfitting 21 | DROPOUT_RATE=0.5 #dropout 比率 22 | LAMBDA_PW=0.5 #PW层级正则化系数 23 | LAMBDA_PPH=0.5 #PW层级正则化系数 24 | LAMBDA_IPH=0.5 #PW层级正则化系数 25 | 26 | 27 | #can't modify 28 | CLASS_NUM=3 #类别数量 29 | VOCAB_SIZE=4711 # 样本中不同字的个数+1(padding 0),根据处理数据的时候得到 30 | 31 | -------------------------------------------------------------------------------- /seq2seq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoragoCode/AttentionBasedProsodyPrediction/4d4226fcb336c6abe1a90422db346e726ae6da42/seq2seq.py -------------------------------------------------------------------------------- /temp_test.py: -------------------------------------------------------------------------------- 1 | # logits 2 | logits_iph = tf.matmul(h_iph, w_iph) + b_iph # shape of logits:[batch_size*max_time, 3] 3 | logits_normal_iph = tf.reshape( # logits in an normal way:[batch_size,max_time_stpes,3] 4 | tensor=logits_iph, 5 | shape=(-1, self.max_sentence_size, 3), 6 | name="logits_normal_iph" 7 | ) 8 | logits_iph_masked = tf.boolean_mask( # [seq_len1+seq_len2+....+,3] 9 | tensor=logits_normal_iph, 10 | mask=self.mask, 11 | name="logits_iph_masked" 12 | ) 13 | 14 | # prediction 15 | pred_iph = tf.cast(tf.argmax(logits_iph, 1), tf.int32, name="pred_iph") # pred_iph:[batch_size*max_time,] 16 | pred_normal_iph = tf.reshape( # pred in an normal way,[batch_size, max_time] 17 | tensor=pred_iph, 18 | shape=(-1, self.max_sentence_size), 19 | name="pred_normal_iph" 20 | ) 21 | pred_iph_masked = tf.boolean_mask( # logits_iph_masked [seq_len1+seq_len2+....+,] 22 | tensor=pred_normal_iph, 23 | mask=self.mask, 24 | name="pred_iph_masked" 25 | ) 26 | pred_normal_one_hot_iph = tf.one_hot( # one-hot the pred_normal:[batch_size, max_time,class_num] 27 | indices=pred_normal_iph, 28 | depth=self.class_num, 29 | name="pred_normal_one_hot_iph" 30 | ) 31 | 32 | # loss 33 | self.loss_iph = tf.losses.softmax_cross_entropy( 34 | labels=y_p_iph_masked, 35 | logits=logits_iph_masked 36 | ) 37 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import tensorflow as tf 4 | from sklearn.metrics import precision_score 5 | from sklearn.metrics import recall_score 6 | from sklearn.metrics import accuracy_score 7 | from sklearn.metrics import f1_score 8 | import parameter 9 | 10 | #compute accuracy,precison,recall and f1 11 | def eval(y_true,y_pred): 12 | #accuracy 13 | accuracy=accuracy_score(y_true=y_true,y_pred=y_pred) 14 | 15 | #class 1 16 | binarized_y_true_1=binarize(sequence=y_true,positive_value=1) 17 | binarized_y_pred_1=binarize(sequence=y_pred,positive_value=1) 18 | recall_1=recall_score(y_true=binarized_y_true_1,y_pred=binarized_y_pred_1) 19 | precision_1=precision_score(y_true=binarized_y_true_1,y_pred=binarized_y_pred_1) 20 | f_1=f1_score(y_true=binarized_y_true_1,y_pred=binarized_y_pred_1) 21 | 22 | # class 2 23 | binarized_y_true_2 = binarize(sequence=y_true, positive_value=2) 24 | binarized_y_pred_2 = binarize(sequence=y_pred, positive_value=2) 25 | recall_2 = recall_score(y_true=binarized_y_true_2, y_pred=binarized_y_pred_2) 26 | precision_2 = precision_score(y_true=binarized_y_true_2, y_pred=binarized_y_pred_2) 27 | f_2 = f1_score(y_true=binarized_y_true_2, y_pred=binarized_y_pred_2) 28 | 29 | return accuracy,f_1,f_2 30 | 31 | #以positive_value为正类别,来二值化一个sequence.计算metrics用到 32 | def binarize(sequence,positive_value): 33 | #deep copy 34 | temp_sequence=sequence.copy() 35 | temp_sequence[temp_sequence!=positive_value]=0 36 | temp_sequence[temp_sequence==positive_value]=1 37 | return temp_sequence 38 | 39 | 40 | #recover to original result 41 | def recover(X,preds_pw,preds_pph,preds_iph,filename): 42 | #get complex "#" index 43 | length=preds_pw.shape[0] 44 | complex=np.array([preds_iph,preds_pph,preds_pw]) 45 | arg = np.argmax(complex, axis=0) 46 | #print("arg:\n", arg) 47 | for i in range(length): 48 | if arg[i] == 0: 49 | if complex[0, i] == 2: 50 | arg[i] = 6 51 | else: 52 | arg[i] = 0 53 | if arg[i] == 1: 54 | if complex[1, i] == 2: 55 | arg[i] = 4 56 | else: 57 | arg[i] = 0 58 | if arg[i] == 2: 59 | if complex[2, i] == 2: 60 | arg[i] = 2 61 | else: 62 | arg[i] = 0 63 | arg = (arg / 2).astype(dtype=np.int32) 64 | #shape of arg:[test_size,max_sentence_size] 65 | arg=np.reshape(arg,newshape=(-1,parameter.MAX_SENTENCE_SIZE)) 66 | #print("arg.shape",arg.shape) 67 | #print("arg:\n", arg) 68 | #get id2words 69 | df_words_ids = pd.read_csv(filepath_or_buffer="./dataset/temptest/words_ids.csv", encoding="utf-8") 70 | #print(df_words_ids.head(5)) 71 | id2words = pd.Series(data=df_words_ids["words"].values, index=df_words_ids["id"].values) 72 | #print(id2words[2]) 73 | doc="" 74 | for i in range(X.shape[0]): 75 | sentence="" 76 | for j in range(X.shape[1]): 77 | if(X[i][j])==0: 78 | break; 79 | else: 80 | sentence+=id2words[X[i][j]] 81 | if(arg[i][j]!=0): 82 | sentence+=("#"+str(arg[i][j])) 83 | sentence+="\n" 84 | doc+=sentence 85 | f=open(filename,mode="w",encoding="utf-8") 86 | f.write(doc) 87 | f.close() 88 | 89 | if __name__ =="__main__": 90 | #测试 91 | 92 | a=np.array([1,2,3,4,0,5,6,7,1,1,2,1,0]) 93 | print(a) 94 | result=binarize(sequence=a,positive_value=1) 95 | print(result) 96 | print(a) 97 | 98 | 99 | 100 | 101 | 102 | --------------------------------------------------------------------------------