├── README.md ├── asp_c_model.py └── op_p_model.py /README.md: -------------------------------------------------------------------------------- 1 | ## opinion_mining 2 | 之江杯2019-电商评论观点挖掘,数据及赛题介绍请见[比赛页面](https://zhejianglab.aliyun.com/entrance/231731/information) 3 | 4 | ## 整体思路 5 | 使用两阶段的pipeline的方式,第一阶段用BIES标注`OpinionTerms`和`Polarities`,第二阶段携带第一阶段抽取的**一个**`OpinionTerms`信息去标注`AspectTerms`(如果这个`OpinionTerms`没有对应`AspectTerms`,将`AspectTerms`的序列标注置为全O),同时使用一个分类器去得到这个`OpinionTerms` `AspectTerms` pair或者`OpinionTerms`的`Categories`,两阶段的训练都是采用`multi-task`。为什么第二阶段不用BIES同时标注类别?因为没有`AspectTerms`的情况很多,但是又必须输出一个`Categories`。因为做这个比赛的时间比较赶,没有对两个阶段的总体做线下评分,都是看两阶段有提升就提交了,也没有搞模型融合,只是两阶段都跑了5折,概率平均去预测,最后复赛排名30,五折提升还是非常可观,要是有时间搞出多模型就更好了。 6 | ### 模型细节 7 | 在抽取的时候,因为使用`BERT`,是以字为粒度去做标注,缺失了分词信息,因此使用了`HanLP`做了词性标注,将同一个词的词性用`BIES`规范以`embedding`的形式注入到模型中,新增加了词性与分词信息,线下测有一定的提升。 8 | 9 | ## 领域迁移 10 | 一阶段因为只抽取`OpinionTerms`和`Polarities`,直观上来看,这个任务是不用区分领域的(只是抽取makeup与laptop的`OpinionTerms`和`Polarities`),所以只是简单的混合了两个领域的的数据(laptop makeup),第二阶段laptop和makeup的`Categories`,就有一定的差距了,目前我的解决方案是先使用大数据量的makeup数据去训练第二阶段,然后加载权重,更换最后的`Categories`分类器,再小学习率的微调laptop的数据。 11 | 12 | ## 联合训练的模型 13 | 之前受到过苏神信息抽取任务的启发,曾经尝试过将两个阶段合并到一起。每次训练的时候,模型1还是像上述一阶段的任务这样训练,模型2每次采样一个`OpinionTerms`去抽取`AspectTerms`。(在每一个epoch都重新采样,所以epoch数目足够的话,还是可以见到很多`AspectTerms`数据的),但是我试了一下效果并不好,线下的指标很低,猜测是简单的随机采样可能采到负样本的情况比较多,后面应该尝试更多的采样策略。 14 | 15 | ## Keras显示多个loss 16 | 因为使用了多任务训练,用hook的方式,让Keras能在训练过程中显示每一个任务的loss,详情见代码。 17 | `tensorflow-gpu==1.8.0 keras==2.2.4` 18 | 19 | ## reference 20 | - [多任务训练及数据处理参考](https://github.com/EliasCai/viewpoint-mining) 21 | - [Keras中无损实现复杂(多入参)的损失函数](https://zhuanlan.zhihu.com/p/54024591) 22 | -------------------------------------------------------------------------------- /asp_c_model.py: -------------------------------------------------------------------------------- 1 | # @author : srtianxia 2 | # @time : 2019/9/23 8:20 3 | # @description: 4 | 5 | import codecs 6 | import json 7 | import os 8 | from collections import Counter 9 | 10 | import keras.backend as K 11 | import numpy as np 12 | import pandas as pd 13 | from keras import Input, Model 14 | from keras.callbacks import Callback 15 | from keras.layers import Lambda, Concatenate, RepeatVector, Dense, GlobalAveragePooling1D, Embedding 16 | from keras.losses import categorical_crossentropy 17 | from keras.utils import to_categorical 18 | from keras_bert import load_trained_model_from_checkpoint, Tokenizer 19 | from keras_bert.layers import MaskedConv1D 20 | from keras_contrib.layers import CRF 21 | from tqdm import tqdm 22 | from keras_bert import AdamWarmup, calc_train_steps 23 | import random 24 | from pyhanlp import HanLP 25 | 26 | random.seed(200) 27 | tqdm.pandas() 28 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 29 | np.random.seed(2019) 30 | 31 | root_path = 'bert_base/pretraining_output/' # 使用laptop的语料微调过的bert模型 32 | config_path = root_path + 'bert_config.json' 33 | checkpoint_path = root_path + 'model.ckpt-10000' 34 | dict_path = root_path + 'vocab.txt' 35 | MAX_LEN = 50 36 | TAG_O = '' 37 | 38 | TEST_SIZE = 0.2 39 | BATCH_SIZE = 64 40 | 41 | 42 | def generate_cp_label(labels): 43 | cps_list = Counter(labels).most_common() 44 | cp2id = ({t: i for i, (t, c) in enumerate(cps_list)}) 45 | id2cp = {v: k for k, v in cp2id.items()} 46 | return cp2id, id2cp 47 | 48 | 49 | def encode_aspect_seq(maxlen=48): 50 | id_list = df_review['id'].to_list() 51 | val_ids = random.sample(id_list, int(len(id_list) * 0.1)) 52 | train_reviews = df_review[~df_review['id'].isin(val_ids)] 53 | val_reviews = df_review[df_review['id'].isin(val_ids)] 54 | print(train_reviews.shape[0], val_reviews.shape[0]) 55 | train_labels = df_label[~df_label['id'].isin(val_ids)] 56 | val_labels = df_label[df_label['id'].isin(val_ids)] 57 | 58 | term_to_id = { 59 | "A-B": 1, 60 | "A-I": 2, 61 | "A-E": 3, 62 | "A-S": 4, 63 | } 64 | 65 | def encode_label(df_label: pd.DataFrame, df_reviews: pd.DataFrame): 66 | texts = [] 67 | c_labels = [] 68 | a_seqs = [] 69 | o_seqs = [] 70 | ids = [] 71 | lf_seqs = [] 72 | rt_seqs = [] 73 | for _, row in tqdm(df_label.iterrows(), 'encode label'): 74 | id = row['id'] 75 | ids.append(id) 76 | review = df_reviews.loc[df_reviews['id'] == id]['Reviews'].values.tolist()[0] 77 | texts.append(review) 78 | C = row['Categories'] 79 | c_labels.append(C) 80 | seq_a = np.zeros((maxlen,), dtype=np.int32) 81 | seq_o = np.zeros((maxlen,), dtype=np.int32) 82 | seq_lf = np.zeros((maxlen,), dtype=np.int32) 83 | seq_rt = np.zeros((maxlen,), dtype=np.int32) 84 | # for i_ in range(maxlen): 85 | # seq_o[i_] = 1e-10 86 | e_o = row['O_end'] 87 | opinion_terms = row['OpinionTerms'] 88 | if opinion_terms != '_' and int(e_o) < maxlen: 89 | s_o = int(row['O_start']) 90 | e_o = int(e_o) 91 | for i in range(s_o, e_o): 92 | seq_o[i] = 1 93 | for i in range(maxlen): 94 | seq_lf[i] = abs(i - s_o) 95 | seq_rt[i] = abs(i - e_o) 96 | lf_seqs.append(seq_lf) 97 | rt_seqs.append(seq_rt) 98 | o_seqs.append(seq_o) 99 | aspect_terms = row['AspectTerms'] 100 | e = row['A_end'] 101 | if aspect_terms != '_' and int(e) < maxlen: 102 | s = int(row['A_start']) 103 | e = int(e) 104 | if e - s == 1: # 单个的 105 | seq_a[s] = term_to_id["%s-S" % 'A'] 106 | else: 107 | seq_a[s] = term_to_id["%s-B" % 'A'] 108 | seq_a[e - 1] = term_to_id["%s-E" % 'A'] 109 | for p in range(s + 1, e - 1): 110 | seq_a[p] = term_to_id["%s-I" % 'A'] 111 | a_seqs.append(seq_a) 112 | return texts, a_seqs, o_seqs, lf_seqs, rt_seqs, c_labels, ids 113 | 114 | val_texts, val_a_seqs, val_o_seqs, val_lf_seqs, val_rt_seqs, val_cp_labels, val_ids = encode_label(val_labels, 115 | val_reviews) 116 | train_texts, train_a_seqs, train_o_seqs, train_lf_seqs, train_rt_seqs, train_cp_labels, train_ids = encode_label( 117 | train_labels, train_reviews) 118 | 119 | cp2id, id2cp = generate_cp_label(train_cp_labels + val_cp_labels) 120 | val_cp_labels = [cp2id[i] for i in val_cp_labels] 121 | train_cp_labels = [cp2id[i] for i in train_cp_labels] 122 | 123 | seq_id_val = np.array(val_ids) 124 | seq_id_train = np.array(train_ids) 125 | 126 | seq_A_val = np.stack(val_a_seqs) 127 | seq_A_train = np.stack(train_a_seqs) 128 | 129 | seq_A_all = np.concatenate((seq_A_val, seq_A_train)) 130 | seq_A_all = to_categorical(seq_A_all) 131 | seq_A_val = seq_A_all[:len(seq_A_val)] 132 | seq_A_train = seq_A_all[len(seq_A_val):] 133 | 134 | seq_O_val = np.stack(val_o_seqs) 135 | seq_O_train = np.stack(train_o_seqs) 136 | 137 | seq_lf_val = np.stack(val_lf_seqs) 138 | seq_lf_train = np.stack(train_lf_seqs) 139 | 140 | seq_rt_val = np.stack(val_rt_seqs) 141 | seq_rt_train = np.stack(train_rt_seqs) 142 | 143 | seq_input_val, seq_seg_val = bert_text_to_seq( 144 | val_texts, tokenizer, maxlen=MAX_LEN 145 | ) 146 | 147 | seq_input_train, seq_seg_train = bert_text_to_seq( 148 | train_texts, tokenizer, maxlen=MAX_LEN 149 | ) 150 | 151 | id_to_term = dict([(v, k) for k, v in term_to_id.items()]) 152 | 153 | all_cp_labels = train_cp_labels + val_cp_labels 154 | 155 | all_cp_labels = to_categorical(all_cp_labels) 156 | 157 | train_cp_labels = all_cp_labels[:len(train_cp_labels)] 158 | val_cp_labels = all_cp_labels[len(train_cp_labels):] 159 | 160 | return [seq_input_train, seq_seg_train, seq_A_train, seq_O_train, seq_lf_train, seq_rt_train, train_cp_labels, 161 | seq_id_train], \ 162 | [seq_input_val, seq_seg_val, seq_A_val, seq_O_val, seq_lf_val, seq_rt_val, val_cp_labels, seq_id_val], ( 163 | cp2id, id2cp), id_to_term, val_ids 164 | 165 | 166 | def bert_text_to_seq(texts, tokenizer, maxlen=48): 167 | input_ids = [] 168 | seg_ids = [] 169 | for idx, text in tqdm(enumerate(texts), 'bert_text_to_seq'): 170 | ids, segs = tokenizer.encode(text, max_len=maxlen) 171 | input_ids.append(ids) 172 | seg_ids.append(segs) 173 | return np.array(input_ids), np.array(seg_ids) 174 | 175 | 176 | class BertTokenizer(Tokenizer): 177 | def _tokenize(self, text): 178 | R = [] 179 | for c in text: 180 | if c in self._token_dict: 181 | R.append(c) 182 | elif self._is_space(c): 183 | R.append('[unused1]') # space类用未经训练的[unused1]表示 184 | else: 185 | R.append('[UNK]') # 剩余的字符是[UNK] 186 | return R 187 | 188 | 189 | def pos_tag(review, maxlen=MAX_LEN): 190 | pos_results = HanLP.segment(review) 191 | 192 | tag_pos = 0 193 | postag = [TAG_O] * len(review) 194 | 195 | pos_tag_pos = [] 196 | for idx, term in enumerate(pos_results): 197 | word = term.word 198 | tag = str(term.flag) 199 | words_len = len(word) 200 | pos_tag_pos.append((tag_pos, tag_pos + words_len, tag)) 201 | tag_pos += words_len 202 | 203 | for (s, e, label) in pos_tag_pos: 204 | if e - s == 1: # 单个的 205 | postag[s] = "%s-S" % label 206 | else: 207 | postag[s] = "%s-B" % label 208 | postag[e - 1] = "%s-E" % label 209 | for p in range(s + 1, e - 1): 210 | postag[p] = "%s-I" % label 211 | return postag + [TAG_O] * (maxlen - len(postag)) if len(postag) < maxlen else postag[:maxlen] 212 | 213 | 214 | def decode_asp_c(seq_id, seq_O, text_review): 215 | max_len = seq_O.shape[1] 216 | seq_idx = np.arange(max_len) 217 | assert seq_O.shape[0] == len(text_review) 218 | opinions = [] 219 | for id, s_ao, text in zip(seq_id, seq_O, text_review): 220 | idx_ob = seq_idx[np.where(s_ao == 1, True, False)] 221 | idx_oe = seq_idx[np.where(s_ao == 3, True, False)] 222 | idx_oi = seq_idx[np.where(s_ao == 4, True, False)] 223 | 224 | o_terms = [] 225 | for i_b, i_e in zip(idx_ob, idx_oe): 226 | if i_b >= i_e + 1: 227 | continue 228 | o_terms.append(text[i_b: i_e + 1]) 229 | 230 | for i_i in idx_oi: 231 | o_terms.append(text[i_i: i_i + 1]) 232 | 233 | opinions.append((id, o_terms)) 234 | return opinions 235 | 236 | 237 | class Evaluation(Callback): 238 | def __init__(self, val_data, interval=1): 239 | self.val_data = val_data 240 | self.interval = interval 241 | self.best_f1 = 0. 242 | 243 | def on_epoch_end(self, epoch, log={}): 244 | if epoch % self.interval == 0: 245 | a_out, cp_out = self.model.predict(self.val_data[:-1], batch_size=BATCH_SIZE) 246 | # o_pred = np.argmax(o_out, axis=2) 247 | a_pred = np.argmax(a_out, axis=2) 248 | texts = [df_review[df_review['id'] == i]["Reviews"].values[0] for i in self.val_data[-1]] 249 | cp_pred = np.argmax(cp_out, -1) 250 | pred_vp_val = decode_asp_c(self.val_data[-1], a_pred, texts) 251 | cp_pred_decode = [id2cp[i] for i in cp_pred] 252 | true_df = df_label[df_label['id'].isin(val_ids)] 253 | pred_df = pd.DataFrame(pred_vp_val, columns=['id', 'AspectTerms']) 254 | pred_df['CP'] = cp_pred_decode 255 | S, P, G = 1e-10, 1e-10, 1e-10 256 | p_save_list = [] 257 | t_save_list = [] 258 | 259 | for (_, trues), (_, preds) in zip(true_df.groupby('id'), pred_df.groupby('id')): 260 | assert trues.shape[0] == preds.shape[0] 261 | id = trues['id'].values[0] 262 | R = set() 263 | T = set() 264 | for (_, true_row), (_, pred_row) in zip(trues.iterrows(), preds.iterrows()): 265 | T.add( 266 | (true_row['OpinionTerms'], true_row['AspectTerms'], true_row['Categories'], 267 | ) 268 | ) 269 | CP = pred_row['CP'] 270 | aspect_terms = pred_row['AspectTerms'] 271 | if len(aspect_terms) == 0: 272 | aspect_terms = '_' 273 | else: 274 | aspect_terms = aspect_terms[0] 275 | R.add( 276 | (true_row['OpinionTerms'], aspect_terms, CP) 277 | ) 278 | S += len(R & T) 279 | P += len(R) 280 | G += len(T) 281 | for i in R: 282 | p_save_list.append({ 283 | 'id': str(id), 284 | 'OpinionTerms': i[0], 285 | 'AspectTerms': i[1], 286 | 'Categories': i[2], 287 | 288 | }) 289 | for i in T: 290 | t_save_list.append({ 291 | 'id': str(id), 292 | 'OpinionTerms': i[0], 293 | 'AspectTerms': i[1], 294 | 'Categories': i[2], 295 | 296 | }) 297 | with codecs.open(f'./data/ea/dev_pred_{epoch}.json', 'w', encoding='utf-8') as f: # 错误分析 298 | json.dump(p_save_list, f, indent=4, ensure_ascii=False) 299 | 300 | with codecs.open(f'./data/ea/dev_true_{epoch}.json', 'w', encoding='utf-8') as f: 301 | json.dump(t_save_list, f, indent=4, ensure_ascii=False) 302 | 303 | precision, recall = S / P, S / G 304 | f1 = 2 * precision * recall / (precision + recall) 305 | if f1 > self.best_f1: 306 | self.best_f1 = f1 307 | train_model.save_weights('finetune_weight/best_f1/best_f1_mix_c.h5') 308 | 309 | print( 310 | f'precision = {precision}', 311 | f'recall = {recall}', 312 | f'f1 = {f1}', 313 | "\n", 314 | ) 315 | 316 | 317 | def generate_postag2id(train_pos_tag): 318 | reviews_list = [] 319 | for sent in train_pos_tag: 320 | reviews_list.extend(sent) 321 | ct_list = Counter(reviews_list).most_common() 322 | postag2id = ({t: i for i, (t, c) in enumerate(ct_list)}) 323 | return postag2id 324 | 325 | 326 | def dilated_gated_conv1d(seq, mask, name, dilation_rate=1): 327 | """膨胀门卷积(残差式) 328 | """ 329 | dim = K.int_shape(seq)[-1] 330 | h = MaskedConv1D(filters=dim * 2, kernel_size=3, padding='same', dilation_rate=dilation_rate, name=name)(seq) 331 | 332 | def _gate(x): 333 | dropout_rate = 0.1 334 | s, h = x 335 | g, h = h[:, :, :dim], h[:, :, dim:] 336 | g = K.in_train_phase(K.dropout(g, dropout_rate, seed=2019), g) 337 | g = K.sigmoid(g) 338 | return g * s + (1 - g) * h 339 | 340 | seq = Lambda(_gate)([seq, h]) 341 | seq = Lambda(lambda x: x[0] * x[1])([seq, mask]) 342 | return seq 343 | 344 | 345 | df_review = pd.read_csv('./data_phase2/Train_makeup_reviews.csv') 346 | df_label = pd.read_csv('./data_phase2/Train_makeup_labels.csv') 347 | 348 | token_dict = {} 349 | with codecs.open(dict_path, 'r', 'utf8') as reader: 350 | for line in reader: 351 | token = line.strip() 352 | token_dict[token] = len(token_dict) 353 | 354 | tokenizer = BertTokenizer(token_dict) 355 | train_data, val_data, (cp2id, id2cp), id_to_term, val_ids = encode_aspect_seq(maxlen=MAX_LEN) 356 | 357 | bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None) 358 | 359 | for l in bert_model.layers: 360 | l.trainable = True 361 | 362 | x1_in = Input(shape=(MAX_LEN,), name='x1_in') 363 | x2_in = Input(shape=(MAX_LEN,), name='x2_in') 364 | opinion_mask_in = Input(shape=(MAX_LEN,), name='opinion_mask_in') 365 | lf_pos_in = Input(shape=(MAX_LEN,), name='lf_pos_in') 366 | rt_pos_in = Input(shape=(MAX_LEN,), name='rt_pos_in') 367 | seq_a_in = Input(shape=(MAX_LEN, len(id_to_term) + 1), name='seq_a_in') 368 | c_in = Input(shape=(len(cp2id),), name='c_in') 369 | 370 | opinion_mask = Lambda(lambda x: K.expand_dims(K.cast(K.greater(x, 0), 'float32'), axis=1))(opinion_mask_in) 371 | opinion_mask_emb = Lambda(lambda x: K.cast(K.greater(x, 0), 'float32'))(opinion_mask_in) 372 | pos_tag = Embedding(2, 10, name='embpos')(opinion_mask_emb) 373 | lf_pos_tag = Embedding(MAX_LEN, 10, name='lf_embpos')(lf_pos_in) 374 | rt_pos_tag = Embedding(MAX_LEN, 10, name='rt_embpos')(rt_pos_in) 375 | 376 | x = bert_model([x1_in, x2_in]) 377 | 378 | opinion_vec = Lambda(lambda x: K.batch_dot(x[0], x[1]) / K.sum(x[0], keepdims=True))([opinion_mask, x]) # [?,1,768] 379 | opinion_vec_ori = Lambda(lambda x: K.squeeze(x, axis=1))(opinion_vec) 380 | opinion_vec = RepeatVector(MAX_LEN)(opinion_vec_ori) 381 | mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(x1_in) 382 | 383 | x = Concatenate()([x, opinion_vec, pos_tag, lf_pos_tag, rt_pos_tag]) 384 | 385 | x = dilated_gated_conv1d(x, mask, 'CNN_1', 1) 386 | x = dilated_gated_conv1d(x, mask, 'CNN_2', 2) 387 | x = dilated_gated_conv1d(x, mask, 'CNN_3', 5) 388 | 389 | crf = CRF(len(id_to_term) + 1) 390 | a_out = crf(x) 391 | loss_A = crf.loss_function(seq_a_in, a_out) # 直接加入 Lambda层后 计算图会出错 392 | loss_A = Lambda(lambda x: K.mean(x))(loss_A) 393 | 394 | x = GlobalAveragePooling1D()(x) 395 | x = Concatenate()([x, opinion_vec_ori]) 396 | 397 | c_out = Dense(len(cp2id), activation='softmax', name='cp_out_Dense')(x) 398 | 399 | a_model = Model([x1_in, x2_in, opinion_mask_in, lf_pos_in, rt_pos_in], a_out) 400 | cp_model = Model([x1_in, x2_in, opinion_mask_in, lf_pos_in, rt_pos_in], c_out) 401 | 402 | train_model = Model([x1_in, x2_in, seq_a_in, opinion_mask_in, lf_pos_in, rt_pos_in, c_in], [a_out, c_out]) 403 | 404 | loss_c = Lambda(lambda x: K.mean(categorical_crossentropy(x[0], x[1])), name='loss_p')([c_in, c_out]) 405 | 406 | train_model.add_loss(loss_A) 407 | train_model.add_loss(loss_c) 408 | 409 | total_steps, warmup_steps = calc_train_steps( 410 | num_example=train_data[0].shape[0], 411 | batch_size=BATCH_SIZE, 412 | epochs=100, 413 | warmup_proportion=0.05, 414 | ) 415 | 416 | optimizer = AdamWarmup(total_steps, warmup_steps, lr=1e-4, min_lr=1e-6) 417 | 418 | train_model.compile(optimizer=optimizer) 419 | 420 | train_model.metrics_tensors.append(loss_A) 421 | train_model.metrics_names.append('loss_A') 422 | train_model.metrics_tensors.append(loss_c) 423 | train_model.metrics_names.append('loss_c') 424 | train_model.summary() 425 | 426 | eval_callback = Evaluation(val_data=val_data) 427 | 428 | train_model.fit(train_data[:-1], epochs=100, shuffle=True, batch_size=BATCH_SIZE, callbacks=[eval_callback]) 429 | -------------------------------------------------------------------------------- /op_p_model.py: -------------------------------------------------------------------------------- 1 | # @author : srtianxia 2 | # @time : 2019/9/22 20:45 3 | # @description: 4 | 5 | import codecs 6 | import os 7 | import pickle 8 | 9 | import keras.backend as K 10 | import numpy as np 11 | import pandas as pd 12 | from keras import Input, Model 13 | from keras.callbacks import Callback 14 | from keras.layers import Embedding, Concatenate, Dense, Lambda 15 | from keras.losses import categorical_crossentropy 16 | from keras.utils import to_categorical 17 | from keras_bert import Tokenizer, load_trained_model_from_checkpoint 18 | from keras_contrib.layers import CRF 19 | from pyhanlp import HanLP 20 | from sklearn.model_selection import train_test_split 21 | from tensorflow import set_random_seed 22 | from tqdm import tqdm 23 | from keras_bert import AdamWarmup, calc_train_steps 24 | 25 | tqdm.pandas() 26 | set_random_seed(2019) 27 | 28 | os.environ["CUDA_VISIBLE_DEVICES"] = '0' 29 | 30 | np.random.seed(2019) 31 | MAX_LEN = 48 32 | BATCH_SIZE = 64 33 | TEST_SIZE = 0.1 34 | LEARNING_RATE = 1e-5 35 | EPOCHS = 100 36 | POS_TAG_DIM = 256 37 | 38 | TAG_PAD = '' 39 | 40 | df_review_laptop = pd.read_csv('./data_phase2/Train_laptop_reviews.csv') 41 | df_label_laptop = pd.read_csv('./data_phase2/Train_laptop_labels.csv') 42 | 43 | df_review_makeup = pd.read_csv('./data_phase2/Train_makeup_reviews.csv') 44 | df_label_makeup = pd.read_csv('./data_phase2/Train_makeup_labels.csv') 45 | 46 | nums_laptop = df_review_laptop.shape[0] 47 | df_review_makeup['id'] = df_review_makeup['id'] + nums_laptop 48 | df_label_makeup['id'] = df_label_makeup['id'] + nums_laptop 49 | 50 | df_review = pd.concat([df_review_laptop, df_review_makeup], ignore_index=True) 51 | df_label = pd.concat([df_label_laptop, df_label_makeup], ignore_index=True) 52 | 53 | 54 | def decode_seq(seq_id, seq_O, seq_P, id_to_label, text_review): 55 | max_len = seq_O.shape[1] 56 | seq_idx = np.arange(max_len) 57 | assert seq_O.shape[0] == seq_P.shape[0] == len(text_review) 58 | viewpoints = [] 59 | for id, s_ao, s_cp, text in tqdm(zip(seq_id, seq_O, seq_P, text_review), 'decode_seq'): 60 | idx_ob = seq_idx[np.where(s_ao == 1, True, False)] 61 | idx_oe = seq_idx[np.where(s_ao == 3, True, False)] 62 | idx_oi = seq_idx[np.where(s_ao == 4, True, False)] 63 | 64 | o_terms = [] 65 | 66 | for i_b, i_e in zip(idx_ob, idx_oe): 67 | if i_b >= i_e + 1: 68 | continue 69 | label = max(s_cp[i_b: i_e + 1]) 70 | o_terms.append((text[i_b: i_e + 1], id_to_label.get(label, 'O'), i_b, i_e + 1)) 71 | 72 | for i_i in idx_oi: 73 | label = max(s_cp[i_i: i_i + 1]) 74 | o_terms.append((text[i_i: i_i + 1], id_to_label.get(label, 'O'), i_i, i_i + 1)) 75 | 76 | viewpoints.append((id, o_terms)) 77 | return viewpoints 78 | 79 | 80 | def encode_seq(df_label, maxlen=48): 81 | label_to_id = { 82 | '中性': 1, 83 | '负面': 2, 84 | '正面': 3 # 这样解码的时候可能更合理 85 | } 86 | 87 | term_to_id = { 88 | "O-B": 1, 89 | "O-I": 2, 90 | "O-E": 3, 91 | "O-S": 4, 92 | } 93 | 94 | def encode_term(pos, label): 95 | seq = np.zeros((maxlen,), dtype=np.int32) 96 | for (s, e) in pos: 97 | if e - s == 1: # 单个的 98 | seq[s] = term_to_id["%s-S" % label] 99 | else: 100 | seq[s] = term_to_id["%s-B" % label] 101 | seq[e - 1] = term_to_id["%s-E" % label] 102 | for p in range(s + 1, e - 1): 103 | seq[p] = term_to_id["%s-I" % label] 104 | return seq.reshape((1, -1)) 105 | 106 | def encode_label(pos_o, label): 107 | seq = np.zeros((maxlen,), dtype=np.int32) 108 | for (s, e), l in zip(pos_o, label): 109 | if s == " " or int(e) >= maxlen: 110 | continue 111 | s = int(s) 112 | e = int(e) 113 | if e - s == 1: 114 | seq[s] = label_to_id[l] 115 | else: 116 | seq[s] = label_to_id[l] 117 | seq[e - 1] = label_to_id[l] 118 | for p in range(s + 1, e - 1): 119 | seq[p] = label_to_id[l] 120 | return seq.reshape((1, -1)) 121 | 122 | seq_O = df_label.groupby("id").apply( # 所有的O 123 | lambda x: encode_term( 124 | [ 125 | (int(s), int(e)) 126 | for s, e in zip(x["O_start"], x["O_end"]) 127 | if s != " " and int(e) < maxlen 128 | ], 129 | "O", 130 | ) 131 | ) 132 | 133 | seq_CP = df_label.groupby("id").apply( 134 | lambda x: encode_label( 135 | [(s, e) for s, e in zip(x["O_start"], x["O_end"])], 136 | [p for p in x["Polarities"]], 137 | ) 138 | ) 139 | 140 | seq_id = np.array(df_label.groupby("id").apply(lambda x: list(x['id'])[0]).to_list()) 141 | seq_O = np.vstack(seq_O) 142 | 143 | seq_P = np.vstack(seq_CP) 144 | 145 | id_to_label = dict([(v, k) for k, v in label_to_id.items()]) 146 | id_to_term = dict([(v, k) for k, v in term_to_id.items()]) 147 | return seq_id, seq_O, seq_P, id_to_label, id_to_term 148 | 149 | 150 | def cal_opinion_metrics(pred_vp, true_vp): 151 | true_df = pd.DataFrame(true_vp, columns=['id', 'o', 'p', 's', 'e']) 152 | S, P, G = 1e-10, 1e-10, 1e-10 153 | pred_df = pd.DataFrame(pred_vp, columns=['id', 'o_pred']) 154 | for idx, trues in tqdm(true_df.groupby('id'), 'cal_opinion_metrics'): 155 | id = trues['id'].values[0] 156 | T = set() 157 | for _, true_row in trues.iterrows(): 158 | T.add((true_row['o'], true_row['p'], str(true_row['s']), str(true_row['e']))) 159 | pred_list = pred_df.loc[pred_df['id'] == id]['o_pred'].values.tolist()[0] 160 | R = set() 161 | for pred_row in pred_list: 162 | R.add((pred_row[0], pred_row[1], str(pred_row[2]), str(pred_row[3]))) 163 | S += len(R & T) 164 | P += len(R) 165 | G += len(T) 166 | 167 | precision, recall = S / P, S / G 168 | f1 = 2 * precision * recall / (precision + recall) 169 | 170 | print( 171 | f'precision = {precision}', 172 | f'recall = {recall}', 173 | f'f1 = {f1}', 174 | "\n", 175 | ) 176 | return precision, recall, f1 177 | 178 | 179 | def split_viewpoints(seq_id, seq_input, seq_mask, seq_AO, seq_P, seq_postag): 180 | idx = np.random.permutation(range(seq_id.shape[0])) 181 | tr_idx, te_idx = train_test_split(idx, test_size=TEST_SIZE, random_state=2019) 182 | return ( 183 | [ 184 | seq_id[tr_idx], 185 | seq_input[tr_idx], 186 | seq_mask[tr_idx], 187 | seq_postag[tr_idx], 188 | seq_AO[tr_idx], 189 | seq_P[tr_idx] 190 | ], 191 | [ 192 | seq_id[te_idx], 193 | seq_input[te_idx], 194 | seq_mask[te_idx], 195 | seq_postag[te_idx], 196 | seq_AO[te_idx], 197 | seq_P[te_idx] 198 | ], 199 | ) 200 | 201 | 202 | root_path = './chinese_L-12_H-768_A-12/' 203 | config_path = root_path + 'bert_config.json' 204 | checkpoint_path = root_path + 'bert_model.ckpt' 205 | dict_path = root_path + 'vocab.txt' 206 | 207 | token_dict = {} 208 | with codecs.open(dict_path, 'r', 'utf8') as reader: 209 | for line in reader: 210 | token = line.strip() 211 | token_dict[token] = len(token_dict) 212 | 213 | 214 | class BertTokenizer(Tokenizer): 215 | def _tokenize(self, text): 216 | R = [] 217 | for c in text: 218 | if c in self._token_dict: 219 | R.append(c) 220 | elif self._is_space(c): 221 | R.append('[unused1]') 222 | else: 223 | R.append('[UNK]') 224 | return R 225 | 226 | 227 | def bert_text_to_seq(texts, tokenizer, maxlen=48): 228 | input_ids = [] 229 | seg_ids = [] 230 | for idx, text in tqdm(enumerate(texts), 'bert_text_to_seq'): 231 | ids, segs = tokenizer.encode(text, max_len=maxlen) 232 | input_ids.append(ids) 233 | seg_ids.append(segs) 234 | return np.array(input_ids), np.array(seg_ids) 235 | 236 | 237 | def pos_tag(review, maxlen=MAX_LEN): 238 | pos_results = HanLP.segment(review) 239 | tag_pos = 0 240 | postag = [TAG_PAD] * len(review) 241 | pos_tag_pos = [] 242 | for idx, term in enumerate(pos_results): 243 | word = term.word 244 | tag = str(term.nature) 245 | words_len = len(word) 246 | pos_tag_pos.append((tag_pos, tag_pos + words_len, tag)) 247 | tag_pos += words_len 248 | 249 | for (s, e, label) in pos_tag_pos: 250 | if e - s == 1: # 单个的 251 | postag[s] = "%s-S" % label 252 | else: 253 | postag[s] = "%s-B" % label 254 | postag[e - 1] = "%s-E" % label 255 | for p in range(s + 1, e - 1): 256 | postag[p] = "%s-I" % label 257 | return postag + [TAG_PAD] * (maxlen - len(postag)) if len(postag) < maxlen else postag[:maxlen] 258 | 259 | 260 | def main(): 261 | seq_id, seq_O, seq_P, id_to_label, id_to_term = encode_seq(df_label=df_label, maxlen=MAX_LEN) 262 | 263 | class Evaluation(Callback): 264 | def __init__(self, val_data, interval=1): 265 | self.val_data = val_data 266 | self.interval = interval 267 | self.best_f1 = 0. 268 | 269 | self.true_vp_val = [ 270 | ( 271 | row["id"], 272 | row["OpinionTerms"], 273 | row["Polarities"], 274 | row['O_start'], 275 | row['O_end'] 276 | ) 277 | for rowid, row in df_label[df_label['id'].isin(self.val_data[0])].iterrows() 278 | ] 279 | 280 | def on_epoch_end(self, epoch, log={}): 281 | if epoch % self.interval == 0: 282 | o_out, p_out = pred_model.predict(self.val_data[1:4], batch_size=BATCH_SIZE) # CRF概率 283 | o_pred = np.argmax(o_out, axis=2) 284 | p_pred = np.argmax(p_out, axis=2) 285 | 286 | texts = [df_review[df_review['id'] == i]["Reviews"].values[0] for i in self.val_data[0]] 287 | 288 | pred_vp_val = decode_seq( 289 | self.val_data[0], o_pred, p_pred, id_to_label, texts) 290 | 291 | precision, recall, f1 = cal_opinion_metrics(pred_vp_val, self.true_vp_val) 292 | if f1 > self.best_f1: 293 | self.best_f1 = f1 294 | self.model.save_weights(f'./model_op/op_model_0924_viteb.weights') 295 | print(f'best = {f1}') 296 | 297 | tokenizer = BertTokenizer(token_dict) 298 | 299 | seq_input, seq_seg = bert_text_to_seq( 300 | list(df_review["Reviews"]), tokenizer, maxlen=MAX_LEN 301 | ) 302 | 303 | true_vp = [ 304 | ( 305 | row["id"], 306 | row["OpinionTerms"], 307 | row["Polarities"], 308 | row['O_start'], 309 | row['O_end'] 310 | ) 311 | for rowid, row in df_label.iterrows() 312 | ] 313 | 314 | pred_vp = decode_seq( 315 | seq_id, seq_O, seq_P, id_to_label, list(df_review["Reviews"]) 316 | ) 317 | 318 | cal_opinion_metrics(pred_vp, true_vp) 319 | 320 | seq_O = to_categorical(seq_O) 321 | 322 | seq_P = to_categorical(seq_P) 323 | 324 | df_review['pos_tag'] = df_review['Reviews'].progress_apply(pos_tag) 325 | 326 | with open('./data/postag2id_0922_laptop_make_up.pkl', 'rb') as f: 327 | postag2id = pickle.load(f) 328 | 329 | df_review['pos_tag'] = df_review['pos_tag'].progress_apply(lambda postag: [postag2id[x] for x in postag]) 330 | 331 | seq_postag = np.array(df_review['pos_tag'].values.tolist()) 332 | 333 | view_train, view_val = split_viewpoints(seq_id, seq_input, seq_seg, seq_O, seq_P, seq_postag) 334 | 335 | print(view_val[0]) 336 | print('------------------- 保存验证集的id ---------------------') 337 | print('保存final 验证集的val ids') 338 | 339 | # np.save('./data/final_makeup_laptop_val_ids', view_val[0]) 340 | print('------------------- 保存完毕 ---------------------------') 341 | # exit() 342 | bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None) 343 | for l in bert_model.layers: 344 | l.trainable = True 345 | 346 | x1_in = Input(shape=(MAX_LEN,), name='x1_in') 347 | x2_in = Input(shape=(MAX_LEN,), name='x2_in') 348 | o_in = Input(shape=(MAX_LEN, len(id_to_term) + 1,), name='o_in') 349 | p_in = Input(shape=(MAX_LEN, len(id_to_label) + 1,), name='p_in') 350 | 351 | pos_tag_in = Input(shape=(MAX_LEN,), name='pos_tag_in') 352 | pos_tag_emb = Embedding(len(postag2id), POS_TAG_DIM, trainable=True)(pos_tag_in) 353 | 354 | x = bert_model([x1_in, x2_in]) 355 | x = Concatenate()([x, pos_tag_emb]) 356 | 357 | p_out = Dense(len(id_to_label) + 1, activation='softmax')(x) # p_out 是极性的输出 358 | crf = CRF(len(id_to_term) + 1) 359 | o_out = crf(x) 360 | loss_seq_O = crf.loss_function(o_in, o_out) # 直接加入 Lambda层后 计算图会出错 361 | loss_seq_O = Lambda(lambda x: K.mean(x))(loss_seq_O) 362 | # loss_seq_O = Lambda(lambda x: K.mean(categorical_crossentropy(x[0], x[1])), name='loss_seq_O')([o_in, o_out]) 363 | 364 | loss_p = Lambda(lambda x: K.mean(categorical_crossentropy(x[0], x[1])), name='loss_c')([p_in, p_out]) 365 | 366 | train_model = Model([x1_in, x2_in, pos_tag_in, o_in, p_in], [o_out, p_out]) 367 | pred_model = Model([x1_in, x2_in, pos_tag_in], [o_out, p_out]) 368 | train_model._losses = [] 369 | train_model._per_input_losses = {} 370 | train_model.add_loss(loss_seq_O) 371 | train_model.add_loss(loss_p) 372 | 373 | print(view_train[0].shape[0]) 374 | 375 | total_steps, warmup_steps = calc_train_steps( 376 | num_example=view_train[0].shape[0], 377 | batch_size=BATCH_SIZE, 378 | epochs=EPOCHS, 379 | warmup_proportion=0.1, 380 | ) 381 | # optimizer = Adam(lr=1e-5) 382 | optimizer = AdamWarmup(total_steps, warmup_steps, lr=5e-5, min_lr=1e-6) 383 | 384 | train_model.compile(optimizer=optimizer) 385 | train_model.metrics_tensors.append(loss_seq_O) 386 | train_model.metrics_names.append('loss_seq_O') 387 | train_model.metrics_tensors.append(loss_p) 388 | train_model.metrics_names.append('loss_p') 389 | train_model.summary() 390 | 391 | eval_callback = Evaluation(val_data=view_val) 392 | 393 | train_model.fit(view_train[1:], epochs=EPOCHS, shuffle=True, batch_size=BATCH_SIZE, callbacks=[eval_callback]) 394 | 395 | 396 | if __name__ == "__main__": 397 | main() 398 | --------------------------------------------------------------------------------