├── Joint_Model ├── Entity_Relation_addlabelembedding_version7.py ├── Entity_Relationship_Version6.py ├── Entity_Relationship_version1.py └── Entity_relationship_version2.py ├── README.md ├── Relation_Model ├── Relationship_Version3.py └── Relationship_Version5.py ├── data └── data.rar └── data_process.py /Joint_Model/Entity_Relation_addlabelembedding_version7.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 使用BERT, 30000条 3 | F值最高78.9% 4 | 缺陷:无法进行实体重叠的关系抽取 5 | ''' 6 | #! -*- coding:utf-8 -*- 7 | import codecs 8 | import os 9 | import numpy as np 10 | import tensorflow as tf 11 | from data_process import id2predicate, list_find, predicate2id 12 | import json 13 | from transformers import BertTokenizer, TFBertModel 14 | from tqdm import tqdm 15 | 16 | 17 | import logging 18 | logging.disable(30) 19 | 20 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 21 | 22 | train_data = json.load(open('./data_trans/train_data_me.json', encoding='utf-8')) 23 | dev_data = json.load(open('./data_trans/dev_data_me.json', encoding='utf-8')) 24 | 25 | # bert配置 26 | checkpoint_path = "./bert_model/chinese_L-12_H-768_A-12/" 27 | tokenizer = BertTokenizer.from_pretrained(checkpoint_path, lowercase=True, add_special_tokens=True) 28 | bert_model = TFBertModel.from_pretrained(checkpoint_path) 29 | 30 | num_class = 50 31 | label_class = 3 32 | lr = 2e-5 33 | epsilon = 1e-06 34 | num_epochs = 20 35 | batch_size = 6 36 | dropout = 0.5 37 | ''' 38 | ner:预测subject/object 39 | perdicate:预测头部关系矩阵(128*128) 40 | ''' 41 | def get_input_bert(data): 42 | input_x, input_segment, input_mask, input_ner, input_re = [], [], [], [], [] 43 | for l in tqdm(range(30000)): 44 | items = {} 45 | line = data[l] 46 | text = line['text'][0:126] 47 | word_list = [key for key in text] 48 | word_list.insert(0, "[CLS]") 49 | word_list.append("[SEP]") 50 | spo = line['spo_list'] 51 | #token_ids = tokenizer.encode(text, max_length=128) 52 | token_ids = tokenizer.convert_tokens_to_ids(word_list) 53 | segment_ids = np.zeros(len(token_ids)) 54 | mask = np.ones(len(token_ids)) 55 | for sp in spo: 56 | sp = (tokenizer.convert_tokens_to_ids([key for key in sp[0]]), sp[1], tokenizer.convert_tokens_to_ids([key for key in sp[2]])) 57 | subjectid = list_find(token_ids, sp[0]) 58 | objectid = list_find(token_ids, sp[2]) 59 | if subjectid != -1 and objectid != -1: 60 | key = (subjectid, subjectid + len(sp[0])) 61 | if key not in items: 62 | items[key] = [] 63 | items[key].append((objectid, 64 | objectid + len(sp[2]), 65 | predicate2id[sp[1]] + 1)) 66 | if items: 67 | input_x.append(token_ids) 68 | input_segment.append(segment_ids) 69 | input_mask.append(mask) 70 | #seq_len.append(len(text2id)) 71 | ner_s = np.zeros(len(token_ids), dtype=np.int32) 72 | er_s = np.zeros((128, 128), dtype=np.int32) 73 | for j in items: 74 | ner_s[j[0]] = 1 75 | ner_s[j[0]+1:j[1]] = 2 76 | for k in items[j]: 77 | ner_s[k[0]] = 1 78 | ner_s[k[0]+1:k[1]] = 2 79 | er_s[j[0]][k[0]] = k[2] 80 | #print(ner_s) 81 | input_ner.append(ner_s) 82 | input_re.append(er_s) 83 | 84 | #seq_len = np.array(seq_len, dtype=np.int32) 85 | input_re = np.array(input_re, dtype=np.int32) 86 | input_x = tf.keras.preprocessing.sequence.pad_sequences(input_x, 128, padding='post', truncating='post') 87 | input_segment = tf.keras.preprocessing.sequence.pad_sequences(input_segment, 128, padding='post', truncating='post') 88 | input_mask = tf.keras.preprocessing.sequence.pad_sequences(input_mask, 128, padding='post', truncating='post') 89 | input_ner = tf.keras.preprocessing.sequence.pad_sequences(input_ner, 128, padding='post', truncating='post') 90 | return input_x, input_segment, input_mask, input_ner, input_re 91 | 92 | # input_x, input_segment, input_ner, input_re = get_input_bert(train_data) 93 | # print(train_data[0]) 94 | # print(input_x[0]) 95 | # print(input_segment[0]) 96 | # print(input_ner[0]) 97 | # print(input_re[0][21]) 98 | 99 | class data_loader(): 100 | def __init__(self): 101 | self.input_x, self.input_segment,self.input_mask, self.input_ner, self.input_re = get_input_bert(train_data) 102 | self.input_x = self.input_x.astype(np.int32) 103 | self.input_segment = self.input_segment.astype(np.int32) 104 | self.input_mask = self.input_mask.astype(np.int32) 105 | self.input_ner = self.input_ner.astype(np.int32) 106 | self.input_re = self.input_re.astype(np.int32) 107 | self.num_train = self.input_x.shape[0] 108 | self.db_train = tf.data.Dataset.from_tensor_slices((self.input_x, self.input_segment, self.input_mask, self.input_ner, self.input_re)) 109 | self.db_train = self.db_train.shuffle(self.num_train).batch(batch_size, drop_remainder=True) 110 | 111 | def get_batch(self, batch_s): 112 | indics = np.random.randint(0, self.num_train, batch_s) 113 | return self.input_x[indics], self.input_segment[indics], self.input_mask[indics], self.input_ner[indics], self.input_re[indics] 114 | 115 | ''' 116 | epoch20, 最大F=81.1 117 | ''' 118 | class Ner_model(tf.keras.Model): 119 | def __init__(self, bert_model): 120 | super(Ner_model, self).__init__() 121 | self.bert = bert_model 122 | #self.dense_fuc = tf.keras.layers.Dense(100, use_bias=False) #全连接层 123 | self.dense = tf.keras.layers.Dense(label_class) 124 | 125 | def call(self, inputs, mask, segment): 126 | output_encode, _ = self.bert([inputs, mask, segment]) 127 | #x = self.dense_fuc(output_encode) 128 | x = self.dense(output_encode) 129 | x = tf.nn.softmax(x) 130 | return x, output_encode 131 | 132 | class ER_model(tf.keras.Model): 133 | def __init__(self): 134 | super(ER_model, self).__init__() 135 | self.label_embedding = tf.keras.layers.Embedding(3, 256) 136 | self.dense_label = tf.keras.layers.Dense(128, use_bias=True) 137 | self.dense_left = tf.keras.layers.Dense(128, use_bias=False) 138 | self.dense_right = tf.keras.layers.Dense(128, use_bias=False) 139 | self.dropout = tf.keras.layers.Dropout(dropout) 140 | self.dense = tf.keras.layers.Dense(num_class) 141 | 142 | def call(self, encode_input, ner): 143 | label_embedding = self.label_embedding(ner) 144 | label_embedding = self.dense_label(label_embedding) 145 | encode_input = tf.concat([encode_input, label_embedding], axis=-1) 146 | left = self.dense_left(encode_input) 147 | right = self.dense_right(encode_input) 148 | outer_sum = broadcasting(left, right) 149 | output = tf.tanh(outer_sum) 150 | output = self.dropout(output) 151 | output = self.dense(output) 152 | output = tf.sigmoid(output) 153 | return output 154 | 155 | def broadcasting(left, right): 156 | left = tf.transpose(left, perm=[1, 0, 2]) 157 | left = tf.expand_dims(left, 3) 158 | right = tf.transpose(right, perm=[0, 2, 1]) 159 | right = tf.expand_dims(right, 0) 160 | B = left + right 161 | B = tf.transpose(B, perm=[1, 0, 3, 2]) 162 | return B 163 | 164 | def loss_function(ner, re_pred, input_nerd, input_red): 165 | ner_one_hot = tf.one_hot(input_nerd, depth=3, dtype=tf.float32) 166 | loss_ner = tf.keras.losses.categorical_crossentropy(y_true=ner_one_hot, y_pred=ner) 167 | loss_ner = tf.reduce_sum(loss_ner) 168 | 169 | input_re_onehot = tf.one_hot(input_red, depth=num_class, dtype=tf.float32) 170 | loss_re = tf.keras.losses.binary_crossentropy(y_true=input_re_onehot, y_pred=re_pred) 171 | loss_re = tf.reduce_sum(loss_re) 172 | 173 | loss = (loss_ner + loss_re) 174 | return loss, loss_ner, loss_re 175 | 176 | 177 | class Extra_result(object): 178 | def __init__(self, text, spo_list): 179 | self.text = text 180 | self.spo = spo_list 181 | def call(self): 182 | result = [] 183 | word_list = [key for key in self.text] 184 | word_list.insert(0, "[CLS]") 185 | word_list.append("[SEP]") 186 | segment_ids = np.zeros(len(word_list)) 187 | mask = np.ones(len(word_list)) 188 | token = tf.constant(tokenizer.convert_tokens_to_ids(word_list), dtype=tf.int32)[None, :] 189 | segment_ids = tf.constant(segment_ids, dtype=tf.int32)[None, :] 190 | mask = tf.constant(mask, dtype=tf.int32)[None, :] 191 | Model_ner = model_Ner 192 | ner, encode = Model_ner(token, mask, segment_ids) 193 | subjects, new_ner = self.extra_sujects(ner) 194 | new_ner = tf.constant(tf.convert_to_tensor(new_ner), dtype=tf.int32)[None, :] 195 | Model_er = model_Er 196 | re = Model_er(encode, new_ner) 197 | relationship = self.extra_er(subjects, re) 198 | print(subjects) 199 | print(relationship) 200 | result.extend(relationship) 201 | return result 202 | 203 | def extra_sujects(self, ner_label): 204 | ner = ner_label[0] 205 | ner = tf.round(ner) 206 | ner = [tf.argmax(ner[k]) for k in range(ner.shape[0])] 207 | new_ner = list(np.array(ner)) 208 | ner = list(np.array(ner))[1:-1] 209 | ner.append(0)#防止最后一位不为0 210 | text_list = [key for key in self.text] 211 | subject = [] 212 | for i, k in enumerate(text_list): 213 | if int(ner[i]) == 0 or int(ner[i]) == 2: 214 | continue 215 | elif int(ner[i]) == 1: 216 | ner_back = [int(j) for j in ner[i + 1:]] 217 | if 1 in ner_back and 0 in ner_back: 218 | indics_1 = ner_back.index(1) + i 219 | indics_0 = ner_back.index(0) + i 220 | subject.append((''.join(text_list[i: min(indics_0, indics_1) + 1]), i + 1)) 221 | elif 1 not in ner_back: 222 | indics = ner_back.index(0) + i 223 | subject.append((''.join(text_list[i:indics + 1]), i + 1)) 224 | return subject, new_ner 225 | 226 | def extra_er(self, subjects, re): 227 | position = [key[1] for key in subjects] 228 | subjects_ = [key[0] for key in subjects] 229 | re = re[0] 230 | relationship = [] 231 | re = tf.argmax(re, axis=-1) 232 | length = re.shape[0] 233 | for k in range(length): 234 | for i, key in enumerate(list(np.array(re[k]))): 235 | if int(key) > 0: 236 | if k in position and i in position: 237 | subject = subjects_[position.index(k)] 238 | object = subjects_[position.index(i)] 239 | predicate = id2predicate[key - 1] 240 | relationship.append((subject, predicate, object)) 241 | return relationship 242 | 243 | 244 | class Evaluate(object): 245 | def __init__(self): 246 | pass 247 | def reset(self,spo_list): 248 | xx = [] 249 | for key in spo_list: 250 | xx.append((key[0], key[1], key[2])) 251 | return xx 252 | def evaluate(self, data): 253 | A, B, C = 1e-10, 1e-10, 1e-10 254 | for d in data[0:10]: 255 | extra_items = Extra_result(d['text'], self.reset(d['spo_list'])) 256 | R = set(extra_items.call()) 257 | T = set(self.reset(d['spo_list'])) 258 | A += len(R & T)#抽取正确数量 259 | B += len(R) #抽取数量 260 | C += len(T)#原正确数量 261 | return (2 * A / (B + C)), (A / B), (A / C) 262 | 263 | #建立模型 264 | 265 | model_Ner = Ner_model(bert_model) 266 | model_Er = ER_model() 267 | optimizer = tf.keras.optimizers.Adam(learning_rate=lr) 268 | 269 | #保存模型 270 | # checkpoint_dir = './save/Entity_Relationshaip_version2_checkpoints' 271 | # checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt') 272 | checkpoint = tf.train.Checkpoint(optimizer=optimizer, model_Ner=model_Ner, model_Er=model_Er) 273 | 274 | evaluate = Evaluate() 275 | data_loader = data_loader() 276 | best = 0.0 277 | 278 | #训练模型 279 | for epoch in range(num_epochs): 280 | print('Epoch:', epoch + 1) 281 | 282 | num_batchs = int(data_loader.num_train / batch_size) + 1 283 | for batch_index in range(num_batchs): 284 | input_x, input_segment, input_mask, input_ner, input_re = data_loader.get_batch(batch_size) 285 | with tf.GradientTape() as tape: 286 | y_ner, encode_output = model_Ner(input_x, input_mask, input_segment) #预测ner 287 | y_re = model_Er(encode_output, input_ner) #预测关系 288 | loss, loss1, loss2 = loss_function(y_ner, y_re, input_ner, input_re) 289 | if (batch_index+1) % 100 == 0: 290 | print("batch %d: loss %f: loss1 %f: loss2 %f" % (batch_index+1, loss.numpy(), loss1.numpy(), loss2.numpy())) 291 | 292 | variables = (model_Ner.variables + model_Er.variables) 293 | grads = tape.gradient(loss, variables) 294 | optimizer.apply_gradients(grads_and_vars=zip(grads, variables)) 295 | 296 | #f, p, r = evaluate.evaluate(train_data) 297 | F, P, R = evaluate.evaluate(dev_data) 298 | #print('训练集:', "f %f: p %f: r %f: " % (f, p, r)) 299 | print('测试集:', "F %f: P %f: R %f: " % (F, P, F)) 300 | if round(F, 2) > best and round(F, 2) > 0.50: 301 | best = F 302 | print('saving_model') 303 | #model.save('./save/Entity_Relationshaip_version2.h5') 304 | checkpoint.save('./save/Entity_Relationship/version7_checkpoints.ckpt') 305 | 306 | -------------------------------------------------------------------------------- /Joint_Model/Entity_Relationship_Version6.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 使用BERT, 24000条 3 | F值最高81.8% 4 | 缺陷:无法进行实体重叠的关系抽取 5 | ''' 6 | #! -*- coding:utf-8 -*- 7 | import codecs 8 | import os 9 | import numpy as np 10 | import tensorflow as tf 11 | from data_process import id2predicate, list_find, predicate2id 12 | import json 13 | from transformers import BertTokenizer, TFBertModel 14 | from tqdm import tqdm 15 | 16 | 17 | import logging 18 | logging.disable(30) 19 | 20 | 21 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 22 | 23 | train_data = json.load(open('./data_trans/train_data_me.json', encoding='utf-8')) 24 | dev_data = json.load(open('./data_trans/dev_data_me.json', encoding='utf-8')) 25 | 26 | # bert配置 27 | checkpoint_path = "./bert_model/chinese_L-12_H-768_A-12/" 28 | tokenizer = BertTokenizer.from_pretrained(checkpoint_path, lowercase=True, add_special_tokens=True) 29 | bert_model = TFBertModel.from_pretrained(checkpoint_path) 30 | 31 | num_class = 50 32 | label_class = 3 33 | lr = 2e-5 34 | epsilon = 1e-06 35 | num_epochs = 20 36 | batch_size = 6 37 | dropout = 0.5 38 | ''' 39 | ner:预测subject/object 40 | perdicate:预测头部关系矩阵(128*128) 41 | ''' 42 | def get_input_bert(data): 43 | input_x, input_segment, input_mask, input_ner, input_re = [], [], [], [], [] 44 | for l in tqdm(range(24000)): 45 | items = {} 46 | line = data[l] 47 | text = line['text'][0:126] 48 | word_list = [key for key in text] 49 | word_list.insert(0, "[CLS]") 50 | word_list.append("[SEP]") 51 | spo = line['spo_list'] 52 | #token_ids = tokenizer.encode(text, max_length=128) 53 | token_ids = tokenizer.convert_tokens_to_ids(word_list) 54 | segment_ids = np.zeros(len(token_ids)) 55 | mask = np.ones(len(token_ids)) 56 | for sp in spo: 57 | sp = (tokenizer.convert_tokens_to_ids([key for key in sp[0]]), sp[1], tokenizer.convert_tokens_to_ids([key for key in sp[2]])) 58 | subjectid = list_find(token_ids, sp[0]) 59 | objectid = list_find(token_ids, sp[2]) 60 | if subjectid != -1 and objectid != -1: 61 | key = (subjectid, subjectid + len(sp[0])) 62 | if key not in items: 63 | items[key] = [] 64 | items[key].append((objectid, 65 | objectid + len(sp[2]), 66 | predicate2id[sp[1]] + 1)) 67 | if items: 68 | input_x.append(token_ids) 69 | input_segment.append(segment_ids) 70 | input_mask.append(mask) 71 | #seq_len.append(len(text2id)) 72 | ner_s = np.zeros(len(token_ids), dtype=np.int32) 73 | er_s = np.zeros((128, 128), dtype=np.int32) 74 | for j in items: 75 | ner_s[j[0]] = 1 76 | ner_s[j[0]+1:j[1]] = 2 77 | for k in items[j]: 78 | ner_s[k[0]] = 1 79 | ner_s[k[0]+1:k[1]] = 2 80 | er_s[j[0]][k[0]] = k[2] 81 | #print(ner_s) 82 | input_ner.append(ner_s) 83 | input_re.append(er_s) 84 | 85 | #seq_len = np.array(seq_len, dtype=np.int32) 86 | input_re = np.array(input_re, dtype=np.int32) 87 | input_x = tf.keras.preprocessing.sequence.pad_sequences(input_x, 128, padding='post', truncating='post') 88 | input_segment = tf.keras.preprocessing.sequence.pad_sequences(input_segment, 128, padding='post', truncating='post') 89 | input_mask = tf.keras.preprocessing.sequence.pad_sequences(input_mask, 128, padding='post', truncating='post') 90 | input_ner = tf.keras.preprocessing.sequence.pad_sequences(input_ner, 128, padding='post', truncating='post') 91 | return input_x, input_segment, input_mask, input_ner, input_re 92 | 93 | # input_x, input_segment, input_ner, input_re = get_input_bert(train_data) 94 | # print(train_data[0]) 95 | # print(input_x[0]) 96 | # print(input_segment[0]) 97 | # print(input_ner[0]) 98 | # print(input_re[0][21]) 99 | 100 | class data_loader(): 101 | def __init__(self): 102 | self.input_x, self.input_segment,self.input_mask, self.input_ner, self.input_re = get_input_bert(train_data) 103 | self.input_x = self.input_x.astype(np.int32) 104 | self.input_segment = self.input_segment.astype(np.int32) 105 | self.input_mask = self.input_mask.astype(np.int32) 106 | self.input_ner = self.input_ner.astype(np.int32) 107 | self.input_re = self.input_re.astype(np.int32) 108 | self.num_train = self.input_x.shape[0] 109 | self.db_train = tf.data.Dataset.from_tensor_slices((self.input_x, self.input_segment, self.input_mask, self.input_ner, self.input_re)) 110 | self.db_train = self.db_train.shuffle(self.num_train).batch(batch_size, drop_remainder=True) 111 | 112 | def get_batch(self, batch_s): 113 | indics = np.random.randint(0, self.num_train, batch_s) 114 | return self.input_x[indics], self.input_segment[indics], self.input_mask[indics], self.input_ner[indics], self.input_re[indics] 115 | 116 | ''' 117 | epoch20, 最大F=81.1 118 | ''' 119 | class Ner_model(tf.keras.Model): 120 | def __init__(self, bert_model): 121 | super(Ner_model, self).__init__() 122 | self.bert = bert_model 123 | #self.dense_fuc = tf.keras.layers.Dense(100, use_bias=False) #全连接层 124 | self.dense = tf.keras.layers.Dense(label_class) 125 | 126 | def call(self, inputs, mask, segment): 127 | output_encode, _ = self.bert([inputs, mask, segment]) 128 | #x = self.dense_fuc(output_encode) 129 | x = self.dense(output_encode) 130 | x = tf.nn.softmax(x) 131 | return x, output_encode 132 | 133 | class ER_model(tf.keras.Model): 134 | def __init__(self): 135 | super(ER_model, self).__init__() 136 | self.dense_left = tf.keras.layers.Dense(128, use_bias=False) 137 | self.dense_right = tf.keras.layers.Dense(128, use_bias=False) 138 | self.dropout = tf.keras.layers.Dropout(dropout) 139 | self.dense = tf.keras.layers.Dense(num_class) 140 | 141 | def call(self, encode_input): 142 | left = self.dense_left(encode_input) 143 | right = self.dense_right(encode_input) 144 | outer_sum = broadcasting(left, right) 145 | output = tf.tanh(outer_sum) 146 | output = self.dropout(output) 147 | output = self.dense(output) 148 | output = tf.sigmoid(output) 149 | return output 150 | 151 | def broadcasting(left, right): 152 | left = tf.transpose(left, perm=[1, 0, 2]) 153 | left = tf.expand_dims(left, 3) 154 | right = tf.transpose(right, perm=[0, 2, 1]) 155 | right = tf.expand_dims(right, 0) 156 | B = left + right 157 | B = tf.transpose(B, perm=[1, 0, 3, 2]) 158 | return B 159 | 160 | def loss_function(ner, re_pred, input_nerd, input_red): 161 | ner_one_hot = tf.one_hot(input_nerd, depth=3, dtype=tf.float32) 162 | loss_ner = tf.keras.losses.categorical_crossentropy(y_true=ner_one_hot, y_pred=ner) 163 | loss_ner = tf.reduce_sum(loss_ner) 164 | 165 | input_re_onehot = tf.one_hot(input_red, depth=num_class, dtype=tf.float32) 166 | loss_re = tf.keras.losses.binary_crossentropy(y_true=input_re_onehot, y_pred=re_pred) 167 | loss_re = tf.reduce_sum(loss_re) 168 | 169 | loss = (loss_ner + loss_re) 170 | return loss, loss_ner, loss_re 171 | 172 | 173 | class Extra_result(object): 174 | def __init__(self, text, spo_list): 175 | self.text = text 176 | self.spo =spo_list 177 | def call(self): 178 | result = [] 179 | word_list = [key for key in self.text] 180 | word_list.insert(0, "[CLS]") 181 | word_list.append("[SEP]") 182 | segment_ids = np.zeros(len(word_list)) 183 | mask = np.ones(len(word_list)) 184 | token = tf.constant(tokenizer.convert_tokens_to_ids(word_list), dtype=tf.int32)[None, :] 185 | segment_ids = tf.constant(segment_ids, dtype=tf.int32)[None, :] 186 | mask = tf.constant(mask, dtype=tf.int32)[None, :] 187 | Model_ner = model_Ner 188 | ner, encode = Model_ner(token, mask, segment_ids) 189 | subjects= self.extra_sujects(ner) 190 | Model_er = model_Er 191 | re = Model_er(encode) 192 | relationship = self.extra_er(subjects, re) 193 | print(subjects) 194 | print(relationship) 195 | result.extend(relationship) 196 | return result 197 | 198 | def extra_sujects(self, ner_label): 199 | ner = ner_label[0] 200 | ner = tf.round(ner) 201 | ner = [tf.argmax(ner[k]) for k in range(ner.shape[0])] 202 | ner = list(np.array(ner))[1:-1] 203 | ner.append(0)#防止最后一位不为0 204 | text_list = [key for key in self.text] 205 | subject = [] 206 | for i, k in enumerate(text_list): 207 | if int(ner[i]) == 0 or int(ner[i]) == 2: 208 | continue 209 | elif int(ner[i]) == 1: 210 | ner_back = [int(j) for j in ner[i + 1:]] 211 | if 1 in ner_back and 0 in ner_back: 212 | indics_1 = ner_back.index(1) + i 213 | indics_0 = ner_back.index(0) + i 214 | subject.append((''.join(text_list[i: min(indics_0, indics_1) + 1]), i + 1)) 215 | elif 1 not in ner_back: 216 | indics = ner_back.index(0) + i 217 | subject.append((''.join(text_list[i:indics + 1]), i + 1)) 218 | return subject 219 | 220 | def extra_er(self, subjects, re): 221 | position = [key[1] for key in subjects] 222 | subjects_ = [key[0] for key in subjects] 223 | re = re[0] 224 | relationship = [] 225 | re = tf.argmax(re, axis=-1) 226 | length = re.shape[0] 227 | for k in range(length): 228 | for i, key in enumerate(list(np.array(re[k]))): 229 | if int(key) > 0: 230 | if k in position and i in position: 231 | subject = subjects_[position.index(k)] 232 | object = subjects_[position.index(i)] 233 | predicate = id2predicate[key - 1] 234 | relationship.append((subject, predicate, object)) 235 | return relationship 236 | 237 | 238 | class Evaluate(object): 239 | def __init__(self): 240 | pass 241 | def reset(self,spo_list): 242 | xx = [] 243 | for key in spo_list: 244 | xx.append((key[0], key[1], key[2])) 245 | return xx 246 | def evaluate(self, data): 247 | A, B, C = 1e-10, 1e-10, 1e-10 248 | for d in data[0:10]: 249 | extra_items = Extra_result(d['text'], self.reset(d['spo_list'])) 250 | R = set(extra_items.call()) 251 | T = set(self.reset(d['spo_list'])) 252 | A += len(R & T)#抽取正确数量 253 | B += len(R) #抽取数量 254 | C += len(T)#原正确数量 255 | return (2 * A / (B + C)), (A / B), (A / C) 256 | 257 | #建立模型 258 | 259 | model_Ner = Ner_model(bert_model) 260 | model_Er = ER_model() 261 | optimizer = tf.keras.optimizers.Adam(learning_rate=lr) 262 | 263 | #保存模型 264 | # checkpoint_dir = './save/Entity_Relationshaip_version2_checkpoints' 265 | # checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt') 266 | checkpoint = tf.train.Checkpoint(optimizer=optimizer, model_Ner=model_Ner, model_Er=model_Er) 267 | 268 | evaluate = Evaluate() 269 | data_loader = data_loader() 270 | best = 0.0 271 | 272 | for epoch in range(num_epochs): 273 | print('Epoch:', epoch + 1) 274 | 275 | num_batchs = int(data_loader.num_train / batch_size) + 1 276 | for batch_index in range(num_batchs): 277 | input_x, input_segment,input_mask, input_ner, input_re = data_loader.get_batch(batch_size) 278 | with tf.GradientTape() as tape: 279 | y_ner, encode_output = model_Ner(input_x, input_mask, input_segment) #预测ner 280 | y_re = model_Er(encode_output) #预测关系 281 | loss, loss1, loss2 = loss_function(y_ner, y_re, input_ner, input_re) 282 | if (batch_index+1) % 100 == 0: 283 | print("batch %d: loss %f: loss1 %f: loss2 %f" % (batch_index+1, loss.numpy(), loss1.numpy(), loss2.numpy())) 284 | 285 | variables = (model_Ner.variables + model_Er.variables) 286 | grads = tape.gradient(loss, variables) 287 | optimizer.apply_gradients(grads_and_vars=zip(grads, variables)) 288 | 289 | #f, p, r = evaluate.evaluate(train_data) 290 | F, P, R = evaluate.evaluate(dev_data) 291 | #print('训练集:', "f %f: p %f: r %f: " % (f, p, r)) 292 | print('测试集:', "F %f: P %f: R %f: " % (F, P, F)) 293 | if round(F, 2) > best and round(F, 2) > 0.50: 294 | best = F 295 | print('saving_model') 296 | #model.save('./save/Entity_Relationshaip_version2.h5') 297 | checkpoint.save('./save/Entity_Relationship/version6_checkpoints.ckpt') 298 | 299 | -------------------------------------------------------------------------------- /Joint_Model/Entity_Relationship_version1.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | import numpy as np 3 | import tensorflow as tf 4 | from data_process import Token, get_input, id2predicate 5 | import json 6 | 7 | train_data = json.load(open('./data_trans/train_data_me.json', encoding='utf-8')) 8 | dev_data = json.load(open('./data_trans/dev_data_me.json', encoding='utf-8')) 9 | 10 | num_class = 49 11 | lr = 0.001 12 | num_epochs = 20 13 | batch_size = 32 14 | 15 | class data_loader(): 16 | def __init__(self): 17 | self.input_x, self.input_ner1, self.input_ner2, self.input_re1, self.input_re2, self.p_s, self.p_e = get_input(train_data) 18 | self.input_x = self.input_x.astype(np.int32) 19 | self.input_ner1 = self.input_ner1.astype(np.int32) 20 | self.input_ner2 = self.input_ner2.astype(np.int32) 21 | self.input_re1 = self.input_re1.astype(np.float32) 22 | self.input_re2 = self.input_re2.astype(np.float32) 23 | self.p_s = self.p_s.astype(np.int32) 24 | self.p_e = self.p_e.astype(np.int32) 25 | self.num_train = self.input_x.shape[0] 26 | self.db_train = tf.data.Dataset.from_tensor_slices((self.input_x, self.input_ner1, self.input_ner2, self.input_re1, self.input_re2, self.p_s, self.p_e)) 27 | self.db_train = self.db_train.shuffle(self.num_train).batch(batch_size, drop_remainder=True) 28 | 29 | def get_batch(self, batch_s): 30 | indics = np.random.randint(0, self.num_train, batch_s) 31 | return self.input_x[indics], self.input_ner1[indics], self.input_ner2[indics], self.input_re1[indics], self.input_re2[indics], self.p_s[indics], self.p_e[indics] 32 | 33 | 34 | class Ner_model(tf.keras.Model): 35 | def __init__(self): 36 | super(Ner_model, self).__init__() 37 | self.char_embedding = tf.keras.layers.Embedding(4996, 64, mask_zero=True) 38 | self.bi_gru = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True)) 39 | self.dense_1 = tf.keras.layers.Dense(1) 40 | self.dense_2 = tf.keras.layers.Dense(1) 41 | 42 | def call(self, inputs): 43 | x = self.char_embedding(inputs) 44 | mask = self.char_embedding.compute_mask(inputs) 45 | x_gru = self.bi_gru(x, mask=mask) 46 | x_1 = tf.nn.sigmoid(self.dense_1(x_gru)) 47 | x_2 = tf.nn.sigmoid(self.dense_2(x_gru)) 48 | return x_1, x_2, x_gru 49 | 50 | 51 | class ER_model(tf.keras.Model): 52 | def __init__(self): 53 | super(ER_model, self).__init__() 54 | self.dense_1 = tf.keras.layers.Dense(num_class) 55 | self.dense_2 = tf.keras.layers.Dense(num_class) 56 | self.average = tf.keras.layers.Average() 57 | 58 | def call(self, x_lstm, position_s, position_e): 59 | add_encode = np.zeros_like(x_lstm) 60 | for i, k in enumerate(position_s): 61 | gru_v = x_lstm[i, :, :] 62 | v_s = gru_v[k, :] 63 | v_e = gru_v[position_e[i], :] 64 | v_subject = self.average([v_s, v_e]) 65 | add_encode[i, k, :] = v_subject 66 | add_encode[i, position_e[i], :] = v_subject 67 | 68 | x = x_lstm + add_encode 69 | output1 = tf.sigmoid(self.dense_1(x)) 70 | output2 = tf.sigmoid(self.dense_2(x)) 71 | return output1, output2 72 | 73 | 74 | def loss_function(y_1, y_2, y_re1, y_re2, input_ner1, input_ner2, input_re1, input_re2): 75 | input_ner1 = tf.expand_dims(input_ner1, 2) 76 | loss_ner1 = tf.keras.losses.binary_crossentropy(y_true=input_ner1, y_pred=y_1) 77 | loss_ner1 = tf.reduce_sum(loss_ner1) 78 | 79 | input_ner2 = tf.expand_dims(input_ner2, 2) 80 | loss_ner2 = tf.keras.losses.binary_crossentropy(y_true=input_ner2, y_pred=y_2) 81 | loss_ner2 = tf.reduce_sum(loss_ner2) 82 | 83 | loss_re1 = tf.reduce_sum(tf.keras.losses.binary_crossentropy(y_true=input_re1, y_pred=y_re1), axis=-1, keepdims=True) 84 | loss_re1 = tf.reduce_sum(loss_re1) 85 | 86 | loss_re2 = tf.reduce_sum(tf.keras.losses.binary_crossentropy(y_true=input_re2, y_pred=y_re2), axis=-1, keepdims=True) 87 | loss_re2 = tf.reduce_sum(loss_re2) 88 | loss = (loss_ner1 + loss_ner2) + (loss_re1 + loss_re2) 89 | 90 | return loss, (loss_ner1 + loss_ner2), (loss_re1 + loss_re2) 91 | 92 | 93 | class Extra_result(object): 94 | def __init__(self, text): 95 | self.text = text 96 | def call(self): 97 | result = [] 98 | token = np.zeros(len(self.text)) 99 | text2id = Token(self.text) 100 | token[0:len(text2id)] = text2id 101 | Model_ner = model_Ner 102 | Model_er = model_Er 103 | ner1, ner2, out_lm = Model_ner(np.array([token], dtype=np.int32)) 104 | subjects = self.extra_sujects(ner1, ner2) 105 | for i, key in enumerate(subjects): 106 | ids1 = key[1] 107 | ids2 = key[2] 108 | re1, re2 = Model_er(out_lm, np.array([ids1], dtype=np.int32), np.array([ids2], dtype=np.int32)) 109 | relationship = self.extra_er(key[0], re1, re2) 110 | result.extend(relationship) 111 | print(subjects) 112 | print(result) 113 | return result 114 | 115 | def extra_sujects(self, ner_1, ner_2): 116 | subject = [] 117 | ner_1, ner_2 = np.where(ner_1[0] > 0.5)[0], np.where(ner_2[0] > 0.5)[0] 118 | if len(ner_1) > 0: 119 | for i in ner_1: 120 | j = ner_2[ner_2 >= i] 121 | if len(j) > 0: 122 | j = j[0] 123 | _subject = self.text[i: j+1] 124 | subject.append((_subject, i, j)) 125 | return subject 126 | 127 | def extra_er(self, key, re1, re2): 128 | relationship = [] 129 | o_re1, o_re2 = np.where(re1[0] > 0.5), np.where(re2[0] > 0.5) 130 | for _re1, c1 in zip(*o_re1): 131 | for _re2, c2 in zip(*o_re2): 132 | if _re1 <= _re2 and c1 == c2: 133 | _object = self.text[_re1: _re2 + 1] 134 | _predicate = id2predicate[c1] 135 | relationship.append((key, _predicate, _object)) 136 | break 137 | return relationship 138 | 139 | 140 | class Evaluate(object): 141 | def __init__(self): 142 | pass 143 | def reset(self,spo_list): 144 | xx = [] 145 | for key in spo_list: 146 | xx.append((key[0], key[1], key[2])) 147 | return xx 148 | def evaluate(self, data): 149 | A, B, C = 1e-10, 1e-10, 1e-10 150 | for d in data[0:10]: 151 | extra_items = Extra_result(d['text']) 152 | R = set(extra_items.call()) 153 | T = set(self.reset(d['spo_list'])) 154 | A += len(R & T) 155 | B += len(R) 156 | C += len(T) 157 | return 2 * A / (B + C), A / B, A / C 158 | 159 | model_Ner = Ner_model() 160 | model_Er = ER_model() 161 | optimizer = tf.keras.optimizers.Adam(learning_rate=lr) 162 | checkpoint = tf.train.Checkpoint(optimizer=optimizer, model_Ner=model_Ner, model_Er=model_Er) 163 | evaluate = Evaluate() 164 | data_loader = data_loader() 165 | best = 0.0 166 | 167 | for epoch in range(num_epochs): 168 | print('Epoch:', epoch + 1) 169 | 170 | num_batchs = int(data_loader.num_train / batch_size) + 1 171 | for batch_index in range(num_batchs): 172 | input_x, input_ner1, input_ner2, input_re1, input_re2, position_s, position_e = data_loader.get_batch(batch_size) 173 | 174 | with tf.GradientTape() as tape: 175 | y_1, y_2, out_lstm = model_Ner(input_x) #预测ner 176 | y_re1, y_re2 = model_Er(out_lstm, position_s, position_e) 177 | loss, loss1, loss2 = loss_function(y_1, y_2, y_re1, y_re2, input_ner1, input_ner2, input_re1, input_re2) 178 | if (batch_index+1) % 500 == 0: 179 | print("batch %d: loss %f: loss1 %f: loss2 %f" % (batch_index+1, loss.numpy(), loss1.numpy(), loss2.numpy())) 180 | 181 | variables = (model_Ner.variables + model_Er.variables) 182 | grads = tape.gradient(loss, variables) 183 | optimizer.apply_gradients(grads_and_vars=zip(grads, variables)) 184 | F, P, R = evaluate.evaluate(dev_data) 185 | print('测试集:', "F %f: P %f: R %f: " % (F, P, F)) 186 | if round(F, 2) > best and round(F, 2) > 0.50: 187 | best = F 188 | print('saving_model') 189 | #model.save('./save/Entity_Relationshaip_version2.h5') 190 | checkpoint.save('./save/Entity_Relationship/version1_checkpoints.ckpt') -------------------------------------------------------------------------------- /Joint_Model/Entity_relationship_version2.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | import numpy as np 3 | import tensorflow as tf 4 | from data_process import Token, get_input_so, id2predicate 5 | import json 6 | 7 | train_data = json.load(open('./data_trans/train_data_me.json', encoding='utf-8')) 8 | dev_data = json.load(open('./data_trans/dev_data_me.json', encoding='utf-8')) 9 | 10 | len_char = 4996 11 | char_dim = 128 12 | label_class = 3 13 | num_class = 50 14 | lr = 0.005 15 | num_epochs = 20 16 | batch_size = 16 17 | dropout = 0.5 18 | 19 | # def load_pickle(file_path): 20 | # with open(file_path, 'rb') as f: 21 | # fileobj = pickle.load(f) 22 | # return fileobj 23 | # embedding_matrix = load_pickle('./data_trans/embedding_matrix.pkl') 24 | 25 | class data_loader(): 26 | def __init__(self): 27 | self.input_x, self.input_ner, self.input_re = get_input_so(train_data) 28 | self.input_x = self.input_x.astype(np.int32) 29 | self.input_ner = self.input_ner.astype(np.int32) 30 | self.input_re = self.input_re.astype(np.int32) 31 | self.num_train = self.input_x.shape[0] 32 | self.db_train = tf.data.Dataset.from_tensor_slices((self.input_x, self.input_ner, self.input_re)) 33 | self.db_train = self.db_train.shuffle(self.num_train).batch(batch_size, drop_remainder=True) 34 | 35 | def get_batch(self, batch_s): 36 | indics = np.random.randint(0, self.num_train, batch_s) 37 | return self.input_x[indics], self.input_ner[indics], self.input_re[indics] 38 | 39 | 40 | class Ner_model(tf.keras.Model): 41 | def __init__(self): 42 | super(Ner_model, self).__init__() 43 | self.char_embedding = tf.keras.layers.Embedding(4996, 64, mask_zero=True) 44 | self.bi_gru = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True)) 45 | self.dropout = tf.keras.layers.Dropout(dropout) 46 | self.dense = tf.keras.layers.Dense(label_class) 47 | 48 | def call(self, inputs): 49 | x = self.char_embedding(inputs) 50 | mask = self.char_embedding.compute_mask(inputs) 51 | x_gru = self.bi_gru(x, mask=mask) 52 | x = self.dense(x_gru) 53 | x = self.dropout(x) 54 | x_ = tf.nn.softmax(x) 55 | return x_, x_gru 56 | 57 | 58 | class ER_model(tf.keras.Model): 59 | def __init__(self): 60 | super(ER_model, self).__init__() 61 | #self.label_embedding = tf.keras.layers.Embedding(3, 64) 62 | # self.dropout = tf.keras.layers.Dropout(dropout) 63 | self.dense_left = tf.keras.layers.Dense(100, use_bias=False) 64 | self.dense_right = tf.keras.layers.Dense(100, use_bias=False) 65 | self.dropout = tf.keras.layers.Dropout(dropout) 66 | self.dense = tf.keras.layers.Dense(num_class) 67 | 68 | # @tf.function(input_signature=[tf.TensorSpec(shape=(None, 128), dtype=tf.int32), 69 | # tf.TensorSpec(shape=(None, 128), dtype=tf.float32)]) 70 | def call(self, encode_input): 71 | #label_embedding = self.label_embedding(ner) 72 | #mask = self.label_embedding.compute_mask(ner) 73 | #encode_input_hidden_size = encode_input.shape[-1] 74 | # u_a = tf.Variable("u_a", [encode_input_hidden_size + 64, hidden_size_n1]) 75 | # w_a = tf.Variable("w_a", [encode_input_hidden_size + 64, hidden_size_n1]) 76 | # v = tf.Variable("v", [hidden_size_n1, num_class]) 77 | # b_s = tf.Variable("b_s", [hidden_size_n1]) 78 | # print(u_a.shape) 79 | #encode_input = tf.concat([encode_input, label_embedding], axis=-1) 80 | left = self.dense_left(encode_input) 81 | right = self.dense_right(encode_input) 82 | outer_sum = broadcasting(left, right) 83 | output = tf.tanh(outer_sum) 84 | output = self.dropout(output) 85 | output = self.dense(output) 86 | output = tf.sigmoid(output) 87 | return output 88 | 89 | def broadcasting(left, right): 90 | left = tf.transpose(left, perm=[1, 0, 2]) 91 | left = tf.expand_dims(left, 3) 92 | right = tf.transpose(right, perm=[0, 2, 1]) 93 | right = tf.expand_dims(right, 0) 94 | B = left + right 95 | B = tf.transpose(B, perm=[1, 0, 3, 2]) 96 | return B 97 | 98 | 99 | # def Mask(inputs): 100 | # mask = tf.math.logical_not(tf.math.equal(inputs, 0)) 101 | # mask = tf.cast(mask, tf.float32) 102 | # # mask = tf.keras.layers.Lambda(lambda x: tf.cast(tf.keras.backend.greater(tf.expand_dims(x,2), 0), tf.float32))(inputs) 103 | # return mask 104 | 105 | def loss_function(ner, re_pred, input_nerd, input_red): 106 | ner_one_hot = tf.one_hot(input_nerd, depth=3, dtype=tf.float32) 107 | loss_ner = tf.keras.losses.categorical_crossentropy(y_true=ner_one_hot, y_pred=ner) 108 | loss_ner = tf.reduce_sum(loss_ner) 109 | 110 | input_re_onehot = tf.one_hot(input_red, depth=num_class, dtype=tf.float32) 111 | loss_re = tf.keras.losses.binary_crossentropy(y_true=input_re_onehot, y_pred=re_pred) 112 | loss_re = tf.reduce_sum(loss_re) 113 | 114 | loss = (loss_ner + loss_re) 115 | return loss, loss_ner, loss_re 116 | 117 | 118 | class Extra_result(object): 119 | def __init__(self, text): 120 | self.text = text 121 | def call(self): 122 | result = [] 123 | token = np.zeros(len(self.text)) 124 | text2id = Token(self.text) 125 | token[0:len(text2id)] = text2id 126 | Model_ner = model_Ner 127 | Model_er = model_Er 128 | ner, out_lm = Model_ner(np.array([token], dtype=np.int32)) 129 | subjects, ner_list = self.extra_sujects(ner) 130 | re = Model_er(out_lm) 131 | relationship = self.extra_er(subjects, re) 132 | # print(subjects) 133 | # print(relationship) 134 | result.extend(relationship) 135 | return result 136 | 137 | def extra_sujects(self, ner_label): 138 | ner = ner_label[0] 139 | ner = tf.round(ner) 140 | ner = [tf.argmax(ner[k]) for k in range(ner.shape[0])] 141 | ner = list(np.array(ner)) 142 | ner.append(0)#防止最后一位不为0 143 | text_list = [key for key in self.text] 144 | subject = [] 145 | for i, k in enumerate(text_list): 146 | if int(ner[i]) == 0 or int(ner[i]) == 2: 147 | continue 148 | elif int(ner[i]) == 1: 149 | ner_back = [int(j) for j in ner[i + 1:]] 150 | if 1 in ner_back and 0 in ner_back: 151 | indics_1 = ner_back.index(1) + i 152 | indics_0 = ner_back.index(0) + i 153 | subject.append((''.join(text_list[i: min(indics_0, indics_1) + 1]), i)) 154 | elif 1 not in ner_back: 155 | indics = ner_back.index(0) + i 156 | subject.append((''.join(text_list[i:indics + 1]), i)) 157 | return subject, ner[:-1] 158 | 159 | def extra_er(self, subjects, re): 160 | position = [key[1] for key in subjects] 161 | subjects_ = [key[0] for key in subjects] 162 | re = re[0] 163 | relationship = [] 164 | re = tf.argmax(re, axis=-1) 165 | length = re.shape[0] 166 | for k in range(length): 167 | for i, key in enumerate(list(np.array(re[k]))): 168 | if int(key) > 0: 169 | if k in position and i in position: 170 | subject = subjects_[position.index(k)] 171 | object = subjects_[position.index(i)] 172 | predicate = id2predicate[key - 1] 173 | relationship.append((subject, predicate, object)) 174 | return relationship 175 | 176 | 177 | class Evaluate(object): 178 | def __init__(self): 179 | pass 180 | def reset(self,spo_list): 181 | xx = [] 182 | for key in spo_list: 183 | xx.append((key[0], key[1], key[2])) 184 | return xx 185 | def evaluate(self, data): 186 | A, B, C = 1e-10, 1e-10, 1e-10 187 | for d in data[0:100]: 188 | extra_items = Extra_result(d['text']) 189 | R = set(extra_items.call()) 190 | T = set(self.reset(d['spo_list'])) 191 | A += len(R & T) 192 | B += len(R) 193 | C += len(T) 194 | return 2 * A / (B + C), A / B, A / C 195 | 196 | #建立模型 197 | model_Ner = Ner_model() 198 | model_Er = ER_model() 199 | optimizer = tf.keras.optimizers.Adam(learning_rate=lr) 200 | 201 | #保存模型 202 | # checkpoint_dir = './save/Entity_Relationshaip_version2_checkpoints' 203 | # checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt') 204 | checkpoint = tf.train.Checkpoint(optimizer=optimizer, model_Ner=model_Ner, model_Er=model_Er) 205 | 206 | evaluate = Evaluate() 207 | data_loader = data_loader() 208 | best = 0.0 209 | 210 | for epoch in range(num_epochs): 211 | print('Epoch:', epoch + 1) 212 | 213 | num_batchs = int(data_loader.num_train / batch_size) + 1 214 | for batch_index in range(num_batchs): 215 | input_x, input_ner, input_re = data_loader.get_batch(batch_size) 216 | with tf.GradientTape() as tape: 217 | y_ner, encode_gru = model_Ner(input_x) #预测ner 218 | y_re = model_Er(encode_gru) #预测关系 219 | # mask_ner = Mask(input_x) 220 | # mask_re = Mask(input_re) 221 | loss, loss1, loss2 = loss_function(y_ner, y_re, input_ner, input_re) 222 | if (batch_index+1) % 100 == 0: 223 | print("batch %d: loss %f: loss1 %f: loss2 %f" % (batch_index+1, loss.numpy(), loss1.numpy(), loss2.numpy())) 224 | 225 | variables = (model_Ner.variables + model_Er.variables) 226 | grads = tape.gradient(loss, variables) 227 | optimizer.apply_gradients(grads_and_vars=zip(grads, variables)) 228 | 229 | #f, p, r = evaluate.evaluate(train_data) 230 | F, P, R = evaluate.evaluate(dev_data) 231 | #print('训练集:', "f %f: p %f: r %f: " % (f, p, r)) 232 | print('测试集:', "F %f: P %f: R %f: " % (F, P, F)) 233 | if round(F, 2) > best and round(F, 2) > 0.50: 234 | best = F 235 | print('saving_model') 236 | #model.save('./save/Entity_Relationshaip_version2.h5') 237 | checkpoint.save('./save/Entity_Relationship/version2_checkpoints.ckpt') 238 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 实体关系抽取是信息抽取任务中非常基础且必要的工作。实体关系主要有一对一、多对一、多对多等。今天从实践的角度介绍一下实体关系抽取的相关工作。仅为了简单介绍实体关系抽取相关的实践过程,模型我没有进行深度调优,故不适用实际生产中。仅在此介绍下方法,模型主要结构使用的双向GRU网络, 以及BERT。![Joint entity recognition and relation extraction as a multi-head selection problem](https://upload-images.jianshu.io/upload_images/12779090-f6b57e875738d096.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240) 2 | 模型依据论文“Joint entity recognition and relation extraction as a multi-head selection problem”进行改造而成,舍弃了Label Embeddings部分;没有使用CRF层进行实体的识别,主要是没有发现与TF2.0搭配的CRF库,进而用Softmax代替。 (tf2.2可以和Addons中crf搭配使用) 3 | 4 | ## 数据来源 5 | 百度2019语言与智能技术竞赛信息抽取 6 | 7 | ## 模型要求 8 | Tensorflow-gpu=2.0.0 9 | transformers 10 | 11 | # 一、实体关系联合抽取 12 | 目的是提取三元组[subject, predicate,object]; 13 | 基本简单再简单的原则,实体标签{O:0, B:1, I:2}, 进行subject,object提取; 14 | 关系部分,是一个多头选择策略(本文主要也是介绍这种方法),对任一序列,构建[sequence_length, sequence_length]数组,Subject头位置[第N行]对应Object头位置[第M列],该处数字表示第C种关系。 15 | 16 | ## 模型一 17 | 使用双向GRU构建实体关系联合抽取模型。 18 | 使用字向量,首先制作字典,选取前4996个字; 19 | 构建模型输入数据,最大长度128,构建文本token,实体label, 关系label; 20 | 双向GRU输出+softmax提取实体;双向GRU输出+sigmoid提取关系。 21 | 22 | 模型使用64000条训练数据,最后对测试集前100进行验证,最大F1值是51%; 23 | 主要原因是单靠双向GRU的学习能力不够。 24 | 25 | ## 模型二 26 | 使用BERT构建实体关系联合抽取模型。 27 | 构建模型输入数据,最大长度128,BERT输入又三部分构成[文本token, mask_token, segment_token],实体label, 关系label; 28 | BERT输出+softmax提取实体;BERT输出+sigmoid提取关系。 29 | 30 | 模型使用30000条训练数据,最后对测试集前100进行验证,最大F1值是81.8%; 31 | 模型太重,没有取过多数据训练,应该还可以继续提高的。 32 | 33 | 在模型二的基础上增加label_embeddings层,但模型并没有获得提升,F1值78.9%;猜测原因可能是实体标签(原论文里面包含标签的类型信息)过于简单,损失的信息过多,如此将label_embedding与bert输出结合,并没有起到作用。 34 | 35 | # 二、关系抽取 36 | ## 模型一 37 | 不进行实体提取,当做已有实体直接进行关系预测。 38 | 使用双向GRU构建关系抽取模型。 39 | 使用字向量,首先制作字典,选取前4996个字; 40 | 构建模型输入数据,最大长度128,构建文本token,实体label, 关系label;双向GRU输出+sigmoid提取关系。 41 | 42 | 模型使用64000条训练数据,最后对测试集前100进行验证,关系提取最大F1值是81.8%,跟模型一对比,也验证了单靠双向GRU进行联合。 43 | 44 | ## 模型二 45 | 使用BERT构建关系抽取模型。 46 | 构建模型输入数据,最大长度128,BERT输入又三部分构成[文本token, mask_token, segment_token],实体label, 关系label; 47 | BERT输出+sigmoid提取关系。 48 | 49 | 模型使用30000条训练数据,最后对测试集前100进行验证,模型没有跑完,前10个epoch时,模型F1已经超过85%。 50 | 51 | 52 | 53 | # 新增模型 54 | 55 | ![A Novel Cascade Binary Tagging Framework for Relational Triple Extraction](C:\Users\admin\AppData\Roaming\Typora\typora-user-images\image-20200603154644370.png) 56 | 57 | A Novel Cascade Binary Tagging Framework for Relational Triple Extraction[ACL_2020],操作上主要是先预测Subject, 然后将关系建模为头实体(Subjects)映射到尾实体(Object)的函数, 进行relation-object同时预测。采用指针标注的方式,可以有效解决多个三元组重叠的实体关系抽取问题。 58 | 59 | 模型实现程序:Entity_Relationship_version1.py 60 | 61 | 用BiGru代替了Bert结构(纯粹为了节省时间) 62 | 63 | 20个epoch,F1值最高68(仅仅用BiGru,数值感觉蛮不错的),感觉还可以迭代下去,loss还可以继续下降。 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /Relation_Model/Relationship_Version3.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 使用双向GRU 3 | F值最高81.8% 4 | 缺陷:无法进行实体重叠的关系抽取 5 | ''' 6 | 7 | #! -*- coding:utf-8 -*- 8 | import numpy as np 9 | import tensorflow as tf 10 | from data_process import Token, get_input_so, id2predicate 11 | import json 12 | 13 | train_data = json.load(open('./data_trans/train_data_me.json', encoding='utf-8')) 14 | dev_data = json.load(open('./data_trans/dev_data_me.json', encoding='utf-8')) 15 | 16 | len_char = 4996 17 | char_dim = 128 18 | num_class = 50 19 | lr = 0.005 20 | num_epochs = 20 21 | batch_size = 16 22 | dropout = 0.5 23 | 24 | class data_loader(): 25 | def __init__(self): 26 | self.input_x, self.input_ner, self.input_re = get_input_so(train_data) 27 | self.input_x = self.input_x.astype(np.int32) 28 | self.input_ner = self.input_ner.astype(np.int32) 29 | self.input_re = self.input_re.astype(np.int32) 30 | self.num_train = self.input_x.shape[0] 31 | self.db_train = tf.data.Dataset.from_tensor_slices((self.input_x, self.input_ner, self.input_re)) 32 | self.db_train = self.db_train.shuffle(self.num_train).batch(batch_size, drop_remainder=True) 33 | 34 | def get_batch(self, batch_s): 35 | indics = np.random.randint(0, self.num_train, batch_s) 36 | return self.input_x[indics], self.input_ner[indics], self.input_re[indics] 37 | ''' 38 | epoch20, 最大F=81.1 39 | ''' 40 | class ER_model(tf.keras.Model): 41 | def __init__(self): 42 | super(ER_model, self).__init__() 43 | self.char_embedding = tf.keras.layers.Embedding(4996, 64, mask_zero=True) # 44 | self.bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True)) 45 | self.dense_left = tf.keras.layers.Dense(100, use_bias=False) 46 | self.dense_right = tf.keras.layers.Dense(100, use_bias=False) 47 | self.dropout = tf.keras.layers.Dropout(dropout) 48 | self.dense = tf.keras.layers.Dense(num_class) 49 | 50 | #对子类化的模型或层的‘call’方法中,掩码不能被自动传播,所以你需手动将掩码参数传递任何需要它的层。 51 | def call(self, inputs): 52 | embedding = self.char_embedding(inputs) 53 | mask = self.char_embedding.compute_mask(inputs) 54 | encode_input = self.bi_lstm(embedding, mask=mask) 55 | left = self.dense_left(encode_input) 56 | right = self.dense_right(encode_input) 57 | outer_sum = broadcasting(left, right) 58 | output = tf.tanh(outer_sum) 59 | output = self.dropout(output) 60 | output = self.dense(output) 61 | output = tf.sigmoid(output) 62 | return output 63 | 64 | def broadcasting(left, right): 65 | left = tf.transpose(left, perm=[1, 0, 2]) 66 | left = tf.expand_dims(left, 3) 67 | right = tf.transpose(right, perm=[0, 2, 1]) 68 | right = tf.expand_dims(right, 0) 69 | B = left + right 70 | B = tf.transpose(B, perm=[1, 0, 3, 2]) 71 | return B 72 | 73 | def loss_function(re_pred, input_red): 74 | 75 | input_re_onehot = tf.one_hot(input_red, depth=num_class, dtype=tf.float32) 76 | loss_re = tf.keras.losses.binary_crossentropy(y_true=input_re_onehot, y_pred=re_pred) 77 | loss_re = tf.reduce_sum(loss_re) 78 | loss = (loss_re) 79 | return loss 80 | 81 | 82 | class Extra_result(object): 83 | def __init__(self, text, spo_list): 84 | self.text = text 85 | self.spo =spo_list 86 | def call(self): 87 | result = [] 88 | token = np.zeros(len(self.text)) 89 | text2id = Token(self.text) 90 | token[0:len(text2id)] = text2id 91 | Model_er = model_Er 92 | subjects = self.extra_sujects() 93 | re = Model_er(np.array([token], dtype=np.int32)) 94 | relationship = self.extra_er(subjects, re) 95 | # print(subjects) 96 | print(relationship) 97 | result.extend(relationship) 98 | return result 99 | 100 | def extra_sujects(self): 101 | subject = [] 102 | subject_ = [] 103 | for key in self.spo: 104 | subject_.append(key[0]) 105 | subject_.append(key[2]) 106 | 107 | subject_ = list(set(subject_)) 108 | for key in subject_: 109 | id = self.text.index(key) 110 | subject.append((key, id)) 111 | return subject 112 | 113 | def extra_er(self, subjects, re): 114 | position = [key[1] for key in subjects] 115 | subjects_ = [key[0] for key in subjects] 116 | re = re[0] 117 | 118 | relationship = [] 119 | re = tf.argmax(re, axis=-1) 120 | 121 | length = re.shape[0] 122 | for k in range(length): 123 | for i, key in enumerate(list(np.array(re[k]))): 124 | if int(key) > 0: 125 | if k in position and i in position: 126 | subject = subjects_[position.index(k)] 127 | object = subjects_[position.index(i)] 128 | predicate = id2predicate[key - 1] 129 | relationship.append((subject, predicate, object)) 130 | return relationship 131 | 132 | 133 | class Evaluate(object): 134 | def __init__(self): 135 | pass 136 | def reset(self,spo_list): 137 | xx = [] 138 | for key in spo_list: 139 | xx.append((key[0], key[1], key[2])) 140 | return xx 141 | def evaluate(self, data): 142 | A, B, C = 1e-10, 1e-10, 1e-10 143 | for d in data[0:100]: 144 | extra_items = Extra_result(d['text'], self.reset(d['spo_list'])) 145 | R = set(extra_items.call()) 146 | T = set(self.reset(d['spo_list'])) 147 | A += len(R & T)#抽取正确数量 148 | B += len(R) #抽取数量 149 | C += len(T)#原正确数量 150 | return (2 * A / (B + C)), (A / B), (A / C) 151 | 152 | #建立模型 153 | model_Er = ER_model() 154 | optimizer = tf.keras.optimizers.Adam(learning_rate=lr) 155 | 156 | #保存模型 157 | # checkpoint_dir = './save/Entity_Relationshaip_version2_checkpoints' 158 | # checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt') 159 | checkpoint = tf.train.Checkpoint(optimizer=optimizer, model_Er=model_Er) 160 | 161 | evaluate = Evaluate() 162 | data_loader = data_loader() 163 | best = 0.0 164 | 165 | for epoch in range(num_epochs): 166 | print('Epoch:', epoch + 1) 167 | num_batchs = int(data_loader.num_train / batch_size) + 1 168 | for batch_index in range(num_batchs): 169 | input_x, input_ner, input_re = data_loader.get_batch(batch_size) 170 | with tf.GradientTape() as tape: 171 | 172 | y_re = model_Er(input_x) #预测关系 173 | loss = loss_function(y_re, input_re) 174 | if (batch_index+1) % 100 == 0: 175 | print("batch %d: loss %f" % (batch_index+1, loss.numpy())) 176 | 177 | variables = (model_Er.variables) 178 | grads = tape.gradient(loss, variables) 179 | optimizer.apply_gradients(grads_and_vars=zip(grads, variables)) 180 | 181 | F, P, R = evaluate.evaluate(dev_data) 182 | print('测试集:', "F: %f, P: %f, R: %f" % (F, P, F)) 183 | if round(F, 2) > best and round(F, 2) > 0.50: 184 | best = F 185 | print('saving_model') 186 | #model.save('./save/Entity_Relationshaip_version2.h5') 187 | checkpoint.save('./save/Relationship/version3_checkpoints.ckpt') 188 | -------------------------------------------------------------------------------- /Relation_Model/Relationship_Version5.py: -------------------------------------------------------------------------------- 1 | ''' 2 | F值最高85% 3 | 缺陷:无法进行实体重叠的关系抽取 4 | ''' 5 | 6 | #! -*- coding:utf-8 -*- 7 | import codecs 8 | import os 9 | import numpy as np 10 | import tensorflow as tf 11 | from data_process import id2predicate, list_find, predicate2id 12 | import json 13 | from transformers import BertTokenizer, TFBertModel 14 | from tqdm import tqdm 15 | 16 | import logging 17 | logging.disable(30) 18 | 19 | 20 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 21 | 22 | train_data = json.load(open('./data_trans/train_data_me.json', encoding='utf-8')) 23 | dev_data = json.load(open('./data_trans/dev_data_me.json', encoding='utf-8')) 24 | 25 | # bert配置 26 | checkpoint_path = "./bert_model/chinese_L-12_H-768_A-12/" 27 | tokenizer = BertTokenizer.from_pretrained(checkpoint_path, lowercase=True, add_special_tokens=True) 28 | bert_model = TFBertModel.from_pretrained(checkpoint_path) 29 | 30 | num_class = 50 31 | label_class = 3 32 | lr = 2e-5 33 | epsilon = 1e-06 34 | num_epochs = 20 35 | batch_size = 6 36 | dropout = 0.5 37 | ''' 38 | ner:预测subject/object 39 | perdicate:预测头部关系矩阵(128*128) 40 | ''' 41 | def get_input_bert(data): 42 | input_x, input_segment, input_mask, input_ner, input_re = [], [], [], [], [] 43 | for l in tqdm(range(30000)): 44 | items = {} 45 | line = data[l] 46 | text = line['text'][0:126] 47 | word_list = [key for key in text] 48 | word_list.insert(0, "[CLS]") 49 | word_list.append("[SEP]") 50 | spo = line['spo_list'] 51 | #token_ids = tokenizer.encode(text, max_length=128) 52 | token_ids = tokenizer.convert_tokens_to_ids(word_list) 53 | segment_ids = np.zeros(len(token_ids)) 54 | mask = np.ones(len(token_ids)) 55 | for sp in spo: 56 | sp = (tokenizer.convert_tokens_to_ids([key for key in sp[0]]), sp[1], tokenizer.convert_tokens_to_ids([key for key in sp[2]])) 57 | subjectid = list_find(token_ids, sp[0]) 58 | objectid = list_find(token_ids, sp[2]) 59 | if subjectid != -1 and objectid != -1: 60 | key = (subjectid, subjectid + len(sp[0])) 61 | if key not in items: 62 | items[key] = [] 63 | items[key].append((objectid, 64 | objectid + len(sp[2]), 65 | predicate2id[sp[1]] + 1)) 66 | if items: 67 | input_x.append(token_ids) 68 | input_segment.append(segment_ids) 69 | input_mask.append(mask) 70 | #seq_len.append(len(text2id)) 71 | ner_s = np.zeros(len(token_ids), dtype=np.int32) 72 | er_s = np.zeros((128, 128), dtype=np.int32) 73 | for j in items: 74 | ner_s[j[0]] = 1 75 | ner_s[j[0]+1:j[1]] = 2 76 | for k in items[j]: 77 | ner_s[k[0]] = 1 78 | ner_s[k[0]+1:k[1]] = 2 79 | er_s[j[0]][k[0]] = k[2] 80 | #print(ner_s) 81 | input_ner.append(ner_s) 82 | input_re.append(er_s) 83 | 84 | #seq_len = np.array(seq_len, dtype=np.int32) 85 | input_re = np.array(input_re, dtype=np.int32) 86 | input_x = tf.keras.preprocessing.sequence.pad_sequences(input_x, 128, padding='post', truncating='post') 87 | input_segment = tf.keras.preprocessing.sequence.pad_sequences(input_segment, 128, padding='post', truncating='post') 88 | input_mask = tf.keras.preprocessing.sequence.pad_sequences(input_mask, 128, padding='post', truncating='post') 89 | input_ner = tf.keras.preprocessing.sequence.pad_sequences(input_ner, 128, padding='post', truncating='post') 90 | return input_x, input_segment, input_mask, input_ner, input_re 91 | 92 | # input_x, input_segment,input_mask, input_ner, input_re = get_input_bert(train_data) 93 | # print(train_data[10]) 94 | # print(input_x[10]) 95 | # print(input_segment[10]) 96 | # print(input_mask[10]) 97 | # print(input_ner[10]) 98 | # print(input_re[10][1]) 99 | 100 | class data_loader(): 101 | def __init__(self): 102 | self.input_x, self.input_segment, self.input_mask, self.input_ner, self.input_re = get_input_bert(train_data) 103 | self.input_x = self.input_x.astype(np.int32) 104 | self.input_segment = self.input_segment.astype(np.int32) 105 | self.input_mask = self.input_mask.astype(np.int32) 106 | self.input_ner = self.input_ner.astype(np.int32) 107 | self.input_re = self.input_re.astype(np.int32) 108 | self.num_train = self.input_x.shape[0] 109 | self.db_train = tf.data.Dataset.from_tensor_slices((self.input_x, self.input_segment, self.input_mask, self.input_ner, self.input_re)) 110 | self.db_train = self.db_train.shuffle(self.num_train).batch(batch_size, drop_remainder=True) 111 | 112 | def get_batch(self, batch_s): 113 | indics = np.random.randint(0, self.num_train, batch_s) 114 | return self.input_x[indics], self.input_segment[indics], self.input_mask[indics], self.input_ner[indics], self.input_re[indics] 115 | 116 | class ER_model(tf.keras.Model): 117 | def __init__(self, bert_model): 118 | 119 | super(ER_model, self).__init__() 120 | self.bert = bert_model 121 | #self.bert = TFBertModel.from_pretrained(self.checkpoint) 122 | self.dense_left = tf.keras.layers.Dense(126, use_bias=False) 123 | self.dense_right = tf.keras.layers.Dense(126, use_bias=False) 124 | self.dropout = tf.keras.layers.Dropout(dropout) 125 | self.dense = tf.keras.layers.Dense(num_class) 126 | 127 | def call(self, inputs, mask, segment): 128 | encode_input, _ = self.bert([inputs, mask, segment]) 129 | left = self.dense_left(encode_input) 130 | right = self.dense_right(encode_input) 131 | outer_sum = broadcasting(left, right) 132 | output = tf.tanh(outer_sum) 133 | output = self.dropout(output) 134 | output = self.dense(output) 135 | output = tf.sigmoid(output) 136 | return output 137 | 138 | def broadcasting(left, right): 139 | left = tf.transpose(left, perm=[1, 0, 2]) 140 | left = tf.expand_dims(left, 3) 141 | right = tf.transpose(right, perm=[0, 2, 1]) 142 | right = tf.expand_dims(right, 0) 143 | B = left + right 144 | B = tf.transpose(B, perm=[1, 0, 3, 2]) 145 | return B 146 | 147 | def loss_function(re_pred, input_red): 148 | 149 | input_re_onehot = tf.one_hot(input_red, depth=num_class, dtype=tf.float32) 150 | loss_re = tf.keras.losses.binary_crossentropy(y_true=input_re_onehot, y_pred=re_pred) 151 | loss_re = tf.reduce_sum(loss_re) 152 | loss = (loss_re) 153 | return loss 154 | 155 | 156 | class Extra_result(object): 157 | def __init__(self, text, spo_list): 158 | self.text = text 159 | self.spo = spo_list 160 | def call(self): 161 | result = [] 162 | word_list = [key for key in self.text] 163 | word_list.insert(0, "[CLS]") 164 | word_list.append("[SEP]") 165 | segment_ids = np.zeros(len(word_list)) 166 | mask = np.ones(len(word_list)) 167 | token = tf.constant(tokenizer.convert_tokens_to_ids(word_list), dtype=tf.int32)[None, :] 168 | segment_ids = tf.constant(segment_ids, dtype=tf.int32)[None, :] 169 | mask = tf.constant(mask, dtype=tf.int32)[None, :] 170 | subjects = self.extra_sujects() 171 | re = model_Er(token, mask, segment_ids) 172 | relationship = self.extra_er(subjects, re) 173 | print(subjects) 174 | print(relationship) 175 | result.extend(relationship) 176 | return result 177 | 178 | def extra_sujects(self): 179 | subject = [] 180 | subject_ = [] 181 | for key in self.spo: 182 | subject_.append(key[0]) 183 | subject_.append(key[2]) 184 | 185 | subject_ = list(set(subject_)) 186 | for key in subject_: 187 | id = self.text.index(key) 188 | subject.append((key, id + 1)) 189 | return subject 190 | 191 | def extra_er(self, subjects, re): 192 | position = [key[1] for key in subjects] 193 | subjects_ = [key[0] for key in subjects] 194 | re = re[0] 195 | relationship = [] 196 | re = tf.argmax(re, axis=-1) 197 | print(re) 198 | length = re.shape[0] 199 | for k in range(length): 200 | for i, key in enumerate(list(np.array(re[k]))): 201 | if int(key) > 0: 202 | if k in position and i in position: 203 | subject = subjects_[position.index(k)] 204 | object = subjects_[position.index(i)] 205 | predicate = id2predicate[key - 1] 206 | relationship.append((subject, predicate, object)) 207 | return relationship 208 | 209 | 210 | class Evaluate(object): 211 | def __init__(self): 212 | pass 213 | def reset(self,spo_list): 214 | xx = [] 215 | for key in spo_list: 216 | xx.append((key[0], key[1], key[2])) 217 | return xx 218 | def evaluate(self, data): 219 | A, B, C = 1e-10, 1e-10, 1e-10 220 | for d in data[0:100]: 221 | extra_items = Extra_result(d['text'], self.reset(d['spo_list'])) 222 | R = set(extra_items.call()) 223 | T = set(self.reset(d['spo_list'])) 224 | A += len(R & T)#抽取正确数量 225 | B += len(R) #抽取数量 226 | C += len(T)#原正确数量 227 | return (2 * A / (B + C)), (A / B), (A / C) 228 | 229 | #建立模型 230 | model_Er = ER_model(bert_model) 231 | #optimizer = AdamW(lr=lr) 232 | optimizer = tf.keras.optimizers.Adam(learning_rate=lr, epsilon=epsilon) 233 | #保存模型 234 | # checkpoint_dir = './save/Entity_Relationshaip_version2_checkpoints' 235 | # checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt') 236 | checkpoint = tf.train.Checkpoint(optimizer=optimizer, model_Er=model_Er) 237 | 238 | evaluate = Evaluate() 239 | data_loader = data_loader() 240 | best = 0.0 241 | 242 | for epoch in range(num_epochs): 243 | print('Epoch:', epoch + 1) 244 | num_batchs = int(data_loader.num_train / batch_size) + 1 245 | for batch_index in range(num_batchs): 246 | input_x, input_segment, input_mask, input_ner, input_re = data_loader.get_batch(batch_size) 247 | with tf.GradientTape() as tape: 248 | y_re = model_Er(input_x, input_mask, input_segment) #预测关系 249 | loss = loss_function(y_re, input_re) 250 | if (batch_index+1) % 100 == 0: 251 | print("batch %d: loss %f" % (batch_index+1, loss.numpy())) 252 | 253 | variables = (model_Er.variables) 254 | grads = tape.gradient(loss, variables) 255 | optimizer.apply_gradients(grads_and_vars=zip(grads, variables)) 256 | 257 | F, P, R = evaluate.evaluate(dev_data) 258 | print('测试集:', "F: %f, P: %f, R: %f" % (F, P, F)) 259 | if round(F, 2) > best and round(F, 2) > 0.50: 260 | best = F 261 | print('saving_model') 262 | #model.save('./save/Entity_Relationshaip_version2.h5') 263 | checkpoint.save('./save/Relationship/version5_checkpoints.ckpt') 264 | -------------------------------------------------------------------------------- /data/data.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/X-jun-0130/Entity-recognition-and-Relation-extraction/f553993d7432c9b9f2dd6c7cdb51969f93dac646/data/data.rar -------------------------------------------------------------------------------- /data_process.py: -------------------------------------------------------------------------------- 1 | from random import choice 2 | import json 3 | import tensorflow as tf 4 | from tqdm import tqdm 5 | import numpy as np 6 | 7 | max_len = 128 8 | 9 | word2id = open('./data_trans/word2id.txt', 'r', encoding='utf-8') 10 | id2predicate, predicate2id = json.load(open('./data_trans/all_50_schemas_me.json', encoding='utf-8')) 11 | id2predicate = {int(i): j for i, j in id2predicate.items()} 12 | num_classes = len(id2predicate) 13 | word_list = [key.strip('\n') for key in word2id] 14 | 15 | def Token(text): 16 | text2id = [] 17 | for word in text: 18 | if word in word_list: 19 | text2id.append(word_list.index(word)) 20 | else: 21 | word = '[UNK]' 22 | text2id.append(word_list.index(word)) 23 | return text2id 24 | 25 | 26 | def list_find(list1, list2): 27 | """在list1中寻找子串list2,如果找到,返回第一个下标; 28 | 如果找不到,返回-1。 29 | """ 30 | n_list2 = len(list2) 31 | for i in range(len(list1)): 32 | if list1[i: i+n_list2] == list2: 33 | return i 34 | return -1 35 | ''' 36 | ner:预测subject 37 | perdicate:预测object和relation矩阵(128*num_class) 38 | ''' 39 | def get_input(data): 40 | input_x, input_ner1, input_ner2, input_re1, input_re2, position_s, position_e = [], [], [], [], [], [], [] 41 | for l in tqdm(range(64000)): 42 | items = {} 43 | line = data[l] 44 | text = line['text'][:128] 45 | spo = line['spo_list'] 46 | text2id = Token(text) 47 | for sp in spo: 48 | sp = (Token(sp[0]), sp[1], Token(sp[2])) 49 | subjectid = list_find(text2id, sp[0]) 50 | objectid = list_find(text2id, sp[2]) 51 | if subjectid != -1 and objectid != -1: 52 | key = (subjectid, subjectid + len(sp[0])) 53 | if key not in items: 54 | items[key] = [] 55 | items[key].append((objectid, 56 | objectid + len(sp[2]), 57 | predicate2id[sp[1]])) 58 | if items: 59 | input_x.append(text2id) 60 | #seq_len.append(len(text2id)) 61 | ner_s1 = np.zeros(128, dtype=np.int32) 62 | ner_s2 = np.zeros(128, dtype=np.int32) 63 | for j in items: 64 | ner_s1[j[0]] = 1 65 | ner_s2[j[1]-1] = 1 66 | #print(ner_s) 67 | input_ner1.append(ner_s1) 68 | input_ner2.append(ner_s2) 69 | k1, k2 = np.array(list(items.keys())).T 70 | k1 = choice(k1) 71 | k2 = choice(k2[k2 >= k1]) 72 | er_s1 = np.zeros((128, num_classes), dtype=np.float32) 73 | er_s2 = np.zeros((128, num_classes), dtype=np.float32) 74 | position_s.append(k1) 75 | position_e.append(k2 - 1) 76 | for j in items.get((k1, k2), []): 77 | er_s1[j[0]][j[2]] = 1 78 | er_s2[j[1] - 1][j[2]] = 1 79 | input_re1.append(er_s1) 80 | input_re2.append(er_s2) 81 | 82 | #seq_len = np.array(seq_len, dtype=np.int32) 83 | input_re1 = np.array(input_re1, dtype=np.int32) 84 | input_re2 = np.array(input_re2, dtype=np.int32) 85 | input_x = tf.keras.preprocessing.sequence.pad_sequences(input_x, max_len, padding='post', truncating='post') 86 | input_ner1 = np.array(input_ner1, dtype=np.int32) 87 | input_ner2 = np.array(input_ner2, dtype=np.int32) 88 | position_s = np.array(position_s, dtype=np.int32) 89 | position_e = np.array(position_e, dtype=np.int32) 90 | return input_x, input_ner1, input_ner2, input_re1, input_re2, position_s, position_e 91 | 92 | # dev_data = json.load(open('./data_trans/dev_data_me.json', encoding='utf-8')) 93 | # input_x, input_ner1, input_ner2, input_re1, input_re2, position_s, position_e = get_input(dev_data) 94 | # print(input_ner1[1]) 95 | # print(input_ner1[2]) 96 | # print(input_ner2[1]) 97 | # print(input_ner2[2]) 98 | # input_ner1 = tf.one_hot(input_ner1[10], depth=2, dtype=tf.float32) 99 | # print(input_ner1) 100 | # input_ner1 = tf.argmax(input_ner1, axis=-1) 101 | # input_ner1 = np.array(input_ner1, dtype=np.float32) 102 | # print(np.where(input_ner1>0.5)) 103 | ''' 104 | ner:预测subject/object 105 | perdicate:预测头部关系矩阵(128*128) 106 | ''' 107 | def get_input_so(data): 108 | input_x, input_ner, input_re = [], [], [] 109 | for l in tqdm(range(32)): 110 | items = {} 111 | line = data[l] 112 | text = line['text'][:128] 113 | spo = line['spo_list'] 114 | text2id = Token(text) 115 | for sp in spo: 116 | sp = (Token(sp[0]), sp[1], Token(sp[2])) 117 | subjectid = list_find(text2id, sp[0]) 118 | objectid = list_find(text2id, sp[2]) 119 | if subjectid != -1 and objectid != -1: 120 | key = (subjectid, subjectid + len(sp[0])) 121 | if key not in items: 122 | items[key] = [] 123 | items[key].append((objectid, 124 | objectid + len(sp[2]), 125 | predicate2id[sp[1]] + 1)) 126 | if items: 127 | input_x.append(text2id) 128 | #seq_len.append(len(text2id)) 129 | ner_s = np.zeros(len(text2id), dtype=np.int32) 130 | er_s = np.zeros((128, 128), dtype=np.int32) 131 | #mask_ = np.ones(len(text2id), dtype=np.int32) 132 | for j in items: 133 | ner_s[j[0]] = 1 134 | ner_s[j[0]+1:j[1]] = 2 135 | for k in items[j]: 136 | ner_s[k[0]] = 1 137 | ner_s[k[0]+1:k[1]] = 2 138 | er_s[j[0]][k[0]] = k[2] 139 | #print(ner_s) 140 | input_ner.append(ner_s) 141 | input_re.append(er_s) 142 | #mask.append(mask_) 143 | 144 | 145 | #seq_len = np.array(seq_len, dtype=np.int32) 146 | input_re = np.array(input_re, dtype=np.int32) 147 | input_x = tf.keras.preprocessing.sequence.pad_sequences(input_x, max_len, padding='post', truncating='post') 148 | input_ner = tf.keras.preprocessing.sequence.pad_sequences(input_ner, max_len, padding='post', truncating='post') 149 | #mask = tf.keras.preprocessing.sequence.pad_sequences(mask, max_len, padding='post', truncating='post') 150 | return input_x, input_ner, input_re 151 | 152 | # train_data = json.load(open('train_data_me.json', encoding='utf-8')) 153 | # input_x, input_ner, input_re = get_input_so(train_data) 154 | # print(train_data[0]) 155 | # print(input_x[0]) 156 | # print(input_ner[0]) 157 | # print(input_re[0][21]) 158 | --------------------------------------------------------------------------------