├── .idea ├── Joint-Entity-and-Relation-Extraction.iml ├── modules.xml └── workspace.xml ├── README.md ├── build_data.py ├── comparative_model.py ├── config └── CoNLL04 │ └── bio_config ├── data └── CoNLL04 │ ├── BIO2id.json │ ├── README.md │ ├── char2id.json │ ├── dev.txt │ ├── dev_me.json │ ├── relation2id.json │ ├── test.txt │ ├── test_me.json │ ├── train.txt │ ├── train_me.json │ ├── vecs.lc.over100freq.zip │ └── word2id.json ├── eval.py ├── layers.py ├── models.py ├── train.py └── utils.py /.idea/Joint-Entity-and-Relation-Extraction.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Joint-Entity-and-Relation-Extraction 2 | Implementation of the papers Joint entity recognition and relation extraction as a multi-head selection problem for multi-context joint entity and relation extraction. 3 | -------------------------------------------------------------------------------- /build_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | from utils import read_properties,collect_BIO2id,collect_char2id,collect_data_set,collect_word2id,readFile,collect_n_char2id 4 | 5 | config_file= read_properties('config/CoNLL04/bio_config') 6 | filename_train = config_file.getProperty("filename_train") 7 | filename_test = config_file.getProperty("filename_test") 8 | filename_dev = config_file.getProperty("filename_dev") 9 | 10 | filename_train_me = config_file.getProperty("filename_train_me") 11 | filename_test_me = config_file.getProperty("filename_test_me") 12 | filename_dev_me = config_file.getProperty("filename_dev_me") 13 | 14 | filename_char2id = config_file.getProperty("filename_char2id") 15 | filename_n_char2id = config_file.getProperty("filename_n_char2id") 16 | 17 | filename_word2id = config_file.getProperty("filename_word2id") 18 | filename_BIO2id = config_file.getProperty("filename_BIO2id") 19 | filename_relation2id = config_file.getProperty("filename_relation2id") 20 | 21 | train_data_me = collect_data_set(readFile(filename_train),filename_train_me) 22 | dev_data_me = collect_data_set(readFile(filename_dev),filename_dev_me) 23 | test_data_me = collect_data_set(readFile(filename_test),filename_test_me) 24 | 25 | collect_char2id(train_data_me+dev_data_me+test_data_me,filename_char2id) 26 | collect_n_char2id(train_data_me+dev_data_me+test_data_me,filename_n_char2id,3) 27 | collect_word2id(train_data_me+dev_data_me+test_data_me,filename_word2id) 28 | collect_BIO2id(train_data_me+dev_data_me+test_data_me,filename_BIO2id) 29 | # collect_relations2id(train_data_me+dev_data_me+test_data_me,filename_relation2id) 30 | 31 | 32 | -------------------------------------------------------------------------------- /comparative_model.py: -------------------------------------------------------------------------------- 1 | from layers import Position_Embedding,Attention_Layer,Self_Attention_Layer,Gate_Add_Lyaer,seq_and_vec,MaskedConv1D,MaskedLSTM,MaskFlatten,MaskPermute,MaskRepeatVector 2 | from keras.models import Model 3 | from keras.layers import * 4 | import keras 5 | from keras_contrib.layers import CRF 6 | from keras_multi_head import MultiHead,MultiHeadAttention 7 | from keras_self_attention import SeqSelfAttention as self_attention 8 | from keras_pos_embd import TrigPosEmbedding 9 | from keras_position_wise_feed_forward import FeedForward 10 | 11 | class lstm_attention_model_ner_part(): 12 | def __init__(self,embedding_martrix,hidden_size, 13 | nb_head,word_embed_size,char_embed_size,word_vocab_size,char_vocab_size,multi_layers,num_classes 14 | ,maxlen_sentence,maxlen_word,word_char_embed_mode='add',learning_rate = 5e-5,embedding_dropout_prob=0.1,nn_dropout_prob=0.1,optmizer='adam', 15 | is_use_char_embedding=False): 16 | """ 17 | 测试一下self-attention在ner上的效果 18 | """ 19 | self.embedding_martrix = embedding_martrix 20 | self.num_classes = num_classes 21 | self.hidden_size = hidden_size 22 | self.nb_head = nb_head 23 | self.word_embed_size = word_embed_size 24 | self.char_embed_size = char_embed_size 25 | # self.pos_embed_size = pos_embed_size #use the add position_embedding 26 | self.word_vocab_size = word_vocab_size 27 | self.char_vocab_size = char_vocab_size 28 | # self.maxlen = maxlen 29 | self.multi_layers = multi_layers 30 | self.maxlen_sentence = maxlen_sentence 31 | self.maxlen_word = maxlen_word 32 | self.word_char_embed_mode= word_char_embed_mode 33 | self.learning_rate = learning_rate 34 | self.embedding_dropout_prob = embedding_dropout_prob 35 | self.nn_dropout_prob = nn_dropout_prob 36 | self.is_use_char_embedding = is_use_char_embedding 37 | print(multi_layers) 38 | 39 | #char_embedding_shape [batch,sentence,word,dim] 40 | def reshape_layer_1(self, char_embedding,char_embedding_shape): 41 | def reshape(char_embedding): 42 | return K.reshape(char_embedding, shape=(-1, char_embedding_shape[-2], self.char_embed_size)) #[batch*sentence,word,dim] 43 | return Lambda(reshape)(char_embedding) 44 | 45 | def reshape_layer_2(self, char_embedding,char_embedding_shape): 46 | def reshape(char_embedding): 47 | return K.reshape(char_embedding, shape=(-1, char_embedding_shape[1], self.char_embed_size)) #[batch,sentence,dim] 48 | return Lambda(reshape)(char_embedding) 49 | 50 | def model(self): 51 | word_input = Input(shape=(self.maxlen_sentence,)) #[batch,sentencen] 52 | char_input = Input(shape=(self.maxlen_sentence,self.maxlen_word,)) #[batch,word,char] 53 | ner_label = Input(shape=(self.maxlen_sentence,)) 54 | # relation_label = Input(shape=(self.maxlen_sentence,)) 55 | 56 | mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(word_input) 57 | 58 | word_embedding = Embedding(self.word_vocab_size, self.word_embed_size,mask_zero=True,weights=[self.embedding_martrix],name='word_embedding',trainable=True)(word_input) #[batch,word,embed] 59 | char_embedding = Embedding(self.char_vocab_size,self.char_embed_size,mask_zero=True,name='char_embedding',trainable=True)(char_input) #[batch,word,char,embedd] 60 | 61 | if self.embedding_dropout_prob: 62 | word_embedding = Dropout(self.embedding_dropout_prob)(word_embedding) 63 | char_embedding = Dropout(self.embedding_dropout_prob)(char_embedding) 64 | 65 | if self.is_use_char_embedding: 66 | # char_embedding maxpooling part 67 | char_embedding_shape = K.int_shape(char_embedding) # [batch,sentence,word,dim] 68 | # char_embedding_reshaped = K.reshape(char_embedding, shape=(-1, char_embedding_shape[-2],self.char_embed_size)) # [batch*sentence,word,dim of char embedding] 69 | char_embedding_reshaped = self.reshape_layer_1(char_embedding,char_embedding_shape) 70 | char_lstm = Bidirectional(MaskedLSTM(units=self.char_embed_size // 2, return_sequences=True, name='char_lstm_layer'))( 71 | char_embedding_reshaped) 72 | attention = TimeDistributed(Dense(1, activation='tanh'))(char_lstm) 73 | attention = MaskFlatten()(attention) 74 | attention = Activation('softmax')(attention) 75 | attention = MaskRepeatVector(self.char_embed_size)(attention) 76 | attention = MaskPermute([2, 1])(attention) 77 | sent_representation = multiply([char_lstm, attention]) 78 | attention = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation) 79 | 80 | # char_maxpool = GlobalMaxPooling1D(char_lstm) # [batch*sentence,hidden_size] 81 | # char_att = Attention_Layer()(char_lstm) # [batch*sentence,hidden_size] 82 | # char_embedding = K.reshape(char_maxpool, shape=[-1, char_embedding_shape[1], 83 | # self.hidden_size]) # [batch,sentence,hidden_size] 84 | # char_embedding = K.reshape(attention, shape=[-1, char_embedding_shape[-1], self.char_embed_size]) # [batch,sentence,hidden_size] 85 | char_embedding = self.reshape_layer_2(attention,char_embedding_shape) 86 | if self.word_char_embed_mode == 'concate': 87 | embedding = Concatenate(axis=-1)([word_embedding,char_embedding]) 88 | else : 89 | embedding = Gate_Add_Lyaer()([word_embedding,char_embedding]) 90 | # pass 91 | else: 92 | embedding = word_embedding 93 | #multi-layers self-attention for ner pred 94 | if self.embedding_dropout_prob: 95 | embedding = Dropout(self.embedding_dropout_prob)(embedding) 96 | 97 | 98 | # part1 , multi-self-attentionblock, (CNN/LSTM/FNN+self-attention) 99 | lstm = Bidirectional(MaskedLSTM(units=self.hidden_size // 2, return_sequences=True))(embedding) 100 | attention = TimeDistributed(Dense(1, activation='tanh'))(lstm) 101 | attention = MaskFlatten()(attention) 102 | attention = Activation('softmax')(attention) 103 | attention = MaskRepeatVector(self.hidden_size)(attention) 104 | attention = MaskPermute([2, 1])(attention) 105 | sent_representation = multiply([lstm, attention]) 106 | attention = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation) 107 | # lstm_attention = Lambda(seq_and_vec, output_shape=(None, self.hidden_size * 2))( 108 | # [lstm, attention]) # [这里考虑下用相加的方法，以及门控相加] 109 | attention = MaskRepeatVector(self.maxlen_sentence)(attention) #[batch,sentence,hidden_size] 110 | lstm = Gate_Add_Lyaer()([lstm,attention]) 111 | if self.nn_dropout_prob: 112 | lstm = Dropout(self.nn_dropout_prob)(lstm) 113 | 114 | lstm_attention = MaskedConv1D(filters=self.hidden_size,kernel_size=3,activation='relu',padding='same')(lstm) 115 | bio_pred = Dense(self.num_classes, activation='softmax')(lstm_attention) 116 | pred_model =Model([word_input, char_input], bio_pred) 117 | #part2 multi-head selection for relation classification 118 | train_model = Model([word_input, char_input, ner_label], bio_pred) 119 | 120 | loss = K.sparse_categorical_crossentropy(ner_label, bio_pred) 121 | loss = K.sum(loss * mask[:, :, 0]) / K.sum(mask) 122 | 123 | train_model.summary() 124 | train_model.add_loss(loss) 125 | train_model.compile(keras.optimizers.adam(lr=self.learning_rate)) 126 | 127 | return train_model,pred_model 128 | 129 | 130 | class lstm_model_ner_part(): 131 | def __init__(self,embedding_martrix,hidden_size, 132 | nb_head,word_embed_size,char_embed_size,word_vocab_size,char_vocab_size,multi_layers,num_classes 133 | ,maxlen_sentence,maxlen_word,word_char_embed_mode='add',learning_rate = 5e-5,embedding_dropout_prob=0.1,nn_dropout_prob=0.1,optmizer='adam', 134 | is_use_char_embedding=False): 135 | """ 136 | 测试一下self-attention在ner上的效果 137 | """ 138 | self.embedding_martrix = embedding_martrix 139 | self.num_classes = num_classes 140 | self.hidden_size = hidden_size 141 | self.nb_head = nb_head 142 | self.word_embed_size = word_embed_size 143 | self.char_embed_size = char_embed_size 144 | # self.pos_embed_size = pos_embed_size #use the add position_embedding 145 | self.word_vocab_size = word_vocab_size 146 | self.char_vocab_size = char_vocab_size 147 | # self.maxlen = maxlen 148 | self.multi_layers = multi_layers 149 | self.maxlen_sentence = maxlen_sentence 150 | self.maxlen_word = maxlen_word 151 | self.word_char_embed_mode= word_char_embed_mode 152 | self.learning_rate = learning_rate 153 | self.embedding_dropout_prob = embedding_dropout_prob 154 | self.nn_dropout_prob = nn_dropout_prob 155 | self.is_use_char_embedding = is_use_char_embedding 156 | print(multi_layers) 157 | 158 | #char_embedding_shape [batch,sentence,word,dim] 159 | def reshape_layer_1(self, char_embedding,char_embedding_shape): 160 | def reshape(char_embedding): 161 | return K.reshape(char_embedding, shape=(-1, char_embedding_shape[-2], self.char_embed_size)) #[batch*sentence,word,dim] 162 | return Lambda(reshape)(char_embedding) 163 | 164 | def reshape_layer_2(self, char_embedding,char_embedding_shape): 165 | def reshape(char_embedding): 166 | return K.reshape(char_embedding, shape=(-1, char_embedding_shape[1], self.char_embed_size)) #[batch,sentence,dim] 167 | return Lambda(reshape)(char_embedding) 168 | 169 | def model(self): 170 | word_input = Input(shape=(self.maxlen_sentence,)) #[batch,sentencen] 171 | char_input = Input(shape=(self.maxlen_sentence,self.maxlen_word,)) #[batch,word,char] 172 | ner_label = Input(shape=(self.maxlen_sentence,)) 173 | # relation_label = Input(shape=self.maxlen_sentence,) #[batch,sentence,n_classes] 174 | mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(word_input) 175 | 176 | word_embedding = Embedding(self.word_vocab_size, self.word_embed_size,mask_zero=True,weights=[self.embedding_martrix],name='word_embedding',trainable=True)(word_input) #[batch,word,embed] 177 | char_embedding = Embedding(self.char_vocab_size,self.char_embed_size,mask_zero=True,name='char_embedding',trainable=True)(char_input) #[batch,word,char,embedd] 178 | 179 | if self.embedding_dropout_prob: 180 | word_embedding = Dropout(self.embedding_dropout_prob)(word_embedding) 181 | char_embedding = Dropout(self.embedding_dropout_prob)(char_embedding) 182 | 183 | if self.is_use_char_embedding: 184 | # char_embedding maxpooling part 185 | char_embedding_shape = K.int_shape(char_embedding) # [batch,sentence,word,dim] 186 | # char_embedding_reshaped = K.reshape(char_embedding, shape=(-1, char_embedding_shape[-2],self.char_embed_size)) # [batch*sentence,word,dim of char embedding] 187 | char_embedding_reshaped = self.reshape_layer_1(char_embedding,char_embedding_shape) 188 | char_lstm = Bidirectional(MaskedLSTM(units=self.char_embed_size // 2, return_sequences=True, name='char_lstm_layer'))( 189 | char_embedding_reshaped) 190 | 191 | attention = TimeDistributed(Dense(1, activation='tanh'))(char_lstm) 192 | attention = MaskFlatten()(attention) 193 | attention = Activation('softmax')(attention) 194 | attention = MaskRepeatVector(self.char_embed_size)(attention) 195 | attention = MaskPermute([2, 1])(attention) 196 | sent_representation = multiply([char_lstm, attention]) 197 | attention = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation) 198 | 199 | char_embedding = self.reshape_layer_2(attention,char_embedding_shape) 200 | if self.word_char_embed_mode == 'concate': 201 | embedding = Concatenate(axis=-1)([word_embedding,char_embedding]) 202 | else : 203 | embedding = Gate_Add_Lyaer()([word_embedding,char_embedding]) 204 | # pass 205 | else: 206 | embedding = word_embedding 207 | #multi-layers self-attention for ner pred 208 | if self.embedding_dropout_prob: 209 | embedding = Dropout(self.embedding_dropout_prob)(embedding) 210 | 211 | # part1 , multi-self-attentionblock, (CNN/LSTM/FNN+self-attention) 212 | lstm = Bidirectional(MaskedLSTM(units=self.hidden_size // 2, return_sequences=True), name='lstm_layer0')(embedding) 213 | if self.nn_dropout_prob: 214 | lstm = Dropout(self.nn_dropout_prob)(lstm) 215 | # # multi_lstm_layers 216 | # if self.multi_layers >= 2: 217 | # for i in range(self.multi_layers - 1): 218 | # i+=1 219 | # lstm = Bidirectional(CuDNNLSTM(self.hidden_size // 2, return_sequences=True), name='lstm_layer{}'.format(i))(lstm) 220 | # if self.nn_dropout_prob: 221 | # lstm = Dropout(self.nn_dropout_prob)(lstm) 222 | bio_pred = Dense(self.num_classes, activation='softmax')(lstm) 223 | pred_model =Model([word_input, char_input], bio_pred) 224 | 225 | 226 | train_model = Model([word_input, char_input, ner_label], bio_pred) 227 | 228 | loss = K.sparse_categorical_crossentropy(ner_label, bio_pred) 229 | loss = K.sum(loss * mask[:, :, 0]) / K.sum(mask) 230 | 231 | loss = K.sum(loss * mask) / K.sum(mask) 232 | train_model.summary() 233 | train_model.add_loss(loss) 234 | train_model.compile(keras.optimizers.adam(lr=self.learning_rate)) 235 | 236 | return train_model,pred_model 237 | 238 | 239 | -------------------------------------------------------------------------------- /config/CoNLL04/bio_config: -------------------------------------------------------------------------------- 1 | #dataset 2 | filename_dev = "data/CoNLL04/dev.txt" 3 | filename_test = "data/CoNLL04/test.txt" 4 | filename_train = "data/CoNLL04/train.txt" 5 | 6 | filename_train_me = "data/CoNLL04/train_me.json" 7 | filename_dev_me = "data/CoNLL04/dev_me.json" 8 | filename_test_me = "data/CoNLL04/test_me.json" 9 | 10 | filename_char2id = "data/CoNLL04/char2id.json" 11 | filename_word2id = "data/CoNLL04/word2id.json" 12 | filename_n_char2id = "data/CoNLL04/n_char2id.json" 13 | filename_BIO2id = "data/CoNLL04/BIO2id.json" 14 | filename_relation2id = "data/CoNLL04/relation2id.json" 15 | 16 | #training 17 | is_use_n_char=False 18 | epochs = 150 19 | batch_size = 128 20 | optimizer = Adam 21 | save_model_file = "save_model/ner_model.weights" 22 | 23 | #hyperparameters 24 | hidden_size = 128 25 | nb_head = 8 26 | word_embed_size = 100 27 | char_embed_size = 30 28 | maxlen_sentence = 100 29 | maxlen_word = 25 30 | multi_layers = 4 31 | embedding_dropout_prob = 0.25 32 | nn_dropout_prob = 0.25 33 | learning_rate = 1e-3 34 | is_use_char_embedding = True 35 | 36 | -------------------------------------------------------------------------------- /data/CoNLL04/BIO2id.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "1": "O", 4 | "2": "B-Loc", 5 | "3": "I-Loc", 6 | "4": "B-Peop", 7 | "5": "I-Peop", 8 | "6": "B-Org", 9 | "7": "I-Org", 10 | "8": "B-Other", 11 | "9": "I-Other" 12 | }, 13 | { 14 | "O": 1, 15 | "B-Loc": 2, 16 | "I-Loc": 3, 17 | "B-Peop": 4, 18 | "I-Peop": 5, 19 | "B-Org": 6, 20 | "I-Org": 7, 21 | "B-Other": 8, 22 | "I-Other": 9 23 | } 24 | ] -------------------------------------------------------------------------------- /data/CoNLL04/README.md: -------------------------------------------------------------------------------- 1 | We use the splits defined in previous work for the CoNLL04 dataset. 2 | 3 | See our multi-head selection papers for more info. 4 | 5 | The format of the input files has been adapted to the input format of our head selection model. 6 | 7 | The original link to the dataset can be found [here](http://cogcomp.org/Data/ER/conll04.corp). 8 | 9 | data from here https://github.com/bekou/multihead_joint_entity_relation_extraction/tree/master/data/CoNLL04 -------------------------------------------------------------------------------- /data/CoNLL04/char2id.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "1": "V", 4 | "2": "e", 5 | "3": "r", 6 | "4": "y", 7 | "5": "s", 8 | "6": "t", 9 | "7": "o", 10 | "8": "n", 11 | "9": "g", 12 | "10": "u", 13 | "11": "h", 14 | "12": "w", 15 | "13": "i", 16 | "14": "d", 17 | "15": "a", 18 | "16": "c", 19 | "17": "m", 20 | "18": "p", 21 | "19": ",", 22 | "20": "5", 23 | "21": "0", 24 | "22": "-", 25 | "23": "7", 26 | "24": "G", 27 | "25": "I", 28 | "26": "l", 29 | "27": "S", 30 | "28": ".", 31 | "29": "A", 32 | "30": "b", 33 | "31": "v", 34 | "32": "M", 35 | "33": "R", 36 | "34": "W", 37 | "35": "k", 38 | "36": "f", 39 | "37": "E", 40 | "38": "'", 41 | "39": "H", 42 | "40": "C", 43 | "41": "(", 44 | "42": "T", 45 | "43": "x", 46 | "44": ")", 47 | "45": "J", 48 | "46": "2", 49 | "47": "K", 50 | "48": "Y", 51 | "49": "O", 52 | "50": "D", 53 | "51": "N", 54 | "52": "F", 55 | "53": "j", 56 | "54": "z", 57 | "55": "B", 58 | "56": "1", 59 | "57": "6", 60 | "58": "3", 61 | "59": "9", 62 | "60": "4", 63 | "61": "P", 64 | "62": "`", 65 | "63": "L", 66 | "64": "q", 67 | "65": "U", 68 | "66": "$", 69 | "67": "8", 70 | "68": ":", 71 | "69": "X", 72 | "70": "\"", 73 | "71": "Q", 74 | "72": ";", 75 | "73": "Z", 76 | "74": "_", 77 | "75": "!", 78 | "76": "?", 79 | "77": "&" 80 | }, 81 | { 82 | "V": 1, 83 | "e": 2, 84 | "r": 3, 85 | "y": 4, 86 | "s": 5, 87 | "t": 6, 88 | "o": 7, 89 | "n": 8, 90 | "g": 9, 91 | "u": 10, 92 | "h": 11, 93 | "w": 12, 94 | "i": 13, 95 | "d": 14, 96 | "a": 15, 97 | "c": 16, 98 | "m": 17, 99 | "p": 18, 100 | ",": 19, 101 | "5": 20, 102 | "0": 21, 103 | "-": 22, 104 | "7": 23, 105 | "G": 24, 106 | "I": 25, 107 | "l": 26, 108 | "S": 27, 109 | ".": 28, 110 | "A": 29, 111 | "b": 30, 112 | "v": 31, 113 | "M": 32, 114 | "R": 33, 115 | "W": 34, 116 | "k": 35, 117 | "f": 36, 118 | "E": 37, 119 | "'": 38, 120 | "H": 39, 121 | "C": 40, 122 | "(": 41, 123 | "T": 42, 124 | "x": 43, 125 | ")": 44, 126 | "J": 45, 127 | "2": 46, 128 | "K": 47, 129 | "Y": 48, 130 | "O": 49, 131 | "D": 50, 132 | "N": 51, 133 | "F": 52, 134 | "j": 53, 135 | "z": 54, 136 | "B": 55, 137 | "1": 56, 138 | "6": 57, 139 | "3": 58, 140 | "9": 59, 141 | "4": 60, 142 | "P": 61, 143 | "`": 62, 144 | "L": 63, 145 | "q": 64, 146 | "U": 65, 147 | "$": 66, 148 | "8": 67, 149 | ":": 68, 150 | "X": 69, 151 | "\"": 70, 152 | "Q": 71, 153 | ";": 72, 154 | "Z": 73, 155 | "_": 74, 156 | "!": 75, 157 | "?": 76, 158 | "&": 77 159 | } 160 | ] -------------------------------------------------------------------------------- /data/CoNLL04/relation2id.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "1": "O", 4 | "2": "B-Loc", 5 | "3": "I-Loc", 6 | "4": "B-Peop", 7 | "5": "I-Peop", 8 | "6": "B-Org", 9 | "7": "I-Org", 10 | "8": "B-Other", 11 | "9": "I-Other" 12 | }, 13 | { 14 | "O": 1, 15 | "B-Loc": 2, 16 | "I-Loc": 3, 17 | "B-Peop": 4, 18 | "I-Peop": 5, 19 | "B-Org": 6, 20 | "I-Org": 7, 21 | "B-Other": 8, 22 | "I-Other": 9 23 | } 24 | ] -------------------------------------------------------------------------------- /data/CoNLL04/vecs.lc.over100freq.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cdjasonj/Joint-Entity-and-Relation-Extraction/e5eb78ae5b1b9730019706c5f415a6fba61ec777/data/CoNLL04/vecs.lc.over100freq.zip -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import utils 3 | from utils import read_properties 4 | # def NER_result_Evaluator(outputs,targets): 5 | # """ 6 | # 这里测评下ner的结果的f1 7 | # :return: 8 | # """ 9 | # 10 | # right,true,pred = 1e-10, 1e-10, 1e-10 11 | # 12 | # for i in range(len(outputs)): 13 | # output = outputs[i] 14 | # target = targets[i] 15 | # output = output[:len(target)] 16 | # for j in range(len(output)): 17 | # if output[j] != 0: 18 | # pred += 1 19 | # if target[j] == output[j]: 20 | # right += 1 21 | # for j in range(len(target)): 22 | # if target[j] != 0 : 23 | # true+=1 24 | # R = right/pred 25 | # P = right/true 26 | # F = (2*P*R)/(P+R) 27 | # return P,R,F 28 | 29 | 30 | def NER_result_Evaluator(outputs,targets): 31 | config_file = read_properties('config/CoNLL04/bio_config') 32 | filename_BIO2id = config_file.getProperty("filename_BIO2id") 33 | id2BIO, BIO2id = json.load(open(filename_BIO2id, encoding='utf-8')) 34 | right,true,pred = 1e-10, 1e-10, 1e-10 35 | for i in range(len(outputs)): 36 | output = outputs[i] 37 | target = targets[i] 38 | output = output[:len(target)] 39 | flag= 0 40 | output_pred = [] 41 | target_pred = [] 42 | for i in range(len(output)): 43 | bio = id2BIO[str(output[i])] 44 | if bio[0]=='B': 45 | output_pred.append(i) 46 | for j in range(i+1,len(output)): 47 | bio = id2BIO[str(output[j])] 48 | if bio[0] == 'I': 49 | output_pred.append(i+j+1) 50 | else: 51 | break 52 | break 53 | for i in range(len(target)): 54 | bio = id2BIO[str(target[i])] 55 | if bio[0]=='B': 56 | target_pred.append(i) 57 | for j in range(i+1,len(target)): 58 | bio = id2BIO[str(target[j])] 59 | if bio[0] == 'I': 60 | target_pred.append(i+j+1) 61 | else: 62 | break 63 | break 64 | 65 | if output_pred: 66 | pred+=1 67 | if target_pred: 68 | true+=1 69 | if output_pred == target_pred: 70 | right+=1 71 | 72 | R = right/pred 73 | P = right/true 74 | F = (2*P*R)/(P+R) 75 | return P,R,F 76 | -------------------------------------------------------------------------------- /layers.py: -------------------------------------------------------------------------------- 1 | import keras 2 | from keras import backend as K 3 | from keras.layers import * 4 | 5 | 6 | def seq_and_vec(x): 7 | """seq是[None, seq_len, s_size]的格式， 8 | vec是[None, v_size]的格式，将vec重复seq_len次，拼到seq上， 9 | 得到[None, seq_len, s_size+v_size]的向量。 10 | """ 11 | seq, vec = x 12 | vec = K.expand_dims(vec, 1) 13 | vec = K.zeros_like(seq[:, :, :1]) + vec 14 | return K.concatenate([seq, vec], 2) 15 | 16 | def attention_like_tensor(x): 17 | """ 18 | 把attention向量广播到每个一时间步， 19 | :param x: [batch,dim] 20 | :return: [batch,sentene,dim] 21 | """ 22 | 23 | class Attention_Layer(keras.layers.Layer): 24 | 25 | """ 26 | dot attention for word_char_embedding 27 | q,v,k for define the attention 28 | score = softmax(dot(q,v)) 29 | attention = sum(score*k)= 30 | """ 31 | # def __init__(self,**kwargs): 32 | # super(Attention_Layer,self).__init__(**kwargs) 33 | # 34 | # def build(self, input_shape): 35 | # self.W = self.add_weight(name='W',shape=(input_shape[-1],input_shape[-1]),initializer='glorot_normal') 36 | # self.acitvation = 37 | # def call(self,inputs,mask=None): 38 | # score = K.softmax(K.dot(inputs,self.W),axis=-1) 39 | # c = 40 | 41 | 42 | class Gate_Add_Lyaer(keras.layers.Layer): 43 | """ 44 | gate add mechanism for word_char embedding 45 | z = sigmoid(W(1)tanh(W(2)word_embedding + W(3)char_att)) 46 | word_char_embedding = z*word_embedding + (1-z)char_att 47 | 48 | """ 49 | def __init__(self,**kwargs): 50 | """ 51 | 52 | :param word_embedding: shape [batch,sentence,dim of word_embedding] 53 | :param char_att: shape [batch,sentence,dim of char_embedding] 54 | :param kwargs: 55 | """ 56 | super(Gate_Add_Lyaer,self).__init__(**kwargs) 57 | self.supports_masking = True 58 | 59 | def build(self, input_shape): 60 | assert input_shape[0][2] == input_shape[1][2] 61 | 62 | self.W1 = self.add_weight(name='W1',shape=(input_shape[0][-1],input_shape[0][-1]),initializer='glorot_normal') #[dim,dim] 63 | self.W2 = self.add_weight(name='W2',shape=(input_shape[0][-1],input_shape[0][-1]),initializer='glorot_normal') 64 | self.W3 = self.add_weight(name='W3',shape=(input_shape[0][-1],input_shape[0][-1]),initializer='glorot_normal') 65 | 66 | super(Gate_Add_Lyaer, self).build(input_shape) 67 | 68 | def call(self,inputs,mask=None): 69 | # inputs[0]:word_embedding ,inputs[1]:char_embedding 70 | word_embedding_shape = K.int_shape(inputs[0]) #[batch,sentence,dim of word embedding] 71 | char_embedding_shape = K.int_shape(inputs[1]) #[batch,sentence,dim of char embedding] 72 | # word_embedding_reshaped = K.reshape(inputs[0],shape=(-1,word_embedding_shape[-1])) #[batch*sentence,dim of word embedding] 73 | # char_embedding_reshaped = K.reshape(inputs[1],shape=(-1,char_embedding_shape[-1])) #[batch*sentence, dim of char embedding] 74 | word_embedding = K.dot(inputs[0],self.W1) 75 | char_embedding = K.dot(inputs[1],self.W2) 76 | wc_tanh = K.tanh(word_embedding+char_embedding) 77 | z = K.sigmoid(K.dot(wc_tanh,self.W3)) 78 | embedding = z*inputs[0]+(1-z)*inputs[1] 79 | # z = K.sigmoid(K.dot(K.tanh(K.dot(word_embedding_reshaped,self.W1) + K.dot(char_embedding_shape,self.W2)),self.W3)) 80 | # embedding = z*word_embedding_reshaped + (1-z)*char_embedding_reshaped #[batch*sentence,] 81 | # embedding = K.reshape(embedding,shape=(-1,word_embedding_reshaped[1],word_embedding_reshaped[-1]))# [batch,sentecen,dim] 82 | return embedding 83 | 84 | def compute_mask(self, inputs, mask=None): 85 | return mask 86 | 87 | 88 | def compute_output_shape(self, input_shape): 89 | return (input_shape[0][0],input_shape[0][1],input_shape[0][2]) 90 | 91 | 92 | class Position_Embedding(Layer): 93 | 94 | def __init__(self, size=None, mode='sum', **kwargs): 95 | self.size = size # 必须为偶数 96 | self.mode = mode 97 | super(Position_Embedding, self).__init__(**kwargs) 98 | 99 | def call(self, x,mask=None): 100 | if (self.size == None) or (self.mode == 'sum'): 101 | self.size = int(x.shape[-1]) 102 | batch_size, seq_len = K.shape(x)[0], K.shape(x)[1] 103 | position_j = 1. / K.pow(10000., 2 * K.arange(self.size / 2, dtype='float32') / self.size) 104 | position_j = K.expand_dims(position_j, 0) 105 | position_i = K.cumsum(K.ones_like(x[:, :, 0]), 1) - 1 # K.arange不支持变长，只好用这种方法生成 106 | position_i = K.expand_dims(position_i, 2) 107 | position_ij = K.dot(position_i, position_j) 108 | position_ij = K.concatenate([K.cos(position_ij), K.sin(position_ij)], 2) 109 | if self.mode == 'sum': 110 | return position_ij + x 111 | elif self.mode == 'concat': 112 | return K.concatenate([position_ij, x], 2) 113 | 114 | def compute_output_shape(self, input_shape): 115 | if self.mode == 'sum': 116 | return input_shape 117 | elif self.mode == 'concat': 118 | return (input_shape[0], input_shape[1], input_shape[2] + self.size) 119 | 120 | 121 | class Self_Attention_Layer(Layer): 122 | """多头注意力机制 123 | """ 124 | 125 | def __init__(self, nb_head, size_per_head, **kwargs): 126 | self.nb_head = nb_head 127 | self.size_per_head = size_per_head 128 | self.out_dim = nb_head * size_per_head 129 | super(Self_Attention_Layer, self).__init__(**kwargs) 130 | 131 | def build(self, input_shape): 132 | q_in_dim = input_shape[0][-1] 133 | k_in_dim = input_shape[1][-1] 134 | v_in_dim = input_shape[2][-1] 135 | self.q_kernel = self.add_weight(name='q_kernel', 136 | shape=(q_in_dim, self.out_dim), 137 | initializer='glorot_normal') 138 | self.k_kernel = self.add_weight(name='k_kernel', 139 | shape=(k_in_dim, self.out_dim), 140 | initializer='glorot_normal') 141 | self.v_kernel = self.add_weight(name='w_kernel', 142 | shape=(v_in_dim, self.out_dim), 143 | initializer='glorot_normal') 144 | 145 | def mask(self, x, mask, mode='mul'): 146 | if mask is None: 147 | return x 148 | else: 149 | for _ in range(K.ndim(x) - K.ndim(mask)): 150 | mask = K.expand_dims(mask, K.ndim(mask)) 151 | if mode == 'mul': 152 | return x * mask 153 | else: 154 | return x - (1 - mask) * 1e10 155 | 156 | def call(self, inputs): 157 | q, k, v = inputs[:3] 158 | v_mask, q_mask = None, None 159 | if len(inputs) > 3: 160 | v_mask = inputs[3] 161 | if len(inputs) > 4: 162 | q_mask = inputs[4] 163 | # 线性变化 164 | qw = K.dot(q, self.q_kernel) 165 | kw = K.dot(k, self.k_kernel) 166 | vw = K.dot(v, self.v_kernel) 167 | # qw = Dense(self.out_dim,activation='relu')(q) 168 | # kw = Dense(self.out_dim, activation='relu')(k) 169 | # vw = Dense(self.out_dim, activation='relu')(v) 170 | # 形状变换 171 | qw = K.reshape(qw, (-1, K.shape(qw)[1], self.nb_head, self.size_per_head)) 172 | kw = K.reshape(kw, (-1, K.shape(kw)[1], self.nb_head, self.size_per_head)) 173 | vw = K.reshape(vw, (-1, K.shape(vw)[1], self.nb_head, self.size_per_head)) 174 | # 维度置换 175 | qw = K.permute_dimensions(qw, (0, 2, 1, 3)) 176 | kw = K.permute_dimensions(kw, (0, 2, 1, 3)) 177 | vw = K.permute_dimensions(vw, (0, 2, 1, 3)) 178 | # Attention 179 | a = K.batch_dot(qw, kw, [3, 3]) / self.size_per_head ** 0.5 180 | a = K.permute_dimensions(a, (0, 3, 2, 1)) 181 | a = self.mask(a, v_mask, 'add') 182 | a = K.permute_dimensions(a, (0, 3, 2, 1)) 183 | a = K.softmax(a) 184 | # 完成输出 185 | o = K.batch_dot(a, vw, [3, 2]) 186 | o = K.permute_dimensions(o, (0, 2, 1, 3)) 187 | o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) 188 | o = self.mask(o, q_mask, 'mul') 189 | return o 190 | 191 | def compute_output_shape(self, input_shape): 192 | return (input_shape[0][0], input_shape[0][1], self.out_dim) 193 | 194 | 195 | class MaskedConv1D(keras.layers.Conv1D): 196 | 197 | def __init__(self, **kwargs): 198 | super(MaskedConv1D, self).__init__(**kwargs) 199 | self.supports_masking = True 200 | 201 | def compute_mask(self, inputs, mask=None): 202 | return mask 203 | 204 | def call(self, inputs, mask=None): 205 | if mask is not None: 206 | mask = K.cast(mask, K.floatx()) 207 | inputs *= K.expand_dims(mask, axis=-1) 208 | return super(MaskedConv1D, self).call(inputs) 209 | 210 | 211 | class MaskedLSTM(keras.layers.CuDNNLSTM): 212 | 213 | def __init__(self, **kwargs): 214 | super(MaskedLSTM, self).__init__(**kwargs) 215 | self.supports_masking = True 216 | 217 | def compute_mask(self, inputs, mask=None): 218 | return mask 219 | 220 | def call(self, inputs, mask=None, training=None, initial_state=None): 221 | if mask is not None: 222 | mask = K.cast(mask, K.floatx()) 223 | inputs *= K.expand_dims(mask, axis=-1) 224 | return super(MaskedLSTM, self).call(inputs) 225 | 226 | 227 | class MaskFlatten(keras.layers.Flatten): 228 | 229 | def __init__(self, **kwargs): 230 | super(MaskFlatten, self).__init__(**kwargs) 231 | self.supports_masking = True 232 | 233 | def compute_mask(self, inputs, mask=None): 234 | return mask 235 | 236 | def call(self, inputs, mask=None): 237 | # if mask is not None: 238 | # mask = K.cast(mask, K.floatx()) 239 | # inputs *= K.expand_dims(mask, axis=-1) 240 | return super(MaskFlatten, self).call(inputs) #调用父类的call ,然后传入inputs 241 | 242 | 243 | class MaskRepeatVector(keras.layers.RepeatVector): 244 | 245 | def __init__(self, n,**kwargs): 246 | super(MaskRepeatVector, self).__init__(n,**kwargs) 247 | self.supports_masking = True 248 | 249 | def compute_mask(self, inputs, mask=None): 250 | return mask 251 | 252 | def call(self, inputs, mask=None): 253 | return super(MaskRepeatVector, self).call(inputs) 254 | 255 | class MaskPermute(keras.layers.Permute): 256 | 257 | def __init__(self, dims,**kwargs): 258 | super(MaskPermute, self).__init__(dims,**kwargs) 259 | self.supports_masking = True 260 | 261 | def compute_mask(self, inputs, mask=None): 262 | return mask 263 | 264 | def call(self, inputs, mask=None): 265 | return super(MaskPermute, self).call(inputs) 266 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | import keras 2 | import numpy as np 3 | from keras.layers import * 4 | from keras.models import Model 5 | from layers import Position_Embedding,Attention_Layer,Self_Attention_Layer,Gate_Add_Lyaer,seq_and_vec,MaskedConv1D,MaskedLSTM,MaskFlatten,MaskPermute,MaskRepeatVector 6 | 7 | class word_char_lstm_model(): 8 | 9 | def __init__(self, hidden_size, word_embed_size, char_embed_size, word_vocab_size,char_vocab_size, multi_layers, maxlen, maxlen_word, 10 | num_classes_part1,num_classes_part2, learning_rate=5e-5, embedding_dropout_prob=0.1,nn_dropout_prob = 0.1, optmizer='adam'): 11 | """ 12 | try the gate add way for word_char_embedding, char_embedding from the char-leve of word attention 13 | 14 | :param hidden_size: 15 | :param embed_size: 16 | :param vocab_size: 17 | :param dropout_prob: 18 | """ 19 | self.num_classes_part1 = num_classes_part1 20 | self.num_classes_part2 = num_classes_part2 21 | self.hidden_size = hidden_size 22 | self.word_embed_size = word_embed_size 23 | self.char_embed_size = char_embed_size 24 | self.word_vocab_size = word_vocab_size 25 | self.char_vocab_size = char_vocab_size 26 | self.maxlen = maxlen 27 | self.maxlen_word = maxlen_word 28 | self.multi_layers = multi_layers 29 | self.learning_rate = learning_rate 30 | self.embedding_dropout_prob = embedding_dropout_prob 31 | self.nn_dropout_prob = nn_dropout_prob 32 | 33 | def model(self): 34 | """ 35 | 后面加上mask 36 | part1 : word_embed=>LSTM*2=>attention=>dense=>outputs 37 | part2 : outputs_concat LSTM => dense=>outputs 38 | :return: 39 | """ 40 | word_input = Input(shape=(None,)) # [batch_size,sentence] 41 | char_input = Input(shape=(None,None,)) # [batch_size,sentence,word] 42 | outputs_part1 = Input(shape=(None,)) 43 | outputs_part2 = Input(shape=(None,None,)) #[batch,sentence,sentence*rel_counts] 44 | # word_embedding_layer 45 | word_embedding = Embedding(self.word_vocab_size, self.word_embed_size, name='word_embedding')(word_input) 46 | char_embedding = Embedding(self.char_vocab_size, self.char_embed_size, name='char_embedding')(char_input) # [batch,sentence,word,dim of char embedding] 47 | if self.embedding_dropout_prob: 48 | word_embedding = Dropout(self.embedding_dropout_prob)(word_embedding) 49 | char_embedding = Dropout(self.embedding_dropout_prob)(char_embedding) 50 | 51 | # char_embedding maxpooling part 52 | char_embedding_shape = K.int_shape(char_embedding) # [batch,sentence,word,dim] 53 | char_embedding_reshaped = K.reshape(char_embedding, shape=[-1, char_embedding_shape[-2], 54 | self.char_embed_size]) # [batch*sentence,word,dim of char embedding] 55 | char_lstm = Bidirectional(CuDNNLSTM(self.hidden_size // 2, return_sequences=True, name='char_lstm_layer'))( 56 | char_embedding_reshaped) 57 | # char_maxpool = GlobalMaxPooling1D(char_lstm) # [batch*sentence,hidden_size] 58 | char_att = Attention_Layer()(char_lstm) #[batch*sentence,hidden_size] 59 | # char_embedding = K.reshape(char_maxpool, shape=[-1, char_embedding_shape[1], 60 | # self.hidden_size]) # [batch,sentence,hidden_size] 61 | char_embedding = K.reshape(char_att,shape=[-1,char_embedding_shape[-1],self.hidden_size])#[batch,sentence,hidden_size] 62 | # embedding = Concatenate(axis=-1)([word_embedding, char_embedding]) 63 | embedding = Gate_Add_Lyaer()([word_embedding,char_embedding]) 64 | # part1 , entity_pred 65 | lstm = Bidirectional(CuDNNLSTM(self.hidden_size // 2, return_sequences=True, name='lstm_layer0'))(embedding) 66 | if self.nn_dropout_prob: 67 | lstm = Dropout(self.nn_dropout_prob)(lstm) 68 | # multi_lstm_layers 69 | if self.multi_layers >= 2: 70 | for i in range(self.multi_layers - 1): 71 | lstm = Bidirectional( 72 | CuDNNLSTM(self.hidden_size // 2, return_sequences=True, name='lstm_layer{}'.format(i + 1)))(lstm) 73 | if self.nn_dropout_prob: 74 | lstm = Dropout(self.nn_dropout_prob)(lstm) 75 | 76 | attention = TimeDistributed(Dense(1, activation='tanh'))(lstm) 77 | attention = Flatten()(attention) 78 | attention = Activation('softmax')(attention) 79 | attention = RepeatVector(self.hidden_size)(attention) 80 | attention = Permute([2, 1])(attention) 81 | sent_representation = multiply([lstm, attention]) 82 | attention = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation) 83 | lstm_attention = Lambda(seq_and_vec, output_shape=(None, self.hidden_size * 2))( 84 | [lstm, attention]) # [这里考虑下用相加的方法，以及门控相加] 85 | entity_pred = Dense(self.num_classes_part1, activation='softmax')(lstm_attention) 86 | entity_model = Model([word_input], [entity_pred]) 87 | 88 | # part2 multi-head selection for relation classfication 89 | h = Concatenate(axis=-1)([lstm, entity_pred]) 90 | multi_head_selection_pred = Dense(self.num_classes_part2, activation='sigmoid')(h) #[batch_size,sentence,] 91 | relation_model = Model([word_input], [multi_head_selection_pred]) 92 | train_model = Model([word_input, outputs_part1, outputs_part2], [multi_head_selection_pred]) 93 | 94 | part1_loss = K.sparse_categorical_crossentropy(outputs_part1, entity_pred) 95 | part2_loss = K.binary_crossentropy(outputs_part2, multi_head_selection_pred) 96 | part1_loss = K.mean(part1_loss) 97 | part2_loss = K.mean(part2_loss) 98 | 99 | train_model.add_loss(part1_loss + part2_loss) 100 | train_model.compile(keras.optimizers.adam(lr=5e-5)) 101 | 102 | return entity_model, relation_model, train_model 103 | 104 | 105 | class lstm_attention_model_ner_part(): 106 | def __init__(self,embedding_martrix,hidden_size, 107 | nb_head,word_embed_size,char_embed_size,word_vocab_size,char_vocab_size,multi_layers,num_classes 108 | ,maxlen_sentence,maxlen_word,word_char_embed_mode='add',learning_rate = 5e-5,embedding_dropout_prob=0.1,nn_dropout_prob=0.1,optmizer='adam', 109 | is_use_char_embedding=False): 110 | """ 111 | 测试一下self-attention在ner上的效果 112 | """ 113 | self.embedding_martrix = embedding_martrix 114 | self.num_classes = num_classes 115 | self.hidden_size = hidden_size 116 | self.nb_head = nb_head 117 | self.word_embed_size = word_embed_size 118 | self.char_embed_size = char_embed_size 119 | # self.pos_embed_size = pos_embed_size #use the add position_embedding 120 | self.word_vocab_size = word_vocab_size 121 | self.char_vocab_size = char_vocab_size 122 | # self.maxlen = maxlen 123 | self.multi_layers = multi_layers 124 | self.maxlen_sentence = maxlen_sentence 125 | self.maxlen_word = maxlen_word 126 | self.word_char_embed_mode= word_char_embed_mode 127 | self.learning_rate = learning_rate 128 | self.embedding_dropout_prob = embedding_dropout_prob 129 | self.nn_dropout_prob = nn_dropout_prob 130 | self.is_use_char_embedding = is_use_char_embedding 131 | print(multi_layers) 132 | 133 | #char_embedding_shape [batch,sentence,word,dim] 134 | def reshape_layer_1(self, char_embedding,char_embedding_shape): 135 | def reshape(char_embedding): 136 | return K.reshape(char_embedding, shape=(-1, char_embedding_shape[-2], self.char_embed_size)) #[batch*sentence,word,dim] 137 | return Lambda(reshape)(char_embedding) 138 | 139 | def reshape_layer_2(self, char_embedding,char_embedding_shape): 140 | def reshape(char_embedding): 141 | return K.reshape(char_embedding, shape=(-1, char_embedding_shape[1], self.char_embed_size)) #[batch,sentence,dim] 142 | return Lambda(reshape)(char_embedding) 143 | 144 | def model(self): 145 | word_input = Input(shape=(self.maxlen_sentence,)) #[batch,sentencen] 146 | char_input = Input(shape=(self.maxlen_sentence,self.maxlen_word,)) #[batch,word,char] 147 | ner_label = Input(shape=(self.maxlen_sentence,)) 148 | 149 | mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(word_input) 150 | 151 | word_embedding = Embedding(self.word_vocab_size, self.word_embed_size,weights=[self.embedding_martrix],name='word_embedding',trainable=True)(word_input) #[batch,word,embed] 152 | char_embedding = Embedding(self.char_vocab_size,self.char_embed_size,name='char_embedding',trainable=True)(char_input) #[batch,word,char,embedd] 153 | 154 | if self.embedding_dropout_prob: 155 | word_embedding = Dropout(self.embedding_dropout_prob)(word_embedding) 156 | char_embedding = Dropout(self.embedding_dropout_prob)(char_embedding) 157 | 158 | if self.is_use_char_embedding: 159 | # char_embedding maxpooling part 160 | char_embedding_shape = K.int_shape(char_embedding) # [batch,sentence,word,dim] 161 | # char_embedding_reshaped = K.reshape(char_embedding, shape=(-1, char_embedding_shape[-2],self.char_embed_size)) # [batch*sentence,word,dim of char embedding] 162 | char_embedding_reshaped = self.reshape_layer_1(char_embedding,char_embedding_shape) 163 | char_lstm = Bidirectional(MaskedLSTM(units=self.char_embed_size // 2, return_sequences=True, name='char_lstm_layer'))( 164 | char_embedding_reshaped) 165 | 166 | attention = TimeDistributed(Dense(1, activation='tanh'))(char_lstm) 167 | attention = MaskFlatten()(attention) 168 | attention = Activation('softmax')(attention) 169 | attention = MaskRepeatVector(self.char_embed_size)(attention) 170 | attention = MaskPermute([2, 1])(attention) 171 | sent_representation = multiply([char_lstm, attention]) 172 | attention = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation) 173 | 174 | # char_maxpool = GlobalMaxPooling1D(char_lstm) # [batch*sentence,hidden_size] 175 | # char_att = Attention_Layer()(char_lstm) # [batch*sentence,hidden_size] 176 | # char_embedding = K.reshape(char_maxpool, shape=[-1, char_embedding_shape[1], 177 | # self.hidden_size]) # [batch,sentence,hidden_size] 178 | # char_embedding = K.reshape(attention, shape=[-1, char_embedding_shape[-1], self.char_embed_size]) # [batch,sentence,hidden_size] 179 | char_embedding = self.reshape_layer_2(attention,char_embedding_shape) 180 | if self.word_char_embed_mode == 'concate': 181 | embedding = Concatenate(axis=-1)([word_embedding,char_embedding]) 182 | else : 183 | embedding = Gate_Add_Lyaer()([word_embedding,char_embedding]) 184 | # pass 185 | else: 186 | embedding = word_embedding 187 | #multi-layers self-attention for ner pred 188 | if self.embedding_dropout_prob: 189 | embedding = Dropout(self.embedding_dropout_prob)(embedding) 190 | 191 | # part1 , multi-self-attentionblock, (CNN/LSTM/FNN+self-attention) 192 | lstm = Bidirectional(MaskedLSTM(units=self.hidden_size // 2, return_sequences=True), name='lstm_layer0')(embedding) 193 | if self.nn_dropout_prob: 194 | lstm = Dropout(self.nn_dropout_prob)(lstm) 195 | # # multi_lstm_layers 196 | # if self.multi_layers >= 2: 197 | # for i in range(self.multi_layers - 1): 198 | # i+=1 199 | # lstm = Bidirectional(CuDNNLSTM(self.hidden_size // 2, return_sequences=True), name='lstm_layer{}'.format(i))(lstm) 200 | # if self.nn_dropout_prob: 201 | # lstm = Dropout(self.nn_dropout_prob)(lstm) 202 | 203 | attention = TimeDistributed(Dense(1, activation='tanh'))(lstm) 204 | # 205 | attention = MaskFlatten()(attention) 206 | attention = Activation('softmax')(attention) 207 | attention = MaskRepeatVector(self.hidden_size)(attention) 208 | attention = MaskPermute([2, 1])(attention) 209 | sent_representation = multiply([lstm, attention]) 210 | attention = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation) 211 | lstm_attention = Lambda(seq_and_vec, output_shape=(None, self.hidden_size * 2))( 212 | [lstm, attention]) # [这里考虑下用相加的方法，以及门控相加] 213 | lstm_attention = MaskedConv1D(filters=self.hidden_size,kernel_size=3,activation='relu',padding='same')(lstm_attention) 214 | 215 | bio_pred = Dense(self.num_classes, activation='softmax')(lstm_attention) 216 | pred_model =Model([word_input, char_input], bio_pred) 217 | train_model = Model([word_input, char_input, ner_label], bio_pred) 218 | 219 | loss = K.sparse_categorical_crossentropy(ner_label, bio_pred) 220 | loss = K.sum(loss * mask[:, :, 0]) / K.sum(mask) 221 | 222 | loss = K.sum(loss * mask) / K.sum(mask) 223 | train_model.summary() 224 | train_model.add_loss(loss) 225 | train_model.compile(keras.optimizers.adam(lr=self.learning_rate)) 226 | 227 | return train_model,pred_model 228 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | from utils import read_properties,data_generator,load_data,get_embedding_matrix 4 | from comparative_model import lstm_model_ner_part,lstm_attention_model_ner_part 5 | import keras 6 | import keras.backend as K 7 | from keras.callbacks import LearningRateScheduler 8 | from eval import NER_result_Evaluator 9 | import os 10 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 11 | config_file= read_properties('config/CoNLL04/bio_config') 12 | #datasets 13 | filename_train_me = config_file.getProperty("filename_train_me") 14 | filename_test_me = config_file.getProperty("filename_test_me") 15 | filename_dev_me = config_file.getProperty("filename_dev_me") 16 | 17 | filename_char2id = config_file.getProperty("filename_char2id") 18 | filename_word2id = config_file.getProperty("filename_word2id") 19 | filename_BIO2id = config_file.getProperty("filename_BIO2id") 20 | filename_relation2id = config_file.getProperty("filename_relation2id") 21 | 22 | #training 23 | epochs = config_file.getProperty("epochs") 24 | batch_size = config_file.getProperty('batch_size') 25 | model_save_file = config_file.getProperty('save_model_file') 26 | is_use_n_char = bool(config_file.getProperty('is_use_n_char')) 27 | 28 | #hyperparameters 29 | is_use_char_embedding = bool(config_file.getProperty('is_use_char_embedding')) 30 | hidden_size = int(config_file.getProperty('hidden_size')) 31 | word_embed_size = int(config_file.getProperty('word_embed_size')) 32 | char_embed_size = int(config_file.getProperty('char_embed_size')) 33 | embedding_dropout_prob = float(config_file.getProperty('embedding_dropout_prob')) 34 | nn_dropout_prob = float(config_file.getProperty('nn_dropout_prob')) 35 | multi_layers = int(config_file.getProperty('multi_layers')) 36 | nb_head = int(config_file.getProperty('nb_head')) 37 | learning_rate = float(config_file.getProperty('learning_rate')) 38 | maxlen_sentence = int(config_file.getProperty('maxlen_sentence')) 39 | maxlen_word = int(config_file.getProperty('maxlen_word')) 40 | 41 | train_data = json.load(open(filename_train_me,encoding='utf-8')) 42 | dev_data = json.load(open(filename_dev_me,encoding='utf-8')) 43 | id2char, char2id = json.load(open(filename_char2id,encoding='utf-8')) 44 | id2n_char, n_char2id = json.load(open(filename_char2id,encoding='utf-8')) 45 | id2word, word2id = json.load(open(filename_word2id,encoding='utf-8')) 46 | id2BIO,BIO2id = json.load(open(filename_BIO2id,encoding='utf-8')) 47 | # id2relation,relation2id = json.load(open(filename_relation2id,encoding='utf-8')) 48 | char_vocab_size = len(char2id) +1 # 0,padding 49 | word_vocab_size = len(word2id) +1 # 0 ,padding 50 | ner_classes_num = len(BIO2id) 51 | embedding_martrix = get_embedding_matrix(word2id) 52 | 53 | # lstm_model = lstm_model_ner_part(hidden_size, nb_head, word_embed_size, char_embed_size, word_vocab_size, char_vocab_size, multi_layers, 54 | # ner_classes_num, learning_rate, embedding_dropout_prob, nn_dropout_prob, is_use_char_embedding) 55 | # 56 | # self_att_model = self_attention_model_ner_part(hidden_size, nb_head, word_embed_size, char_embed_size, word_vocab_size, char_vocab_size, multi_layers, 57 | # ner_classes_num, learning_rate, embedding_dropout_prob, nn_dropout_prob, is_use_char_embedding) 58 | # self_att_model = self_attention_model_ner_part(embedding_martrix,hidden_size, 5, word_embed_size, char_embed_size, word_vocab_size, char_vocab_size, 5, 59 | # ner_classes_num, maxlen_sentence,maxlen_word,learning_rate, embedding_dropout_prob, nn_dropout_prob, 'adam',False) 60 | 61 | word_char_embed_mode = 'concate' 62 | # lstm_model = lstm_model_ner_part(embedding_martrix,hidden_size, nb_head, word_embed_size, char_embed_size, word_vocab_size, char_vocab_size, multi_layers, 63 | # ner_classes_num, maxlen_sentence,maxlen_word,word_char_embed_mode,learning_rate,embedding_dropout_prob,nn_dropout_prob,'adam',True) 64 | lstm_model = lstm_attention_model_ner_part(embedding_martrix,hidden_size, nb_head, word_embed_size, char_embed_size, word_vocab_size, char_vocab_size, multi_layers, 65 | ner_classes_num, maxlen_sentence,maxlen_word,word_char_embed_mode,learning_rate,embedding_dropout_prob,nn_dropout_prob,'adam',True) 66 | 67 | train_model,pred_model = lstm_model.model() 68 | 69 | #TODO only ner part now, then complete it 70 | 71 | def pred_op(mode): 72 | eval_TEXT_WORD, eval_TEXT_CHAR, true_bio = load_data(mode) 73 | ner_pred = pred_model.predict([eval_TEXT_WORD, eval_TEXT_CHAR],batch_size=800,verbose=1)#[batch,sentence,num_classe] 74 | ner_pred = np.argmax(ner_pred,axis=-1) #[batch,sentence] 75 | return ner_pred,true_bio 76 | 77 | # 78 | # def scheduler(epoch): 79 | # # 每隔1个epoch，学习率减小为原来的1/2 80 | # # if epoch % 100 == 0 and epoch != 0: 81 | # #再epoch > 3的时候,开始学习率递减,每次递减为原来的1/2,最低为2e-6 82 | # if (epoch+1) % 50 == 0: 83 | # lr = K.get_value(train_model.optimizer.lr) 84 | # lr = lr*0.5 85 | # if lr < 2e-6: 86 | # return 2e-6 87 | # else: 88 | # return lr 89 | 90 | def train_op(): 91 | # reduce_lr = LearningRateScheduler(scheduler, verbose=1) 92 | train_D = data_generator(train_data,char2id,n_char2id,word2id,BIO2id,maxlen_sentence,maxlen_word,is_use_n_char,128) 93 | best_f1 = 0 94 | for i in range(1,150): #epochs 95 | print(i) 96 | train_model.fit_generator(train_D.__iter__(), 97 | steps_per_epoch=len(train_D), 98 | epochs=1, 99 | # callbacks=[reduce_lr] 100 | ) 101 | # if (i) % 2 == 0 : #两次对dev进行一次测评,并对dev结果进行保存 102 | ner_pred,true_bio = pred_op('dev') 103 | P, R, F = NER_result_Evaluator(ner_pred,true_bio) 104 | if F > best_f1 : 105 | train_model.save_weights(model_save_file) 106 | best_f1 = F 107 | print('当前第{}个epoch，验证集,准确度为{},召回为{},f1为：{}'.format(i,P,R,F)) 108 | 109 | ner_pred, true_bio = pred_op('test') 110 | P, R, F = NER_result_Evaluator(ner_pred, true_bio) 111 | print('当前第{}个epoch，测试集,准确度为{},召回为{},f1为：{}'.format(i,P,R,F)) 112 | 113 | if i % 50 == 0: 114 | ner_pred, true_bio = pred_op('train') 115 | P, R, F = NER_result_Evaluator(ner_pred, true_bio) 116 | print('训练集,准确度为{},召回为{},f1为：{}'.format(P, R, F)) 117 | 118 | print(best_f1) 119 | train_op() -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import json 4 | import csv 5 | import codecs 6 | from keras.preprocessing.sequence import pad_sequences 7 | def readFile(file_name): 8 | head_id_col_vector = ['token_id', 'token', "BIO", "relation", 'head'] 9 | file = pd.read_csv(file_name, names=head_id_col_vector, encoding="utf-8", 10 | engine='python', sep="\t", quoting=csv.QUOTE_NONE).as_matrix() 11 | return file 12 | 13 | def collect_data_set(file,save_file): 14 | datas = [] 15 | text = [] 16 | BIOS = [] 17 | relations = [] 18 | heads = [] 19 | for i in range(file.shape[0]): 20 | if '#doc' not in file[i][0]: 21 | text.append(file[i][1]) 22 | BIOS.append(file[i][2]) 23 | relations.append(file[i][3]) 24 | heads.append(file[i][4]) 25 | else: 26 | dic = {} 27 | dic['text'] = text 28 | dic['BIOS'] = BIOS 29 | dic['relations'] = relations 30 | dic['heads'] = heads 31 | datas.append(dic) 32 | 33 | text = [] 34 | BIOS = [] 35 | relations = [] 36 | heads = [] 37 | 38 | with codecs.open(save_file, 'w', encoding='utf-8') as f: 39 | json.dump(datas, f, indent=4, ensure_ascii=False) 40 | return datas 41 | 42 | def collect_char2id(datasets,save_file): 43 | chars = {} 44 | for data in datasets: 45 | for word in data['text']: 46 | for char in word: 47 | chars[char] = chars.get(char, 0) + 1 48 | id2char = {i+1:j for i,j in enumerate(chars)} # padding: 0 49 | char2id = {j:i for i,j in id2char.items()} 50 | with codecs.open(save_file, 'w', encoding='utf-8') as f: 51 | json.dump([id2char, char2id], f, indent=4, ensure_ascii=False) 52 | 53 | def collect_n_char2id(datasets,save_file,n): 54 | chars = {} 55 | for data in datasets: 56 | for word in data['text']: 57 | n_chars = n_char(word,n) 58 | for _n_char in n_chars: 59 | chars[_n_char] = chars.get(_n_char, 0) + 1 60 | id2char = {i+1:j for i,j in enumerate(chars)} # padding: 0 61 | char2id = {j:i for i,j in id2char.items()} 62 | with codecs.open(save_file, 'w', encoding='utf-8') as f: 63 | json.dump([id2char, char2id], f, indent=4, ensure_ascii=False) 64 | 65 | def collect_word2id(datasets,save_file): 66 | words = {} 67 | for data in datasets: 68 | for word in data['text']: 69 | words[word] = words.get(word,0)+1 70 | id2word = {i+1:j for i,j in enumerate(words)} #padding:0 71 | word2id = {j:i for i,j in id2word.items()} 72 | with codecs.open(save_file, 'w', encoding='utf-8') as f: 73 | json.dump([id2word, word2id], f, indent=4, ensure_ascii=False) 74 | 75 | def collect_BIO2id(datasets,save_file): 76 | BIOs = {} 77 | for data in datasets: 78 | for bio in data['BIOS']: 79 | if bio != 'O': 80 | BIOs[bio] = BIOs.get(bio,0) +1 81 | 82 | id2BIO = {i+1:j for i,j in enumerate(BIOs)} #padding:0 83 | id2BIO[0] = 'O' 84 | BIO2id = {j:i for i,j in id2BIO.items()} 85 | with codecs.open(save_file, 'w', encoding='utf-8') as f: 86 | json.dump([id2BIO, BIO2id], f, indent=4, ensure_ascii=False) 87 | 88 | # def collect_relations2id(datasets,save_file): 89 | # BIOs = {} 90 | # for data in datasets: 91 | # for bio in data['BIOS']: 92 | # BIOs[bio] = BIOs.get(bio,0) +1 93 | # id2BIO = {i+1:j for i,j in enumerate(BIOs)} #padding:0 94 | # BIO2id = {j:i for i,j in id2BIO.items()} 95 | # with codecs.open(save_file, 'w', encoding='utf-8') as f: 96 | # json.dump([id2BIO, BIO2id], f, indent=4, ensure_ascii=False) 97 | 98 | 99 | #这里有bug只能输出字符串，到时候重写一下 100 | class read_properties: 101 | def __init__(self,filepath, sep='=', comment_char='#'): 102 | """Read the file passed as parameter as a properties file.""" 103 | self.props = {} 104 | #print filepath 105 | with open(filepath, "rt") as f: 106 | for line in f: 107 | #print line 108 | l = line.strip() 109 | if l and not l.startswith(comment_char): 110 | key_value = l.split(sep) 111 | self.props[key_value[0].strip()] = key_value[1].split("#")[0].strip('" \t') 112 | 113 | 114 | def getProperty(self,propertyName): 115 | return self.props.get(propertyName) 116 | 117 | 118 | def sentence_pad(X,maxlen_sentence): 119 | #sentence_level pad for word input and bio tagging 120 | #use the maxlen of batch datas to pad the sentence level inputs 121 | """ 122 | 123 | :param datas: [batch_size,None] 124 | :return: datas : [batch_size,maxlen of sentence] 125 | """ 126 | # L = [len(x) for x in X] 127 | # ML = max(L) 128 | ML = maxlen_sentence 129 | return [x + [0] * (ML - len(x)) for x in X] 130 | 131 | 132 | def n_char(word,n): 133 | """ 134 | split the word use n_gram 135 | n = 2 136 | word = love 137 | ==> lo ov ve e 138 | n =3 139 | word = love 140 | ==> lov ove ve 141 | :param word: 142 | :return: 143 | """ 144 | word = str(word) 145 | n_char = [] 146 | n_char.append(''*(n-1) + word[0]) 147 | temp = '' 148 | for index,char in enumerate(word): 149 | if index+n < len(word): 150 | temp += word[index:index+n] 151 | n_char.append(temp) 152 | temp = '' 153 | else: 154 | temp += word[index:] 155 | temp += '' * (n - len(temp)) 156 | n_char.append(temp) 157 | temp = '' 158 | return n_char 159 | 160 | def char_pad(datas,maxlen_sentence,maxlen_word): 161 | #word_leve pad for char input 162 | #use the maxlen of batch data of words to pad the char levels and use the maxlen of batch datas to pad the sentence level inputs 163 | """ 164 | :param datas: [batch_size,None,None] 165 | :return: [batch_size,maxlen of sentence , maxlen of words] 166 | """ 167 | new_data = [] 168 | for sentence in datas: 169 | _sentence = [] 170 | for word in sentence: 171 | if len(word) < maxlen_word: 172 | word+=[0]*(maxlen_word - len(word)) 173 | else: 174 | word = word[:maxlen_word] 175 | _sentence.append(word) 176 | 177 | pad_word = [0]*maxlen_word 178 | if len(_sentence) < maxlen_sentence: 179 | for i in range(maxlen_sentence - len(_sentence)): 180 | _sentence.append(pad_word) 181 | else: 182 | _sentence = _sentence[:maxlen_sentence] 183 | new_data.append(_sentence) 184 | return new_data 185 | 186 | #TODO complete the function for joint extraction 187 | def load_data(mode): 188 | #only for ner prediction now , then i will compelet the function for joint extraction 189 | #load data for predict 190 | config_file = read_properties('config/CoNLL04/bio_config') 191 | is_use_n_char = bool(config_file.getProperty('is_use_n_char')) 192 | filename_char2id = config_file.getProperty("filename_char2id") 193 | filename_word2id = config_file.getProperty("filename_word2id") 194 | filename_BIO2id = config_file.getProperty("filename_BIO2id") 195 | filename_relation2id = config_file.getProperty("filename_relation2id") 196 | id2char, char2id = json.load(open(filename_char2id, encoding='utf-8')) 197 | id2n_char, n_char2id = json.load(open(filename_char2id, encoding='utf-8')) 198 | id2word, word2id = json.load(open(filename_word2id, encoding='utf-8')) 199 | id2BIO, BIO2id = json.load(open(filename_BIO2id, encoding='utf-8')) 200 | filename_train_me = config_file.getProperty("filename_train_me") 201 | filename_dev_me = config_file.getProperty("filename_dev_me") 202 | filename_test_me = config_file.getProperty("filename_test_me") 203 | maxlen_sentence = int(config_file.getProperty('maxlen_sentence')) 204 | maxlen_word = int(config_file.getProperty('maxlen_word')) 205 | eval_data= [] 206 | # import ipdb 207 | # ipdb.set_trace() 208 | if mode == 'dev': 209 | eval_data = json.load(open(filename_dev_me, encoding='utf-8')) 210 | if mode == 'test': 211 | eval_data = json.load(open(filename_test_me, encoding='utf-8')) 212 | if mode == 'train': 213 | eval_data = json.load(open(filename_train_me,encoding='utf-8')) 214 | 215 | TEXT_WORD, TEXT_CHAR, BIO = [], [], [] 216 | for data in eval_data: 217 | text = data['text'] 218 | bio = data['BIOS'] 219 | _text_word = [word2id.get(word,0) for word in text] 220 | _text_char = [] # 2 dimmensions 221 | if is_use_n_char: 222 | for word in _text_word: 223 | n_chars = n_char(word,3) 224 | chars = [n_char2id.get(_char) for _char in n_chars] 225 | _text_char.append(chars) 226 | else: 227 | for word in _text_word: 228 | chars = [char2id.get(_char) for _char in str(word)] 229 | _text_char.append(chars) 230 | _bio = [BIO2id.get(b) for b in bio] 231 | TEXT_WORD.append(_text_word) 232 | TEXT_CHAR.append(_text_char) # [batch,word,char] #padding two times, 233 | # first in word dimensions for sentence maxlen ,then ,in char dimensions for maxlen_word 234 | BIO.append(_bio) 235 | TEXT_WORD = pad_sequences(TEXT_WORD, maxlen=maxlen_sentence, padding='post', value=0) 236 | TEXT_CHAR = np.array(char_pad(TEXT_CHAR,maxlen_sentence,maxlen_word)) 237 | # BIO = pad_sequences(BIO, maxlen=30, padding='post', value=0) 238 | return TEXT_WORD,TEXT_CHAR,BIO 239 | 240 | #TODO 241 | class data_generator(): 242 | def __init__(self,data,char2id,n_char2id,word2id,BIO2id,maxlen_sentence,maxlen_word,is_use_n_char,batch_size=128): 243 | self.data = data 244 | self.batch_size = batch_size 245 | self.char2id = char2id 246 | self.n_char2id = n_char2id 247 | self.word2id = word2id 248 | self.BIO2id = BIO2id 249 | self.maxlen_sentence = maxlen_sentence 250 | self.maxlen_word = maxlen_word 251 | self.is_use_n_char = is_use_n_char 252 | self.steps = len(self.data)//self.batch_size 253 | if len(self.data) % self.batch_size != 0: 254 | self.steps += 1 255 | def __len__(self): 256 | return self.steps 257 | def __iter__(self): 258 | while True : 259 | index = list(range(len(self.data))) 260 | np.random.shuffle(index) 261 | TEXT_WORD,TEXT_CHAR,BIO = [],[],[] 262 | for idx in index: 263 | _data = self.data[idx] 264 | text = _data['text'] 265 | bio = _data['BIOS'] 266 | _text_word = [self.word2id.get(word) for word in text] 267 | _text_char = [] # 2 dimmensions 268 | if self.is_use_n_char: 269 | for word in _text_word: 270 | n_chars = n_char(word,3) 271 | chars = [self.n_char2id.get(_char) for _char in n_chars] 272 | _text_char.append(chars) 273 | else: 274 | for word in _text_word: 275 | chars = [self.char2id.get(_char) for _char in str(word)] 276 | _text_char.append(chars) 277 | _bio = [self.BIO2id.get(b) for b in bio] 278 | TEXT_WORD.append(_text_word) 279 | TEXT_CHAR.append(_text_char) #[batch,word,char] #padding two times, 280 | # first in word dimensions for sentence maxlen ,then ,in char dimensions for maxlen_word 281 | BIO.append(_bio) 282 | if len(TEXT_WORD) == self.batch_size or idx == index[-1]: 283 | TEXT_WORD = pad_sequences(TEXT_WORD,maxlen=self.maxlen_sentence,padding='post',value=0) 284 | TEXT_CHAR = np.array(char_pad(TEXT_CHAR,self.maxlen_sentence,self.maxlen_word)) 285 | BIO = pad_sequences(BIO,maxlen=self.maxlen_sentence,padding='post',value=0) 286 | yield [TEXT_WORD,TEXT_CHAR,BIO ],None 287 | TEXT_WORD,TEXT_CHAR,BIO =[],[],[] 288 | 289 | def _load_embed(file): 290 | def get_coefs(word, *arr): 291 | return word, np.asarray(arr)[:100] 292 | embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='utf-8')) 293 | 294 | return embeddings_index 295 | 296 | def _load_embedding_matrix(word_index, embedding): 297 | embed_word_count = 0 298 | # nb_words = min(max_features, len(word_index)) 299 | nb_words = len(word_index) 300 | embedding_matrix = np.random.normal(size=(nb_words+1, 100)) 301 | 302 | for word, i in word_index.items(): 303 | # if i >= max_features: continue 304 | if word not in embedding: 305 | word = word.lower() 306 | if word.islower and word not in embedding: 307 | word = word.title() 308 | embedding_vector = embedding.get(word) 309 | if embedding_vector is not None: 310 | embedding_matrix[i] = embedding_vector 311 | embed_word_count += 1 312 | print('词向量的覆盖率为{}'.format(embed_word_count / len(word_index))) 313 | return embedding_matrix 314 | 315 | def get_embedding_matrix(word_index): 316 | embedding_dir = 'data/CoNLL04/glove.6B.100d.txt' 317 | embedding = _load_embed(embedding_dir) 318 | embedding_matrix = _load_embedding_matrix(word_index, embedding) 319 | 320 | return embedding_matrix --------------------------------------------------------------------------------