├── .idea
├── Joint-Entity-and-Relation-Extraction.iml
├── modules.xml
└── workspace.xml
├── README.md
├── build_data.py
├── comparative_model.py
├── config
└── CoNLL04
│ └── bio_config
├── data
└── CoNLL04
│ ├── BIO2id.json
│ ├── README.md
│ ├── char2id.json
│ ├── dev.txt
│ ├── dev_me.json
│ ├── relation2id.json
│ ├── test.txt
│ ├── test_me.json
│ ├── train.txt
│ ├── train_me.json
│ ├── vecs.lc.over100freq.zip
│ └── word2id.json
├── eval.py
├── layers.py
├── models.py
├── train.py
└── utils.py
/.idea/Joint-Entity-and-Relation-Extraction.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Joint-Entity-and-Relation-Extraction
2 | Implementation of the papers Joint entity recognition and relation extraction as a multi-head selection problem for multi-context joint entity and relation extraction.
3 |
--------------------------------------------------------------------------------
/build_data.py:
--------------------------------------------------------------------------------
1 | import json
2 | import numpy as np
3 | from utils import read_properties,collect_BIO2id,collect_char2id,collect_data_set,collect_word2id,readFile,collect_n_char2id
4 |
5 | config_file= read_properties('config/CoNLL04/bio_config')
6 | filename_train = config_file.getProperty("filename_train")
7 | filename_test = config_file.getProperty("filename_test")
8 | filename_dev = config_file.getProperty("filename_dev")
9 |
10 | filename_train_me = config_file.getProperty("filename_train_me")
11 | filename_test_me = config_file.getProperty("filename_test_me")
12 | filename_dev_me = config_file.getProperty("filename_dev_me")
13 |
14 | filename_char2id = config_file.getProperty("filename_char2id")
15 | filename_n_char2id = config_file.getProperty("filename_n_char2id")
16 |
17 | filename_word2id = config_file.getProperty("filename_word2id")
18 | filename_BIO2id = config_file.getProperty("filename_BIO2id")
19 | filename_relation2id = config_file.getProperty("filename_relation2id")
20 |
21 | train_data_me = collect_data_set(readFile(filename_train),filename_train_me)
22 | dev_data_me = collect_data_set(readFile(filename_dev),filename_dev_me)
23 | test_data_me = collect_data_set(readFile(filename_test),filename_test_me)
24 |
25 | collect_char2id(train_data_me+dev_data_me+test_data_me,filename_char2id)
26 | collect_n_char2id(train_data_me+dev_data_me+test_data_me,filename_n_char2id,3)
27 | collect_word2id(train_data_me+dev_data_me+test_data_me,filename_word2id)
28 | collect_BIO2id(train_data_me+dev_data_me+test_data_me,filename_BIO2id)
29 | # collect_relations2id(train_data_me+dev_data_me+test_data_me,filename_relation2id)
30 |
31 |
32 |
--------------------------------------------------------------------------------
/comparative_model.py:
--------------------------------------------------------------------------------
1 | from layers import Position_Embedding,Attention_Layer,Self_Attention_Layer,Gate_Add_Lyaer,seq_and_vec,MaskedConv1D,MaskedLSTM,MaskFlatten,MaskPermute,MaskRepeatVector
2 | from keras.models import Model
3 | from keras.layers import *
4 | import keras
5 | from keras_contrib.layers import CRF
6 | from keras_multi_head import MultiHead,MultiHeadAttention
7 | from keras_self_attention import SeqSelfAttention as self_attention
8 | from keras_pos_embd import TrigPosEmbedding
9 | from keras_position_wise_feed_forward import FeedForward
10 |
11 | class lstm_attention_model_ner_part():
12 | def __init__(self,embedding_martrix,hidden_size,
13 | nb_head,word_embed_size,char_embed_size,word_vocab_size,char_vocab_size,multi_layers,num_classes
14 | ,maxlen_sentence,maxlen_word,word_char_embed_mode='add',learning_rate = 5e-5,embedding_dropout_prob=0.1,nn_dropout_prob=0.1,optmizer='adam',
15 | is_use_char_embedding=False):
16 | """
17 | 测试一下self-attention在ner上的效果
18 | """
19 | self.embedding_martrix = embedding_martrix
20 | self.num_classes = num_classes
21 | self.hidden_size = hidden_size
22 | self.nb_head = nb_head
23 | self.word_embed_size = word_embed_size
24 | self.char_embed_size = char_embed_size
25 | # self.pos_embed_size = pos_embed_size #use the add position_embedding
26 | self.word_vocab_size = word_vocab_size
27 | self.char_vocab_size = char_vocab_size
28 | # self.maxlen = maxlen
29 | self.multi_layers = multi_layers
30 | self.maxlen_sentence = maxlen_sentence
31 | self.maxlen_word = maxlen_word
32 | self.word_char_embed_mode= word_char_embed_mode
33 | self.learning_rate = learning_rate
34 | self.embedding_dropout_prob = embedding_dropout_prob
35 | self.nn_dropout_prob = nn_dropout_prob
36 | self.is_use_char_embedding = is_use_char_embedding
37 | print(multi_layers)
38 |
39 | #char_embedding_shape [batch,sentence,word,dim]
40 | def reshape_layer_1(self, char_embedding,char_embedding_shape):
41 | def reshape(char_embedding):
42 | return K.reshape(char_embedding, shape=(-1, char_embedding_shape[-2], self.char_embed_size)) #[batch*sentence,word,dim]
43 | return Lambda(reshape)(char_embedding)
44 |
45 | def reshape_layer_2(self, char_embedding,char_embedding_shape):
46 | def reshape(char_embedding):
47 | return K.reshape(char_embedding, shape=(-1, char_embedding_shape[1], self.char_embed_size)) #[batch,sentence,dim]
48 | return Lambda(reshape)(char_embedding)
49 |
50 | def model(self):
51 | word_input = Input(shape=(self.maxlen_sentence,)) #[batch,sentencen]
52 | char_input = Input(shape=(self.maxlen_sentence,self.maxlen_word,)) #[batch,word,char]
53 | ner_label = Input(shape=(self.maxlen_sentence,))
54 | # relation_label = Input(shape=(self.maxlen_sentence,))
55 |
56 | mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(word_input)
57 |
58 | word_embedding = Embedding(self.word_vocab_size, self.word_embed_size,mask_zero=True,weights=[self.embedding_martrix],name='word_embedding',trainable=True)(word_input) #[batch,word,embed]
59 | char_embedding = Embedding(self.char_vocab_size,self.char_embed_size,mask_zero=True,name='char_embedding',trainable=True)(char_input) #[batch,word,char,embedd]
60 |
61 | if self.embedding_dropout_prob:
62 | word_embedding = Dropout(self.embedding_dropout_prob)(word_embedding)
63 | char_embedding = Dropout(self.embedding_dropout_prob)(char_embedding)
64 |
65 | if self.is_use_char_embedding:
66 | # char_embedding maxpooling part
67 | char_embedding_shape = K.int_shape(char_embedding) # [batch,sentence,word,dim]
68 | # char_embedding_reshaped = K.reshape(char_embedding, shape=(-1, char_embedding_shape[-2],self.char_embed_size)) # [batch*sentence,word,dim of char embedding]
69 | char_embedding_reshaped = self.reshape_layer_1(char_embedding,char_embedding_shape)
70 | char_lstm = Bidirectional(MaskedLSTM(units=self.char_embed_size // 2, return_sequences=True, name='char_lstm_layer'))(
71 | char_embedding_reshaped)
72 | attention = TimeDistributed(Dense(1, activation='tanh'))(char_lstm)
73 | attention = MaskFlatten()(attention)
74 | attention = Activation('softmax')(attention)
75 | attention = MaskRepeatVector(self.char_embed_size)(attention)
76 | attention = MaskPermute([2, 1])(attention)
77 | sent_representation = multiply([char_lstm, attention])
78 | attention = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation)
79 |
80 | # char_maxpool = GlobalMaxPooling1D(char_lstm) # [batch*sentence,hidden_size]
81 | # char_att = Attention_Layer()(char_lstm) # [batch*sentence,hidden_size]
82 | # char_embedding = K.reshape(char_maxpool, shape=[-1, char_embedding_shape[1],
83 | # self.hidden_size]) # [batch,sentence,hidden_size]
84 | # char_embedding = K.reshape(attention, shape=[-1, char_embedding_shape[-1], self.char_embed_size]) # [batch,sentence,hidden_size]
85 | char_embedding = self.reshape_layer_2(attention,char_embedding_shape)
86 | if self.word_char_embed_mode == 'concate':
87 | embedding = Concatenate(axis=-1)([word_embedding,char_embedding])
88 | else :
89 | embedding = Gate_Add_Lyaer()([word_embedding,char_embedding])
90 | # pass
91 | else:
92 | embedding = word_embedding
93 | #multi-layers self-attention for ner pred
94 | if self.embedding_dropout_prob:
95 | embedding = Dropout(self.embedding_dropout_prob)(embedding)
96 |
97 |
98 | # part1 , multi-self-attentionblock, (CNN/LSTM/FNN+self-attention)
99 | lstm = Bidirectional(MaskedLSTM(units=self.hidden_size // 2, return_sequences=True))(embedding)
100 | attention = TimeDistributed(Dense(1, activation='tanh'))(lstm)
101 | attention = MaskFlatten()(attention)
102 | attention = Activation('softmax')(attention)
103 | attention = MaskRepeatVector(self.hidden_size)(attention)
104 | attention = MaskPermute([2, 1])(attention)
105 | sent_representation = multiply([lstm, attention])
106 | attention = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation)
107 | # lstm_attention = Lambda(seq_and_vec, output_shape=(None, self.hidden_size * 2))(
108 | # [lstm, attention]) # [这里考虑下用相加的方法,以及门控相加]
109 | attention = MaskRepeatVector(self.maxlen_sentence)(attention) #[batch,sentence,hidden_size]
110 | lstm = Gate_Add_Lyaer()([lstm,attention])
111 | if self.nn_dropout_prob:
112 | lstm = Dropout(self.nn_dropout_prob)(lstm)
113 |
114 | lstm_attention = MaskedConv1D(filters=self.hidden_size,kernel_size=3,activation='relu',padding='same')(lstm)
115 | bio_pred = Dense(self.num_classes, activation='softmax')(lstm_attention)
116 | pred_model =Model([word_input, char_input], bio_pred)
117 | #part2 multi-head selection for relation classification
118 | train_model = Model([word_input, char_input, ner_label], bio_pred)
119 |
120 | loss = K.sparse_categorical_crossentropy(ner_label, bio_pred)
121 | loss = K.sum(loss * mask[:, :, 0]) / K.sum(mask)
122 |
123 | train_model.summary()
124 | train_model.add_loss(loss)
125 | train_model.compile(keras.optimizers.adam(lr=self.learning_rate))
126 |
127 | return train_model,pred_model
128 |
129 |
130 | class lstm_model_ner_part():
131 | def __init__(self,embedding_martrix,hidden_size,
132 | nb_head,word_embed_size,char_embed_size,word_vocab_size,char_vocab_size,multi_layers,num_classes
133 | ,maxlen_sentence,maxlen_word,word_char_embed_mode='add',learning_rate = 5e-5,embedding_dropout_prob=0.1,nn_dropout_prob=0.1,optmizer='adam',
134 | is_use_char_embedding=False):
135 | """
136 | 测试一下self-attention在ner上的效果
137 | """
138 | self.embedding_martrix = embedding_martrix
139 | self.num_classes = num_classes
140 | self.hidden_size = hidden_size
141 | self.nb_head = nb_head
142 | self.word_embed_size = word_embed_size
143 | self.char_embed_size = char_embed_size
144 | # self.pos_embed_size = pos_embed_size #use the add position_embedding
145 | self.word_vocab_size = word_vocab_size
146 | self.char_vocab_size = char_vocab_size
147 | # self.maxlen = maxlen
148 | self.multi_layers = multi_layers
149 | self.maxlen_sentence = maxlen_sentence
150 | self.maxlen_word = maxlen_word
151 | self.word_char_embed_mode= word_char_embed_mode
152 | self.learning_rate = learning_rate
153 | self.embedding_dropout_prob = embedding_dropout_prob
154 | self.nn_dropout_prob = nn_dropout_prob
155 | self.is_use_char_embedding = is_use_char_embedding
156 | print(multi_layers)
157 |
158 | #char_embedding_shape [batch,sentence,word,dim]
159 | def reshape_layer_1(self, char_embedding,char_embedding_shape):
160 | def reshape(char_embedding):
161 | return K.reshape(char_embedding, shape=(-1, char_embedding_shape[-2], self.char_embed_size)) #[batch*sentence,word,dim]
162 | return Lambda(reshape)(char_embedding)
163 |
164 | def reshape_layer_2(self, char_embedding,char_embedding_shape):
165 | def reshape(char_embedding):
166 | return K.reshape(char_embedding, shape=(-1, char_embedding_shape[1], self.char_embed_size)) #[batch,sentence,dim]
167 | return Lambda(reshape)(char_embedding)
168 |
169 | def model(self):
170 | word_input = Input(shape=(self.maxlen_sentence,)) #[batch,sentencen]
171 | char_input = Input(shape=(self.maxlen_sentence,self.maxlen_word,)) #[batch,word,char]
172 | ner_label = Input(shape=(self.maxlen_sentence,))
173 | # relation_label = Input(shape=self.maxlen_sentence,) #[batch,sentence,n_classes]
174 | mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(word_input)
175 |
176 | word_embedding = Embedding(self.word_vocab_size, self.word_embed_size,mask_zero=True,weights=[self.embedding_martrix],name='word_embedding',trainable=True)(word_input) #[batch,word,embed]
177 | char_embedding = Embedding(self.char_vocab_size,self.char_embed_size,mask_zero=True,name='char_embedding',trainable=True)(char_input) #[batch,word,char,embedd]
178 |
179 | if self.embedding_dropout_prob:
180 | word_embedding = Dropout(self.embedding_dropout_prob)(word_embedding)
181 | char_embedding = Dropout(self.embedding_dropout_prob)(char_embedding)
182 |
183 | if self.is_use_char_embedding:
184 | # char_embedding maxpooling part
185 | char_embedding_shape = K.int_shape(char_embedding) # [batch,sentence,word,dim]
186 | # char_embedding_reshaped = K.reshape(char_embedding, shape=(-1, char_embedding_shape[-2],self.char_embed_size)) # [batch*sentence,word,dim of char embedding]
187 | char_embedding_reshaped = self.reshape_layer_1(char_embedding,char_embedding_shape)
188 | char_lstm = Bidirectional(MaskedLSTM(units=self.char_embed_size // 2, return_sequences=True, name='char_lstm_layer'))(
189 | char_embedding_reshaped)
190 |
191 | attention = TimeDistributed(Dense(1, activation='tanh'))(char_lstm)
192 | attention = MaskFlatten()(attention)
193 | attention = Activation('softmax')(attention)
194 | attention = MaskRepeatVector(self.char_embed_size)(attention)
195 | attention = MaskPermute([2, 1])(attention)
196 | sent_representation = multiply([char_lstm, attention])
197 | attention = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation)
198 |
199 | char_embedding = self.reshape_layer_2(attention,char_embedding_shape)
200 | if self.word_char_embed_mode == 'concate':
201 | embedding = Concatenate(axis=-1)([word_embedding,char_embedding])
202 | else :
203 | embedding = Gate_Add_Lyaer()([word_embedding,char_embedding])
204 | # pass
205 | else:
206 | embedding = word_embedding
207 | #multi-layers self-attention for ner pred
208 | if self.embedding_dropout_prob:
209 | embedding = Dropout(self.embedding_dropout_prob)(embedding)
210 |
211 | # part1 , multi-self-attentionblock, (CNN/LSTM/FNN+self-attention)
212 | lstm = Bidirectional(MaskedLSTM(units=self.hidden_size // 2, return_sequences=True), name='lstm_layer0')(embedding)
213 | if self.nn_dropout_prob:
214 | lstm = Dropout(self.nn_dropout_prob)(lstm)
215 | # # multi_lstm_layers
216 | # if self.multi_layers >= 2:
217 | # for i in range(self.multi_layers - 1):
218 | # i+=1
219 | # lstm = Bidirectional(CuDNNLSTM(self.hidden_size // 2, return_sequences=True), name='lstm_layer{}'.format(i))(lstm)
220 | # if self.nn_dropout_prob:
221 | # lstm = Dropout(self.nn_dropout_prob)(lstm)
222 | bio_pred = Dense(self.num_classes, activation='softmax')(lstm)
223 | pred_model =Model([word_input, char_input], bio_pred)
224 |
225 |
226 | train_model = Model([word_input, char_input, ner_label], bio_pred)
227 |
228 | loss = K.sparse_categorical_crossentropy(ner_label, bio_pred)
229 | loss = K.sum(loss * mask[:, :, 0]) / K.sum(mask)
230 |
231 | loss = K.sum(loss * mask) / K.sum(mask)
232 | train_model.summary()
233 | train_model.add_loss(loss)
234 | train_model.compile(keras.optimizers.adam(lr=self.learning_rate))
235 |
236 | return train_model,pred_model
237 |
238 |
239 |
--------------------------------------------------------------------------------
/config/CoNLL04/bio_config:
--------------------------------------------------------------------------------
1 | #dataset
2 | filename_dev = "data/CoNLL04/dev.txt"
3 | filename_test = "data/CoNLL04/test.txt"
4 | filename_train = "data/CoNLL04/train.txt"
5 |
6 | filename_train_me = "data/CoNLL04/train_me.json"
7 | filename_dev_me = "data/CoNLL04/dev_me.json"
8 | filename_test_me = "data/CoNLL04/test_me.json"
9 |
10 | filename_char2id = "data/CoNLL04/char2id.json"
11 | filename_word2id = "data/CoNLL04/word2id.json"
12 | filename_n_char2id = "data/CoNLL04/n_char2id.json"
13 | filename_BIO2id = "data/CoNLL04/BIO2id.json"
14 | filename_relation2id = "data/CoNLL04/relation2id.json"
15 |
16 | #training
17 | is_use_n_char=False
18 | epochs = 150
19 | batch_size = 128
20 | optimizer = Adam
21 | save_model_file = "save_model/ner_model.weights"
22 |
23 | #hyperparameters
24 | hidden_size = 128
25 | nb_head = 8
26 | word_embed_size = 100
27 | char_embed_size = 30
28 | maxlen_sentence = 100
29 | maxlen_word = 25
30 | multi_layers = 4
31 | embedding_dropout_prob = 0.25
32 | nn_dropout_prob = 0.25
33 | learning_rate = 1e-3
34 | is_use_char_embedding = True
35 |
36 |
--------------------------------------------------------------------------------
/data/CoNLL04/BIO2id.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "1": "O",
4 | "2": "B-Loc",
5 | "3": "I-Loc",
6 | "4": "B-Peop",
7 | "5": "I-Peop",
8 | "6": "B-Org",
9 | "7": "I-Org",
10 | "8": "B-Other",
11 | "9": "I-Other"
12 | },
13 | {
14 | "O": 1,
15 | "B-Loc": 2,
16 | "I-Loc": 3,
17 | "B-Peop": 4,
18 | "I-Peop": 5,
19 | "B-Org": 6,
20 | "I-Org": 7,
21 | "B-Other": 8,
22 | "I-Other": 9
23 | }
24 | ]
--------------------------------------------------------------------------------
/data/CoNLL04/README.md:
--------------------------------------------------------------------------------
1 | We use the splits defined in previous work for the CoNLL04 dataset.
2 |
3 | See our multi-head selection papers for more info.
4 |
5 | The format of the input files has been adapted to the input format of our head selection model.
6 |
7 | The original link to the dataset can be found [here](http://cogcomp.org/Data/ER/conll04.corp).
8 |
9 | data from here https://github.com/bekou/multihead_joint_entity_relation_extraction/tree/master/data/CoNLL04
--------------------------------------------------------------------------------
/data/CoNLL04/char2id.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "1": "V",
4 | "2": "e",
5 | "3": "r",
6 | "4": "y",
7 | "5": "s",
8 | "6": "t",
9 | "7": "o",
10 | "8": "n",
11 | "9": "g",
12 | "10": "u",
13 | "11": "h",
14 | "12": "w",
15 | "13": "i",
16 | "14": "d",
17 | "15": "a",
18 | "16": "c",
19 | "17": "m",
20 | "18": "p",
21 | "19": ",",
22 | "20": "5",
23 | "21": "0",
24 | "22": "-",
25 | "23": "7",
26 | "24": "G",
27 | "25": "I",
28 | "26": "l",
29 | "27": "S",
30 | "28": ".",
31 | "29": "A",
32 | "30": "b",
33 | "31": "v",
34 | "32": "M",
35 | "33": "R",
36 | "34": "W",
37 | "35": "k",
38 | "36": "f",
39 | "37": "E",
40 | "38": "'",
41 | "39": "H",
42 | "40": "C",
43 | "41": "(",
44 | "42": "T",
45 | "43": "x",
46 | "44": ")",
47 | "45": "J",
48 | "46": "2",
49 | "47": "K",
50 | "48": "Y",
51 | "49": "O",
52 | "50": "D",
53 | "51": "N",
54 | "52": "F",
55 | "53": "j",
56 | "54": "z",
57 | "55": "B",
58 | "56": "1",
59 | "57": "6",
60 | "58": "3",
61 | "59": "9",
62 | "60": "4",
63 | "61": "P",
64 | "62": "`",
65 | "63": "L",
66 | "64": "q",
67 | "65": "U",
68 | "66": "$",
69 | "67": "8",
70 | "68": ":",
71 | "69": "X",
72 | "70": "\"",
73 | "71": "Q",
74 | "72": ";",
75 | "73": "Z",
76 | "74": "_",
77 | "75": "!",
78 | "76": "?",
79 | "77": "&"
80 | },
81 | {
82 | "V": 1,
83 | "e": 2,
84 | "r": 3,
85 | "y": 4,
86 | "s": 5,
87 | "t": 6,
88 | "o": 7,
89 | "n": 8,
90 | "g": 9,
91 | "u": 10,
92 | "h": 11,
93 | "w": 12,
94 | "i": 13,
95 | "d": 14,
96 | "a": 15,
97 | "c": 16,
98 | "m": 17,
99 | "p": 18,
100 | ",": 19,
101 | "5": 20,
102 | "0": 21,
103 | "-": 22,
104 | "7": 23,
105 | "G": 24,
106 | "I": 25,
107 | "l": 26,
108 | "S": 27,
109 | ".": 28,
110 | "A": 29,
111 | "b": 30,
112 | "v": 31,
113 | "M": 32,
114 | "R": 33,
115 | "W": 34,
116 | "k": 35,
117 | "f": 36,
118 | "E": 37,
119 | "'": 38,
120 | "H": 39,
121 | "C": 40,
122 | "(": 41,
123 | "T": 42,
124 | "x": 43,
125 | ")": 44,
126 | "J": 45,
127 | "2": 46,
128 | "K": 47,
129 | "Y": 48,
130 | "O": 49,
131 | "D": 50,
132 | "N": 51,
133 | "F": 52,
134 | "j": 53,
135 | "z": 54,
136 | "B": 55,
137 | "1": 56,
138 | "6": 57,
139 | "3": 58,
140 | "9": 59,
141 | "4": 60,
142 | "P": 61,
143 | "`": 62,
144 | "L": 63,
145 | "q": 64,
146 | "U": 65,
147 | "$": 66,
148 | "8": 67,
149 | ":": 68,
150 | "X": 69,
151 | "\"": 70,
152 | "Q": 71,
153 | ";": 72,
154 | "Z": 73,
155 | "_": 74,
156 | "!": 75,
157 | "?": 76,
158 | "&": 77
159 | }
160 | ]
--------------------------------------------------------------------------------
/data/CoNLL04/relation2id.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "1": "O",
4 | "2": "B-Loc",
5 | "3": "I-Loc",
6 | "4": "B-Peop",
7 | "5": "I-Peop",
8 | "6": "B-Org",
9 | "7": "I-Org",
10 | "8": "B-Other",
11 | "9": "I-Other"
12 | },
13 | {
14 | "O": 1,
15 | "B-Loc": 2,
16 | "I-Loc": 3,
17 | "B-Peop": 4,
18 | "I-Peop": 5,
19 | "B-Org": 6,
20 | "I-Org": 7,
21 | "B-Other": 8,
22 | "I-Other": 9
23 | }
24 | ]
--------------------------------------------------------------------------------
/data/CoNLL04/vecs.lc.over100freq.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cdjasonj/Joint-Entity-and-Relation-Extraction/e5eb78ae5b1b9730019706c5f415a6fba61ec777/data/CoNLL04/vecs.lc.over100freq.zip
--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
1 | import json
2 | import utils
3 | from utils import read_properties
4 | # def NER_result_Evaluator(outputs,targets):
5 | # """
6 | # 这里测评下ner的结果的f1
7 | # :return:
8 | # """
9 | #
10 | # right,true,pred = 1e-10, 1e-10, 1e-10
11 | #
12 | # for i in range(len(outputs)):
13 | # output = outputs[i]
14 | # target = targets[i]
15 | # output = output[:len(target)]
16 | # for j in range(len(output)):
17 | # if output[j] != 0:
18 | # pred += 1
19 | # if target[j] == output[j]:
20 | # right += 1
21 | # for j in range(len(target)):
22 | # if target[j] != 0 :
23 | # true+=1
24 | # R = right/pred
25 | # P = right/true
26 | # F = (2*P*R)/(P+R)
27 | # return P,R,F
28 |
29 |
30 | def NER_result_Evaluator(outputs,targets):
31 | config_file = read_properties('config/CoNLL04/bio_config')
32 | filename_BIO2id = config_file.getProperty("filename_BIO2id")
33 | id2BIO, BIO2id = json.load(open(filename_BIO2id, encoding='utf-8'))
34 | right,true,pred = 1e-10, 1e-10, 1e-10
35 | for i in range(len(outputs)):
36 | output = outputs[i]
37 | target = targets[i]
38 | output = output[:len(target)]
39 | flag= 0
40 | output_pred = []
41 | target_pred = []
42 | for i in range(len(output)):
43 | bio = id2BIO[str(output[i])]
44 | if bio[0]=='B':
45 | output_pred.append(i)
46 | for j in range(i+1,len(output)):
47 | bio = id2BIO[str(output[j])]
48 | if bio[0] == 'I':
49 | output_pred.append(i+j+1)
50 | else:
51 | break
52 | break
53 | for i in range(len(target)):
54 | bio = id2BIO[str(target[i])]
55 | if bio[0]=='B':
56 | target_pred.append(i)
57 | for j in range(i+1,len(target)):
58 | bio = id2BIO[str(target[j])]
59 | if bio[0] == 'I':
60 | target_pred.append(i+j+1)
61 | else:
62 | break
63 | break
64 |
65 | if output_pred:
66 | pred+=1
67 | if target_pred:
68 | true+=1
69 | if output_pred == target_pred:
70 | right+=1
71 |
72 | R = right/pred
73 | P = right/true
74 | F = (2*P*R)/(P+R)
75 | return P,R,F
76 |
--------------------------------------------------------------------------------
/layers.py:
--------------------------------------------------------------------------------
1 | import keras
2 | from keras import backend as K
3 | from keras.layers import *
4 |
5 |
6 | def seq_and_vec(x):
7 | """seq是[None, seq_len, s_size]的格式,
8 | vec是[None, v_size]的格式,将vec重复seq_len次,拼到seq上,
9 | 得到[None, seq_len, s_size+v_size]的向量。
10 | """
11 | seq, vec = x
12 | vec = K.expand_dims(vec, 1)
13 | vec = K.zeros_like(seq[:, :, :1]) + vec
14 | return K.concatenate([seq, vec], 2)
15 |
16 | def attention_like_tensor(x):
17 | """
18 | 把attention向量广播到每个一时间步,
19 | :param x: [batch,dim]
20 | :return: [batch,sentene,dim]
21 | """
22 |
23 | class Attention_Layer(keras.layers.Layer):
24 |
25 | """
26 | dot attention for word_char_embedding
27 | q,v,k for define the attention
28 | score = softmax(dot(q,v))
29 | attention = sum(score*k)=
30 | """
31 | # def __init__(self,**kwargs):
32 | # super(Attention_Layer,self).__init__(**kwargs)
33 | #
34 | # def build(self, input_shape):
35 | # self.W = self.add_weight(name='W',shape=(input_shape[-1],input_shape[-1]),initializer='glorot_normal')
36 | # self.acitvation =
37 | # def call(self,inputs,mask=None):
38 | # score = K.softmax(K.dot(inputs,self.W),axis=-1)
39 | # c =
40 |
41 |
42 | class Gate_Add_Lyaer(keras.layers.Layer):
43 | """
44 | gate add mechanism for word_char embedding
45 | z = sigmoid(W(1)tanh(W(2)word_embedding + W(3)char_att))
46 | word_char_embedding = z*word_embedding + (1-z)char_att
47 |
48 | """
49 | def __init__(self,**kwargs):
50 | """
51 |
52 | :param word_embedding: shape [batch,sentence,dim of word_embedding]
53 | :param char_att: shape [batch,sentence,dim of char_embedding]
54 | :param kwargs:
55 | """
56 | super(Gate_Add_Lyaer,self).__init__(**kwargs)
57 | self.supports_masking = True
58 |
59 | def build(self, input_shape):
60 | assert input_shape[0][2] == input_shape[1][2]
61 |
62 | self.W1 = self.add_weight(name='W1',shape=(input_shape[0][-1],input_shape[0][-1]),initializer='glorot_normal') #[dim,dim]
63 | self.W2 = self.add_weight(name='W2',shape=(input_shape[0][-1],input_shape[0][-1]),initializer='glorot_normal')
64 | self.W3 = self.add_weight(name='W3',shape=(input_shape[0][-1],input_shape[0][-1]),initializer='glorot_normal')
65 |
66 | super(Gate_Add_Lyaer, self).build(input_shape)
67 |
68 | def call(self,inputs,mask=None):
69 | # inputs[0]:word_embedding ,inputs[1]:char_embedding
70 | word_embedding_shape = K.int_shape(inputs[0]) #[batch,sentence,dim of word embedding]
71 | char_embedding_shape = K.int_shape(inputs[1]) #[batch,sentence,dim of char embedding]
72 | # word_embedding_reshaped = K.reshape(inputs[0],shape=(-1,word_embedding_shape[-1])) #[batch*sentence,dim of word embedding]
73 | # char_embedding_reshaped = K.reshape(inputs[1],shape=(-1,char_embedding_shape[-1])) #[batch*sentence, dim of char embedding]
74 | word_embedding = K.dot(inputs[0],self.W1)
75 | char_embedding = K.dot(inputs[1],self.W2)
76 | wc_tanh = K.tanh(word_embedding+char_embedding)
77 | z = K.sigmoid(K.dot(wc_tanh,self.W3))
78 | embedding = z*inputs[0]+(1-z)*inputs[1]
79 | # z = K.sigmoid(K.dot(K.tanh(K.dot(word_embedding_reshaped,self.W1) + K.dot(char_embedding_shape,self.W2)),self.W3))
80 | # embedding = z*word_embedding_reshaped + (1-z)*char_embedding_reshaped #[batch*sentence,]
81 | # embedding = K.reshape(embedding,shape=(-1,word_embedding_reshaped[1],word_embedding_reshaped[-1]))# [batch,sentecen,dim]
82 | return embedding
83 |
84 | def compute_mask(self, inputs, mask=None):
85 | return mask
86 |
87 |
88 | def compute_output_shape(self, input_shape):
89 | return (input_shape[0][0],input_shape[0][1],input_shape[0][2])
90 |
91 |
92 | class Position_Embedding(Layer):
93 |
94 | def __init__(self, size=None, mode='sum', **kwargs):
95 | self.size = size # 必须为偶数
96 | self.mode = mode
97 | super(Position_Embedding, self).__init__(**kwargs)
98 |
99 | def call(self, x,mask=None):
100 | if (self.size == None) or (self.mode == 'sum'):
101 | self.size = int(x.shape[-1])
102 | batch_size, seq_len = K.shape(x)[0], K.shape(x)[1]
103 | position_j = 1. / K.pow(10000., 2 * K.arange(self.size / 2, dtype='float32') / self.size)
104 | position_j = K.expand_dims(position_j, 0)
105 | position_i = K.cumsum(K.ones_like(x[:, :, 0]), 1) - 1 # K.arange不支持变长,只好用这种方法生成
106 | position_i = K.expand_dims(position_i, 2)
107 | position_ij = K.dot(position_i, position_j)
108 | position_ij = K.concatenate([K.cos(position_ij), K.sin(position_ij)], 2)
109 | if self.mode == 'sum':
110 | return position_ij + x
111 | elif self.mode == 'concat':
112 | return K.concatenate([position_ij, x], 2)
113 |
114 | def compute_output_shape(self, input_shape):
115 | if self.mode == 'sum':
116 | return input_shape
117 | elif self.mode == 'concat':
118 | return (input_shape[0], input_shape[1], input_shape[2] + self.size)
119 |
120 |
121 | class Self_Attention_Layer(Layer):
122 | """多头注意力机制
123 | """
124 |
125 | def __init__(self, nb_head, size_per_head, **kwargs):
126 | self.nb_head = nb_head
127 | self.size_per_head = size_per_head
128 | self.out_dim = nb_head * size_per_head
129 | super(Self_Attention_Layer, self).__init__(**kwargs)
130 |
131 | def build(self, input_shape):
132 | q_in_dim = input_shape[0][-1]
133 | k_in_dim = input_shape[1][-1]
134 | v_in_dim = input_shape[2][-1]
135 | self.q_kernel = self.add_weight(name='q_kernel',
136 | shape=(q_in_dim, self.out_dim),
137 | initializer='glorot_normal')
138 | self.k_kernel = self.add_weight(name='k_kernel',
139 | shape=(k_in_dim, self.out_dim),
140 | initializer='glorot_normal')
141 | self.v_kernel = self.add_weight(name='w_kernel',
142 | shape=(v_in_dim, self.out_dim),
143 | initializer='glorot_normal')
144 |
145 | def mask(self, x, mask, mode='mul'):
146 | if mask is None:
147 | return x
148 | else:
149 | for _ in range(K.ndim(x) - K.ndim(mask)):
150 | mask = K.expand_dims(mask, K.ndim(mask))
151 | if mode == 'mul':
152 | return x * mask
153 | else:
154 | return x - (1 - mask) * 1e10
155 |
156 | def call(self, inputs):
157 | q, k, v = inputs[:3]
158 | v_mask, q_mask = None, None
159 | if len(inputs) > 3:
160 | v_mask = inputs[3]
161 | if len(inputs) > 4:
162 | q_mask = inputs[4]
163 | # 线性变化
164 | qw = K.dot(q, self.q_kernel)
165 | kw = K.dot(k, self.k_kernel)
166 | vw = K.dot(v, self.v_kernel)
167 | # qw = Dense(self.out_dim,activation='relu')(q)
168 | # kw = Dense(self.out_dim, activation='relu')(k)
169 | # vw = Dense(self.out_dim, activation='relu')(v)
170 | # 形状变换
171 | qw = K.reshape(qw, (-1, K.shape(qw)[1], self.nb_head, self.size_per_head))
172 | kw = K.reshape(kw, (-1, K.shape(kw)[1], self.nb_head, self.size_per_head))
173 | vw = K.reshape(vw, (-1, K.shape(vw)[1], self.nb_head, self.size_per_head))
174 | # 维度置换
175 | qw = K.permute_dimensions(qw, (0, 2, 1, 3))
176 | kw = K.permute_dimensions(kw, (0, 2, 1, 3))
177 | vw = K.permute_dimensions(vw, (0, 2, 1, 3))
178 | # Attention
179 | a = K.batch_dot(qw, kw, [3, 3]) / self.size_per_head ** 0.5
180 | a = K.permute_dimensions(a, (0, 3, 2, 1))
181 | a = self.mask(a, v_mask, 'add')
182 | a = K.permute_dimensions(a, (0, 3, 2, 1))
183 | a = K.softmax(a)
184 | # 完成输出
185 | o = K.batch_dot(a, vw, [3, 2])
186 | o = K.permute_dimensions(o, (0, 2, 1, 3))
187 | o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
188 | o = self.mask(o, q_mask, 'mul')
189 | return o
190 |
191 | def compute_output_shape(self, input_shape):
192 | return (input_shape[0][0], input_shape[0][1], self.out_dim)
193 |
194 |
195 | class MaskedConv1D(keras.layers.Conv1D):
196 |
197 | def __init__(self, **kwargs):
198 | super(MaskedConv1D, self).__init__(**kwargs)
199 | self.supports_masking = True
200 |
201 | def compute_mask(self, inputs, mask=None):
202 | return mask
203 |
204 | def call(self, inputs, mask=None):
205 | if mask is not None:
206 | mask = K.cast(mask, K.floatx())
207 | inputs *= K.expand_dims(mask, axis=-1)
208 | return super(MaskedConv1D, self).call(inputs)
209 |
210 |
211 | class MaskedLSTM(keras.layers.CuDNNLSTM):
212 |
213 | def __init__(self, **kwargs):
214 | super(MaskedLSTM, self).__init__(**kwargs)
215 | self.supports_masking = True
216 |
217 | def compute_mask(self, inputs, mask=None):
218 | return mask
219 |
220 | def call(self, inputs, mask=None, training=None, initial_state=None):
221 | if mask is not None:
222 | mask = K.cast(mask, K.floatx())
223 | inputs *= K.expand_dims(mask, axis=-1)
224 | return super(MaskedLSTM, self).call(inputs)
225 |
226 |
227 | class MaskFlatten(keras.layers.Flatten):
228 |
229 | def __init__(self, **kwargs):
230 | super(MaskFlatten, self).__init__(**kwargs)
231 | self.supports_masking = True
232 |
233 | def compute_mask(self, inputs, mask=None):
234 | return mask
235 |
236 | def call(self, inputs, mask=None):
237 | # if mask is not None:
238 | # mask = K.cast(mask, K.floatx())
239 | # inputs *= K.expand_dims(mask, axis=-1)
240 | return super(MaskFlatten, self).call(inputs) #调用父类的call ,然后传入inputs
241 |
242 |
243 | class MaskRepeatVector(keras.layers.RepeatVector):
244 |
245 | def __init__(self, n,**kwargs):
246 | super(MaskRepeatVector, self).__init__(n,**kwargs)
247 | self.supports_masking = True
248 |
249 | def compute_mask(self, inputs, mask=None):
250 | return mask
251 |
252 | def call(self, inputs, mask=None):
253 | return super(MaskRepeatVector, self).call(inputs)
254 |
255 | class MaskPermute(keras.layers.Permute):
256 |
257 | def __init__(self, dims,**kwargs):
258 | super(MaskPermute, self).__init__(dims,**kwargs)
259 | self.supports_masking = True
260 |
261 | def compute_mask(self, inputs, mask=None):
262 | return mask
263 |
264 | def call(self, inputs, mask=None):
265 | return super(MaskPermute, self).call(inputs)
266 |
--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
1 | import keras
2 | import numpy as np
3 | from keras.layers import *
4 | from keras.models import Model
5 | from layers import Position_Embedding,Attention_Layer,Self_Attention_Layer,Gate_Add_Lyaer,seq_and_vec,MaskedConv1D,MaskedLSTM,MaskFlatten,MaskPermute,MaskRepeatVector
6 |
7 | class word_char_lstm_model():
8 |
9 | def __init__(self, hidden_size, word_embed_size, char_embed_size, word_vocab_size,char_vocab_size, multi_layers, maxlen, maxlen_word,
10 | num_classes_part1,num_classes_part2, learning_rate=5e-5, embedding_dropout_prob=0.1,nn_dropout_prob = 0.1, optmizer='adam'):
11 | """
12 | try the gate add way for word_char_embedding, char_embedding from the char-leve of word attention
13 |
14 | :param hidden_size:
15 | :param embed_size:
16 | :param vocab_size:
17 | :param dropout_prob:
18 | """
19 | self.num_classes_part1 = num_classes_part1
20 | self.num_classes_part2 = num_classes_part2
21 | self.hidden_size = hidden_size
22 | self.word_embed_size = word_embed_size
23 | self.char_embed_size = char_embed_size
24 | self.word_vocab_size = word_vocab_size
25 | self.char_vocab_size = char_vocab_size
26 | self.maxlen = maxlen
27 | self.maxlen_word = maxlen_word
28 | self.multi_layers = multi_layers
29 | self.learning_rate = learning_rate
30 | self.embedding_dropout_prob = embedding_dropout_prob
31 | self.nn_dropout_prob = nn_dropout_prob
32 |
33 | def model(self):
34 | """
35 | 后面加上mask
36 | part1 : word_embed=>LSTM*2=>attention=>dense=>outputs
37 | part2 : outputs_concat LSTM => dense=>outputs
38 | :return:
39 | """
40 | word_input = Input(shape=(None,)) # [batch_size,sentence]
41 | char_input = Input(shape=(None,None,)) # [batch_size,sentence,word]
42 | outputs_part1 = Input(shape=(None,))
43 | outputs_part2 = Input(shape=(None,None,)) #[batch,sentence,sentence*rel_counts]
44 | # word_embedding_layer
45 | word_embedding = Embedding(self.word_vocab_size, self.word_embed_size, name='word_embedding')(word_input)
46 | char_embedding = Embedding(self.char_vocab_size, self.char_embed_size, name='char_embedding')(char_input) # [batch,sentence,word,dim of char embedding]
47 | if self.embedding_dropout_prob:
48 | word_embedding = Dropout(self.embedding_dropout_prob)(word_embedding)
49 | char_embedding = Dropout(self.embedding_dropout_prob)(char_embedding)
50 |
51 | # char_embedding maxpooling part
52 | char_embedding_shape = K.int_shape(char_embedding) # [batch,sentence,word,dim]
53 | char_embedding_reshaped = K.reshape(char_embedding, shape=[-1, char_embedding_shape[-2],
54 | self.char_embed_size]) # [batch*sentence,word,dim of char embedding]
55 | char_lstm = Bidirectional(CuDNNLSTM(self.hidden_size // 2, return_sequences=True, name='char_lstm_layer'))(
56 | char_embedding_reshaped)
57 | # char_maxpool = GlobalMaxPooling1D(char_lstm) # [batch*sentence,hidden_size]
58 | char_att = Attention_Layer()(char_lstm) #[batch*sentence,hidden_size]
59 | # char_embedding = K.reshape(char_maxpool, shape=[-1, char_embedding_shape[1],
60 | # self.hidden_size]) # [batch,sentence,hidden_size]
61 | char_embedding = K.reshape(char_att,shape=[-1,char_embedding_shape[-1],self.hidden_size])#[batch,sentence,hidden_size]
62 | # embedding = Concatenate(axis=-1)([word_embedding, char_embedding])
63 | embedding = Gate_Add_Lyaer()([word_embedding,char_embedding])
64 | # part1 , entity_pred
65 | lstm = Bidirectional(CuDNNLSTM(self.hidden_size // 2, return_sequences=True, name='lstm_layer0'))(embedding)
66 | if self.nn_dropout_prob:
67 | lstm = Dropout(self.nn_dropout_prob)(lstm)
68 | # multi_lstm_layers
69 | if self.multi_layers >= 2:
70 | for i in range(self.multi_layers - 1):
71 | lstm = Bidirectional(
72 | CuDNNLSTM(self.hidden_size // 2, return_sequences=True, name='lstm_layer{}'.format(i + 1)))(lstm)
73 | if self.nn_dropout_prob:
74 | lstm = Dropout(self.nn_dropout_prob)(lstm)
75 |
76 | attention = TimeDistributed(Dense(1, activation='tanh'))(lstm)
77 | attention = Flatten()(attention)
78 | attention = Activation('softmax')(attention)
79 | attention = RepeatVector(self.hidden_size)(attention)
80 | attention = Permute([2, 1])(attention)
81 | sent_representation = multiply([lstm, attention])
82 | attention = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation)
83 | lstm_attention = Lambda(seq_and_vec, output_shape=(None, self.hidden_size * 2))(
84 | [lstm, attention]) # [这里考虑下用相加的方法,以及门控相加]
85 | entity_pred = Dense(self.num_classes_part1, activation='softmax')(lstm_attention)
86 | entity_model = Model([word_input], [entity_pred])
87 |
88 | # part2 multi-head selection for relation classfication
89 | h = Concatenate(axis=-1)([lstm, entity_pred])
90 | multi_head_selection_pred = Dense(self.num_classes_part2, activation='sigmoid')(h) #[batch_size,sentence,]
91 | relation_model = Model([word_input], [multi_head_selection_pred])
92 | train_model = Model([word_input, outputs_part1, outputs_part2], [multi_head_selection_pred])
93 |
94 | part1_loss = K.sparse_categorical_crossentropy(outputs_part1, entity_pred)
95 | part2_loss = K.binary_crossentropy(outputs_part2, multi_head_selection_pred)
96 | part1_loss = K.mean(part1_loss)
97 | part2_loss = K.mean(part2_loss)
98 |
99 | train_model.add_loss(part1_loss + part2_loss)
100 | train_model.compile(keras.optimizers.adam(lr=5e-5))
101 |
102 | return entity_model, relation_model, train_model
103 |
104 |
105 | class lstm_attention_model_ner_part():
106 | def __init__(self,embedding_martrix,hidden_size,
107 | nb_head,word_embed_size,char_embed_size,word_vocab_size,char_vocab_size,multi_layers,num_classes
108 | ,maxlen_sentence,maxlen_word,word_char_embed_mode='add',learning_rate = 5e-5,embedding_dropout_prob=0.1,nn_dropout_prob=0.1,optmizer='adam',
109 | is_use_char_embedding=False):
110 | """
111 | 测试一下self-attention在ner上的效果
112 | """
113 | self.embedding_martrix = embedding_martrix
114 | self.num_classes = num_classes
115 | self.hidden_size = hidden_size
116 | self.nb_head = nb_head
117 | self.word_embed_size = word_embed_size
118 | self.char_embed_size = char_embed_size
119 | # self.pos_embed_size = pos_embed_size #use the add position_embedding
120 | self.word_vocab_size = word_vocab_size
121 | self.char_vocab_size = char_vocab_size
122 | # self.maxlen = maxlen
123 | self.multi_layers = multi_layers
124 | self.maxlen_sentence = maxlen_sentence
125 | self.maxlen_word = maxlen_word
126 | self.word_char_embed_mode= word_char_embed_mode
127 | self.learning_rate = learning_rate
128 | self.embedding_dropout_prob = embedding_dropout_prob
129 | self.nn_dropout_prob = nn_dropout_prob
130 | self.is_use_char_embedding = is_use_char_embedding
131 | print(multi_layers)
132 |
133 | #char_embedding_shape [batch,sentence,word,dim]
134 | def reshape_layer_1(self, char_embedding,char_embedding_shape):
135 | def reshape(char_embedding):
136 | return K.reshape(char_embedding, shape=(-1, char_embedding_shape[-2], self.char_embed_size)) #[batch*sentence,word,dim]
137 | return Lambda(reshape)(char_embedding)
138 |
139 | def reshape_layer_2(self, char_embedding,char_embedding_shape):
140 | def reshape(char_embedding):
141 | return K.reshape(char_embedding, shape=(-1, char_embedding_shape[1], self.char_embed_size)) #[batch,sentence,dim]
142 | return Lambda(reshape)(char_embedding)
143 |
144 | def model(self):
145 | word_input = Input(shape=(self.maxlen_sentence,)) #[batch,sentencen]
146 | char_input = Input(shape=(self.maxlen_sentence,self.maxlen_word,)) #[batch,word,char]
147 | ner_label = Input(shape=(self.maxlen_sentence,))
148 |
149 | mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(word_input)
150 |
151 | word_embedding = Embedding(self.word_vocab_size, self.word_embed_size,weights=[self.embedding_martrix],name='word_embedding',trainable=True)(word_input) #[batch,word,embed]
152 | char_embedding = Embedding(self.char_vocab_size,self.char_embed_size,name='char_embedding',trainable=True)(char_input) #[batch,word,char,embedd]
153 |
154 | if self.embedding_dropout_prob:
155 | word_embedding = Dropout(self.embedding_dropout_prob)(word_embedding)
156 | char_embedding = Dropout(self.embedding_dropout_prob)(char_embedding)
157 |
158 | if self.is_use_char_embedding:
159 | # char_embedding maxpooling part
160 | char_embedding_shape = K.int_shape(char_embedding) # [batch,sentence,word,dim]
161 | # char_embedding_reshaped = K.reshape(char_embedding, shape=(-1, char_embedding_shape[-2],self.char_embed_size)) # [batch*sentence,word,dim of char embedding]
162 | char_embedding_reshaped = self.reshape_layer_1(char_embedding,char_embedding_shape)
163 | char_lstm = Bidirectional(MaskedLSTM(units=self.char_embed_size // 2, return_sequences=True, name='char_lstm_layer'))(
164 | char_embedding_reshaped)
165 |
166 | attention = TimeDistributed(Dense(1, activation='tanh'))(char_lstm)
167 | attention = MaskFlatten()(attention)
168 | attention = Activation('softmax')(attention)
169 | attention = MaskRepeatVector(self.char_embed_size)(attention)
170 | attention = MaskPermute([2, 1])(attention)
171 | sent_representation = multiply([char_lstm, attention])
172 | attention = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation)
173 |
174 | # char_maxpool = GlobalMaxPooling1D(char_lstm) # [batch*sentence,hidden_size]
175 | # char_att = Attention_Layer()(char_lstm) # [batch*sentence,hidden_size]
176 | # char_embedding = K.reshape(char_maxpool, shape=[-1, char_embedding_shape[1],
177 | # self.hidden_size]) # [batch,sentence,hidden_size]
178 | # char_embedding = K.reshape(attention, shape=[-1, char_embedding_shape[-1], self.char_embed_size]) # [batch,sentence,hidden_size]
179 | char_embedding = self.reshape_layer_2(attention,char_embedding_shape)
180 | if self.word_char_embed_mode == 'concate':
181 | embedding = Concatenate(axis=-1)([word_embedding,char_embedding])
182 | else :
183 | embedding = Gate_Add_Lyaer()([word_embedding,char_embedding])
184 | # pass
185 | else:
186 | embedding = word_embedding
187 | #multi-layers self-attention for ner pred
188 | if self.embedding_dropout_prob:
189 | embedding = Dropout(self.embedding_dropout_prob)(embedding)
190 |
191 | # part1 , multi-self-attentionblock, (CNN/LSTM/FNN+self-attention)
192 | lstm = Bidirectional(MaskedLSTM(units=self.hidden_size // 2, return_sequences=True), name='lstm_layer0')(embedding)
193 | if self.nn_dropout_prob:
194 | lstm = Dropout(self.nn_dropout_prob)(lstm)
195 | # # multi_lstm_layers
196 | # if self.multi_layers >= 2:
197 | # for i in range(self.multi_layers - 1):
198 | # i+=1
199 | # lstm = Bidirectional(CuDNNLSTM(self.hidden_size // 2, return_sequences=True), name='lstm_layer{}'.format(i))(lstm)
200 | # if self.nn_dropout_prob:
201 | # lstm = Dropout(self.nn_dropout_prob)(lstm)
202 |
203 | attention = TimeDistributed(Dense(1, activation='tanh'))(lstm)
204 | #
205 | attention = MaskFlatten()(attention)
206 | attention = Activation('softmax')(attention)
207 | attention = MaskRepeatVector(self.hidden_size)(attention)
208 | attention = MaskPermute([2, 1])(attention)
209 | sent_representation = multiply([lstm, attention])
210 | attention = Lambda(lambda xin: K.sum(xin, axis=1))(sent_representation)
211 | lstm_attention = Lambda(seq_and_vec, output_shape=(None, self.hidden_size * 2))(
212 | [lstm, attention]) # [这里考虑下用相加的方法,以及门控相加]
213 | lstm_attention = MaskedConv1D(filters=self.hidden_size,kernel_size=3,activation='relu',padding='same')(lstm_attention)
214 |
215 | bio_pred = Dense(self.num_classes, activation='softmax')(lstm_attention)
216 | pred_model =Model([word_input, char_input], bio_pred)
217 | train_model = Model([word_input, char_input, ner_label], bio_pred)
218 |
219 | loss = K.sparse_categorical_crossentropy(ner_label, bio_pred)
220 | loss = K.sum(loss * mask[:, :, 0]) / K.sum(mask)
221 |
222 | loss = K.sum(loss * mask) / K.sum(mask)
223 | train_model.summary()
224 | train_model.add_loss(loss)
225 | train_model.compile(keras.optimizers.adam(lr=self.learning_rate))
226 |
227 | return train_model,pred_model
228 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | import json
2 | import numpy as np
3 | from utils import read_properties,data_generator,load_data,get_embedding_matrix
4 | from comparative_model import lstm_model_ner_part,lstm_attention_model_ner_part
5 | import keras
6 | import keras.backend as K
7 | from keras.callbacks import LearningRateScheduler
8 | from eval import NER_result_Evaluator
9 | import os
10 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
11 | config_file= read_properties('config/CoNLL04/bio_config')
12 | #datasets
13 | filename_train_me = config_file.getProperty("filename_train_me")
14 | filename_test_me = config_file.getProperty("filename_test_me")
15 | filename_dev_me = config_file.getProperty("filename_dev_me")
16 |
17 | filename_char2id = config_file.getProperty("filename_char2id")
18 | filename_word2id = config_file.getProperty("filename_word2id")
19 | filename_BIO2id = config_file.getProperty("filename_BIO2id")
20 | filename_relation2id = config_file.getProperty("filename_relation2id")
21 |
22 | #training
23 | epochs = config_file.getProperty("epochs")
24 | batch_size = config_file.getProperty('batch_size')
25 | model_save_file = config_file.getProperty('save_model_file')
26 | is_use_n_char = bool(config_file.getProperty('is_use_n_char'))
27 |
28 | #hyperparameters
29 | is_use_char_embedding = bool(config_file.getProperty('is_use_char_embedding'))
30 | hidden_size = int(config_file.getProperty('hidden_size'))
31 | word_embed_size = int(config_file.getProperty('word_embed_size'))
32 | char_embed_size = int(config_file.getProperty('char_embed_size'))
33 | embedding_dropout_prob = float(config_file.getProperty('embedding_dropout_prob'))
34 | nn_dropout_prob = float(config_file.getProperty('nn_dropout_prob'))
35 | multi_layers = int(config_file.getProperty('multi_layers'))
36 | nb_head = int(config_file.getProperty('nb_head'))
37 | learning_rate = float(config_file.getProperty('learning_rate'))
38 | maxlen_sentence = int(config_file.getProperty('maxlen_sentence'))
39 | maxlen_word = int(config_file.getProperty('maxlen_word'))
40 |
41 | train_data = json.load(open(filename_train_me,encoding='utf-8'))
42 | dev_data = json.load(open(filename_dev_me,encoding='utf-8'))
43 | id2char, char2id = json.load(open(filename_char2id,encoding='utf-8'))
44 | id2n_char, n_char2id = json.load(open(filename_char2id,encoding='utf-8'))
45 | id2word, word2id = json.load(open(filename_word2id,encoding='utf-8'))
46 | id2BIO,BIO2id = json.load(open(filename_BIO2id,encoding='utf-8'))
47 | # id2relation,relation2id = json.load(open(filename_relation2id,encoding='utf-8'))
48 | char_vocab_size = len(char2id) +1 # 0,padding
49 | word_vocab_size = len(word2id) +1 # 0 ,padding
50 | ner_classes_num = len(BIO2id)
51 | embedding_martrix = get_embedding_matrix(word2id)
52 |
53 | # lstm_model = lstm_model_ner_part(hidden_size, nb_head, word_embed_size, char_embed_size, word_vocab_size, char_vocab_size, multi_layers,
54 | # ner_classes_num, learning_rate, embedding_dropout_prob, nn_dropout_prob, is_use_char_embedding)
55 | #
56 | # self_att_model = self_attention_model_ner_part(hidden_size, nb_head, word_embed_size, char_embed_size, word_vocab_size, char_vocab_size, multi_layers,
57 | # ner_classes_num, learning_rate, embedding_dropout_prob, nn_dropout_prob, is_use_char_embedding)
58 | # self_att_model = self_attention_model_ner_part(embedding_martrix,hidden_size, 5, word_embed_size, char_embed_size, word_vocab_size, char_vocab_size, 5,
59 | # ner_classes_num, maxlen_sentence,maxlen_word,learning_rate, embedding_dropout_prob, nn_dropout_prob, 'adam',False)
60 |
61 | word_char_embed_mode = 'concate'
62 | # lstm_model = lstm_model_ner_part(embedding_martrix,hidden_size, nb_head, word_embed_size, char_embed_size, word_vocab_size, char_vocab_size, multi_layers,
63 | # ner_classes_num, maxlen_sentence,maxlen_word,word_char_embed_mode,learning_rate,embedding_dropout_prob,nn_dropout_prob,'adam',True)
64 | lstm_model = lstm_attention_model_ner_part(embedding_martrix,hidden_size, nb_head, word_embed_size, char_embed_size, word_vocab_size, char_vocab_size, multi_layers,
65 | ner_classes_num, maxlen_sentence,maxlen_word,word_char_embed_mode,learning_rate,embedding_dropout_prob,nn_dropout_prob,'adam',True)
66 |
67 | train_model,pred_model = lstm_model.model()
68 |
69 | #TODO only ner part now, then complete it
70 |
71 | def pred_op(mode):
72 | eval_TEXT_WORD, eval_TEXT_CHAR, true_bio = load_data(mode)
73 | ner_pred = pred_model.predict([eval_TEXT_WORD, eval_TEXT_CHAR],batch_size=800,verbose=1)#[batch,sentence,num_classe]
74 | ner_pred = np.argmax(ner_pred,axis=-1) #[batch,sentence]
75 | return ner_pred,true_bio
76 |
77 | #
78 | # def scheduler(epoch):
79 | # # 每隔1个epoch,学习率减小为原来的1/2
80 | # # if epoch % 100 == 0 and epoch != 0:
81 | # #再epoch > 3的时候,开始学习率递减,每次递减为原来的1/2,最低为2e-6
82 | # if (epoch+1) % 50 == 0:
83 | # lr = K.get_value(train_model.optimizer.lr)
84 | # lr = lr*0.5
85 | # if lr < 2e-6:
86 | # return 2e-6
87 | # else:
88 | # return lr
89 |
90 | def train_op():
91 | # reduce_lr = LearningRateScheduler(scheduler, verbose=1)
92 | train_D = data_generator(train_data,char2id,n_char2id,word2id,BIO2id,maxlen_sentence,maxlen_word,is_use_n_char,128)
93 | best_f1 = 0
94 | for i in range(1,150): #epochs
95 | print(i)
96 | train_model.fit_generator(train_D.__iter__(),
97 | steps_per_epoch=len(train_D),
98 | epochs=1,
99 | # callbacks=[reduce_lr]
100 | )
101 | # if (i) % 2 == 0 : #两次对dev进行一次测评,并对dev结果进行保存
102 | ner_pred,true_bio = pred_op('dev')
103 | P, R, F = NER_result_Evaluator(ner_pred,true_bio)
104 | if F > best_f1 :
105 | train_model.save_weights(model_save_file)
106 | best_f1 = F
107 | print('当前第{}个epoch,验证集,准确度为{},召回为{},f1为:{}'.format(i,P,R,F))
108 |
109 | ner_pred, true_bio = pred_op('test')
110 | P, R, F = NER_result_Evaluator(ner_pred, true_bio)
111 | print('当前第{}个epoch,测试集,准确度为{},召回为{},f1为:{}'.format(i,P,R,F))
112 |
113 | if i % 50 == 0:
114 | ner_pred, true_bio = pred_op('train')
115 | P, R, F = NER_result_Evaluator(ner_pred, true_bio)
116 | print('训练集,准确度为{},召回为{},f1为:{}'.format(P, R, F))
117 |
118 | print(best_f1)
119 | train_op()
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import json
4 | import csv
5 | import codecs
6 | from keras.preprocessing.sequence import pad_sequences
7 | def readFile(file_name):
8 | head_id_col_vector = ['token_id', 'token', "BIO", "relation", 'head']
9 | file = pd.read_csv(file_name, names=head_id_col_vector, encoding="utf-8",
10 | engine='python', sep="\t", quoting=csv.QUOTE_NONE).as_matrix()
11 | return file
12 |
13 | def collect_data_set(file,save_file):
14 | datas = []
15 | text = []
16 | BIOS = []
17 | relations = []
18 | heads = []
19 | for i in range(file.shape[0]):
20 | if '#doc' not in file[i][0]:
21 | text.append(file[i][1])
22 | BIOS.append(file[i][2])
23 | relations.append(file[i][3])
24 | heads.append(file[i][4])
25 | else:
26 | dic = {}
27 | dic['text'] = text
28 | dic['BIOS'] = BIOS
29 | dic['relations'] = relations
30 | dic['heads'] = heads
31 | datas.append(dic)
32 |
33 | text = []
34 | BIOS = []
35 | relations = []
36 | heads = []
37 |
38 | with codecs.open(save_file, 'w', encoding='utf-8') as f:
39 | json.dump(datas, f, indent=4, ensure_ascii=False)
40 | return datas
41 |
42 | def collect_char2id(datasets,save_file):
43 | chars = {}
44 | for data in datasets:
45 | for word in data['text']:
46 | for char in word:
47 | chars[char] = chars.get(char, 0) + 1
48 | id2char = {i+1:j for i,j in enumerate(chars)} # padding: 0
49 | char2id = {j:i for i,j in id2char.items()}
50 | with codecs.open(save_file, 'w', encoding='utf-8') as f:
51 | json.dump([id2char, char2id], f, indent=4, ensure_ascii=False)
52 |
53 | def collect_n_char2id(datasets,save_file,n):
54 | chars = {}
55 | for data in datasets:
56 | for word in data['text']:
57 | n_chars = n_char(word,n)
58 | for _n_char in n_chars:
59 | chars[_n_char] = chars.get(_n_char, 0) + 1
60 | id2char = {i+1:j for i,j in enumerate(chars)} # padding: 0
61 | char2id = {j:i for i,j in id2char.items()}
62 | with codecs.open(save_file, 'w', encoding='utf-8') as f:
63 | json.dump([id2char, char2id], f, indent=4, ensure_ascii=False)
64 |
65 | def collect_word2id(datasets,save_file):
66 | words = {}
67 | for data in datasets:
68 | for word in data['text']:
69 | words[word] = words.get(word,0)+1
70 | id2word = {i+1:j for i,j in enumerate(words)} #padding:0
71 | word2id = {j:i for i,j in id2word.items()}
72 | with codecs.open(save_file, 'w', encoding='utf-8') as f:
73 | json.dump([id2word, word2id], f, indent=4, ensure_ascii=False)
74 |
75 | def collect_BIO2id(datasets,save_file):
76 | BIOs = {}
77 | for data in datasets:
78 | for bio in data['BIOS']:
79 | if bio != 'O':
80 | BIOs[bio] = BIOs.get(bio,0) +1
81 |
82 | id2BIO = {i+1:j for i,j in enumerate(BIOs)} #padding:0
83 | id2BIO[0] = 'O'
84 | BIO2id = {j:i for i,j in id2BIO.items()}
85 | with codecs.open(save_file, 'w', encoding='utf-8') as f:
86 | json.dump([id2BIO, BIO2id], f, indent=4, ensure_ascii=False)
87 |
88 | # def collect_relations2id(datasets,save_file):
89 | # BIOs = {}
90 | # for data in datasets:
91 | # for bio in data['BIOS']:
92 | # BIOs[bio] = BIOs.get(bio,0) +1
93 | # id2BIO = {i+1:j for i,j in enumerate(BIOs)} #padding:0
94 | # BIO2id = {j:i for i,j in id2BIO.items()}
95 | # with codecs.open(save_file, 'w', encoding='utf-8') as f:
96 | # json.dump([id2BIO, BIO2id], f, indent=4, ensure_ascii=False)
97 |
98 |
99 | #这里有bug只能输出字符串,到时候重写一下
100 | class read_properties:
101 | def __init__(self,filepath, sep='=', comment_char='#'):
102 | """Read the file passed as parameter as a properties file."""
103 | self.props = {}
104 | #print filepath
105 | with open(filepath, "rt") as f:
106 | for line in f:
107 | #print line
108 | l = line.strip()
109 | if l and not l.startswith(comment_char):
110 | key_value = l.split(sep)
111 | self.props[key_value[0].strip()] = key_value[1].split("#")[0].strip('" \t')
112 |
113 |
114 | def getProperty(self,propertyName):
115 | return self.props.get(propertyName)
116 |
117 |
118 | def sentence_pad(X,maxlen_sentence):
119 | #sentence_level pad for word input and bio tagging
120 | #use the maxlen of batch datas to pad the sentence level inputs
121 | """
122 |
123 | :param datas: [batch_size,None]
124 | :return: datas : [batch_size,maxlen of sentence]
125 | """
126 | # L = [len(x) for x in X]
127 | # ML = max(L)
128 | ML = maxlen_sentence
129 | return [x + [0] * (ML - len(x)) for x in X]
130 |
131 |
132 | def n_char(word,n):
133 | """
134 | split the word use n_gram
135 | n = 2
136 | word = love
137 | ==> lo ov ve e
138 | n =3
139 | word = love
140 | ==> lov ove ve
141 | :param word:
142 | :return:
143 | """
144 | word = str(word)
145 | n_char = []
146 | n_char.append(''*(n-1) + word[0])
147 | temp = ''
148 | for index,char in enumerate(word):
149 | if index+n < len(word):
150 | temp += word[index:index+n]
151 | n_char.append(temp)
152 | temp = ''
153 | else:
154 | temp += word[index:]
155 | temp += '' * (n - len(temp))
156 | n_char.append(temp)
157 | temp = ''
158 | return n_char
159 |
160 | def char_pad(datas,maxlen_sentence,maxlen_word):
161 | #word_leve pad for char input
162 | #use the maxlen of batch data of words to pad the char levels and use the maxlen of batch datas to pad the sentence level inputs
163 | """
164 | :param datas: [batch_size,None,None]
165 | :return: [batch_size,maxlen of sentence , maxlen of words]
166 | """
167 | new_data = []
168 | for sentence in datas:
169 | _sentence = []
170 | for word in sentence:
171 | if len(word) < maxlen_word:
172 | word+=[0]*(maxlen_word - len(word))
173 | else:
174 | word = word[:maxlen_word]
175 | _sentence.append(word)
176 |
177 | pad_word = [0]*maxlen_word
178 | if len(_sentence) < maxlen_sentence:
179 | for i in range(maxlen_sentence - len(_sentence)):
180 | _sentence.append(pad_word)
181 | else:
182 | _sentence = _sentence[:maxlen_sentence]
183 | new_data.append(_sentence)
184 | return new_data
185 |
186 | #TODO complete the function for joint extraction
187 | def load_data(mode):
188 | #only for ner prediction now , then i will compelet the function for joint extraction
189 | #load data for predict
190 | config_file = read_properties('config/CoNLL04/bio_config')
191 | is_use_n_char = bool(config_file.getProperty('is_use_n_char'))
192 | filename_char2id = config_file.getProperty("filename_char2id")
193 | filename_word2id = config_file.getProperty("filename_word2id")
194 | filename_BIO2id = config_file.getProperty("filename_BIO2id")
195 | filename_relation2id = config_file.getProperty("filename_relation2id")
196 | id2char, char2id = json.load(open(filename_char2id, encoding='utf-8'))
197 | id2n_char, n_char2id = json.load(open(filename_char2id, encoding='utf-8'))
198 | id2word, word2id = json.load(open(filename_word2id, encoding='utf-8'))
199 | id2BIO, BIO2id = json.load(open(filename_BIO2id, encoding='utf-8'))
200 | filename_train_me = config_file.getProperty("filename_train_me")
201 | filename_dev_me = config_file.getProperty("filename_dev_me")
202 | filename_test_me = config_file.getProperty("filename_test_me")
203 | maxlen_sentence = int(config_file.getProperty('maxlen_sentence'))
204 | maxlen_word = int(config_file.getProperty('maxlen_word'))
205 | eval_data= []
206 | # import ipdb
207 | # ipdb.set_trace()
208 | if mode == 'dev':
209 | eval_data = json.load(open(filename_dev_me, encoding='utf-8'))
210 | if mode == 'test':
211 | eval_data = json.load(open(filename_test_me, encoding='utf-8'))
212 | if mode == 'train':
213 | eval_data = json.load(open(filename_train_me,encoding='utf-8'))
214 |
215 | TEXT_WORD, TEXT_CHAR, BIO = [], [], []
216 | for data in eval_data:
217 | text = data['text']
218 | bio = data['BIOS']
219 | _text_word = [word2id.get(word,0) for word in text]
220 | _text_char = [] # 2 dimmensions
221 | if is_use_n_char:
222 | for word in _text_word:
223 | n_chars = n_char(word,3)
224 | chars = [n_char2id.get(_char) for _char in n_chars]
225 | _text_char.append(chars)
226 | else:
227 | for word in _text_word:
228 | chars = [char2id.get(_char) for _char in str(word)]
229 | _text_char.append(chars)
230 | _bio = [BIO2id.get(b) for b in bio]
231 | TEXT_WORD.append(_text_word)
232 | TEXT_CHAR.append(_text_char) # [batch,word,char] #padding two times,
233 | # first in word dimensions for sentence maxlen ,then ,in char dimensions for maxlen_word
234 | BIO.append(_bio)
235 | TEXT_WORD = pad_sequences(TEXT_WORD, maxlen=maxlen_sentence, padding='post', value=0)
236 | TEXT_CHAR = np.array(char_pad(TEXT_CHAR,maxlen_sentence,maxlen_word))
237 | # BIO = pad_sequences(BIO, maxlen=30, padding='post', value=0)
238 | return TEXT_WORD,TEXT_CHAR,BIO
239 |
240 | #TODO
241 | class data_generator():
242 | def __init__(self,data,char2id,n_char2id,word2id,BIO2id,maxlen_sentence,maxlen_word,is_use_n_char,batch_size=128):
243 | self.data = data
244 | self.batch_size = batch_size
245 | self.char2id = char2id
246 | self.n_char2id = n_char2id
247 | self.word2id = word2id
248 | self.BIO2id = BIO2id
249 | self.maxlen_sentence = maxlen_sentence
250 | self.maxlen_word = maxlen_word
251 | self.is_use_n_char = is_use_n_char
252 | self.steps = len(self.data)//self.batch_size
253 | if len(self.data) % self.batch_size != 0:
254 | self.steps += 1
255 | def __len__(self):
256 | return self.steps
257 | def __iter__(self):
258 | while True :
259 | index = list(range(len(self.data)))
260 | np.random.shuffle(index)
261 | TEXT_WORD,TEXT_CHAR,BIO = [],[],[]
262 | for idx in index:
263 | _data = self.data[idx]
264 | text = _data['text']
265 | bio = _data['BIOS']
266 | _text_word = [self.word2id.get(word) for word in text]
267 | _text_char = [] # 2 dimmensions
268 | if self.is_use_n_char:
269 | for word in _text_word:
270 | n_chars = n_char(word,3)
271 | chars = [self.n_char2id.get(_char) for _char in n_chars]
272 | _text_char.append(chars)
273 | else:
274 | for word in _text_word:
275 | chars = [self.char2id.get(_char) for _char in str(word)]
276 | _text_char.append(chars)
277 | _bio = [self.BIO2id.get(b) for b in bio]
278 | TEXT_WORD.append(_text_word)
279 | TEXT_CHAR.append(_text_char) #[batch,word,char] #padding two times,
280 | # first in word dimensions for sentence maxlen ,then ,in char dimensions for maxlen_word
281 | BIO.append(_bio)
282 | if len(TEXT_WORD) == self.batch_size or idx == index[-1]:
283 | TEXT_WORD = pad_sequences(TEXT_WORD,maxlen=self.maxlen_sentence,padding='post',value=0)
284 | TEXT_CHAR = np.array(char_pad(TEXT_CHAR,self.maxlen_sentence,self.maxlen_word))
285 | BIO = pad_sequences(BIO,maxlen=self.maxlen_sentence,padding='post',value=0)
286 | yield [TEXT_WORD,TEXT_CHAR,BIO ],None
287 | TEXT_WORD,TEXT_CHAR,BIO =[],[],[]
288 |
289 | def _load_embed(file):
290 | def get_coefs(word, *arr):
291 | return word, np.asarray(arr)[:100]
292 | embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='utf-8'))
293 |
294 | return embeddings_index
295 |
296 | def _load_embedding_matrix(word_index, embedding):
297 | embed_word_count = 0
298 | # nb_words = min(max_features, len(word_index))
299 | nb_words = len(word_index)
300 | embedding_matrix = np.random.normal(size=(nb_words+1, 100))
301 |
302 | for word, i in word_index.items():
303 | # if i >= max_features: continue
304 | if word not in embedding:
305 | word = word.lower()
306 | if word.islower and word not in embedding:
307 | word = word.title()
308 | embedding_vector = embedding.get(word)
309 | if embedding_vector is not None:
310 | embedding_matrix[i] = embedding_vector
311 | embed_word_count += 1
312 | print('词向量的覆盖率为{}'.format(embed_word_count / len(word_index)))
313 | return embedding_matrix
314 |
315 | def get_embedding_matrix(word_index):
316 | embedding_dir = 'data/CoNLL04/glove.6B.100d.txt'
317 | embedding = _load_embed(embedding_dir)
318 | embedding_matrix = _load_embedding_matrix(word_index, embedding)
319 |
320 | return embedding_matrix
--------------------------------------------------------------------------------