├── .idea
├── vcs.xml
└── workspace.xml
├── ModelLib.py
├── MyLayer.py
├── README.md
├── config.py
├── data
├── Pos_test.txt
└── Pos_train.txt
├── generator.py
├── preprocess.py
├── test.py
└── train.py
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
114 |
115 |
116 |
117 |
118 | true
119 | DEFINITION_ORDER
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 | 1558955052662
235 |
236 |
237 | 1558955052662
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
--------------------------------------------------------------------------------
/ModelLib.py:
--------------------------------------------------------------------------------
1 | import keras
2 | import numpy
3 | from keras import Model
4 | from keras.layers import Embedding, Bidirectional, LSTM, \
5 | BatchNormalization, Dropout, Reshape, Conv2D, \
6 | Masking, MaxPooling2D, MaxPooling1D
7 | from keras.layers import Input, Dense, Concatenate, TimeDistributed,Permute,RepeatVector, Multiply
8 | from keras_contrib.layers import CRF
9 | import MyLayer
10 | from keras.layers.core import *
11 |
12 | def BERT_MODEL(para):
13 | # for key in para:
14 | # print key,para[key]
15 | bert_input = Input(shape=(para["max_len"], 768,), dtype='float32', name='bert_input')
16 | mask = Masking()(bert_input)
17 | repre = Dropout(para["char_dropout"])(mask)
18 | repre = Dense(300, activation="relu")(repre)
19 | repre = Bidirectional(LSTM(para["lstm_unit"], return_sequences=True, dropout=para["rnn_dropout"]))(repre)
20 | crf = CRF(para["tag_num"], sparse_target=True)
21 | crf_output = crf(repre)
22 | model = Model(input=bert_input, output=crf_output)
23 | model.summary()
24 | # adam_0 = keras.optimizers.Adam(lr=0.05, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
25 | model.compile("adam", loss=crf.loss_function, metrics=[crf.accuracy])
26 | return model
27 |
28 |
29 | if __name__ == "__main__":
30 | data = numpy.ones((10, 10, 10), dtype='float32')
31 | # data_2 = numpy.ones((10, 100),dtype='float32')
32 | #
33 | # data_input = Input(shape=(100,))
34 | # data_reshape = Reshape(target_shape=(10, 10))(data_input)
35 | # weight_input = Input(shape=(10, 10))
36 | # # model.add(Embedding(input_dim=3,output_dim=5,weights=[weight],mask_zero=True))
37 | # output = Multiply(output_dim=(10,10))([data_reshape, weight_input])
38 | #
39 | # model = Model(input=[data_input,weight_input], output=output)
40 | # result = model.predict([data_2, data], batch_size=2)
41 | # model.summary()
42 |
43 | # train_x, train_y, val_x, val_y, word2id, tags, img_voc = pickle.load(open(config.data_pk, 'rb'))
44 | #
45 | # img_embed = load_img_embed(word2id)
46 | # x = numpy.array([1.0, 2.0, 3.0])
47 | # # x = x.astype(numpy.int64)
48 | # # result = tf.nn.embedding_lookup(img_embed, x)
49 | # # sess = tf.Session()
50 | # # result = sess.run(result)
51 | # # print(result)
52 | #
53 | # x_input = Input(shape=(train_x[0].shape[1],), dtype="int64")
54 | # y = MyLayer.ImageEmbeding(img_weight=img_embed, output_dim=(50, 50, 1))(x_input)
55 | # model = Model(input=x_input, output=y)
56 | # model.summary()
57 | # result = model.predict(train_x[0], batch_size=64)
58 | # print(result.shape)
59 |
--------------------------------------------------------------------------------
/MyLayer.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | import tensorflow as tf
3 | from keras.engine import Layer
4 |
5 |
6 |
7 | def dot_product(x, kernel):
8 | """
9 | Wrapper for dot product operation, in order to be compatible with both
10 | Theano and Tensorflow
11 | Args:
12 | x (): input
13 | kernel (): weights
14 | Returns:
15 | """
16 | if K.backend() == 'tensorflow':
17 | return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
18 | else:
19 | return K.dot(x, kernel)
20 |
21 |
22 | def squash(x, axis=-1):
23 | s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
24 | scale = K.sqrt(s_squared_norm + K.epsilon())
25 | return x / scale
26 |
27 | from keras.layers import merge
28 | from keras.layers.core import *
29 |
30 | import numpy as np
31 |
32 |
33 | def get_activations(model, inputs, print_shape_only=False, layer_name=None):
34 | # Documentation is available online on Github at the address below.
35 | # From: https://github.com/philipperemy/keras-visualize-activations
36 | print('----- activations -----')
37 | activations = []
38 | inp = model.input
39 | if layer_name is None:
40 | outputs = [layer.output for layer in model.layers]
41 | else:
42 | outputs = [layer.output for layer in model.layers if layer.name == layer_name] # all layer outputs
43 | funcs = [K.function([inp] + [K.learning_phase()], [out]) for out in outputs] # evaluation functions
44 | layer_outputs = [func([inputs, 1.])[0] for func in funcs]
45 | for layer_activations in layer_outputs:
46 | activations.append(layer_activations)
47 | if print_shape_only:
48 | print(layer_activations.shape)
49 | else:
50 | print(layer_activations)
51 | return activations
52 |
53 |
54 | def get_data(n, input_dim, attention_column=1):
55 | """
56 | Data generation. x is purely random except that it's first value equals the target y.
57 | In practice, the network should learn that the target = x[attention_column].
58 | Therefore, most of its attention should be focused on the value addressed by attention_column.
59 | :param n: the number of samples to retrieve.
60 | :param input_dim: the number of dimensions of each element in the series.
61 | :param attention_column: the column linked to the target. Everything else is purely random.
62 | :return: x: model inputs, y: model targets
63 | """
64 | x = np.random.standard_normal(size=(n, input_dim))
65 | y = np.random.randint(low=0, high=2, size=(n, 1))
66 | x[:, attention_column] = y[:, 0]
67 | return x, y
68 |
69 |
70 | def get_data_recurrent(n, time_steps, input_dim, attention_column=10):
71 | """
72 | Data generation. x is purely random except that it's first value equals the target y.
73 | In practice, the network should learn that the target = x[attention_column].
74 | Therefore, most of its attention should be focused on the value addressed by attention_column.
75 | :param n: the number of samples to retrieve.
76 | :param time_steps: the number of time steps of your series.
77 | :param input_dim: the number of dimensions of each element in the series.
78 | :param attention_column: the column linked to the target. Everything else is purely random.
79 | :return: x: model inputs, y: model targets
80 | """
81 | x = np.random.standard_normal(size=(n, time_steps, input_dim))
82 | y = np.random.randint(low=0, high=2, size=(n, 1))
83 | x[:, attention_column, :] = np.tile(y[:], (1, input_dim))
84 | return x, y
85 |
86 | class Multiply(Layer):
87 | def __init__(self, output_dim, **kwargs):
88 | self.output_dim = output_dim
89 | super(Multiply, self).__init__(**kwargs)
90 |
91 | def call(self, x):
92 | return tf.multiply(x[0], x[1])
93 |
94 | def compute_output_shape(self, input_shape):
95 | return input_shape[0]
96 |
97 | class ImageEmbeding(Layer):
98 |
99 | def __init__(self, output_dim,img_weight, **kwargs):
100 | self.output_dim = output_dim
101 | self.img_weight = img_weight
102 | super(ImageEmbeding, self).__init__(**kwargs)
103 |
104 | def call(self, x):
105 | return tf.nn.embedding_lookup(self.img_weight, x)
106 |
107 | def compute_output_shape(self, input_shape):
108 | return (input_shape[0],)+self.output_dim
109 |
110 |
111 | from keras import backend as K
112 | from keras.engine.topology import Layer
113 |
114 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # BERT-BiLSTM-CRF
2 | BERT-BiLSTM-CRF的Keras版实现
3 |
4 | ## BERT配置
5 | 1. 首先需要下载Pre-trained的BERT模型,本文用的是Google开源的中文BERT模型:
6 | - https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip
7 | 2. 安装BERT客户端和服务器 pip install bert-serving-server pip install bert-serving-client,源项目如下:
8 | - https://github.com/hanxiao/bert-as-service
9 | 3. 打开服务器,在BERT根目录下,打开终端,输入命令:
10 | - bert-serving-start -pooling_strategy NONE -max_seq_len 144 -mask_cls_sep -model_dir chinese_L-12_H-768_A-12/ -num_worker 1
11 |
12 | ## DEMO数据
13 | - 2015词性标注数据集
14 |
15 | ## 文件描述
16 | - preprocess.py 数据预处理,产生模型输入的pickle文件
17 | - train.py 通过训练集,训练模型
18 | - test.py 计算模型在测试集中的F1值
19 | - Modellib.py 模型位置
20 | - config.py 参数配置
21 | ## 模型训练
22 | 配置BERT->>执行preprocess.py->>执行train.py
23 |
24 | ## 配置
25 | - python 2.7
26 | - tensorflow-gpu 1.10.0
27 | - Keras 2.2.4
28 | - keras-contrib 2.0.8
29 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 |
2 | para = {}
3 |
4 | para["data_pk_path"] = "./data/pku-seg.pk"
5 | para["fea_pk_path"] = "./data/pku-seg-fea.pk"
6 |
7 | para["data_pk_path"] = "./cache/nlpcc-pos.pk"
8 | para["train_path"] = "./data/Pos_train.txt"
9 | para["test_path"] = "./data/Pos_test.txt"
10 |
11 | para["model_path"] = "./model/pku/lstm-crf-embed-bert"
12 |
13 |
14 | para["img_w"] = 50
15 | para["img_h"] = 50
16 | para["embed_dim"] = 200
17 | para["unit_num"] = 200
18 | para["split_seed"] = 2018
19 | para["max_len"] = 142
20 | para["EPOCHS"] = 40
21 | para["batch_size"] = 20
22 |
23 | para["traditional_chinese"] = False
24 | para["sep"] = "\t"
25 | para["char_dropout"] = 0.5
26 | para["rnn_dropout"] = 0.5
27 | para["lstm_unit"] = 300
28 | para["REPRE_NUM"] = 128
29 |
30 | para["fea_dropout"] = 0.3
31 | para["fea_lstm_unit"] = 32
32 | para["fea_dim"] = 20
33 | para["radical_max"] = 7
34 | para["pinyin_max"] = 8
35 | para["rad_max"] = 1
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/generator.py:
--------------------------------------------------------------------------------
1 | from keras_preprocessing.sequence import pad_sequences
2 | import pickle
3 | from preprocess import _parse_data,get_tag,Counter
4 | import codecs
5 | import config
6 | from bert_serving.client import BertClient
7 | import numpy as np
8 |
9 | para = config.para
10 |
11 | def make_batches( size, batch_size):
12 |
13 | nb_batch = int(np.ceil(size/float(batch_size)))
14 | return [(i*batch_size, min(size, (i+1) * batch_size)) for i in range(0, nb_batch)]
15 |
16 | def bert_generator(batch_size,train_path,sep,y,Shuffle = True):
17 | index_array = np.arange(y.shape[0])
18 | if Shuffle:
19 | np.random.shuffle(index_array)
20 |
21 | data = _parse_data(codecs.open(train_path, 'r'), sep=sep)
22 | data = [[items[0] for items in sent] for sent in data]
23 | bc = BertClient() # ip address of the GPU machine
24 | # step = int(data.__len__() / batch_size) + 1
25 | batches = make_batches(y.shape[0] - 1, batch_size)
26 | while 1:
27 | for batch_index, (batch_start, batch_end) in enumerate(batches):
28 | batch_ids = index_array[batch_start:batch_end]
29 | data_batch = [data[id] for id in batch_ids]
30 | # print(data_batch)
31 | x_batch = bc.encode(data_batch,is_tokenized=True)
32 | x_batch = x_batch[:, 1:para["max_len"] + 1]
33 | y_batch = y[batch_ids]
34 | yield (x_batch,y_batch)
35 |
36 | if __name__ == "__main__":
37 | # train_path = config.fold_path + "PKU/train.txt"
38 | # test_path = config.fold_path + "PKU/test.txt"
39 | # sep = " "
40 | # train_x, train_y, val_x, val_y, word2id, tags, img_embed = pickle.load(open("./data/pku-seg.pk", 'rb'))
41 | # for x,y in bert_generator(64, train_path, sep, train_y):
42 | # print x.shape,y.shape
43 | data = make_batches(10000,64)
44 | print(data)
--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
1 | # encoding:utf-8
2 |
3 | import numpy
4 | from collections import Counter
5 |
6 | from keras import Sequential
7 | from keras.layers import Embedding
8 | from keras.preprocessing.sequence import pad_sequences
9 | import config
10 | import random
11 | import pickle
12 | from skimage import io, transform
13 | import codecs
14 | from skimage.viewer import ImageViewer
15 | from bert_serving.client import BertClient
16 |
17 | para = config.para
18 |
19 | def get_char2id(train_x, id2id, maxlen):
20 | char_l = []
21 | lose = [0]*maxlen
22 | for sentence in train_x:
23 | sent_l = []
24 | for word_id in sentence:
25 | try:
26 | # print(id2id[word_id])
27 | sent_l.append(id2id[word_id])
28 | except Exception, e:
29 | sent_l.append(lose)
30 | char_l.append(sent_l)
31 | return char_l
32 |
33 | def cross_validation(X,Y,fold):
34 | val_X = []
35 | val_Y = []
36 | train_X = []
37 | train_Y = []
38 | step = int(X.__len__() / fold)
39 | for i in range(fold):
40 | if i != fold - 1:
41 | val_X.append(X[step * i:step * (i + 1)])
42 | val_Y.append(Y[step * i:step * (i + 1)])
43 | else:
44 | val_X.append(X[step * i:])
45 | val_Y.append(Y[step * i:])
46 | for i in range(fold):
47 | X_list = []
48 | Y_list = []
49 | for j in range(val_X.__len__()):
50 | if j != i:
51 | X_list.append(val_X[j])
52 | Y_list.append(val_Y[j])
53 | train_X.append(numpy.concatenate(X_list, axis=0))
54 | train_Y.append(numpy.concatenate(Y_list, axis=0))
55 | return train_X, train_Y, val_X, val_Y
56 |
57 | def train_test_dev_preprocess():
58 | train = _parse_data(codecs.open(para["train_path"], 'r'), sep=para["sep"])
59 | test = _parse_data(codecs.open(para["test_path"], 'r'), sep=para["sep"])
60 | dev = _parse_data(codecs.open(para["dev_path"], 'r'), sep=para["sep"])
61 | # train_len = train.__len__()
62 | print("Load dataset finish!!")
63 | dataset = train+test+dev
64 | tags = get_tag(dataset)
65 | print(tags)
66 | print(train.__len__(), test.__len__(), dev.__len__(), dataset.__len__())
67 | word_counts = Counter(row[0].lower() for sample in dataset for row in sample)
68 | vocab = [w for w, f in iter(word_counts.items()) if f >= 1]
69 | word2id = dict((w, i + 1) for i, w in enumerate(vocab))
70 |
71 | train_X, train_Y = process_data(train, word2id, tags)
72 | dev_X, dev_Y = process_data(dev, word2id, tags)
73 | test_X, test_Y = process_data(test, word2id, tags)
74 | pickle.dump((train_X, train_Y, test_X, test_Y, dev_X,dev_Y, word2id, tags), open(para["data_pk_path"], "wb"))
75 |
76 | def train_test_set_preprocess():
77 | train = _parse_data(codecs.open(para["train_path"], 'r'), sep=para["sep"])
78 | test = _parse_data(codecs.open(para["test_path"], 'r'), sep=para["sep"])
79 |
80 | # train = dic+train
81 | print("Load trainset,dataset finish!!")
82 | dataset = train+test
83 | tags = get_tag(dataset)
84 | print(tags)
85 | print(train.__len__(),test.__len__(),dataset.__len__())
86 | word_counts = Counter(row[0].lower() for sample in dataset for row in sample)
87 | vocab = [w for w, f in iter(word_counts.items()) if f >= 1]
88 | word2id = dict((w, i + 1) for i, w in enumerate(vocab))
89 | # print(word2id)
90 | train_X, train_Y = process_data(train, word2id, tags)
91 | print(tags)
92 | test_X, test_Y = process_data(test, word2id, tags)
93 | print(train_X.shape,train_Y.shape)
94 | pickle.dump((train_X, train_Y, test_X, test_Y, word2id, tags), open(para["data_pk_path"], "wb"))
95 |
96 | def load_bert_repre():
97 | train = _parse_data(codecs.open(para["train_path"], 'r'), sep=para["sep"])
98 | test = _parse_data(codecs.open(para["test_path"], 'r'), sep=para["sep"])
99 | train = [[items[0] for items in sent] for sent in train]
100 | test = [[items[0] for items in sent] for sent in test]
101 |
102 | train_x = numpy.zeros(shape=(train.__len__(),para["max_len"],768),dtype="float32")
103 | test_x = numpy.zeros(shape=(test.__len__(),para["max_len"],768),dtype="float32")
104 |
105 | bc = BertClient()
106 |
107 | step = int(train.__len__()/256)+1
108 | for i in range(step):
109 | if i != step-1:
110 | x = bc.encode(train[i*256:(i+1)*256], is_tokenized=True)
111 | x = x[:,1:para["max_len"]+1]
112 | train_x[i*256:((i+1)*256)] = x
113 | # print(train_x[i*256:(i+1)*256])
114 | else:
115 | x = bc.encode(train[i*256:],is_tokenized=True)
116 | x = x[:,1:para["max_len"]+1]
117 | train_x[i*256:] = x
118 | # print(train_x[i*256:])
119 |
120 | step = int(test.__len__() / 256) + 1
121 | # print(step)
122 | for i in range(step):
123 | if i != step - 1:
124 | x = bc.encode(test[i * 256:(i + 1) * 256], is_tokenized=True)
125 | x = x[:, 1:para["max_len"]+1]
126 | test_x[i * 256:((i + 1) * 256)] = x
127 | print(test_x[i * 256:(i + 1) * 256])
128 | else:
129 | x = bc.encode(test[i * 256:], is_tokenized=True)
130 | x = x[:, 1:para["max_len"]+1]
131 | test_x[i * 256:] = x
132 | # print(test_x[i * 256:])
133 | return train_x, test_x
134 |
135 | def load_path_bert(path,sep="\t"):
136 |
137 | test = _parse_data(codecs.open(path, 'r'), sep=sep)
138 | test = [[items[0] for items in sent] for sent in test]
139 | test_x = numpy.zeros(shape=(test.__len__(), para["max_len"], 768),dtype="float32")
140 | bc = BertClient()
141 |
142 | step = int(test.__len__() / 256) + 1
143 | print(step)
144 | for i in range(step):
145 | if i != step - 1:
146 | x = bc.encode(test[i * 256:(i + 1) * 256], is_tokenized=True)
147 | x = x[:, 1:para["max_len"]+1]
148 | test_x[i * 256:((i + 1) * 256)] = x
149 | # print(test_x[i * 256:(i + 1) * 256])
150 | else:
151 | x = bc.encode(test[i * 256:], is_tokenized=True)
152 | x = x[:, 1:para["max_len"]+1]
153 | test_x[i * 256:] = x
154 | # print(test_x[i * 256:])
155 | # pickle.dump(test_x, open("./data/bert-pku-seg.pk", "wb"))
156 | return test_x
157 |
158 | def get_tag(data):
159 | tag = []
160 | for words in data:
161 | for word_tag in words:
162 | if word_tag[1] not in tag:
163 | tag.append(word_tag[1])
164 | return tag
165 |
166 |
167 | def _parse_data(file_input,sep="\t"):
168 | rows = file_input.readlines()
169 | rows[0] = rows[0].replace('\xef\xbb\xbf', '')
170 | items = [row.strip().split(sep) for row in rows]
171 | # print(items)
172 | max_len = 0
173 | sents = []
174 | sent = []
175 | n = 0
176 | for item in items:
177 |
178 | if item.__len__() != 1:
179 | sent.append(item)
180 | else:
181 | if sent.__len__() > para["max_len"]:
182 | n += 1
183 | split_sent = []
184 | for i, item in enumerate(sent):
185 | if item[0] in ["。",",",",","!","!","?","?", "、", ";"] and split_sent.__len__()>50:
186 | split_sent.append(item)
187 | if split_sent.__len__() < para["max_len"]:
188 | # for item in split_sent:
189 | # if item[1] != "O":
190 | # sents.append(split_sent[:])
191 | # break
192 | # print(" ".join([item[0] for item in split_sent]))
193 | sents.append(split_sent[:])
194 | # else:
195 | # for item in split_sent:
196 | # print item[0],
197 | # print ""
198 | split_sent = []
199 | else:
200 | split_sent.append(item)
201 |
202 | # if i == sent.__len__()-1 and split_sent.__len__() < config.max_len:
203 | # for item in split_sent:
204 | # sents[sents.__len__()-1].append(item)
205 | # split_sent = []
206 | # continue
207 | else:
208 | if sent.__len__() > 1:
209 | sents.append(sent[:])
210 | sent = []
211 | print ("over_maxlen_sentence_num:", n)
212 | return sents
213 |
214 |
215 | def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False):
216 | if maxlen is None:
217 | maxlen = max(len(s) for s in data)
218 | word2idx = dict((w, i+1) for i, w in enumerate(vocab))
219 | x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data] # set to (index 1) if not in vocab
220 | y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]
221 | x = pad_sequences(x, maxlen, padding='post', truncating='post') # left padding
222 | y_chunk = pad_sequences(y_chunk, maxlen, value=-1, padding='post', truncating='post')
223 | if onehot:
224 | y_chunk = numpy.eye(len(chunk_tags), dtype='float32')[y_chunk]
225 | # print(y_chunk)
226 | else:
227 | y_chunk = numpy.expand_dims(y_chunk, 2)
228 | return x, y_chunk, word2idx
229 |
230 | def process_data(data,word2idx,chunk_tags,onehot=False):
231 | x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data] # set to (index 1) if not in vocab
232 | y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]
233 | x = pad_sequences(x, para["max_len"],padding='post', truncating='post') # left padding
234 | y_chunk = pad_sequences(y_chunk, para["max_len"], value=-1,padding='post', truncating='post')
235 |
236 | if onehot:
237 | y_chunk = numpy.eye(len(chunk_tags), dtype='float32')[y_chunk]
238 | # print(y_chunk)
239 | else:
240 | y_chunk = numpy.expand_dims(y_chunk, 2)
241 | return x, y_chunk
242 |
243 |
244 | def get_lengths(X):
245 | lengths = []
246 | for i in range(len(X)):
247 | length = 0
248 | for dim in X[i]:
249 | # print(dim)
250 | if dim != 0:
251 | length += 1
252 | else:
253 | break
254 | # print(length)
255 | lengths.append(length)
256 |
257 | return lengths
258 |
259 | def create_bool_matrex(repre_dim,x):
260 | bool_x = numpy.zeros(shape=(x.shape[0], x.shape[1],repre_dim))
261 | for i in range(x.shape[0]):
262 | for j in range(x.shape[1]):
263 | if x[i][j] != 0:
264 | bool_x[i,j,:] = 1.
265 | return bool_x
266 |
267 | def load_embed_weight(word2id):
268 | embed_weight = numpy.zeros(shape=(len(word2id.keys())+1, para["embed_dim"]))
269 | char2vec = {}
270 | with open(para["embed_path"], "r") as f:
271 | rows = f.readlines()
272 | for row in rows:
273 | item = row.strip().split(" ", 1)
274 | char = item[0]
275 | # print(item)
276 | vec_str = item[1].split(" ")
277 | vec = [float(i) for i in vec_str]
278 | char2vec[char] = vec
279 | for word in word2id.keys():
280 | # print(word)
281 | vec = char2vec[word]
282 | embed_weight[word2id[word]] = numpy.array(vec)
283 | print(embed_weight)
284 | return embed_weight
285 |
286 | def get_simple2traditional():
287 | simple2traditional = {}
288 | with open(config.traditional_dict_path,"r") as f:
289 | rows = f.readlines()
290 | for row in rows:
291 | item = row.strip().split(" ")
292 | simple2traditional[item[0]] = item[1]
293 | return simple2traditional
294 |
295 |
296 |
297 | if __name__ == "__main__":
298 | para["data_pk_path"] = "./cache/nlpcc-pos.pk"
299 | para["train_path"] = "./data/Pos_train.txt"
300 | para["test_path"] = "./data/Pos_test.txt"
301 | train_test_set_preprocess()
302 |
303 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy
3 | from preprocess import *
4 | from preprocess import get_lengths
5 | import ModelLib
6 | import config
7 | import pickle
8 | import datetime
9 |
10 | para = config.para
11 | train_x, train_y, val_x, val_y, word2id, tags = pickle.load(open(para["data_pk_path"], 'rb'))
12 |
13 | def predict_bert(para):
14 | para['tag_num'] = len(tags)
15 | model = ModelLib.BERT_MODEL(para)
16 | model.load_weights(filepath=para["model_path"])
17 | bert_val =load_path_bert(para["test_path"],sep=para["sep"])
18 | lengths = get_lengths(val_x)
19 |
20 | pred_y = model.predict(bert_val)
21 |
22 | tag_pred_y = []
23 | tag_val_y = []
24 | for i, y in enumerate(pred_y):
25 | y = [numpy.argmax(dim) for dim in y]
26 | print(lengths[i])
27 | p_y = y[:lengths[i]]
28 | print(p_y)
29 | v_y = val_y[i][:lengths[i]].flatten()
30 | print(v_y)
31 | p_y = [tags[dim] for dim in p_y]
32 | v_y = [tags[dim] for dim in v_y]
33 | tag_pred_y.append(p_y)
34 | tag_val_y.append(v_y)
35 | return tag_pred_y,tag_val_y
36 |
37 | def char_seg_acc(tag_pred_y, tag_val_y):
38 | acc = 0.0
39 | num = 0.0
40 | for j in range(len(tag_pred_y)):
41 | for z in range(len(tag_pred_y[j])):
42 | if tag_pred_y[j][z] == tag_val_y[j][z]:
43 | acc+=1
44 | num += 1
45 | print("test acc:"+str(acc/num))
46 |
47 | def word_seg_F1(y_pred,y):
48 | c = 0
49 | true = 0
50 | pos = 0
51 | for i in xrange(len(y)):
52 | start = 0
53 | for j in xrange(len(y[i])):
54 | if y_pred[i][j] == 'E' or y_pred[i][j] == 'S':
55 | pos += 1
56 | if y[i][j] == 'E' or y[i][j] == 'S':
57 | flag = True
58 | if y_pred[i][j] != y[i][j]:
59 | flag = False
60 | if flag:
61 | for k in range(start, j):
62 | if y_pred[i][k] != y[i][k]:
63 | flag = False
64 | break
65 | if flag:
66 | c += 1
67 | true += 1
68 | start = j+1
69 |
70 | P = c/float(pos)
71 | R = c/float(true)
72 | F = 2*P*R/(P+R)
73 | return P,R,F
74 |
75 | def pos_F1(y_pred, y):
76 | c = 0
77 | true = 0
78 | pos = 0
79 | for i in xrange(len(y)):
80 | start = 0
81 | for j in xrange(len(y[i])):
82 | # print y_pred[i][j]
83 | if y_pred[i][j][0] == 'E' or y_pred[i][j][0] == 'S':
84 | pos += 1
85 | if y[i][j][0] == 'E' or y[i][j][0] == 'S':
86 | flag = True
87 | if y_pred[i][j] != y[i][j]:
88 | flag = False
89 | if flag:
90 | for k in range(start, j):
91 | if y_pred[i][k] != y[i][k]:
92 | flag = False
93 | break
94 | if flag:
95 | c += 1
96 | true += 1
97 | start = j+1
98 | try:
99 | P = c/float(pos)
100 | # print pos
101 | R = c/float(true)
102 | # print true
103 | F = 2*P*R/(P+R)
104 | except Exception, e:
105 | print e
106 | return P, R, F
107 |
108 | if __name__ == "__main__":
109 | para["char_dropout"] = 0.5
110 | para["rnn_dropout"] = 0.5
111 |
112 | para["model_path"] = "./model/lstm-crf-bert"
113 | pred_y, val_y = predict_bert(para)
114 | # pred_y, val_y = predict_normal(para, use_embed=False,feature="")
115 | P,R,F = pos_F1(pred_y,val_y)
116 | # P, R, F = word_seg_F1(pred_y,val_y)
117 | print("P:"+str(P))
118 | print("R:"+str(R))
119 | print("F1:"+str(F))
120 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | # encoding:utf-8
2 | from keras.callbacks import ModelCheckpoint
3 | from preprocess import *
4 | from generator import bert_generator
5 | import ModelLib
6 | import config
7 |
8 | para = config.para
9 | train_x, train_y, val_x, val_y, word2id, tags = pickle.load(open(para["data_pk_path"], 'rb'))
10 |
11 | def train_bert_model(para, use_generator = False):
12 | para['tag_num'] = len(tags)
13 | model = ModelLib.BERT_MODEL(para)
14 | checkpoint = ModelCheckpoint(para["model_path"], monitor='val_viterbi_acc', verbose=1,
15 | save_best_only=True, mode='max')
16 |
17 | if use_generator:
18 | val_bert = load_path_bert(para["test_path"], para["sep"])
19 | model.fit_generator(bert_generator(para["batch_size"], para["train_path"], para["sep"], train_y,Shuffle=True), steps_per_epoch=int(train_y.shape[0]/para["batch_size"])+1, callbacks=[checkpoint],
20 | validation_data=(val_bert, val_y), epochs=para["EPOCHS"],verbose=1)
21 | else:
22 | train_bert, val_bert = load_bert_repre()
23 | model.fit(train_bert, train_y, batch_size=para["batch_size"], epochs=para["EPOCHS"], callbacks=[checkpoint],
24 | validation_data=(val_bert, val_y), shuffle=True,verbose=1)
25 |
26 |
27 | if __name__ == "__main__":
28 |
29 | para["char_dropout"] = 0.5
30 | para["rnn_dropout"] = 0.5
31 | para["model_path"] = "./model/bert-lstm-crf"
32 | train_bert_model(para, use_generator=False)
33 |
--------------------------------------------------------------------------------