├── README.md └── src ├── afqmc.py ├── c3.py ├── chid.py ├── cluener.py ├── cmnli.py ├── cmrc2018.py ├── csl.py ├── iflytek.py ├── ocnli.py ├── snippets.py ├── tnews.py └── wsc.py /README.md: -------------------------------------------------------------------------------- 1 | # 基于bert4keras的CLUE基准代码 2 | 真·“Deep Learning for Humans” 3 | 4 | ## 简介 5 | - 博客:https://kexue.fm/archives/8739 6 | 7 | (说实话我也不知道要补充点啥,我觉得代码本身够清晰了,如果还有什么疑问,欢迎提issue~) 8 | 9 | ## 使用 10 | 模型和优化器定义在`snippets.py`里边,如果要更换模型,修改`snippets.py`即可。 11 | 12 | 优化器使用了AdaFactor,这是因为它对参数范围具有较好的适应性,是一个较优(但不一定是最好)的默认选择。 13 | 14 | ## 环境 15 | - 软件:bert4keras>=0.10.8 16 | - 硬件:博客中的成绩是用一张Titan RTX(24G)跑出来的,如果你显存不够,可以适当降低batch_size,并启用梯度累积。 17 | 18 | ## 其他 19 | - 英文GLUE榜单的bert4kears基准:https://github.com/nishiwen1214/GLUE-bert4keras 20 | 21 | ## 交流 22 | QQ交流群:808623966,微信群请加机器人微信号spaces_ac_cn 23 | -------------------------------------------------------------------------------- /src/afqmc.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # CLUE评测 3 | # afqmc文本相似度 4 | # 思路:文本拼接后取[CLS]然后接Dense+Softmax分类 5 | 6 | import json 7 | import numpy as np 8 | from snippets import * 9 | from bert4keras.snippets import sequence_padding, DataGenerator 10 | from bert4keras.snippets import open 11 | from tqdm import tqdm 12 | 13 | # 基本参数 14 | num_classes = 2 15 | maxlen = 128 16 | batch_size = 32 17 | epochs = 10 18 | 19 | 20 | def load_data(filename): 21 | """加载数据 22 | 格式:[(文本1, 文本2, 标签id)] 23 | """ 24 | D = [] 25 | with open(filename) as f: 26 | for i, l in enumerate(f): 27 | l = json.loads(l) 28 | text1, text2 = l['sentence1'], l['sentence2'] 29 | label = l.get('label', 0) 30 | D.append((text1, text2, int(label))) 31 | return D 32 | 33 | 34 | # 加载数据集 35 | train_data = load_data(data_path + 'afqmc/train.json') 36 | valid_data = load_data(data_path + 'afqmc/dev.json') 37 | 38 | 39 | class data_generator(DataGenerator): 40 | """数据生成器 41 | """ 42 | def __iter__(self, random=False): 43 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 44 | for is_end, (text1, text2, label) in self.sample(random): 45 | token_ids, segment_ids = tokenizer.encode( 46 | text1, text2, maxlen=maxlen 47 | ) 48 | batch_token_ids.append(token_ids) 49 | batch_segment_ids.append([0] * len(segment_ids)) 50 | batch_labels.append([label]) 51 | if len(batch_token_ids) == self.batch_size or is_end: 52 | batch_token_ids = sequence_padding(batch_token_ids) 53 | batch_segment_ids = sequence_padding(batch_segment_ids) 54 | batch_labels = sequence_padding(batch_labels) 55 | yield [batch_token_ids, batch_segment_ids], batch_labels 56 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 57 | 58 | 59 | # 转换数据集 60 | train_generator = data_generator(train_data, batch_size) 61 | valid_generator = data_generator(valid_data, batch_size) 62 | 63 | # 构建模型 64 | output = base.model.get_layer(last_layer).output 65 | output = pooling_layer(output) 66 | output = keras.layers.Dense( 67 | units=num_classes, 68 | activation='softmax', 69 | kernel_initializer=base.initializer 70 | )(output) 71 | 72 | model = keras.models.Model(base.model.input, output) 73 | model.summary() 74 | 75 | model.compile( 76 | loss='sparse_categorical_crossentropy', 77 | optimizer=optimizer, 78 | metrics=['accuracy'] 79 | ) 80 | 81 | 82 | class Evaluator(keras.callbacks.Callback): 83 | """保存验证集acc最好的模型 84 | """ 85 | def __init__(self): 86 | self.best_val_acc = 0. 87 | 88 | def on_epoch_end(self, epoch, logs=None): 89 | val_acc = self.evaluate(valid_generator) 90 | if val_acc > self.best_val_acc: 91 | self.best_val_acc = val_acc 92 | model.save_weights('weights/afqmc.weights') 93 | print( 94 | u'val_acc: %.5f, best_val_acc: %.5f\n' % 95 | (val_acc, self.best_val_acc) 96 | ) 97 | 98 | def evaluate(self, data): 99 | total, right = 0., 0. 100 | for x_true, y_true in data: 101 | y_pred = model.predict(x_true).argmax(axis=1) 102 | y_true = y_true[:, 0] 103 | total += len(y_true) 104 | right += (y_true == y_pred).sum() 105 | return right / total 106 | 107 | 108 | def test_predict(in_file, out_file): 109 | """输出测试结果到文件 110 | 结果文件可以提交到 https://www.cluebenchmarks.com 评测。 111 | """ 112 | test_data = load_data(in_file) 113 | test_generator = data_generator(test_data, batch_size) 114 | 115 | results = [] 116 | for x_true, _ in tqdm(test_generator, ncols=0): 117 | y_pred = model.predict(x_true).argmax(axis=1) 118 | results.extend(y_pred) 119 | 120 | fw = open(out_file, 'w') 121 | with open(in_file) as fr: 122 | for l, r in zip(fr, results): 123 | l = json.loads(l) 124 | l = json.dumps({'id': str(l['id']), 'label': str(r)}) 125 | fw.write(l + '\n') 126 | fw.close() 127 | 128 | 129 | if __name__ == '__main__': 130 | 131 | evaluator = Evaluator() 132 | 133 | model.fit_generator( 134 | train_generator.forfit(), 135 | steps_per_epoch=len(train_generator), 136 | epochs=epochs, 137 | callbacks=[evaluator] 138 | ) 139 | 140 | model.load_weights('weights/afqmc.weights') 141 | test_predict( 142 | in_file=data_path + 'afqmc/test.json', 143 | out_file='results/afqmc_predict.json' 144 | ) 145 | 146 | else: 147 | 148 | model.load_weights('weights/afqmc.weights') 149 | -------------------------------------------------------------------------------- /src/c3.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # CLUE评测 3 | # c3多项选择阅读理解 4 | # 思路:每个选项分别与问题、篇章拼接后打分排序 5 | 6 | import json 7 | import numpy as np 8 | from snippets import * 9 | from bert4keras.snippets import sequence_padding, DataGenerator 10 | from bert4keras.snippets import open 11 | from bert4keras.snippets import truncate_sequences 12 | from tqdm import tqdm 13 | 14 | # 基本参数 15 | num_classes = 4 16 | maxlen = 512 17 | batch_size = 4 18 | epochs = 10 19 | 20 | 21 | def load_data(filename): 22 | """加载数据 23 | 格式:[(篇章, 问题, 选项, 答案id)] 24 | """ 25 | D = [] 26 | with open(filename) as f: 27 | data = json.load(f) 28 | for d in data: 29 | p = u'||'.join(d[0]) 30 | for qa in d[1]: 31 | q = qa['question'] 32 | while len(qa['choice']) < num_classes: 33 | qa['choice'].append(u'无效答案') 34 | c = qa['choice'][:num_classes] 35 | if 'answer' in qa: 36 | a = qa['choice'].index(qa['answer']) 37 | else: 38 | a = 0 39 | D.append((p, q, c, a)) 40 | return D 41 | 42 | 43 | # 加载数据集 44 | train_data = load_data(data_path + 'c3/m-train.json') 45 | train_data += load_data(data_path + 'c3/d-train.json') 46 | valid_data = load_data(data_path + 'c3/m-dev.json') 47 | valid_data += load_data(data_path + 'c3/d-dev.json') 48 | 49 | 50 | class data_generator(DataGenerator): 51 | """数据生成器 52 | """ 53 | def __iter__(self, random=False): 54 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 55 | for is_end, (p, q, cs, a) in self.sample(random): 56 | for c in cs: 57 | p_ids = tokenizer.encode(p)[0] 58 | q_ids = tokenizer.encode(q)[0][1:] 59 | c_ids = tokenizer.encode(c)[0][1:] 60 | truncate_sequences(maxlen, -2, c_ids, q_ids, p_ids) 61 | token_ids = p_ids + q_ids + c_ids 62 | batch_token_ids.append(token_ids) 63 | batch_segment_ids.append([0] * len(token_ids)) 64 | batch_labels.append([a]) 65 | if len(batch_token_ids) == self.batch_size * num_classes or is_end: 66 | batch_token_ids = sequence_padding(batch_token_ids) 67 | batch_segment_ids = sequence_padding(batch_segment_ids) 68 | batch_labels = sequence_padding(batch_labels) 69 | yield [batch_token_ids, batch_segment_ids], batch_labels 70 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 71 | 72 | 73 | # 转换数据集 74 | train_generator = data_generator(train_data, batch_size) 75 | valid_generator = data_generator(valid_data, batch_size) 76 | 77 | 78 | def multichoice_crossentropy(y_true, y_pred): 79 | """多项选择的交叉熵 80 | """ 81 | y_true = K.cast(y_true, 'int32')[::num_classes] 82 | y_pred = K.reshape(y_pred, (-1, num_classes)) 83 | return K.mean( 84 | K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) 85 | ) 86 | 87 | 88 | def multichoice_accuracy(y_true, y_pred): 89 | """多项选择的准确率 90 | """ 91 | y_true = K.cast(y_true, 'int32')[::num_classes, 0] 92 | y_pred = K.reshape(y_pred, (-1, num_classes)) 93 | y_pred = K.cast(K.argmax(y_pred, axis=1), 'int32') 94 | return K.mean(K.cast(K.equal(y_true, y_pred), K.floatx())) 95 | 96 | 97 | # 构建模型 98 | output = base.model.get_layer(last_layer).output 99 | output = pooling_layer(output) 100 | output = keras.layers.Dense(units=1, 101 | kernel_initializer=base.initializer)(output) 102 | 103 | model = keras.models.Model(base.model.input, output) 104 | model.summary() 105 | 106 | model.compile( 107 | loss=multichoice_crossentropy, 108 | optimizer=optimizer4, 109 | metrics=[multichoice_accuracy] 110 | ) 111 | 112 | 113 | class Evaluator(keras.callbacks.Callback): 114 | """保存验证集acc最好的模型 115 | """ 116 | def __init__(self): 117 | self.best_val_acc = 0. 118 | 119 | def on_epoch_end(self, epoch, logs=None): 120 | val_acc = self.evaluate(valid_generator) 121 | if val_acc > self.best_val_acc: 122 | self.best_val_acc = val_acc 123 | model.save_weights('weights/c3.weights') 124 | print( 125 | u'val_acc: %.5f, best_val_acc: %.5f\n' % 126 | (val_acc, self.best_val_acc) 127 | ) 128 | 129 | def evaluate(self, data): 130 | total, right = 0., 0. 131 | for x_true, y_true in data: 132 | y_pred = model.predict(x_true).reshape((-1, num_classes)) 133 | y_pred = y_pred.argmax(axis=1) 134 | y_true = y_true[::num_classes, 0] 135 | total += len(y_true) 136 | right += (y_true == y_pred).sum() 137 | return right / total 138 | 139 | 140 | def test_predict(in_file, out_file): 141 | """输出测试结果到文件 142 | 结果文件可以提交到 https://www.cluebenchmarks.com 评测。 143 | """ 144 | test_data = load_data(in_file) 145 | test_generator = data_generator(test_data, batch_size) 146 | 147 | results = [] 148 | for x_true, _ in tqdm(test_generator, ncols=0): 149 | y_pred = model.predict(x_true).reshape((-1, num_classes)) 150 | y_pred = y_pred.argmax(axis=1) 151 | results.extend(y_pred) 152 | 153 | fw = open(out_file, 'w') 154 | with open(in_file) as fr: 155 | data = json.load(fr) 156 | i = 0 157 | for d in data: 158 | for qa in d[1]: 159 | l = json.dumps({'id': str(qa['id']), 'label': str(results[i])}) 160 | fw.write(l + '\n') 161 | i += 1 162 | fw.close() 163 | 164 | 165 | if __name__ == '__main__': 166 | 167 | evaluator = Evaluator() 168 | 169 | model.fit_generator( 170 | train_generator.forfit(), 171 | steps_per_epoch=len(train_generator), 172 | epochs=epochs, 173 | callbacks=[evaluator] 174 | ) 175 | 176 | model.load_weights('weights/c3.weights') 177 | test_predict( 178 | in_file=data_path + 'c3/test1.0.json', 179 | out_file='results/c310_predict.json' 180 | ) 181 | test_predict( 182 | in_file=data_path + 'c3/test1.1.json', 183 | out_file='results/c311_predict.json' 184 | ) 185 | 186 | else: 187 | 188 | model.load_weights('weights/c3.weights') 189 | -------------------------------------------------------------------------------- /src/chid.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # CLUE评测 3 | # chid成语阅读理解(多项选择) 4 | # 思路:每个选项分别与问题、篇章拼接后打分排序 5 | 6 | import json, re 7 | import numpy as np 8 | from snippets import * 9 | from bert4keras.snippets import sequence_padding, DataGenerator 10 | from bert4keras.snippets import open 11 | from scipy.optimize import linear_sum_assignment 12 | from itertools import groupby 13 | from tqdm import tqdm 14 | 15 | # 基本参数 16 | num_classes = 10 17 | maxlen = 64 18 | batch_size = 12 19 | epochs = 5 20 | 21 | 22 | def sample_split(texts, answers, candidates): 23 | """将样本分隔为只有一个答案的样本,并截断长度 24 | """ 25 | results = [] 26 | for i, a in enumerate(answers): 27 | texts_a, texts_b = texts[:i + 1], texts[i + 1:] 28 | offset = 3 + 4 + 1 + 4 * (len(texts_a) + len(texts_b) - 2) 29 | while True: 30 | l_a = sum([len(t) for t in texts_a]) 31 | l_b = sum([len(t) for t in texts_b]) 32 | if l_a + l_b > maxlen - offset: 33 | if l_a > l_b: 34 | if len(texts_a[0]) > 1: 35 | texts_a[0] = texts_a[0][1:] 36 | else: 37 | texts_a = texts_a[1:] 38 | offset -= 4 39 | else: 40 | if len(texts_b[-1]) > 1: 41 | texts_b[-1] = texts_b[-1][:-1] 42 | else: 43 | texts_b = texts_b[:-1] 44 | offset -= 4 45 | else: 46 | break 47 | results.append((texts_a, texts_b, a, candidates)) 48 | return results 49 | 50 | 51 | def load_data(q_file, a_file=None): 52 | """加载数据 53 | 格式:[(左文本, 右文本, 答案id, 候选词集)] 54 | """ 55 | D = [] 56 | with open(q_file) as fq: 57 | if a_file is not None: 58 | A = json.load(open(a_file)) 59 | for i, l in enumerate(fq): 60 | l = json.loads(l) 61 | assert len(l['candidates']) == num_classes 62 | for c in l['content']: 63 | texts = re.split('#idiom\d{6}#', c) 64 | keys = re.findall('#idiom\d{6}#', c) 65 | if a_file is None: 66 | answers = [(i, k, 0) for k in keys] 67 | else: 68 | answers = [(i, k, A[k]) for k in keys] 69 | D.extend(sample_split(texts, answers, l['candidates'])) 70 | return D 71 | 72 | 73 | # 加载数据集 74 | train_data = load_data( 75 | data_path + 'chid/train.json', data_path + 'chid/train_answer.json' 76 | ) 77 | valid_data = load_data( 78 | data_path + 'chid/dev.json', data_path + 'chid/dev_answer.json' 79 | ) 80 | 81 | 82 | class data_generator(DataGenerator): 83 | """数据生成器 84 | """ 85 | def __iter__(self, random=False): 86 | mark_ids = tokenizer.tokens_to_ids([u'[unused1]']) 87 | mask_ids = tokenizer.tokens_to_ids([u'[MASK]']) * 4 88 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 89 | for is_end, (ta, tb, (_, _, a), cs) in self.sample(random): 90 | token_ids = [] 91 | for i, t in enumerate(ta): 92 | token_ids.extend(tokenizer.encode(t)[0][1:-1]) 93 | if i != len(ta) - 1: 94 | token_ids.extend(mask_ids) 95 | token_ids.extend(mark_ids) 96 | for i, t in enumerate(tb): 97 | token_ids.extend(tokenizer.encode(t)[0][1:-1]) 98 | if i != len(tb) - 1: 99 | token_ids.extend(mask_ids) 100 | token_ids.append(tokenizer._token_end_id) 101 | for c in cs: 102 | batch_token_ids.append(tokenizer.encode(c)[0] + token_ids) 103 | batch_segment_ids.append([0] * len(batch_token_ids[-1])) 104 | batch_labels.append([a]) 105 | if len(batch_token_ids) == self.batch_size * num_classes or is_end: 106 | batch_token_ids = sequence_padding(batch_token_ids) 107 | batch_segment_ids = sequence_padding(batch_segment_ids) 108 | batch_labels = sequence_padding(batch_labels) 109 | yield [batch_token_ids, batch_segment_ids], batch_labels 110 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 111 | 112 | 113 | # 转换数据集 114 | train_generator = data_generator(train_data, batch_size) 115 | valid_generator = data_generator(valid_data, batch_size) 116 | 117 | 118 | def multichoice_crossentropy(y_true, y_pred): 119 | """多项选择的交叉熵 120 | """ 121 | y_true = K.cast(y_true, 'int32')[::num_classes] 122 | y_pred = K.reshape(y_pred, (-1, num_classes)) 123 | return K.mean( 124 | K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) 125 | ) 126 | 127 | 128 | def multichoice_accuracy(y_true, y_pred): 129 | """多项选择的准确率 130 | """ 131 | y_true = K.cast(y_true, 'int32')[::num_classes, 0] 132 | y_pred = K.reshape(y_pred, (-1, num_classes)) 133 | y_pred = K.cast(K.argmax(y_pred, axis=1), 'int32') 134 | return K.mean(K.cast(K.equal(y_true, y_pred), K.floatx())) 135 | 136 | 137 | # 构建模型 138 | output = base.model.get_layer(last_layer).output 139 | output = pooling_layer(output) 140 | output = keras.layers.Dense(units=1, 141 | kernel_initializer=base.initializer)(output) 142 | 143 | model = keras.models.Model(base.model.input, output) 144 | model.summary() 145 | 146 | model.compile( 147 | loss=multichoice_crossentropy, 148 | optimizer=optimizer2, 149 | metrics=[multichoice_accuracy] 150 | ) 151 | 152 | 153 | class Evaluator(keras.callbacks.Callback): 154 | """保存验证集acc最好的模型 155 | """ 156 | def __init__(self): 157 | self.best_val_acc = 0. 158 | 159 | def on_epoch_end(self, epoch, logs=None): 160 | val_acc = self.evaluate(valid_data, valid_generator) 161 | if val_acc > self.best_val_acc: 162 | self.best_val_acc = val_acc 163 | model.save_weights('weights/chid.weights') 164 | print( 165 | u'val_acc: %.5f, best_val_acc: %.5f\n' % 166 | (val_acc, self.best_val_acc) 167 | ) 168 | 169 | def evaluate(self, data, generator): 170 | total, right = 0, 0. 171 | logits = np.empty((0, num_classes)) 172 | for x_true, y_true in tqdm(generator, ncols=0): 173 | y_pred = model.predict(x_true).reshape((-1, num_classes)) 174 | logits = np.concatenate([logits, y_pred], axis=0) 175 | for _, g in groupby(data, key=lambda d: d[2][0]): 176 | y_true = np.array([d[2][2] for d in g]) 177 | costs = -logits[total:total + len(y_true)] 178 | y_pred = linear_sum_assignment(costs)[1] 179 | total += len(y_true) 180 | right += (y_true == y_pred).sum() 181 | return right / total 182 | 183 | 184 | def test_predict(in_file, out_file): 185 | """输出测试结果到文件 186 | 结果文件可以提交到 https://www.cluebenchmarks.com 评测。 187 | """ 188 | test_data = load_data(in_file) 189 | test_generator = data_generator(test_data, batch_size) 190 | 191 | logits = np.empty((0, num_classes)) 192 | for x_true, _ in tqdm(test_generator, ncols=0): 193 | y_pred = model.predict(x_true).reshape((-1, num_classes)) 194 | logits = np.concatenate([logits, y_pred], axis=0) 195 | 196 | results, total = {}, 0 197 | for _, g in groupby(test_data, key=lambda d: d[2][0]): 198 | keys = [d[2][1] for d in g] 199 | costs = -logits[total:total + len(keys)] 200 | y_pred = linear_sum_assignment(costs)[1] 201 | for k, r in zip(keys, y_pred): 202 | results[k] = int(r) 203 | total += len(keys) 204 | 205 | with open(out_file, 'w', encoding='utf-8') as f: 206 | json.dump(results, f, ensure_ascii=False, indent=4) 207 | 208 | 209 | if __name__ == '__main__': 210 | 211 | evaluator = Evaluator() 212 | 213 | model.fit_generator( 214 | train_generator.forfit(), 215 | steps_per_epoch=len(train_generator), 216 | epochs=epochs, 217 | callbacks=[evaluator] 218 | ) 219 | 220 | model.load_weights('weights/chid.weights') 221 | test_predict( 222 | in_file=data_path + 'chid/test1.0.json', 223 | out_file='results/chid10_predict.json' 224 | ) 225 | test_predict( 226 | in_file=data_path + 'chid/test1.1.json', 227 | out_file='results/chid11_predict.json' 228 | ) 229 | 230 | else: 231 | 232 | model.load_weights('weights/chid.weights') 233 | -------------------------------------------------------------------------------- /src/cluener.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 用GlobalPointer做中文命名实体识别 3 | # 数据集 https://github.com/CLUEbenchmark/CLUENER2020 4 | 5 | import json 6 | import numpy as np 7 | from snippets import * 8 | from bert4keras.backend import multilabel_categorical_crossentropy 9 | from bert4keras.layers import GlobalPointer 10 | from bert4keras.snippets import sequence_padding, DataGenerator 11 | from bert4keras.snippets import open 12 | from tqdm import tqdm 13 | 14 | maxlen = 256 15 | epochs = 10 16 | batch_size = 32 17 | categories = set() 18 | 19 | 20 | def load_data(filename): 21 | """加载数据 22 | 单条格式:[text, (start, end, label), (start, end, label), ...], 23 | 意味着text[start:end + 1]是类型为label的实体。 24 | """ 25 | D = [] 26 | with open(filename, encoding='utf-8') as f: 27 | for l in f: 28 | l = json.loads(l) 29 | d = [l['text']] 30 | for k, v in l.get('label', {}).items(): 31 | categories.add(k) 32 | for spans in v.values(): 33 | for start, end in spans: 34 | d.append((start, end, k)) 35 | D.append(d) 36 | return D 37 | 38 | 39 | # 标注数据 40 | train_data = load_data(data_path + 'cluener/train.json') 41 | valid_data = load_data(data_path + 'cluener/dev.json') 42 | categories = list(sorted(categories)) 43 | num_classes = len(categories) 44 | 45 | 46 | class data_generator(DataGenerator): 47 | """数据生成器 48 | """ 49 | def __iter__(self, random=False): 50 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 51 | for is_end, d in self.sample(random): 52 | tokens = tokenizer.tokenize(d[0], maxlen=maxlen) 53 | mapping = tokenizer.rematch(d[0], tokens) 54 | start_mapping = {j[0]: i for i, j in enumerate(mapping) if j} 55 | end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j} 56 | token_ids = tokenizer.tokens_to_ids(tokens) 57 | segment_ids = [0] * len(token_ids) 58 | labels = np.zeros((len(categories), maxlen, maxlen)) 59 | for start, end, label in d[1:]: 60 | if start in start_mapping and end in end_mapping: 61 | start = start_mapping[start] 62 | end = end_mapping[end] 63 | label = categories.index(label) 64 | labels[label, start, end] = 1 65 | batch_token_ids.append(token_ids) 66 | batch_segment_ids.append(segment_ids) 67 | batch_labels.append(labels[:, :len(token_ids), :len(token_ids)]) 68 | if len(batch_token_ids) == self.batch_size or is_end: 69 | batch_token_ids = sequence_padding(batch_token_ids) 70 | batch_segment_ids = sequence_padding(batch_segment_ids) 71 | batch_labels = sequence_padding(batch_labels, seq_dims=3) 72 | yield [batch_token_ids, batch_segment_ids], batch_labels 73 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 74 | 75 | 76 | # 转换数据集 77 | train_generator = data_generator(train_data, batch_size) 78 | valid_generator = data_generator(valid_data, batch_size) 79 | 80 | 81 | def globalpointer_crossentropy(y_true, y_pred): 82 | """给GlobalPointer设计的交叉熵 83 | """ 84 | bh = K.prod(K.shape(y_pred)[:2]) 85 | y_true = K.reshape(y_true, (bh, -1)) 86 | y_pred = K.reshape(y_pred, (bh, -1)) 87 | return K.mean(multilabel_categorical_crossentropy(y_true, y_pred)) 88 | 89 | 90 | def globalpointer_f1score(y_true, y_pred): 91 | """给GlobalPointer设计的F1 92 | """ 93 | y_pred = K.cast(K.greater(y_pred, 0), K.floatx()) 94 | return 2 * K.sum(y_true * y_pred) / K.sum(y_true + y_pred) 95 | 96 | 97 | # 构建模型 98 | output = base.model.get_layer(last_layer).output 99 | output = GlobalPointer( 100 | heads=num_classes, 101 | head_size=base.attention_head_size, 102 | use_bias=False, 103 | kernel_initializer=base.initializer 104 | )(output) 105 | 106 | model = keras.models.Model(base.model.input, output) 107 | model.summary() 108 | 109 | model.compile( 110 | loss=globalpointer_crossentropy, 111 | optimizer=optimizer, 112 | metrics=[globalpointer_f1score] 113 | ) 114 | 115 | 116 | class Evaluator(keras.callbacks.Callback): 117 | """保存验证集f1最好的模型 118 | """ 119 | def __init__(self): 120 | self.best_val_f1 = 0 121 | 122 | def on_epoch_end(self, epoch, logs=None): 123 | f1, precision, recall = self.evaluate(valid_generator) 124 | # 保存最优 125 | if f1 >= self.best_val_f1: 126 | self.best_val_f1 = f1 127 | model.save_weights('weights/cluener.weights') 128 | print( 129 | 'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' % 130 | (f1, precision, recall, self.best_val_f1) 131 | ) 132 | 133 | def evaluate(self, data): 134 | X, Y, Z = 1e-10, 1e-10, 1e-10 135 | for x_true, y_true in data: 136 | y_pred = (model.predict(x_true) > 0).astype(int) 137 | X += (y_pred * y_true).sum() 138 | Y += y_pred.sum() 139 | Z += y_true.sum() 140 | f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z 141 | return f1, precision, recall 142 | 143 | 144 | def test_predict(in_file, out_file): 145 | """输出测试结果到文件 146 | 结果文件可以提交到 https://www.cluebenchmarks.com 评测。 147 | """ 148 | test_data = load_data(in_file) 149 | test_generator = data_generator(test_data, batch_size) 150 | 151 | results = [] 152 | for x_true, _ in tqdm(test_generator, ncols=0): 153 | y_pred = model.predict(x_true) 154 | for y in y_pred: 155 | results.append(np.where(y > 0)) 156 | 157 | fw = open(out_file, 'w', encoding='utf-8') 158 | with open(in_file) as fr: 159 | for l, r in zip(fr, results): 160 | l = json.loads(l) 161 | l['label'] = {} 162 | tokens = tokenizer.tokenize(l['text'], maxlen=maxlen) 163 | mapping = tokenizer.rematch(l['text'], tokens) 164 | for label, start, end in zip(*r): 165 | label = categories[label] 166 | start, end = mapping[start][0], mapping[end][-1] 167 | if label not in l['label']: 168 | l['label'][label] = {} 169 | entity = l['text'][start:end + 1] 170 | if entity not in l['label'][label]: 171 | l['label'][label][entity] = [] 172 | l['label'][label][entity].append([start, end]) 173 | l = json.dumps(l, ensure_ascii=False) 174 | fw.write(l + '\n') 175 | fw.close() 176 | 177 | 178 | if __name__ == '__main__': 179 | 180 | evaluator = Evaluator() 181 | 182 | model.fit_generator( 183 | train_generator.forfit(), 184 | steps_per_epoch=len(train_generator), 185 | epochs=epochs, 186 | callbacks=[evaluator] 187 | ) 188 | 189 | model.load_weights('weights/cluener.weights') 190 | test_predict( 191 | in_file=data_path + 'cluener/test.json', 192 | out_file='results/cluener_predict.json' 193 | ) 194 | 195 | else: 196 | 197 | model.load_weights('weights/cluener.weights') 198 | -------------------------------------------------------------------------------- /src/cmnli.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # CLUE评测 3 | # cmnli自然语言推理 4 | # 思路:文本拼接后取[CLS]然后接Dense+Softmax分类 5 | 6 | import json 7 | import numpy as np 8 | from snippets import * 9 | from bert4keras.snippets import sequence_padding, DataGenerator 10 | from bert4keras.snippets import open 11 | from tqdm import tqdm 12 | 13 | # 基本参数 14 | labels = ['entailment', 'neutral', 'contradiction'] 15 | num_classes = len(labels) 16 | maxlen = 128 17 | batch_size = 32 18 | epochs = 10 19 | 20 | 21 | def load_data(filename): 22 | """加载数据 23 | 格式:[(文本1, 文本2, 标签id)] 24 | """ 25 | D = [] 26 | with open(filename) as f: 27 | for i, l in enumerate(f): 28 | l = json.loads(l) 29 | text1, text2 = l['sentence1'], l['sentence2'] 30 | label = l.get('label', 'neutral') 31 | if label in labels: 32 | D.append((text1, text2, labels.index(label))) 33 | return D 34 | 35 | 36 | # 加载数据集 37 | train_data = load_data(data_path + 'cmnli/train.json') 38 | valid_data = load_data(data_path + 'cmnli/dev.json') 39 | 40 | 41 | class data_generator(DataGenerator): 42 | """数据生成器 43 | """ 44 | def __iter__(self, random=False): 45 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 46 | for is_end, (text1, text2, label) in self.sample(random): 47 | token_ids, segment_ids = tokenizer.encode( 48 | text1, text2, maxlen=maxlen 49 | ) 50 | batch_token_ids.append(token_ids) 51 | batch_segment_ids.append([0] * len(segment_ids)) 52 | batch_labels.append([label]) 53 | if len(batch_token_ids) == self.batch_size or is_end: 54 | batch_token_ids = sequence_padding(batch_token_ids) 55 | batch_segment_ids = sequence_padding(batch_segment_ids) 56 | batch_labels = sequence_padding(batch_labels) 57 | yield [batch_token_ids, batch_segment_ids], batch_labels 58 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 59 | 60 | 61 | # 转换数据集 62 | train_generator = data_generator(train_data, batch_size) 63 | valid_generator = data_generator(valid_data, batch_size) 64 | 65 | # 构建模型 66 | output = base.model.get_layer(last_layer).output 67 | output = pooling_layer(output) 68 | output = keras.layers.Dense( 69 | units=num_classes, 70 | activation='softmax', 71 | kernel_initializer=base.initializer 72 | )(output) 73 | 74 | model = keras.models.Model(base.model.input, output) 75 | model.summary() 76 | 77 | model.compile( 78 | loss='sparse_categorical_crossentropy', 79 | optimizer=optimizer, 80 | metrics=['accuracy'] 81 | ) 82 | 83 | 84 | class Evaluator(keras.callbacks.Callback): 85 | """保存验证集acc最好的模型 86 | """ 87 | def __init__(self): 88 | self.best_val_acc = 0. 89 | 90 | def on_epoch_end(self, epoch, logs=None): 91 | val_acc = self.evaluate(valid_generator) 92 | if val_acc > self.best_val_acc: 93 | self.best_val_acc = val_acc 94 | model.save_weights('weights/cmnli.weights') 95 | print( 96 | u'val_acc: %.5f, best_val_acc: %.5f\n' % 97 | (val_acc, self.best_val_acc) 98 | ) 99 | 100 | def evaluate(self, data): 101 | total, right = 0., 0. 102 | for x_true, y_true in data: 103 | y_pred = model.predict(x_true).argmax(axis=1) 104 | y_true = y_true[:, 0] 105 | total += len(y_true) 106 | right += (y_true == y_pred).sum() 107 | return right / total 108 | 109 | 110 | def test_predict(in_file, out_file): 111 | """输出测试结果到文件 112 | 结果文件可以提交到 https://www.cluebenchmarks.com 评测。 113 | """ 114 | test_data = load_data(in_file) 115 | test_generator = data_generator(test_data, batch_size) 116 | 117 | results = [] 118 | for x_true, _ in tqdm(test_generator, ncols=0): 119 | y_pred = model.predict(x_true).argmax(axis=1) 120 | results.extend(y_pred) 121 | 122 | fw = open(out_file, 'w') 123 | with open(in_file) as fr: 124 | for l, r in zip(fr, results): 125 | l = json.loads(l) 126 | l = json.dumps({'id': str(l['id']), 'label': labels[r]}) 127 | fw.write(l + '\n') 128 | fw.close() 129 | 130 | 131 | if __name__ == '__main__': 132 | 133 | evaluator = Evaluator() 134 | 135 | model.fit_generator( 136 | train_generator.forfit(), 137 | steps_per_epoch=len(train_generator), 138 | epochs=epochs, 139 | callbacks=[evaluator] 140 | ) 141 | 142 | model.load_weights('weights/cmnli.weights') 143 | test_predict( 144 | in_file=data_path + 'cmnli/test.json', 145 | out_file='results/cmnli_predict.json' 146 | ) 147 | 148 | else: 149 | 150 | model.load_weights('weights/cmnli.weights') 151 | -------------------------------------------------------------------------------- /src/cmrc2018.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # CLUE评测 3 | # cmrc2018阅读理解 4 | # 思路:基于滑动窗口和GlobalPointer 5 | 6 | import json 7 | import numpy as np 8 | from snippets import * 9 | from bert4keras.layers import GlobalPointer 10 | from bert4keras.snippets import sequence_padding, DataGenerator 11 | from bert4keras.snippets import open 12 | from bert4keras.snippets import lowercase_and_normalize 13 | from tqdm import tqdm 14 | from itertools import groupby 15 | 16 | # 基本参数 17 | maxlen = 512 18 | stride = 128 19 | batch_size = 16 20 | epochs = 10 21 | 22 | 23 | def stride_split(i, q, c, a, s): 24 | """滑动窗口分割context 25 | """ 26 | # 标准转换 27 | q = lowercase_and_normalize(q) 28 | c = lowercase_and_normalize(c) 29 | a = lowercase_and_normalize(a) 30 | e = s + len(a) 31 | # 滑窗分割 32 | results, n = [], 0 33 | max_c_len = maxlen - len(q) - 3 34 | while True: 35 | l, r = n * stride, n * stride + max_c_len 36 | if l <= s < e <= r: 37 | results.append((i, q, c[l:r], a, s - l, e - l)) 38 | else: 39 | results.append((i, q, c[l:r], '', -1, -1)) 40 | if r >= len(c): 41 | return results 42 | n += 1 43 | 44 | 45 | def load_data(filename): 46 | """加载数据 47 | 格式:[(id, 问题, 篇章, 答案, start, end)] 48 | """ 49 | D = [] 50 | data = json.load(open(filename))['data'] 51 | for d in data: 52 | for p in d['paragraphs']: 53 | for qa in p['qas']: 54 | for a in qa['answers']: 55 | D.extend( 56 | stride_split( 57 | qa['id'], qa['question'], p['context'], a['text'], 58 | a['answer_start'] 59 | ) 60 | ) 61 | if a['answer_start'] == -1: 62 | break 63 | return D 64 | 65 | 66 | # 加载数据集 67 | train_data = load_data(data_path + 'cmrc2018/train.json') 68 | valid_data = load_data(data_path + 'cmrc2018/dev.json') 69 | 70 | 71 | class data_generator(DataGenerator): 72 | """数据生成器 73 | """ 74 | def __iter__(self, random=False): 75 | batch_token_ids, batch_segment_ids = [], [] 76 | batch_masks, batch_labels = [], [] 77 | for is_end, (i, q, c, a, s, e) in self.sample(random): 78 | token_ids = tokenizer.encode(q)[0] 79 | mask = [1] + [0] * len(token_ids[:-1]) 80 | if s == -1: 81 | token_ids.extend(tokenizer.encode(c)[0][1:]) 82 | batch_labels.append([0, 0]) 83 | else: 84 | cl_ids = tokenizer.encode(c[:s])[0][1:-1] 85 | a_ids = tokenizer.encode(c[s:e])[0][1:-1] 86 | cr_ids = tokenizer.encode(c[e:])[0][1:] 87 | start = len(token_ids) + len(cl_ids) 88 | end = start + len(a_ids) - 1 89 | batch_labels.append([start, end]) 90 | token_ids.extend(cl_ids + a_ids + cr_ids) 91 | mask.extend([1] * (len(token_ids[:-1]) - len(mask)) + [0]) 92 | batch_token_ids.append(token_ids) 93 | batch_segment_ids.append([0] * len(token_ids)) 94 | batch_masks.append(mask) 95 | if len(batch_token_ids) == self.batch_size or is_end: 96 | batch_token_ids = sequence_padding(batch_token_ids) 97 | batch_segment_ids = sequence_padding(batch_segment_ids) 98 | batch_masks = sequence_padding(batch_masks) 99 | batch_labels = sequence_padding(batch_labels) 100 | yield [ 101 | batch_token_ids, batch_segment_ids, batch_masks 102 | ], batch_labels 103 | batch_token_ids, batch_segment_ids = [], [] 104 | batch_masks, batch_labels = [], [] 105 | 106 | 107 | # 转换数据集 108 | train_generator = data_generator(train_data, batch_size) 109 | valid_generator = data_generator(valid_data, batch_size) 110 | 111 | 112 | class CustomMasking(keras.layers.Layer): 113 | """自定义mask(主要用于mask掉question部分) 114 | """ 115 | def compute_mask(self, inputs, mask=None): 116 | return K.greater(inputs[1], 0.5) 117 | 118 | def call(self, inputs, mask=None): 119 | return inputs[0] 120 | 121 | def compute_output_shape(self, input_shape): 122 | return input_shape[0] 123 | 124 | 125 | def globalpointer_crossentropy(y_true, y_pred): 126 | """给GlobalPointer设计的交叉熵 127 | """ 128 | b, l = K.shape(y_pred)[0], K.shape(y_pred)[1] 129 | # y_true需要重新明确一下shape和dtype 130 | y_true = K.reshape(y_true, (b, 2)) 131 | y_true = K.cast(y_true, 'int32') 132 | y_true = y_true[:, 0] * l + y_true[:, 1] 133 | # 计算交叉熵 134 | y_pred = K.reshape(y_pred, (b, -1)) 135 | return K.mean( 136 | K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) 137 | ) 138 | 139 | 140 | def globalpointer_accuracy(y_true, y_pred): 141 | """给GlobalPointer设计的准确率 142 | """ 143 | b, l = K.shape(y_pred)[0], K.shape(y_pred)[1] 144 | # y_true需要重新明确一下shape和dtype 145 | y_true = K.reshape(y_true, (b, 2)) 146 | y_true = K.cast(y_true, 'int32') 147 | y_true = y_true[:, 0] * l + y_true[:, 1] 148 | # 计算准确率 149 | y_pred = K.reshape(y_pred, (b, -1)) 150 | y_pred = K.cast(K.argmax(y_pred, axis=1), 'int32') 151 | return K.mean(K.cast(K.equal(y_true, y_pred), K.floatx())) 152 | 153 | 154 | # 构建模型 155 | masks_in = keras.layers.Input(shape=(None,)) 156 | output = base.model.get_layer(last_layer).output 157 | output = CustomMasking()([output, masks_in]) 158 | output = GlobalPointer( 159 | heads=1, 160 | head_size=base.attention_head_size, 161 | use_bias=False, 162 | kernel_initializer=base.initializer 163 | )(output) 164 | output = keras.layers.Lambda(lambda x: x[:, 0])(output) 165 | 166 | model = keras.models.Model(base.model.inputs + [masks_in], output) 167 | model.summary() 168 | 169 | model.compile( 170 | loss=globalpointer_crossentropy, 171 | optimizer=optimizer2, 172 | metrics=[globalpointer_accuracy] 173 | ) 174 | 175 | 176 | class Evaluator(keras.callbacks.Callback): 177 | """保存验证集acc最好的模型 178 | """ 179 | def __init__(self): 180 | self.best_val_acc = 0. 181 | 182 | def on_epoch_end(self, epoch, logs=None): 183 | val_acc = self.evaluate(valid_data, valid_generator) 184 | if val_acc > self.best_val_acc: 185 | self.best_val_acc = val_acc 186 | model.save_weights('weights/cmrc2018.weights') 187 | print( 188 | u'val_acc: %.5f, best_val_acc: %.5f\n' % 189 | (val_acc, self.best_val_acc) 190 | ) 191 | 192 | def evaluate(self, data, generator): 193 | Y_scores = np.empty((0, 1)) 194 | Y_start_end = np.empty((0, 2), dtype=int) 195 | Y_true = np.empty((0, 2), dtype=int) 196 | for x_true, y_true in tqdm(generator, ncols=0): 197 | y_pred = model.predict(x_true) 198 | y_pred[:, 0] -= np.inf 199 | y_pred[:, :, 0] -= np.inf 200 | y_pred = y_pred.reshape((x_true[0].shape[0], -1)) 201 | y_start_end = y_pred.argmax(axis=1)[:, None] 202 | y_scores = np.take_along_axis(y_pred, y_start_end, axis=1) 203 | y_start = y_start_end // x_true[0].shape[1] 204 | y_end = y_start_end % x_true[0].shape[1] 205 | y_start_end = np.concatenate([y_start, y_end], axis=1) 206 | Y_scores = np.concatenate([Y_scores, y_scores], axis=0) 207 | Y_start_end = np.concatenate([Y_start_end, y_start_end], axis=0) 208 | Y_true = np.concatenate([Y_true, y_true], axis=0) 209 | 210 | total, right, n = 0., 0., 0 211 | for k, g in groupby(data, key=lambda d: d[0]): # 按qid分组 212 | g = len(list(g)) 213 | i = Y_scores[n:n + g].argmax() + n # 取组内最高分答案 214 | y_true, y_pred = Y_true[i], Y_start_end[i] 215 | if (y_pred == y_true).all(): 216 | right += 1 217 | total += 1 218 | n += g 219 | 220 | return right / total 221 | 222 | 223 | def test_predict(in_file, out_file): 224 | """输出测试结果到文件 225 | 结果文件可以提交到 https://www.cluebenchmarks.com 评测。 226 | """ 227 | test_data = load_data(in_file) 228 | test_generator = data_generator(test_data, batch_size) 229 | 230 | Y_scores = np.empty((0, 1)) 231 | Y_start_end = np.empty((0, 2), dtype=int) 232 | for x_true, _ in tqdm(test_generator, ncols=0): 233 | y_pred = model.predict(x_true) 234 | y_pred[:, 0] -= np.inf 235 | y_pred[:, :, 0] -= np.inf 236 | y_pred = y_pred.reshape((x_true[0].shape[0], -1)) 237 | y_start_end = y_pred.argmax(axis=1)[:, None] 238 | y_scores = np.take_along_axis(y_pred, y_start_end, axis=1) 239 | y_start = y_start_end // x_true[0].shape[1] 240 | y_end = y_start_end % x_true[0].shape[1] 241 | y_start_end = np.concatenate([y_start, y_end], axis=1) 242 | Y_scores = np.concatenate([Y_scores, y_scores], axis=0) 243 | Y_start_end = np.concatenate([Y_start_end, y_start_end], axis=0) 244 | 245 | results, n = {}, 0 246 | for k, g in groupby(test_data, key=lambda d: d[0]): # 按qid分组 247 | g = len(list(g)) 248 | i = Y_scores[n:n + g].argmax() + n # 取组内最高分答案 249 | start, end = Y_start_end[i] 250 | q, c = test_data[i][1:3] 251 | q_tokens = tokenizer.tokenize(q) 252 | c_tokens = tokenizer.tokenize(c)[1:-1] 253 | mapping = tokenizer.rematch(c, c_tokens) # 重匹配,直接在context取片段 254 | start, end = start - len(q_tokens), end - len(q_tokens) 255 | results[k] = c[mapping[start][0]:mapping[end][-1] + 1] 256 | n += g 257 | 258 | with open(out_file, 'w', encoding='utf-8') as f: 259 | json.dump(results, f, ensure_ascii=False, indent=4) 260 | 261 | 262 | if __name__ == '__main__': 263 | 264 | evaluator = Evaluator() 265 | 266 | model.fit_generator( 267 | train_generator.forfit(), 268 | steps_per_epoch=len(train_generator), 269 | epochs=epochs, 270 | callbacks=[evaluator] 271 | ) 272 | 273 | model.load_weights('weights/cmrc2018.weights') 274 | test_predict( 275 | in_file=data_path + 'cmrc2018/test.json', 276 | out_file='results/cmrc2018_predict.json' 277 | ) 278 | 279 | else: 280 | 281 | model.load_weights('weights/cmrc2018.weights') 282 | -------------------------------------------------------------------------------- /src/csl.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # CLUE评测 3 | # csl关键词匹配 4 | # 思路:将关键词拼接成句子后,当成双文本分类 5 | 6 | import json 7 | import numpy as np 8 | from snippets import * 9 | from bert4keras.snippets import sequence_padding, DataGenerator 10 | from bert4keras.snippets import open 11 | from tqdm import tqdm 12 | 13 | # 基本参数 14 | num_classes = 2 15 | maxlen = 128 16 | batch_size = 32 17 | epochs = 10 18 | 19 | 20 | def load_data(filename): 21 | """加载数据 22 | 格式:[(关键词, 文本, 标签id)] 23 | """ 24 | D = [] 25 | with open(filename) as f: 26 | for i, l in enumerate(f): 27 | l = json.loads(l) 28 | text1, text2 = ';'.join(l['keyword']), l['abst'] 29 | label = l.get('label', 0) 30 | D.append((text1, text2, int(label))) 31 | return D 32 | 33 | 34 | # 加载数据集 35 | train_data = load_data(data_path + 'csl/train.json') 36 | valid_data = load_data(data_path + 'csl/dev.json') 37 | 38 | 39 | class data_generator(DataGenerator): 40 | """数据生成器 41 | """ 42 | def __iter__(self, random=False): 43 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 44 | for is_end, (text1, text2, label) in self.sample(random): 45 | token_ids, segment_ids = tokenizer.encode( 46 | text1, text2, maxlen=maxlen 47 | ) 48 | batch_token_ids.append(token_ids) 49 | batch_segment_ids.append([0] * len(segment_ids)) 50 | batch_labels.append([label]) 51 | if len(batch_token_ids) == self.batch_size or is_end: 52 | batch_token_ids = sequence_padding(batch_token_ids) 53 | batch_segment_ids = sequence_padding(batch_segment_ids) 54 | batch_labels = sequence_padding(batch_labels) 55 | yield [batch_token_ids, batch_segment_ids], batch_labels 56 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 57 | 58 | 59 | # 转换数据集 60 | train_generator = data_generator(train_data, batch_size) 61 | valid_generator = data_generator(valid_data, batch_size) 62 | 63 | # 构建模型 64 | output = base.model.get_layer(last_layer).output 65 | output = pooling_layer(output) 66 | output = keras.layers.Dense( 67 | units=num_classes, 68 | activation='softmax', 69 | kernel_initializer=base.initializer 70 | )(output) 71 | 72 | model = keras.models.Model(base.model.input, output) 73 | model.summary() 74 | 75 | model.compile( 76 | loss='sparse_categorical_crossentropy', 77 | optimizer=optimizer, 78 | metrics=['accuracy'] 79 | ) 80 | 81 | 82 | class Evaluator(keras.callbacks.Callback): 83 | """保存验证集acc最好的模型 84 | """ 85 | def __init__(self): 86 | self.best_val_acc = 0. 87 | 88 | def on_epoch_end(self, epoch, logs=None): 89 | val_acc = self.evaluate(valid_generator) 90 | if val_acc > self.best_val_acc: 91 | self.best_val_acc = val_acc 92 | model.save_weights('weights/csl.weights') 93 | print( 94 | u'val_acc: %.5f, best_val_acc: %.5f\n' % 95 | (val_acc, self.best_val_acc) 96 | ) 97 | 98 | def evaluate(self, data): 99 | total, right = 0., 0. 100 | for x_true, y_true in data: 101 | y_pred = model.predict(x_true).argmax(axis=1) 102 | y_true = y_true[:, 0] 103 | total += len(y_true) 104 | right += (y_true == y_pred).sum() 105 | return right / total 106 | 107 | 108 | def test_predict(in_file, out_file): 109 | """输出测试结果到文件 110 | 结果文件可以提交到 https://www.cluebenchmarks.com 评测。 111 | """ 112 | test_data = load_data(in_file) 113 | test_generator = data_generator(test_data, batch_size) 114 | 115 | results = [] 116 | for x_true, _ in tqdm(test_generator, ncols=0): 117 | y_pred = model.predict(x_true).argmax(axis=1) 118 | results.extend(y_pred) 119 | 120 | fw = open(out_file, 'w') 121 | with open(in_file) as fr: 122 | for l, r in zip(fr, results): 123 | l = json.loads(l) 124 | l = json.dumps({'id': str(l['id']), 'label': str(r)}) 125 | fw.write(l + '\n') 126 | fw.close() 127 | 128 | 129 | if __name__ == '__main__': 130 | 131 | evaluator = Evaluator() 132 | 133 | model.fit_generator( 134 | train_generator.forfit(), 135 | steps_per_epoch=len(train_generator), 136 | epochs=epochs, 137 | callbacks=[evaluator] 138 | ) 139 | 140 | model.load_weights('weights/csl.weights') 141 | test_predict( 142 | in_file=data_path + 'csl/test.json', 143 | out_file='results/csl_predict.json' 144 | ) 145 | 146 | else: 147 | 148 | model.load_weights('weights/csl.weights') 149 | -------------------------------------------------------------------------------- /src/iflytek.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # CLUE评测 3 | # iflytek文本分类 4 | # 思路:取[CLS]然后接Dense+Softmax分类 5 | 6 | import json 7 | import numpy as np 8 | from snippets import * 9 | from bert4keras.snippets import sequence_padding, DataGenerator 10 | from bert4keras.snippets import open 11 | from tqdm import tqdm 12 | 13 | # 基本参数 14 | num_classes = 119 15 | maxlen = 128 16 | batch_size = 32 17 | epochs = 10 18 | 19 | 20 | def load_data(filename): 21 | """加载数据 22 | 格式:[(文本, 标签id)] 23 | """ 24 | D = [] 25 | with open(filename) as f: 26 | for i, l in enumerate(f): 27 | l = json.loads(l) 28 | text, label = l['sentence'], l.get('label', 0) 29 | D.append((text, int(label))) 30 | return D 31 | 32 | 33 | # 加载数据集 34 | train_data = load_data(data_path + 'iflytek/train.json') 35 | valid_data = load_data(data_path + 'iflytek/dev.json') 36 | 37 | 38 | class data_generator(DataGenerator): 39 | """数据生成器 40 | """ 41 | def __iter__(self, random=False): 42 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 43 | for is_end, (text, label) in self.sample(random): 44 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 45 | batch_token_ids.append(token_ids) 46 | batch_segment_ids.append(segment_ids) 47 | batch_labels.append([label]) 48 | if len(batch_token_ids) == self.batch_size or is_end: 49 | batch_token_ids = sequence_padding(batch_token_ids) 50 | batch_segment_ids = sequence_padding(batch_segment_ids) 51 | batch_labels = sequence_padding(batch_labels) 52 | yield [batch_token_ids, batch_segment_ids], batch_labels 53 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 54 | 55 | 56 | # 转换数据集 57 | train_generator = data_generator(train_data, batch_size) 58 | valid_generator = data_generator(valid_data, batch_size) 59 | 60 | # 构建模型 61 | output = base.model.get_layer(last_layer).output 62 | output = pooling_layer(output) 63 | output = keras.layers.Dense( 64 | units=num_classes, 65 | activation='softmax', 66 | kernel_initializer=base.initializer 67 | )(output) 68 | 69 | model = keras.models.Model(base.model.input, output) 70 | model.summary() 71 | 72 | model.compile( 73 | loss='sparse_categorical_crossentropy', 74 | optimizer=optimizer, 75 | metrics=['accuracy'] 76 | ) 77 | 78 | 79 | class Evaluator(keras.callbacks.Callback): 80 | """保存验证集acc最好的模型 81 | """ 82 | def __init__(self): 83 | self.best_val_acc = 0. 84 | 85 | def on_epoch_end(self, epoch, logs=None): 86 | val_acc = self.evaluate(valid_generator) 87 | if val_acc > self.best_val_acc: 88 | self.best_val_acc = val_acc 89 | model.save_weights('weights/iflytek.weights') 90 | print( 91 | u'val_acc: %.5f, best_val_acc: %.5f\n' % 92 | (val_acc, self.best_val_acc) 93 | ) 94 | 95 | def evaluate(self, data): 96 | total, right = 0., 0. 97 | for x_true, y_true in data: 98 | y_pred = model.predict(x_true).argmax(axis=1) 99 | y_true = y_true[:, 0] 100 | total += len(y_true) 101 | right += (y_true == y_pred).sum() 102 | return right / total 103 | 104 | 105 | def test_predict(in_file, out_file): 106 | """输出测试结果到文件 107 | 结果文件可以提交到 https://www.cluebenchmarks.com 评测。 108 | """ 109 | test_data = load_data(in_file) 110 | test_generator = data_generator(test_data, batch_size) 111 | 112 | results = [] 113 | for x_true, _ in tqdm(test_generator, ncols=0): 114 | y_pred = model.predict(x_true).argmax(axis=1) 115 | results.extend(y_pred) 116 | 117 | fw = open(out_file, 'w') 118 | with open(in_file) as fr: 119 | for l, r in zip(fr, results): 120 | l = json.loads(l) 121 | l = json.dumps({'id': str(l['id']), 'label': str(r)}) 122 | fw.write(l + '\n') 123 | fw.close() 124 | 125 | 126 | if __name__ == '__main__': 127 | 128 | evaluator = Evaluator() 129 | 130 | model.fit_generator( 131 | train_generator.forfit(), 132 | steps_per_epoch=len(train_generator), 133 | epochs=epochs, 134 | callbacks=[evaluator] 135 | ) 136 | 137 | model.load_weights('weights/iflytek.weights') 138 | test_predict( 139 | in_file=data_path + 'iflytek/test.json', 140 | out_file='results/iflytek_predict.json' 141 | ) 142 | 143 | else: 144 | 145 | model.load_weights('weights/iflytek.weights') 146 | -------------------------------------------------------------------------------- /src/ocnli.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # CLUE评测 3 | # ocnli自然语言推理 4 | # 思路:文本拼接后取[CLS]然后接Dense+Softmax分类 5 | 6 | import json 7 | import numpy as np 8 | from snippets import * 9 | from bert4keras.snippets import sequence_padding, DataGenerator 10 | from bert4keras.snippets import open 11 | from tqdm import tqdm 12 | 13 | # 基本参数 14 | labels = ['entailment', 'neutral', 'contradiction'] 15 | num_classes = len(labels) 16 | maxlen = 128 17 | batch_size = 32 18 | epochs = 10 19 | 20 | 21 | def load_data(filename): 22 | """加载数据 23 | 格式:[(文本1, 文本2, 标签id)] 24 | """ 25 | D = [] 26 | with open(filename) as f: 27 | for i, l in enumerate(f): 28 | l = json.loads(l) 29 | text1, text2 = l['sentence1'], l['sentence2'] 30 | label = l.get('label', 'neutral') 31 | if label in labels: 32 | D.append((text1, text2, labels.index(label))) 33 | return D 34 | 35 | 36 | # 加载数据集 37 | train_data = load_data(data_path + 'ocnli/train.50k.json') 38 | valid_data = load_data(data_path + 'ocnli/dev.json') 39 | 40 | 41 | class data_generator(DataGenerator): 42 | """数据生成器 43 | """ 44 | def __iter__(self, random=False): 45 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 46 | for is_end, (text1, text2, label) in self.sample(random): 47 | token_ids, segment_ids = tokenizer.encode( 48 | text1, text2, maxlen=maxlen 49 | ) 50 | batch_token_ids.append(token_ids) 51 | batch_segment_ids.append([0] * len(segment_ids)) 52 | batch_labels.append([label]) 53 | if len(batch_token_ids) == self.batch_size or is_end: 54 | batch_token_ids = sequence_padding(batch_token_ids) 55 | batch_segment_ids = sequence_padding(batch_segment_ids) 56 | batch_labels = sequence_padding(batch_labels) 57 | yield [batch_token_ids, batch_segment_ids], batch_labels 58 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 59 | 60 | 61 | # 转换数据集 62 | train_generator = data_generator(train_data, batch_size) 63 | valid_generator = data_generator(valid_data, batch_size) 64 | 65 | # 构建模型 66 | output = base.model.get_layer(last_layer).output 67 | output = pooling_layer(output) 68 | output = keras.layers.Dense( 69 | units=num_classes, 70 | activation='softmax', 71 | kernel_initializer=base.initializer 72 | )(output) 73 | 74 | model = keras.models.Model(base.model.input, output) 75 | model.summary() 76 | 77 | model.compile( 78 | loss='sparse_categorical_crossentropy', 79 | optimizer=optimizer, 80 | metrics=['accuracy'] 81 | ) 82 | 83 | 84 | class Evaluator(keras.callbacks.Callback): 85 | """保存验证集acc最好的模型 86 | """ 87 | def __init__(self): 88 | self.best_val_acc = 0. 89 | 90 | def on_epoch_end(self, epoch, logs=None): 91 | val_acc = self.evaluate(valid_generator) 92 | if val_acc > self.best_val_acc: 93 | self.best_val_acc = val_acc 94 | model.save_weights('weights/ocnli.weights') 95 | print( 96 | u'val_acc: %.5f, best_val_acc: %.5f\n' % 97 | (val_acc, self.best_val_acc) 98 | ) 99 | 100 | def evaluate(self, data): 101 | total, right = 0., 0. 102 | for x_true, y_true in data: 103 | y_pred = model.predict(x_true).argmax(axis=1) 104 | y_true = y_true[:, 0] 105 | total += len(y_true) 106 | right += (y_true == y_pred).sum() 107 | return right / total 108 | 109 | 110 | def test_predict(in_file, out_file): 111 | """输出测试结果到文件 112 | 结果文件可以提交到 https://www.cluebenchmarks.com 评测。 113 | """ 114 | test_data = load_data(in_file) 115 | test_generator = data_generator(test_data, batch_size) 116 | 117 | results = [] 118 | for x_true, _ in tqdm(test_generator, ncols=0): 119 | y_pred = model.predict(x_true).argmax(axis=1) 120 | results.extend(y_pred) 121 | 122 | fw = open(out_file, 'w') 123 | with open(in_file) as fr: 124 | for l, r in zip(fr, results): 125 | l = json.loads(l) 126 | l = json.dumps({'id': str(l['id']), 'label': labels[r]}) 127 | fw.write(l + '\n') 128 | fw.close() 129 | 130 | 131 | if __name__ == '__main__': 132 | 133 | evaluator = Evaluator() 134 | 135 | model.fit_generator( 136 | train_generator.forfit(), 137 | steps_per_epoch=len(train_generator), 138 | epochs=epochs, 139 | callbacks=[evaluator] 140 | ) 141 | 142 | model.load_weights('weights/ocnli.weights') 143 | test_predict( 144 | in_file=data_path + 'ocnli/test.json', 145 | out_file='results/ocnli_50k_predict.json' 146 | ) 147 | 148 | else: 149 | 150 | model.load_weights('weights/ocnli.weights') 151 | -------------------------------------------------------------------------------- /src/snippets.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # CLUE评测 3 | # 模型配置文件 4 | 5 | import os 6 | from bert4keras.backend import keras, K 7 | from bert4keras.tokenizers import Tokenizer 8 | from bert4keras.models import build_transformer_model 9 | from bert4keras.optimizers import AdaFactor 10 | from bert4keras.optimizers import extend_with_gradient_accumulation 11 | 12 | # 通用参数 13 | data_path = '/root/clue/datasets/' 14 | learning_rate = 5e-4 15 | pooling = 'first' 16 | 17 | # 权重目录 18 | if not os.path.exists('weights'): 19 | os.mkdir('weights') 20 | 21 | # 输出目录 22 | if not os.path.exists('results'): 23 | os.mkdir('results') 24 | 25 | # 模型路径 26 | config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' 27 | checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 28 | dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' 29 | 30 | # 建立分词器 31 | tokenizer = Tokenizer(dict_path, do_lower_case=True) 32 | 33 | # 预训练模型 34 | base = build_transformer_model( 35 | config_path, checkpoint_path, application='unilm', return_keras_model=False 36 | ) 37 | 38 | # 模型参数 39 | last_layer = 'Transformer-%s-FeedForward-Norm' % (base.num_hidden_layers - 1) 40 | 41 | if pooling == 'first': 42 | pooling_layer = keras.layers.Lambda(lambda x: x[:, 0]) 43 | elif pooling == 'avg': 44 | pooling_layer = keras.layers.GlobalAveragePooling1D() 45 | elif pooling == 'max': 46 | pooling_layer = keras.layers.GlobalMaxPooling1D() 47 | 48 | # 优化器 49 | AdaFactorG = extend_with_gradient_accumulation(AdaFactor, name='AdaFactorG') 50 | 51 | optimizer = AdaFactor( 52 | learning_rate=learning_rate, beta1=0.9, min_dim_size_to_factor=10**6 53 | ) 54 | 55 | optimizer2 = AdaFactorG( 56 | learning_rate=learning_rate, 57 | beta1=0.9, 58 | min_dim_size_to_factor=10**6, 59 | grad_accum_steps=2 60 | ) 61 | 62 | optimizer4 = AdaFactorG( 63 | learning_rate=learning_rate, 64 | beta1=0.9, 65 | min_dim_size_to_factor=10**6, 66 | grad_accum_steps=4 67 | ) 68 | -------------------------------------------------------------------------------- /src/tnews.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # CLUE评测 3 | # tnews文本分类 4 | # 思路:取[CLS]然后接Dense+Softmax分类 5 | 6 | import json 7 | import numpy as np 8 | from snippets import * 9 | from bert4keras.snippets import sequence_padding, DataGenerator 10 | from bert4keras.snippets import open 11 | from tqdm import tqdm 12 | 13 | # 基本参数 14 | labels = [ 15 | '100', '101', '102', '103', '104', '106', '107', '108', '109', '110', '112', 16 | '113', '114', '115', '116' 17 | ] 18 | num_classes = len(labels) 19 | maxlen = 128 20 | batch_size = 32 21 | epochs = 10 22 | 23 | 24 | def load_data(filename): 25 | """加载数据 26 | 格式:[(文本, 标签id)] 27 | """ 28 | D = [] 29 | with open(filename) as f: 30 | for i, l in enumerate(f): 31 | l = json.loads(l) 32 | text, label = l['sentence'], l.get('label', '100') 33 | D.append((text, labels.index(label))) 34 | return D 35 | 36 | 37 | # 加载数据集 38 | train_data = load_data(data_path + 'tnews/train.json') 39 | valid_data = load_data(data_path + 'tnews/dev.json') 40 | 41 | 42 | class data_generator(DataGenerator): 43 | """数据生成器 44 | """ 45 | def __iter__(self, random=False): 46 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 47 | for is_end, (text, label) in self.sample(random): 48 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 49 | batch_token_ids.append(token_ids) 50 | batch_segment_ids.append(segment_ids) 51 | batch_labels.append([label]) 52 | if len(batch_token_ids) == self.batch_size or is_end: 53 | batch_token_ids = sequence_padding(batch_token_ids) 54 | batch_segment_ids = sequence_padding(batch_segment_ids) 55 | batch_labels = sequence_padding(batch_labels) 56 | yield [batch_token_ids, batch_segment_ids], batch_labels 57 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 58 | 59 | 60 | # 转换数据集 61 | train_generator = data_generator(train_data, batch_size) 62 | valid_generator = data_generator(valid_data, batch_size) 63 | 64 | # 构建模型 65 | output = base.model.get_layer(last_layer).output 66 | output = pooling_layer(output) 67 | output = keras.layers.Dense( 68 | units=num_classes, 69 | activation='softmax', 70 | kernel_initializer=base.initializer 71 | )(output) 72 | 73 | model = keras.models.Model(base.model.input, output) 74 | model.summary() 75 | 76 | model.compile( 77 | loss='sparse_categorical_crossentropy', 78 | optimizer=optimizer, 79 | metrics=['accuracy'] 80 | ) 81 | 82 | 83 | class Evaluator(keras.callbacks.Callback): 84 | """保存验证集acc最好的模型 85 | """ 86 | def __init__(self): 87 | self.best_val_acc = 0. 88 | 89 | def on_epoch_end(self, epoch, logs=None): 90 | val_acc = self.evaluate(valid_generator) 91 | if val_acc > self.best_val_acc: 92 | self.best_val_acc = val_acc 93 | model.save_weights('weights/tnews.weights') 94 | print( 95 | u'val_acc: %.5f, best_val_acc: %.5f\n' % 96 | (val_acc, self.best_val_acc) 97 | ) 98 | 99 | def evaluate(self, data): 100 | total, right = 0., 0. 101 | for x_true, y_true in data: 102 | y_pred = model.predict(x_true).argmax(axis=1) 103 | y_true = y_true[:, 0] 104 | total += len(y_true) 105 | right += (y_true == y_pred).sum() 106 | return right / total 107 | 108 | 109 | def test_predict(in_file, out_file): 110 | """输出测试结果到文件 111 | 结果文件可以提交到 https://www.cluebenchmarks.com 评测。 112 | """ 113 | test_data = load_data(in_file) 114 | test_generator = data_generator(test_data, batch_size) 115 | 116 | results = [] 117 | for x_true, _ in tqdm(test_generator, ncols=0): 118 | y_pred = model.predict(x_true).argmax(axis=1) 119 | results.extend(y_pred) 120 | 121 | fw = open(out_file, 'w') 122 | with open(in_file) as fr: 123 | for l, r in zip(fr, results): 124 | l = json.loads(l) 125 | l = json.dumps({'id': str(l['id']), 'label': labels[r]}) 126 | fw.write(l + '\n') 127 | fw.close() 128 | 129 | 130 | if __name__ == '__main__': 131 | 132 | evaluator = Evaluator() 133 | 134 | model.fit_generator( 135 | train_generator.forfit(), 136 | steps_per_epoch=len(train_generator), 137 | epochs=epochs, 138 | callbacks=[evaluator] 139 | ) 140 | 141 | model.load_weights('weights/tnews.weights') 142 | test_predict( 143 | in_file=data_path + 'tnews/test1.0.json', 144 | out_file='results/tnews10_predict.json' 145 | ) 146 | test_predict( 147 | in_file=data_path + 'tnews/test.json', 148 | out_file='results/tnews11_predict.json' 149 | ) 150 | 151 | else: 152 | 153 | model.load_weights('weights/tnews.weights') 154 | -------------------------------------------------------------------------------- /src/wsc.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # CLUE评测 3 | # wsc代词消歧 4 | # 思路:给span1、span2加上特殊标记,然后当成文本分类 5 | 6 | import json 7 | import numpy as np 8 | from snippets import * 9 | from bert4keras.snippets import sequence_padding, DataGenerator 10 | from bert4keras.snippets import open 11 | from tqdm import tqdm 12 | 13 | # 基本参数 14 | labels = ['false', 'true'] 15 | num_classes = len(labels) 16 | maxlen = 128 17 | batch_size = 32 18 | epochs = 30 19 | 20 | 21 | def load_data(filename): 22 | """加载数据 23 | 格式:[(文本, 标签id)] 24 | """ 25 | D = [] 26 | with open(filename) as f: 27 | for i, l in enumerate(f): 28 | l = json.loads(l) 29 | text, label = l['text'], labels.index(l.get('label', 'false')) 30 | s1 = l['target']['span1_index'] 31 | e1 = s1 + len(l['target']['span1_text']) 32 | s2 = l['target']['span2_index'] 33 | e2 = s2 + len(l['target']['span2_text']) 34 | if s1 < s2: 35 | text = ( 36 | text[:s1] + '_' + text[s1:e1] + '_' + text[e1:s2] + '[' + 37 | text[s2:e2] + ']' + text[e2:] 38 | ) 39 | else: 40 | text = ( 41 | text[:s2] + '[' + text[s2:e2] + ']' + text[e2:s1] + '_' + 42 | text[s1:e1] + '_' + text[e1:] 43 | ) 44 | D.append((text, label)) 45 | return D 46 | 47 | 48 | # 加载数据集 49 | train_data = load_data(data_path + 'wsc/train.json') 50 | valid_data = load_data(data_path + 'wsc/dev.json') 51 | 52 | 53 | class data_generator(DataGenerator): 54 | """数据生成器 55 | """ 56 | def __iter__(self, random=False): 57 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 58 | for is_end, (text, label) in self.sample(random): 59 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 60 | batch_token_ids.append(token_ids) 61 | batch_segment_ids.append(segment_ids) 62 | batch_labels.append([label]) 63 | if len(batch_token_ids) == self.batch_size or is_end: 64 | batch_token_ids = sequence_padding(batch_token_ids) 65 | batch_segment_ids = sequence_padding(batch_segment_ids) 66 | batch_labels = sequence_padding(batch_labels) 67 | yield [batch_token_ids, batch_segment_ids], batch_labels 68 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 69 | 70 | 71 | # 转换数据集 72 | train_generator = data_generator(train_data, batch_size) 73 | valid_generator = data_generator(valid_data, batch_size) 74 | 75 | # 构建模型 76 | output = base.model.get_layer(last_layer).output 77 | output = pooling_layer(output) 78 | output = keras.layers.Dense( 79 | units=num_classes, 80 | activation='softmax', 81 | kernel_initializer=base.initializer 82 | )(output) 83 | 84 | model = keras.models.Model(base.model.input, output) 85 | model.summary() 86 | 87 | model.compile( 88 | loss='sparse_categorical_crossentropy', 89 | optimizer=optimizer, 90 | metrics=['accuracy'] 91 | ) 92 | 93 | 94 | class Evaluator(keras.callbacks.Callback): 95 | """保存验证集acc最好的模型 96 | """ 97 | def __init__(self): 98 | self.best_val_acc = 0. 99 | 100 | def on_epoch_end(self, epoch, logs=None): 101 | val_acc = self.evaluate(valid_generator) 102 | if val_acc > self.best_val_acc: 103 | self.best_val_acc = val_acc 104 | model.save_weights('weights/wsc.weights') 105 | print( 106 | u'val_acc: %.5f, best_val_acc: %.5f\n' % 107 | (val_acc, self.best_val_acc) 108 | ) 109 | 110 | def evaluate(self, data): 111 | total, right = 0., 0. 112 | for x_true, y_true in data: 113 | y_pred = model.predict(x_true).argmax(axis=1) 114 | y_true = y_true[:, 0] 115 | total += len(y_true) 116 | right += (y_true == y_pred).sum() 117 | return right / total 118 | 119 | 120 | def test_predict(in_file, out_file): 121 | """输出测试结果到文件 122 | 结果文件可以提交到 https://www.cluebenchmarks.com 评测。 123 | """ 124 | test_data = load_data(in_file) 125 | test_generator = data_generator(test_data, batch_size) 126 | 127 | results = [] 128 | for x_true, _ in tqdm(test_generator, ncols=0): 129 | y_pred = model.predict(x_true).argmax(axis=1) 130 | results.extend(y_pred) 131 | 132 | fw = open(out_file, 'w') 133 | with open(in_file) as fr: 134 | for l, r in zip(fr, results): 135 | l = json.loads(l) 136 | l = json.dumps({'id': str(l['id']), 'label': labels[r]}) 137 | fw.write(l + '\n') 138 | fw.close() 139 | 140 | 141 | if __name__ == '__main__': 142 | 143 | evaluator = Evaluator() 144 | 145 | model.fit_generator( 146 | train_generator.forfit(), 147 | steps_per_epoch=len(train_generator), 148 | epochs=epochs, 149 | callbacks=[evaluator] 150 | ) 151 | 152 | model.load_weights('weights/wsc.weights') 153 | test_predict( 154 | in_file=data_path + 'wsc/test1.0.json', 155 | out_file='results/cluewsc10_predict.json' 156 | ) 157 | test_predict( 158 | in_file=data_path + 'wsc/test.json', 159 | out_file='results/cluewsc11_predict.json' 160 | ) 161 | 162 | else: 163 | 164 | model.load_weights('weights/wsc.weights') 165 | --------------------------------------------------------------------------------