├── README.md ├── sentiment.py └── tnews.py /README.md: -------------------------------------------------------------------------------- 1 | # Pattern-Exploiting Training 2 | 3 | 在中文上的简单实验 4 | 5 | ## 解读 6 | https://kexue.fm/archives/7764 7 | 8 | ## 环境 9 | 实验环境为tensorflow 1.14 + keras 2.3.1 + bert4keras 0.8.8 10 | 11 | ## 数据 12 | - **情感分类** https://github.com/bojone/bert4keras/blob/master/examples/datasets/sentiment.zip 13 | - **短新闻分类** 链接: https://pan.baidu.com/s/1G6MM9K42OTijqiOX0qOcUQ 提取码: m5bq 14 | 15 | ## 交流 16 | QQ交流群:67729435,微信群请加机器人微信号spaces_ac_cn 17 | -------------------------------------------------------------------------------- /sentiment.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # 情感分析例子,利用MLM做 Zero-Shot/Few-Shot/Semi-Supervised Learning 3 | 4 | import numpy as np 5 | from bert4keras.backend import keras, K 6 | from bert4keras.layers import Loss 7 | from bert4keras.tokenizers import Tokenizer 8 | from bert4keras.models import build_transformer_model 9 | from bert4keras.optimizers import Adam 10 | from bert4keras.snippets import sequence_padding, DataGenerator 11 | from bert4keras.snippets import open 12 | from keras.layers import Lambda, Dense 13 | 14 | num_classes = 2 15 | maxlen = 128 16 | batch_size = 32 17 | config_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json' 18 | checkpoint_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt' 19 | dict_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' 20 | 21 | 22 | def load_data(filename): 23 | D = [] 24 | with open(filename, encoding='utf-8') as f: 25 | for l in f: 26 | text, label = l.strip().split('\t') 27 | D.append((text, int(label))) 28 | return D 29 | 30 | 31 | # 加载数据集 32 | train_data = load_data('datasets/sentiment/sentiment.train.data') 33 | valid_data = load_data('datasets/sentiment/sentiment.valid.data') 34 | test_data = load_data('datasets/sentiment/sentiment.test.data') 35 | 36 | # 模拟标注和非标注数据 37 | train_frac = 0.01 # 标注数据的比例 38 | num_labeled = int(len(train_data) * train_frac) 39 | unlabeled_data = [(t, 2) for t, l in train_data[num_labeled:]] 40 | train_data = train_data[:num_labeled] 41 | train_data = train_data + unlabeled_data 42 | 43 | # 建立分词器 44 | tokenizer = Tokenizer(dict_path, do_lower_case=True) 45 | 46 | # 对应的任务描述 47 | prefix = u'很满意。' 48 | mask_idx = 1 49 | pos_id = tokenizer.token_to_id(u'很') 50 | neg_id = tokenizer.token_to_id(u'不') 51 | 52 | 53 | def random_masking(token_ids): 54 | """对输入进行随机mask 55 | """ 56 | rands = np.random.random(len(token_ids)) 57 | source, target = [], [] 58 | for r, t in zip(rands, token_ids): 59 | if r < 0.15 * 0.8: 60 | source.append(tokenizer._token_mask_id) 61 | target.append(t) 62 | elif r < 0.15 * 0.9: 63 | source.append(t) 64 | target.append(t) 65 | elif r < 0.15: 66 | source.append(np.random.choice(tokenizer._vocab_size - 1) + 1) 67 | target.append(t) 68 | else: 69 | source.append(t) 70 | target.append(0) 71 | return source, target 72 | 73 | 74 | class data_generator(DataGenerator): 75 | """数据生成器 76 | """ 77 | def __iter__(self, random=False): 78 | batch_token_ids, batch_segment_ids, batch_output_ids = [], [], [] 79 | for is_end, (text, label) in self.sample(random): 80 | if label != 2: 81 | text = prefix + text 82 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 83 | if random: 84 | source_ids, target_ids = random_masking(token_ids) 85 | else: 86 | source_ids, target_ids = token_ids[:], token_ids[:] 87 | if label == 0: 88 | source_ids[mask_idx] = tokenizer._token_mask_id 89 | target_ids[mask_idx] = neg_id 90 | elif label == 1: 91 | source_ids[mask_idx] = tokenizer._token_mask_id 92 | target_ids[mask_idx] = pos_id 93 | batch_token_ids.append(source_ids) 94 | batch_segment_ids.append(segment_ids) 95 | batch_output_ids.append(target_ids) 96 | if len(batch_token_ids) == self.batch_size or is_end: 97 | batch_token_ids = sequence_padding(batch_token_ids) 98 | batch_segment_ids = sequence_padding(batch_segment_ids) 99 | batch_output_ids = sequence_padding(batch_output_ids) 100 | yield [ 101 | batch_token_ids, batch_segment_ids, batch_output_ids 102 | ], None 103 | batch_token_ids, batch_segment_ids, batch_output_ids = [], [], [] 104 | 105 | 106 | class CrossEntropy(Loss): 107 | """交叉熵作为loss,并mask掉输入部分 108 | """ 109 | def compute_loss(self, inputs, mask=None): 110 | y_true, y_pred = inputs 111 | y_mask = K.cast(K.not_equal(y_true, 0), K.floatx()) 112 | accuracy = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) 113 | accuracy = K.sum(accuracy * y_mask) / K.sum(y_mask) 114 | self.add_metric(accuracy, name='accuracy') 115 | loss = K.sparse_categorical_crossentropy(y_true, y_pred) 116 | loss = K.sum(loss * y_mask) / K.sum(y_mask) 117 | return loss 118 | 119 | 120 | # 加载预训练模型 121 | model = build_transformer_model( 122 | config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=True 123 | ) 124 | 125 | # 训练用模型 126 | y_in = keras.layers.Input(shape=(None,)) 127 | outputs = CrossEntropy(1)([y_in, model.output]) 128 | 129 | train_model = keras.models.Model(model.inputs + [y_in], outputs) 130 | train_model.compile(optimizer=Adam(1e-5)) 131 | train_model.summary() 132 | 133 | # 转换数据集 134 | train_generator = data_generator(train_data, batch_size) 135 | valid_generator = data_generator(valid_data, batch_size) 136 | test_generator = data_generator(test_data, batch_size) 137 | 138 | 139 | class Evaluator(keras.callbacks.Callback): 140 | def __init__(self): 141 | self.best_val_acc = 0. 142 | 143 | def on_epoch_end(self, epoch, logs=None): 144 | model.save_weights('mlm_model.weights') 145 | val_acc = evaluate(valid_generator) 146 | if val_acc > self.best_val_acc: 147 | self.best_val_acc = val_acc 148 | model.save_weights('best_model.weights') 149 | test_acc = evaluate(test_generator) 150 | print( 151 | u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' % 152 | (val_acc, self.best_val_acc, test_acc) 153 | ) 154 | 155 | 156 | def evaluate(data): 157 | total, right = 0., 0. 158 | for x_true, _ in data: 159 | x_true, y_true = x_true[:2], x_true[2] 160 | y_pred = model.predict(x_true) 161 | y_pred = y_pred[:, mask_idx, [neg_id, pos_id]].argmax(axis=1) 162 | y_true = (y_true[:, mask_idx] == pos_id).astype(int) 163 | total += len(y_true) 164 | right += (y_true == y_pred).sum() 165 | return right / total 166 | 167 | 168 | if __name__ == '__main__': 169 | 170 | evaluator = Evaluator() 171 | 172 | train_model.fit_generator( 173 | train_generator.forfit(), 174 | steps_per_epoch=len(train_generator), 175 | epochs=1000, 176 | callbacks=[evaluator] 177 | ) 178 | 179 | else: 180 | 181 | model.load_weights('best_model.weights') 182 | -------------------------------------------------------------------------------- /tnews.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # 新闻分类例子,利用MLM做 Zero-Shot/Few-Shot/Semi-Supervised Learning 3 | 4 | import json 5 | import numpy as np 6 | from bert4keras.backend import keras, K 7 | from bert4keras.layers import Loss 8 | from bert4keras.tokenizers import Tokenizer 9 | from bert4keras.models import build_transformer_model 10 | from bert4keras.optimizers import Adam 11 | from bert4keras.snippets import sequence_padding, DataGenerator 12 | from bert4keras.snippets import open 13 | from keras.layers import Lambda, Dense 14 | 15 | labels = [ 16 | u'文化', u'娱乐', u'体育', u'财经', u'房产', u'汽车', u'教育', u'科技', u'军事', u'旅游', u'国际', 17 | u'证券', u'农业', u'电竞', u'民生' 18 | ] 19 | num_classes = len(labels) 20 | maxlen = 128 21 | batch_size = 32 22 | config_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json' 23 | checkpoint_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt' 24 | dict_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' 25 | 26 | 27 | def load_data(filename): 28 | D = [] 29 | with open(filename, encoding='utf-8') as f: 30 | for i, l in enumerate(f): 31 | l = json.loads(l) 32 | D.append((l['text'], l['label_name'])) 33 | return D 34 | 35 | 36 | # 加载数据集,只截取一部分,模拟小数据集 37 | train_data = load_data('/root/short_news/train.json')[:20000] 38 | valid_data = load_data('/root/short_news/val.json')[:2000] 39 | test_data = load_data('/root/short_news/test.json')[:2000] 40 | 41 | # 模拟标注和非标注数据 42 | train_frac = 0.0 # 标注数据的比例 43 | num_labeled = int(len(train_data) * train_frac) 44 | unlabeled_data = [(t, u'无标签') for t, l in train_data[num_labeled:]] 45 | train_data = train_data[:num_labeled] 46 | train_data = train_data + unlabeled_data 47 | 48 | # 建立分词器 49 | tokenizer = Tokenizer(dict_path, do_lower_case=True) 50 | 51 | # 对应的任务描述 52 | prefix = u'下面报导一则体育新闻。' 53 | mask_idxs = [7, 8] 54 | 55 | 56 | def random_masking(token_ids): 57 | """对输入进行随机mask 58 | """ 59 | rands = np.random.random(len(token_ids)) 60 | source, target = [], [] 61 | for r, t in zip(rands, token_ids): 62 | if r < 0.15 * 0.8: 63 | source.append(tokenizer._token_mask_id) 64 | target.append(t) 65 | elif r < 0.15 * 0.9: 66 | source.append(t) 67 | target.append(t) 68 | elif r < 0.15: 69 | source.append(np.random.choice(tokenizer._vocab_size - 1) + 1) 70 | target.append(t) 71 | else: 72 | source.append(t) 73 | target.append(0) 74 | return source, target 75 | 76 | 77 | class data_generator(DataGenerator): 78 | """数据生成器 79 | """ 80 | def __iter__(self, random=False): 81 | batch_token_ids, batch_segment_ids, batch_output_ids = [], [], [] 82 | for is_end, (text, label) in self.sample(random): 83 | if len(label) == 2: 84 | text = prefix + text 85 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 86 | if random: 87 | source_ids, target_ids = random_masking(token_ids) 88 | else: 89 | source_ids, target_ids = token_ids[:], token_ids[:] 90 | if len(label) == 2: 91 | label_ids = tokenizer.encode(label)[0][1:-1] 92 | for i, j in zip(mask_idxs, label_ids): 93 | source_ids[i] = tokenizer._token_mask_id 94 | target_ids[i] = j 95 | batch_token_ids.append(source_ids) 96 | batch_segment_ids.append(segment_ids) 97 | batch_output_ids.append(target_ids) 98 | if len(batch_token_ids) == self.batch_size or is_end: 99 | batch_token_ids = sequence_padding(batch_token_ids) 100 | batch_segment_ids = sequence_padding(batch_segment_ids) 101 | batch_output_ids = sequence_padding(batch_output_ids) 102 | yield [ 103 | batch_token_ids, batch_segment_ids, batch_output_ids 104 | ], None 105 | batch_token_ids, batch_segment_ids, batch_output_ids = [], [], [] 106 | 107 | 108 | class CrossEntropy(Loss): 109 | """交叉熵作为loss,并mask掉输入部分 110 | """ 111 | def compute_loss(self, inputs, mask=None): 112 | y_true, y_pred = inputs 113 | y_mask = K.cast(K.not_equal(y_true, 0), K.floatx()) 114 | accuracy = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) 115 | accuracy = K.sum(accuracy * y_mask) / K.sum(y_mask) 116 | self.add_metric(accuracy, name='accuracy') 117 | loss = K.sparse_categorical_crossentropy(y_true, y_pred) 118 | loss = K.sum(loss * y_mask) / K.sum(y_mask) 119 | return loss 120 | 121 | 122 | # 加载预训练模型 123 | model = build_transformer_model( 124 | config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=True 125 | ) 126 | 127 | # 训练用模型 128 | y_in = keras.layers.Input(shape=(None,)) 129 | outputs = CrossEntropy(1)([y_in, model.output]) 130 | 131 | train_model = keras.models.Model(model.inputs + [y_in], outputs) 132 | train_model.compile(optimizer=Adam(1e-5)) 133 | train_model.summary() 134 | 135 | # 转换数据集 136 | train_generator = data_generator(train_data, batch_size) 137 | valid_generator = data_generator(valid_data, batch_size) 138 | test_generator = data_generator(test_data, batch_size) 139 | 140 | 141 | class Evaluator(keras.callbacks.Callback): 142 | def __init__(self): 143 | self.best_val_acc = 0. 144 | 145 | def on_epoch_end(self, epoch, logs=None): 146 | model.save_weights('mlm_model.weights') 147 | val_acc = evaluate(valid_generator) 148 | if val_acc > self.best_val_acc: 149 | self.best_val_acc = val_acc 150 | model.save_weights('best_model.weights') 151 | test_acc = evaluate(test_generator) 152 | print( 153 | u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' % 154 | (val_acc, self.best_val_acc, test_acc) 155 | ) 156 | 157 | 158 | def evaluate(data): 159 | label_ids = np.array([tokenizer.encode(l)[0][1:-1] for l in labels]) 160 | total, right = 0., 0. 161 | for x_true, _ in data: 162 | x_true, y_true = x_true[:2], x_true[2] 163 | y_pred = model.predict(x_true)[:, mask_idxs] 164 | y_pred = y_pred[:, 0, label_ids[:, 0]] * y_pred[:, 1, label_ids[:, 1]] 165 | y_pred = y_pred.argmax(axis=1) 166 | y_true = np.array([ 167 | labels.index(tokenizer.decode(y)) for y in y_true[:, mask_idxs] 168 | ]) 169 | total += len(y_true) 170 | right += (y_true == y_pred).sum() 171 | return right / total 172 | 173 | 174 | if __name__ == '__main__': 175 | 176 | evaluator = Evaluator() 177 | 178 | train_model.fit_generator( 179 | train_generator.forfit(), 180 | steps_per_epoch=len(train_generator), 181 | epochs=1000, 182 | callbacks=[evaluator] 183 | ) 184 | 185 | else: 186 | 187 | model.load_weights('best_model.weights') 188 | --------------------------------------------------------------------------------