├── README.md ├── gpt.py └── bert.py /README.md: -------------------------------------------------------------------------------- 1 | # P-tuning 2 | P-tuning方法在中文上的简单实验。 3 | 4 | ## 方法 5 | - https://kexue.fm/archives/8295 6 | 7 | ## 依赖 8 | bert4keras 0.10.4 9 | 10 | ## 交流 11 | QQ交流群:808623966,微信群请加机器人微信号spaces_ac_cn 12 | -------------------------------------------------------------------------------- /gpt.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # 情感分析例子,利用LM+P-tuning 3 | 4 | import numpy as np 5 | from bert4keras.backend import keras, K 6 | from bert4keras.layers import Loss, Embedding 7 | from bert4keras.tokenizers import Tokenizer 8 | from bert4keras.models import build_transformer_model, BERT 9 | from bert4keras.optimizers import Adam 10 | from bert4keras.snippets import sequence_padding, DataGenerator 11 | from bert4keras.snippets import open 12 | from keras.layers import Lambda, Dense 13 | 14 | maxlen = 128 15 | batch_size = 32 16 | config_path = '/root/kg/bert/chinese_nezha_gpt_L-12_H-768_A-12/config.json' 17 | checkpoint_path = '/root/kg/bert/chinese_nezha_gpt_L-12_H-768_A-12/gpt.ckpt' 18 | dict_path = '/root/kg/bert/chinese_nezha_gpt_L-12_H-768_A-12/vocab.txt' 19 | 20 | 21 | def load_data(filename): 22 | D = [] 23 | with open(filename, encoding='utf-8') as f: 24 | for l in f: 25 | text, label = l.strip().split('\t') 26 | D.append((text, int(label))) 27 | return D 28 | 29 | 30 | # 加载数据集 31 | train_data = load_data('datasets/sentiment/sentiment.train.data') 32 | valid_data = load_data('datasets/sentiment/sentiment.valid.data') 33 | test_data = load_data('datasets/sentiment/sentiment.test.data') 34 | 35 | # 模拟标注和非标注数据 36 | train_frac = 0.01 # 标注数据的比例 37 | num_labeled = int(len(train_data) * train_frac) 38 | unlabeled_data = [(t, 2) for t, l in train_data[num_labeled:]] 39 | train_data = train_data[:num_labeled] 40 | # train_data = train_data + unlabeled_data 41 | 42 | # 建立分词器 43 | tokenizer = Tokenizer(dict_path, do_lower_case=True) 44 | 45 | # 对应的任务描述 46 | desc = ['[unused%s]' % i for i in range(1, 9)] 47 | desc_ids = [tokenizer.token_to_id(t) for t in desc] 48 | pos_id = tokenizer.token_to_id(u'很') 49 | neg_id = tokenizer.token_to_id(u'不') 50 | 51 | 52 | class data_generator(DataGenerator): 53 | """数据生成器 54 | """ 55 | def __iter__(self, random=False): 56 | batch_token_ids = [] 57 | for is_end, (text, label) in self.sample(random): 58 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 59 | token_ids = token_ids[:1] + desc_ids[:4] + token_ids[:-1] 60 | token_ids = token_ids + desc_ids[4:] 61 | if label == 0: 62 | token_ids = token_ids + [neg_id] 63 | elif label == 1: 64 | token_ids = token_ids + [pos_id] 65 | batch_token_ids.append(token_ids) 66 | if len(batch_token_ids) == self.batch_size or is_end: 67 | batch_token_ids = sequence_padding(batch_token_ids) 68 | yield batch_token_ids, None 69 | batch_token_ids = [] 70 | 71 | 72 | class CrossEntropy(Loss): 73 | """交叉熵作为loss,并mask掉padding部分 74 | """ 75 | def compute_loss(self, inputs, mask=None): 76 | y_true, y_pred = inputs 77 | if mask[1] is None: 78 | y_mask = 1.0 79 | else: 80 | y_mask = K.cast(mask[1], K.floatx())[:, 1:] 81 | y_true = y_true[:, 1:] # 目标token_ids 82 | y_pred = y_pred[:, :-1] # 预测序列,错开一位 83 | accuracy = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) 84 | accuracy = K.sum(accuracy * y_mask) / K.sum(y_mask) 85 | self.add_metric(accuracy, name='accuracy') 86 | loss = K.sparse_categorical_crossentropy(y_true, y_pred) 87 | loss = K.sum(loss * y_mask) / K.sum(y_mask) 88 | return loss 89 | 90 | 91 | class PtuningEmbedding(Embedding): 92 | """新定义Embedding层,只优化部分Token 93 | """ 94 | def call(self, inputs, mode='embedding'): 95 | embeddings = self.embeddings 96 | embeddings_sg = K.stop_gradient(embeddings) 97 | mask = np.zeros((K.int_shape(embeddings)[0], 1)) 98 | mask[1:9] += 1 # 只优化id为1~8的token 99 | self.embeddings = embeddings * mask + embeddings_sg * (1 - mask) 100 | outputs = super(PtuningEmbedding, self).call(inputs, mode) 101 | self.embeddings = embeddings 102 | return outputs 103 | 104 | 105 | class PtuningBERT(BERT): 106 | """替换原来的Embedding 107 | """ 108 | def apply(self, inputs=None, layer=None, arguments=None, **kwargs): 109 | if layer is Embedding: 110 | layer = PtuningEmbedding 111 | return super(PtuningBERT, 112 | self).apply(inputs, layer, arguments, **kwargs) 113 | 114 | 115 | # 加载预训练模型 116 | model = build_transformer_model( 117 | config_path=config_path, 118 | checkpoint_path=checkpoint_path, 119 | model=PtuningBERT, 120 | segment_vocab_size=0, # 去掉segmeng_ids输入 121 | application='lm', 122 | ) # 建立模型,加载权重 123 | 124 | for layer in model.layers: 125 | if layer.name != 'Embedding-Token': 126 | layer.trainable = False 127 | 128 | output = CrossEntropy(1)([model.input, model.output]) 129 | 130 | model = keras.models.Model(model.input, output) 131 | model.compile(optimizer=Adam(6e-4)) 132 | model.summary() 133 | 134 | # 转换数据集 135 | train_generator = data_generator(train_data, batch_size) 136 | valid_generator = data_generator(valid_data, batch_size) 137 | test_generator = data_generator(test_data, batch_size) 138 | 139 | 140 | class Evaluator(keras.callbacks.Callback): 141 | def __init__(self): 142 | self.best_val_acc = 0. 143 | 144 | def on_epoch_end(self, epoch, logs=None): 145 | val_acc = evaluate(valid_generator) 146 | if val_acc > self.best_val_acc: 147 | self.best_val_acc = val_acc 148 | model.save_weights('best_model_gpt.weights') 149 | test_acc = evaluate(test_generator) 150 | print( 151 | u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' % 152 | (val_acc, self.best_val_acc, test_acc) 153 | ) 154 | 155 | 156 | def evaluate(data): 157 | total, right = 0., 0. 158 | for x_true, _ in data: 159 | y_pred = model.predict(x_true) 160 | for x, y in zip(x_true, y_pred): 161 | x = np.trim_zeros(x) 162 | y = y[:len(x)][-2, [neg_id, pos_id]].argmax() 163 | y = [neg_id, pos_id][y] 164 | if y == x[-1]: 165 | right += 1 166 | total += 1 167 | return right / total 168 | 169 | 170 | if __name__ == '__main__': 171 | 172 | evaluator = Evaluator() 173 | 174 | model.fit_generator( 175 | train_generator.forfit(), 176 | steps_per_epoch=len(train_generator) * 50, 177 | epochs=1000, 178 | callbacks=[evaluator] 179 | ) 180 | 181 | else: 182 | 183 | model.load_weights('best_model_gpt.weights') 184 | -------------------------------------------------------------------------------- /bert.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # 情感分析例子,利用MLM+P-tuning 3 | 4 | import numpy as np 5 | from bert4keras.backend import keras, K 6 | from bert4keras.layers import Loss, Embedding 7 | from bert4keras.tokenizers import Tokenizer 8 | from bert4keras.models import build_transformer_model, BERT 9 | from bert4keras.optimizers import Adam 10 | from bert4keras.snippets import sequence_padding, DataGenerator 11 | from bert4keras.snippets import open 12 | from keras.layers import Lambda, Dense 13 | 14 | maxlen = 128 15 | batch_size = 32 16 | config_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json' 17 | checkpoint_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt' 18 | dict_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' 19 | 20 | 21 | def load_data(filename): 22 | D = [] 23 | with open(filename, encoding='utf-8') as f: 24 | for l in f: 25 | text, label = l.strip().split('\t') 26 | D.append((text, int(label))) 27 | return D 28 | 29 | 30 | # 加载数据集 31 | train_data = load_data('datasets/sentiment/sentiment.train.data') 32 | valid_data = load_data('datasets/sentiment/sentiment.valid.data') 33 | test_data = load_data('datasets/sentiment/sentiment.test.data') 34 | 35 | # 模拟标注和非标注数据 36 | train_frac = 0.01 # 标注数据的比例 37 | num_labeled = int(len(train_data) * train_frac) 38 | unlabeled_data = [(t, 2) for t, l in train_data[num_labeled:]] 39 | train_data = train_data[:num_labeled] 40 | # train_data = train_data + unlabeled_data 41 | 42 | # 建立分词器 43 | tokenizer = Tokenizer(dict_path, do_lower_case=True) 44 | 45 | # 对应的任务描述 46 | mask_idx = 5 47 | desc = ['[unused%s]' % i for i in range(1, 9)] 48 | desc.insert(mask_idx - 1, '[MASK]') 49 | desc_ids = [tokenizer.token_to_id(t) for t in desc] 50 | pos_id = tokenizer.token_to_id(u'很') 51 | neg_id = tokenizer.token_to_id(u'不') 52 | 53 | 54 | def random_masking(token_ids): 55 | """对输入进行随机mask 56 | """ 57 | rands = np.random.random(len(token_ids)) 58 | source, target = [], [] 59 | for r, t in zip(rands, token_ids): 60 | if r < 0.15 * 0.8: 61 | source.append(tokenizer._token_mask_id) 62 | target.append(t) 63 | elif r < 0.15 * 0.9: 64 | source.append(t) 65 | target.append(t) 66 | elif r < 0.15: 67 | source.append(np.random.choice(tokenizer._vocab_size - 1) + 1) 68 | target.append(t) 69 | else: 70 | source.append(t) 71 | target.append(0) 72 | return source, target 73 | 74 | 75 | class data_generator(DataGenerator): 76 | """数据生成器 77 | """ 78 | def __iter__(self, random=False): 79 | batch_token_ids, batch_segment_ids, batch_output_ids = [], [], [] 80 | for is_end, (text, label) in self.sample(random): 81 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 82 | if label != 2: 83 | token_ids = token_ids[:1] + desc_ids + token_ids[1:] 84 | segment_ids = [0] * len(desc_ids) + segment_ids 85 | if random: 86 | source_ids, target_ids = random_masking(token_ids) 87 | else: 88 | source_ids, target_ids = token_ids[:], token_ids[:] 89 | if label == 0: 90 | source_ids[mask_idx] = tokenizer._token_mask_id 91 | target_ids[mask_idx] = neg_id 92 | elif label == 1: 93 | source_ids[mask_idx] = tokenizer._token_mask_id 94 | target_ids[mask_idx] = pos_id 95 | batch_token_ids.append(source_ids) 96 | batch_segment_ids.append(segment_ids) 97 | batch_output_ids.append(target_ids) 98 | if len(batch_token_ids) == self.batch_size or is_end: 99 | batch_token_ids = sequence_padding(batch_token_ids) 100 | batch_segment_ids = sequence_padding(batch_segment_ids) 101 | batch_output_ids = sequence_padding(batch_output_ids) 102 | yield [ 103 | batch_token_ids, batch_segment_ids, batch_output_ids 104 | ], None 105 | batch_token_ids, batch_segment_ids, batch_output_ids = [], [], [] 106 | 107 | 108 | class CrossEntropy(Loss): 109 | """交叉熵作为loss,并mask掉输入部分 110 | """ 111 | def compute_loss(self, inputs, mask=None): 112 | y_true, y_pred = inputs 113 | y_mask = K.cast(K.not_equal(y_true, 0), K.floatx()) 114 | accuracy = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) 115 | accuracy = K.sum(accuracy * y_mask) / K.sum(y_mask) 116 | self.add_metric(accuracy, name='accuracy') 117 | loss = K.sparse_categorical_crossentropy(y_true, y_pred) 118 | loss = K.sum(loss * y_mask) / K.sum(y_mask) 119 | return loss 120 | 121 | 122 | class PtuningEmbedding(Embedding): 123 | """新定义Embedding层,只优化部分Token 124 | """ 125 | def call(self, inputs, mode='embedding'): 126 | embeddings = self.embeddings 127 | embeddings_sg = K.stop_gradient(embeddings) 128 | mask = np.zeros((K.int_shape(embeddings)[0], 1)) 129 | mask[1:9] += 1 # 只优化id为1~8的token 130 | self.embeddings = embeddings * mask + embeddings_sg * (1 - mask) 131 | outputs = super(PtuningEmbedding, self).call(inputs, mode) 132 | self.embeddings = embeddings 133 | return outputs 134 | 135 | 136 | class PtuningBERT(BERT): 137 | """替换原来的Embedding 138 | """ 139 | def apply(self, inputs=None, layer=None, arguments=None, **kwargs): 140 | if layer is Embedding: 141 | layer = PtuningEmbedding 142 | return super(PtuningBERT, 143 | self).apply(inputs, layer, arguments, **kwargs) 144 | 145 | 146 | # 加载预训练模型 147 | model = build_transformer_model( 148 | config_path=config_path, 149 | checkpoint_path=checkpoint_path, 150 | model=PtuningBERT, 151 | with_mlm=True 152 | ) 153 | 154 | for layer in model.layers: 155 | if layer.name != 'Embedding-Token': 156 | layer.trainable = False 157 | 158 | # 训练用模型 159 | y_in = keras.layers.Input(shape=(None,)) 160 | output = keras.layers.Lambda(lambda x: x[:, :10])(model.output) 161 | outputs = CrossEntropy(1)([y_in, model.output]) 162 | 163 | train_model = keras.models.Model(model.inputs + [y_in], outputs) 164 | train_model.compile(optimizer=Adam(6e-4)) 165 | train_model.summary() 166 | 167 | # 预测模型 168 | model = keras.models.Model(model.inputs, output) 169 | 170 | # 转换数据集 171 | train_generator = data_generator(train_data, batch_size) 172 | valid_generator = data_generator(valid_data, batch_size) 173 | test_generator = data_generator(test_data, batch_size) 174 | 175 | 176 | class Evaluator(keras.callbacks.Callback): 177 | def __init__(self): 178 | self.best_val_acc = 0. 179 | 180 | def on_epoch_end(self, epoch, logs=None): 181 | val_acc = evaluate(valid_generator) 182 | if val_acc > self.best_val_acc: 183 | self.best_val_acc = val_acc 184 | model.save_weights('best_model_bert.weights') 185 | test_acc = evaluate(test_generator) 186 | print( 187 | u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' % 188 | (val_acc, self.best_val_acc, test_acc) 189 | ) 190 | 191 | 192 | def evaluate(data): 193 | total, right = 0., 0. 194 | for x_true, _ in data: 195 | x_true, y_true = x_true[:2], x_true[2] 196 | y_pred = model.predict(x_true) 197 | y_pred = y_pred[:, mask_idx, [neg_id, pos_id]].argmax(axis=1) 198 | y_true = (y_true[:, mask_idx] == pos_id).astype(int) 199 | total += len(y_true) 200 | right += (y_true == y_pred).sum() 201 | return right / total 202 | 203 | 204 | if __name__ == '__main__': 205 | 206 | evaluator = Evaluator() 207 | 208 | train_model.fit_generator( 209 | train_generator.forfit(), 210 | steps_per_epoch=len(train_generator) * 50, 211 | epochs=1000, 212 | callbacks=[evaluator] 213 | ) 214 | 215 | else: 216 | 217 | model.load_weights('best_model_bert.weights') 218 | --------------------------------------------------------------------------------