├── bert4keras ├── __init__.py └── backend.py ├── examples ├── datasets │ ├── lcqmc.zip │ ├── sentiment.zip │ └── README.md ├── basic_masked_language_model.py ├── basic_extract_features.py ├── basic_language_model_gpt2_ml.py ├── basic_make_uncased_model_cased.py ├── basic_language_model_nezha_gen_gpt.py ├── task_sentiment_integrated_gradients.py ├── basic_simple_web_serving_simbert.py ├── basic_language_model_cpm_lm.py ├── task_sentence_similarity_lcqmc.py ├── task_sentiment_albert.py ├── task_seq2seq_autotitle.py ├── README.md ├── task_iflytek_gradient_penalty.py ├── task_seq2seq_autotitle_csl.py ├── task_conditional_language_model.py ├── task_question_answer_generation_by_seq2seq.py ├── task_seq2seq_autotitle_csl_mt5.py ├── task_language_model.py ├── task_iflytek_adversarial_training.py ├── task_sequence_labeling_cws_crf.py ├── task_sentiment_virtual_adversarial_training.py ├── task_iflytek_bert_of_theseus.py ├── task_language_model_chinese_chess.py ├── task_sequence_labeling_ner_crf.py ├── task_reading_comprehension_by_mlm.py ├── task_image_caption.py ├── task_reading_comprehension_by_seq2seq.py ├── task_seq2seq_ape210k_math_word_problem.py └── task_relation_extraction.py ├── .github └── issue_template.md ├── setup.py ├── pretraining ├── README.md └── pretraining.py ├── README.md └── LICENSE /bert4keras/__init__.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | 3 | __version__ = '0.9.6' 4 | -------------------------------------------------------------------------------- /examples/datasets/lcqmc.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binhetech/bert4keras/master/examples/datasets/lcqmc.zip -------------------------------------------------------------------------------- /examples/datasets/sentiment.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binhetech/bert4keras/master/examples/datasets/sentiment.zip -------------------------------------------------------------------------------- /examples/datasets/README.md: -------------------------------------------------------------------------------- 1 | ## 一些数据集 2 | 3 | - sentiment.zip: 个人整理的情感二分类数据集（train: 16883, valid: 2111, test: 2111） 4 | - lcqmc.zip: 哈工大整理的句子对二分类（train: 238766, valid: 8802, test: 12500），此处给出的仅是前10条demo，完整数据需要到这里申请 5 | -------------------------------------------------------------------------------- /.github/issue_template.md: -------------------------------------------------------------------------------- 1 | 提问时请尽可能提供如下信息： 2 | 3 | ### 基本信息 4 | - 你使用的**操作系统**: 5 | - 你使用的**Python**版本: 6 | - 你使用的**Tensorflow**版本: 7 | - 你使用的**Keras**版本: 8 | - 你使用的**bert4keras**版本: 9 | - 你使用纯**keras**还是**tf.keras**: 10 | - 你加载的**预训练模型**: 11 | 12 | ### 核心代码 13 | ```python 14 | # 请在此处贴上你的核心代码。 15 | # 请尽量只保留关键部分，不要无脑贴全部代码。 16 | ``` 17 | 18 | ### 输出信息 19 | ```shell 20 | # 请在此处贴上你的调试输出 21 | ``` 22 | 23 | ### 自我尝试 24 | 不管什么问题，请先尝试自行解决，“万般努力”之下仍然无法解决再来提问。此处请贴上你的努力过程。 25 | 26 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name='bert4keras', 7 | version='0.9.6', 8 | description='an elegant bert4keras', 9 | long_description='bert4keras: https://github.com/bojone/bert4keras', 10 | license='Apache License 2.0', 11 | url='https://github.com/bojone/bert4keras', 12 | author='bojone', 13 | author_email='bojone@spaces.ac.cn', 14 | install_requires=['keras<=2.3.1'], 15 | packages=find_packages() 16 | ) 17 | -------------------------------------------------------------------------------- /pretraining/README.md: -------------------------------------------------------------------------------- 1 | # 预训练相关代码 2 | 3 | 目前支持RoBERTa和GPT模式的预训练。请在tensorflow 1.14或1.15下运行。 4 | 5 | ## 使用 6 | ``` 7 | python data_utils.py # 生成tfrecord 8 | python pretraining.py # 启动预训练过程 9 | ``` 10 | 11 | 请阅读`data_utils.py`和`pretraining.py`修改相应的配置和参数，以适配自己的语料和设备。 12 | 13 | ## 背景 14 | 15 | keras是一个友好的框架，通常我们都是基于tf后端使用，另外还有tf.keras可以使用，基本上跟keras 2.3.x的接口一致了。 16 | 17 | 这种一致性意味着使用keras几乎就相当于使用tf，这意味着tf的一切优势keras也有，但tf没有的优势（比如使用简便）keras也有。 18 | 19 | 因此，作者参考原训练过程地实现了基于keras的预训练脚本，而有了这个keras版之后，因为前面所述的一致性，所以我们可以很轻松地迁移到多GPU上训练，也可以很轻松地迁移到TPU上训练。 20 | -------------------------------------------------------------------------------- /examples/basic_masked_language_model.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 测试代码可用性: MLM 3 | 4 | from bert4keras.models import build_transformer_model 5 | from bert4keras.tokenizers import Tokenizer 6 | from bert4keras.snippets import to_array 7 | import numpy as np 8 | 9 | config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' 10 | checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 11 | dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' 12 | 13 | tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 14 | model = build_transformer_model( 15 | config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=True 16 | ) # 建立模型，加载权重 17 | 18 | token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力') 19 | 20 | # mask掉“技术” 21 | token_ids[3] = token_ids[4] = tokenizer._token_dict['[MASK]'] 22 | token_ids, segment_ids = to_array([token_ids], [segment_ids]) 23 | 24 | # 用mlm模型预测被mask掉的部分 25 | probas = model.predict([token_ids, segment_ids])[0] 26 | print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术” 27 | -------------------------------------------------------------------------------- /examples/basic_extract_features.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 测试代码可用性: 提取特征 3 | 4 | from bert4keras.backend import keras 5 | from bert4keras.models import build_transformer_model 6 | from bert4keras.tokenizers import Tokenizer 7 | from bert4keras.snippets import to_array 8 | import numpy as np 9 | 10 | config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' 11 | checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 12 | dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' 13 | 14 | tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 15 | model = build_transformer_model(config_path, checkpoint_path) # 建立模型，加载权重 16 | 17 | # 编码测试 18 | token_ids, segment_ids = tokenizer.encode(u'语言模型') 19 | token_ids, segment_ids = to_array([token_ids], [segment_ids]) 20 | 21 | print('\n ===== predicting =====\n') 22 | print(model.predict([token_ids, segment_ids])) 23 | """ 24 | 输出： 25 | [[[-0.63251007 0.2030236 0.07936534 ... 0.49122632 -0.20493352 26 | 0.2575253 ] 27 | [-0.7588351 0.09651865 1.0718756 ... -0.6109694 0.04312154 28 | 0.03881441] 29 | [ 0.5477043 -0.792117 0.44435206 ... 0.42449304 0.41105673 30 | 0.08222899] 31 | [-0.2924238 0.6052722 0.49968526 ... 0.8604137 -0.6533166 32 | 0.5369075 ] 33 | [-0.7473459 0.49431565 0.7185162 ... 0.3848612 -0.74090636 34 | 0.39056838] 35 | [-0.8741375 -0.21650358 1.338839 ... 0.5816864 -0.4373226 36 | 0.56181806]]] 37 | """ 38 | 39 | print('\n ===== reloading and predicting =====\n') 40 | model.save('test.model') 41 | del model 42 | model = keras.models.load_model('test.model') 43 | print(model.predict([token_ids, segment_ids])) 44 | -------------------------------------------------------------------------------- /examples/basic_language_model_gpt2_ml.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 基本测试：中文GPT2_ML模型 3 | # 介绍链接：https://kexue.fm/archives/7292 4 | 5 | import numpy as np 6 | from bert4keras.models import build_transformer_model 7 | from bert4keras.tokenizers import Tokenizer 8 | from bert4keras.snippets import AutoRegressiveDecoder 9 | from bert4keras.snippets import uniout 10 | 11 | config_path = '/root/kg/bert/gpt2_ml/config.json' 12 | checkpoint_path = '/root/kg/bert/gpt2_ml/model.ckpt-100000' 13 | dict_path = '/root/kg/bert/gpt2_ml/vocab.txt' 14 | 15 | tokenizer = Tokenizer( 16 | dict_path, token_start=None, token_end=None, do_lower_case=True 17 | ) # 建立分词器 18 | 19 | model = build_transformer_model( 20 | config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2_ml' 21 | ) # 建立模型，加载权重 22 | 23 | 24 | class ArticleCompletion(AutoRegressiveDecoder): 25 | """基于随机采样的文章续写 26 | """ 27 | @AutoRegressiveDecoder.wraps(default_rtype='probas') 28 | def predict(self, inputs, output_ids, states): 29 | token_ids = np.concatenate([inputs[0], output_ids], 1) 30 | return model.predict(token_ids)[:, -1] 31 | 32 | def generate(self, text, n=1, topp=0.95): 33 | token_ids, _ = tokenizer.encode(text) 34 | results = self.random_sample([token_ids], n, topp=topp) # 基于随机采样 35 | return [text + tokenizer.decode(ids) for ids in results] 36 | 37 | 38 | article_completion = ArticleCompletion( 39 | start_id=None, 40 | end_id=511, # 511是中文句号 41 | maxlen=256, 42 | minlen=128 43 | ) 44 | 45 | print(article_completion.generate(u'今天天气不错')) 46 | """ 47 | 部分结果： 48 | 49 | >>> article_completion.generate(u'今天天气不错') 50 | [u'今天天气不错，可以去跑步。昨晚看了一个关于跑步的纪录片，里面的女主讲述的是一个女孩子的成长，很励志，也很美丽。我也想跑，但是我不知道跑步要穿运动鞋，所以就买了一双运动鞋。这个纪录片是关于运动鞋的，有一集讲了一个女孩子，从小学开始就没有穿过运动鞋，到了高中才开始尝试跑步。'] 51 | 52 | >>> article_completion.generate(u'双十一') 53 | [u'双十一马上就要到了！你还在为双11的物流配送而担心吗？你还在为没时间去仓库取货而发愁吗？你还在为不知道怎么买到便宜货而发愁吗？你还在为买不到心仪的产品而懊恼吗？那么，双十一就来了！今天小编带你来看看这些快递，都是怎么送货的！1. 物流配送快递公司的配送，主要是由快递公司负责，快递公司负责派件，物流服务。'] 54 | 55 | >>> article_completion.generate(u'科学空间') 56 | [u'科学空间站科学空间站（英文：science space station），是中华人民共和国的一个空间站。该空间站是中国科学院大连物理研究所研制，主要研发和使用中国科学院大连物理研究所的核能动力空间站。科学空间站位于北京市海淀区，距离地面393米，总建筑面积约为1万平方米，总投资约为5亿元人民币。科学空间站于2018年12月26日开始动工，2021年6月建成并投入使用。'] 57 | """ 58 | -------------------------------------------------------------------------------- /examples/basic_make_uncased_model_cased.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 通过简单修改词表，使得不区分大小写的模型有区分大小写的能力 3 | # 基本思路：将英文单词大写化后添加到词表中，并修改模型Embedding层 4 | 5 | from bert4keras.models import build_transformer_model 6 | from bert4keras.tokenizers import Tokenizer, load_vocab 7 | from bert4keras.snippets import to_array 8 | import numpy as np 9 | 10 | config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' 11 | checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 12 | dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' 13 | 14 | token_dict = load_vocab(dict_path) 15 | new_token_dict = token_dict.copy() 16 | compound_tokens = [] 17 | 18 | for t, i in sorted(token_dict.items(), key=lambda s: s[1]): 19 | # 这里主要考虑两种情况：1、首字母大写；2、整个单词大写。 20 | # Python2下，新增了5594个token；Python3下，新增了5596个token。 21 | tokens = [] 22 | if t.isalpha(): 23 | tokens.extend([t[:1].upper() + t[1:], t.upper()]) 24 | elif t[:2] == '##' and t[2:].isalpha(): 25 | tokens.append(t.upper()) 26 | for token in tokens: 27 | if token not in new_token_dict: 28 | compound_tokens.append([i]) 29 | new_token_dict[token] = len(new_token_dict) 30 | 31 | tokenizer = Tokenizer(new_token_dict, do_lower_case=False) 32 | 33 | model = build_transformer_model( 34 | config_path, 35 | checkpoint_path, 36 | compound_tokens=compound_tokens, # 增加新token，用旧token平均来初始化 37 | ) 38 | 39 | text = u'Welcome to BEIJING.' 40 | tokens = tokenizer.tokenize(text) 41 | print(tokens) 42 | """ 43 | 输出：['[CLS]', u'Welcome', u'to', u'BE', u'##I', u'##JING', u'.', '[SEP]'] 44 | """ 45 | 46 | token_ids, segment_ids = tokenizer.encode(text) 47 | token_ids, segment_ids = to_array([token_ids], [segment_ids]) 48 | print(model.predict([token_ids, segment_ids])) 49 | """ 50 | 输出： 51 | [[[-1.4999904e-01 1.9651388e-01 -1.7924258e-01 ... 7.8269649e-01 52 | 2.2241375e-01 1.1325148e-01] 53 | [-4.5268752e-02 5.5090344e-01 7.4699545e-01 ... -4.7773960e-01 54 | -1.7562288e-01 4.1265407e-01] 55 | [ 7.0158571e-02 1.7816302e-01 3.6949167e-01 ... 9.6258509e-01 56 | -8.4678203e-01 6.3776302e-01] 57 | ... 58 | [ 9.3637377e-01 3.0232478e-02 8.1411439e-01 ... 7.9186147e-01 59 | 7.5704646e-01 -8.3475001e-04] 60 | [ 2.3699696e-01 2.9953337e-01 8.1962071e-02 ... -1.3776925e-01 61 | 3.8681498e-01 3.2553676e-01] 62 | [ 1.9728680e-01 7.7782705e-02 5.2951699e-01 ... 8.9622810e-02 63 | -2.3932748e-02 6.9600858e-02]]] 64 | """ 65 | -------------------------------------------------------------------------------- /examples/basic_language_model_nezha_gen_gpt.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 基本测试：中文GPT模型，base版本，华为开源的 3 | # 权重链接: https://pan.baidu.com/s/1-FB0yl1uxYDCGIRvU1XNzQ 提取码: xynn 4 | # 参考项目：https://github.com/bojone/chinese-gen 5 | 6 | import numpy as np 7 | from bert4keras.models import build_transformer_model 8 | from bert4keras.tokenizers import Tokenizer 9 | from bert4keras.snippets import AutoRegressiveDecoder 10 | from bert4keras.snippets import uniout 11 | 12 | config_path = '/root/kg/bert/chinese_nezha_gpt_L-12_H-768_A-12/config.json' 13 | checkpoint_path = '/root/kg/bert/chinese_nezha_gpt_L-12_H-768_A-12/gpt.ckpt' 14 | dict_path = '/root/kg/bert/chinese_nezha_gpt_L-12_H-768_A-12/vocab.txt' 15 | 16 | tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 17 | 18 | model = build_transformer_model( 19 | config_path=config_path, 20 | checkpoint_path=checkpoint_path, 21 | segment_vocab_size=0, # 去掉segmeng_ids输入 22 | application='lm', 23 | ) # 建立模型，加载权重 24 | 25 | 26 | class ArticleCompletion(AutoRegressiveDecoder): 27 | """基于随机采样的文章续写 28 | """ 29 | @AutoRegressiveDecoder.wraps(default_rtype='probas') 30 | def predict(self, inputs, output_ids, states): 31 | token_ids = np.concatenate([inputs[0], output_ids], 1) 32 | return model.predict(token_ids)[:, -1] 33 | 34 | def generate(self, text, n=1, topp=0.95): 35 | token_ids = tokenizer.encode(text)[0][:-1] 36 | results = self.random_sample([token_ids], n, topp=topp) # 基于随机采样 37 | return [text + tokenizer.decode(ids) for ids in results] 38 | 39 | 40 | article_completion = ArticleCompletion( 41 | start_id=None, 42 | end_id=511, # 511是中文句号 43 | maxlen=256, 44 | minlen=128 45 | ) 46 | 47 | print(article_completion.generate(u'今天天气不错')) 48 | """ 49 | 部分结果： 50 | >>> article_completion.generate(u'今天天气不错') 51 | [u'今天天气不错。昨天的天气是多云到晴的天气，今天的天气还不错，不会太冷。明后两天天气还是比较好的。不过今天的天气比较闷热，最高温度在30℃左右，明后两天天气会更加热。预计今天的最高温度为30℃，明后两天的最高温度为32℃左右，今天的最高气温将在30℃左右。（记者李莉）。新华网重庆频道诚邀广大网友投稿，您可以用相机或手机记录下身边的感人故事，精彩瞬间。请将作者、拍摄时间、地点和简要说明连同照片发给我们，我们将精选其中的好图、美图在页面上展示，让所有新华网友共赏。[投稿] 。本报讯(记者陈敏华) 今年上半年，重庆市各级公安机关在全力抓好'] 52 | 53 | >>> article_completion.generate(u'双十一') 54 | [u'双十一大是中国共产党在新的历史起点上召开的一次十分重要的代表大会, 是全面落实科学发展观、推进中国特色社会主义伟大事业的一次重要会议。会议的召开, 是党和政府对新世纪新阶段我国改革开放和社会主义现代化建设事业的新的历史任务的一次重要总动员, 必将对我们党全面推进党的建'] 55 | 56 | >>> article_completion.generate(u'科学空间') 57 | [u'科学空间站上的两个机器人在进入轨道后，一边在轨道上工作，一边用它们的身体和心脏在空间站上的一个大气层进行活动，以确保它们在进入地球之后不会因太阳风暴而受到影响；而另外一个机器人则在进入轨道的过程中，通过机器人与地球上的大气层相互作用，使地球的大气层不断地向地球的大气层中转移，以使其能够在空间站上工作，并且使用它们的身体和心脏来完成它们的各种任务。'] 58 | """ 59 | -------------------------------------------------------------------------------- /examples/task_sentiment_integrated_gradients.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 通过积分梯度（Integrated Gradients）来给输入进行重要性排序 3 | # 接 task_sentiment_albert.py 4 | # 原始论文：https://arxiv.org/abs/1703.01365 5 | # 博客介绍：https://kexue.fm/archives/7533 6 | # 请读者务必先弄懂原理再看代码，下述代码仅是交互式演示代码，并非成品API 7 | 8 | from task_sentiment_albert import * 9 | from keras.layers import Layer, Input 10 | from bert4keras.backend import K, batch_gather 11 | from keras.models import Model 12 | from bert4keras.snippets import uniout 13 | 14 | 15 | class Gradient(Layer): 16 | """获取梯度的层 17 | """ 18 | def __init__(self, **kwargs): 19 | super(Gradient, self).__init__(**kwargs) 20 | self.supports_masking = True 21 | 22 | def call(self, input): 23 | input, output, label = input 24 | output = batch_gather(output, label) 25 | return K.gradients(output, [input])[0] * input 26 | 27 | def compute_output_shape(self, input_shape): 28 | return input_shape[0] 29 | 30 | 31 | label_in = Input(shape=(1,)) # 指定标签 32 | input = model.get_layer('Embedding-Token').output 33 | output = model.output 34 | grads = Gradient()([input, output, label_in]) 35 | grad_model = Model(model.inputs + [label_in], grads) 36 | 37 | # 获取原始embedding层 38 | embeddings = model.get_layer('Embedding-Token').embeddings 39 | values = K.eval(embeddings) 40 | 41 | text = u'这家店真黑心' 42 | text = u'图太乱了有点看不懂重点讲故事的时候很难让孩子集中' 43 | text = u'这是一本很好看的书' 44 | text = u'这是一本很糟糕的书' 45 | token_ids, segment_ids = tokenizer.encode(text) 46 | preds = model.predict([[token_ids], [segment_ids]]) 47 | label = np.argmax(preds[0]) 48 | 49 | pred_grads = [] 50 | n = 20 51 | for i in range(n): 52 | # nlp任务中参照背景通常直接选零向量，所以这里 53 | # 让embedding层从零渐变到原始值，以实现路径变换。 54 | alpha = 1.0 * i / (n - 1) 55 | K.set_value(embeddings, alpha * values) 56 | pred_grad = grad_model.predict([[token_ids], [segment_ids], [[label]]])[0] 57 | pred_grads.append(pred_grad) 58 | 59 | # 然后求平均 60 | pred_grads = np.mean(pred_grads, 0) 61 | 62 | # 这时候我们得到形状为(seq_len, hidden_dim)的矩阵，我们要将它变换成(seq_len,) 63 | # 这时候有两种方案：1、直接求模长；2、取绝对值后再取最大。两者效果差不多。 64 | scores = np.sqrt((pred_grads**2).sum(axis=1)) 65 | scores = (scores - scores.min()) / (scores.max() - scores.min()) 66 | scores = scores.round(4) 67 | results = [(tokenizer.decode([t]), s) for t, s in zip(token_ids, scores)] 68 | print(results[1:-1]) 69 | 70 | scores = np.abs(pred_grads).max(axis=1) 71 | scores = (scores - scores.min()) / (scores.max() - scores.min()) 72 | scores = scores.round(4) 73 | results = [(tokenizer.decode([t]), s) for t, s in zip(token_ids, scores)] 74 | print(results[1:-1]) 75 | -------------------------------------------------------------------------------- /examples/basic_simple_web_serving_simbert.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 利用自带的接口，将SimBERT的同义句生成搭建成Web服务。 3 | # 基于bottlepy简单封装，仅作为临时测试使用，不保证性能。 4 | # 目前仅保证支持 Tensorflow 1.x + Keras <= 2.3.1。 5 | # 具体用法请看 https://github.com/bojone/bert4keras/blob/8ffb46a16a79f87aa8cdf045df7994036b4be47d/bert4keras/snippets.py#L580 6 | 7 | import numpy as np 8 | from collections import Counter 9 | from bert4keras.backend import keras, K 10 | from bert4keras.models import build_transformer_model 11 | from bert4keras.tokenizers import Tokenizer 12 | from bert4keras.snippets import sequence_padding, AutoRegressiveDecoder 13 | from bert4keras.snippets import WebServing 14 | 15 | maxlen = 32 16 | 17 | # bert配置 18 | config_path = '/root/kg/bert/chinese_simbert_L-12_H-768_A-12/bert_config.json' 19 | checkpoint_path = '/root/kg/bert/chinese_simbert_L-12_H-768_A-12/bert_model.ckpt' 20 | dict_path = '/root/kg/bert/chinese_simbert_L-12_H-768_A-12/vocab.txt' 21 | 22 | # 建立分词器 23 | tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 24 | 25 | # 建立加载模型 26 | bert = build_transformer_model( 27 | config_path, 28 | checkpoint_path, 29 | with_pool='linear', 30 | application='unilm', 31 | return_keras_model=False, 32 | ) 33 | 34 | encoder = keras.models.Model(bert.model.inputs, bert.model.outputs[0]) 35 | seq2seq = keras.models.Model(bert.model.inputs, bert.model.outputs[1]) 36 | 37 | 38 | class SynonymsGenerator(AutoRegressiveDecoder): 39 | """seq2seq解码器 40 | """ 41 | @AutoRegressiveDecoder.wraps(default_rtype='probas') 42 | def predict(self, inputs, output_ids, states): 43 | token_ids, segment_ids = inputs 44 | token_ids = np.concatenate([token_ids, output_ids], 1) 45 | segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1) 46 | return seq2seq.predict([token_ids, segment_ids])[:, -1] 47 | 48 | def generate(self, text, n=1, topp=0.95): 49 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 50 | output_ids = self.random_sample([token_ids, segment_ids], n, 51 | topp=topp) # 基于随机采样 52 | return [tokenizer.decode(ids) for ids in output_ids] 53 | 54 | 55 | synonyms_generator = SynonymsGenerator( 56 | start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen 57 | ) 58 | 59 | 60 | def gen_synonyms(text, n=100, k=20): 61 | """"含义：产生sent的n个相似句，然后返回最相似的k个。 62 | 做法：用seq2seq生成，并用encoder算相似度并排序。 63 | """ 64 | r = synonyms_generator.generate(text, n) 65 | r = [i for i in set(r) if i != text] 66 | r = [text] + r 67 | X, S = [], [] 68 | for t in r: 69 | x, s = tokenizer.encode(t) 70 | X.append(x) 71 | S.append(s) 72 | X = sequence_padding(X) 73 | S = sequence_padding(S) 74 | Z = encoder.predict([X, S]) 75 | Z /= (Z**2).sum(axis=1, keepdims=True)**0.5 76 | argsort = np.dot(Z[1:], -Z[0]).argsort() 77 | return [r[i + 1] for i in argsort[:k]] 78 | 79 | 80 | if __name__ == '__main__': 81 | 82 | arguments = {'text': (None, True), 'n': (int, False), 'k': (int, False)} 83 | web = WebServing(port=8864) 84 | web.route('/gen_synonyms', gen_synonyms, arguments) 85 | web.start() 86 | # 现在可以测试访问 https://127.0.0.1:8864/gen_synonyms?text=苹果多少钱一斤 87 | -------------------------------------------------------------------------------- /examples/basic_language_model_cpm_lm.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 基本测试：清华开源的中文GPT2模型（26亿参数） 3 | # 项目链接：https://github.com/TsinghuaAI/CPM-Generate 4 | # 博客介绍：https://kexue.fm/archives/7912 5 | 6 | import numpy as np 7 | from bert4keras.models import build_transformer_model 8 | from bert4keras.tokenizers import SpTokenizer 9 | from bert4keras.snippets import AutoRegressiveDecoder 10 | from bert4keras.snippets import uniout 11 | import jieba 12 | jieba.initialize() 13 | 14 | # 模型路径 15 | config_path = '/root/kg/bert/CPM_LM_2.6B_TF/config.json' 16 | checkpoint_path = '/root/kg/bert/CPM_LM_2.6B_TF/model.ckpt' 17 | spm_path = '/root/kg/bert/CPM_LM_2.6B_TF/chinese_vocab.model' 18 | 19 | 20 | def pre_tokenize(text): 21 | """分词前处理函数 22 | """ 23 | return [ 24 | w.replace(' ', u'\u2582').replace('\n', u'\u2583') 25 | for w in jieba.cut(text, cut_all=False) 26 | ] 27 | 28 | 29 | tokenizer = SpTokenizer( 30 | spm_path, 31 | token_start=None, 32 | token_end=None, 33 | pre_tokenize=pre_tokenize, 34 | token_translate={u'\u2583': ''} 35 | ) # 建立分词器 36 | 37 | model = build_transformer_model( 38 | config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2' 39 | ) # 建立模型，加载权重 40 | 41 | 42 | class TextExpansion(AutoRegressiveDecoder): 43 | """基于随机采样的文本续写 44 | """ 45 | @AutoRegressiveDecoder.wraps(default_rtype='probas') 46 | def predict(self, inputs, output_ids, states): 47 | token_ids = np.concatenate([inputs[0], output_ids], 1) 48 | return model.predict(token_ids)[:, -1] 49 | 50 | def generate(self, text, n=1, topp=0.95, temperature=1): 51 | """输出结果会有一定的随机性，如果只关心Few Shot效果， 52 | 可以考虑将解码方式换为beam search。 53 | """ 54 | token_ids, _ = tokenizer.encode(text) 55 | results = self.random_sample([token_ids], 56 | n, 57 | topp=topp, 58 | temperature=temperature) # 基于随机采样 59 | results = [token_ids + [int(i) for i in ids] for ids in results] 60 | texts = [tokenizer.decode(ids) for ids in results] 61 | return [self.post_replace(text) for text in texts] 62 | 63 | def post_replace(self, text): 64 | for s, t in [(' ', ''), (u'\u2582', ' '), (u'\u2583', '\n')]: 65 | text = text.replace(s, t) 66 | return text 67 | 68 | 69 | text_expansion = TextExpansion( 70 | start_id=None, 71 | end_id=3, # 3是，也是换行符 72 | maxlen=16, 73 | ) 74 | 75 | # 常识推理 76 | # 本例输出：北京 77 | query = u""" 78 | 美国的首都是华盛顿 79 | 法国的首都是巴黎 80 | 日本的首都是东京 81 | 中国的首都是 82 | """ 83 | print(text_expansion.generate(query[1:-1], 1)[0]) 84 | 85 | # 单词翻译 86 | # 本例输出：bird 87 | query = u""" 88 | 狗 dog 89 | 猫 cat 90 | 猪 pig 91 | 鸟 92 | """ 93 | print(text_expansion.generate(query[1:-1], 1)[0]) 94 | 95 | # 主语抽取 96 | # 本例输出：杨振宁 97 | query = u""" 98 | 从1931年起，华罗庚在清华大学边学习边工作华罗庚 99 | 在一间简陋的房间里，陈景润攻克了“哥德巴赫猜想” 陈景润 100 | 在这里，丘成桐得到IBM奖学金丘成桐 101 | 杨振宁在粒子物理学、统计力学和凝聚态物理等领域作出里程碑性贡献 102 | """ 103 | print(text_expansion.generate(query[1:-1], 1)[0]) 104 | 105 | # 三元组抽取 106 | # 本例输出：张红,体重,140斤 107 | query = u""" 108 | 姚明的身高是211cm，是很多人心目中的偶像。 ->姚明，身高，211cm 109 | 毛泽东是绍兴人，早年在长沙读书。->毛泽东，出生地，绍兴 110 | 虽然周杰伦在欧洲办的婚礼，但是他是土生土长的中国人->周杰伦，国籍，中国 111 | 小明出生于武汉，但是却不喜欢在武汉生成，长大后去了北京。->小明，出生地，武汉 112 | 吴亦凡是很多人的偶像，但是他却是加拿大人，另很多人失望->吴亦凡，国籍，加拿大 113 | 武耀的生日在5月8号，这一天，大家都为他庆祝了生日->武耀，生日，5月8号 114 | 《青花瓷》是周杰伦最得意的一首歌。->周杰伦，作品，《青花瓷》 115 | 北京是中国的首都。->中国，首都，北京 116 | 蒋碧的家乡在盘龙城，毕业后去了深圳工作。->蒋碧，籍贯，盘龙城 117 | 上周我们和王立一起去了他的家乡云南玩昨天才回到了武汉。->王立，籍贯，云南 118 | 昨天11月17号，我和朋友一起去了海底捞，期间服务员为我的朋友刘章庆祝了生日。->刘章，生日，11月17号 119 | 张红的体重达到了140斤，她很苦恼。-> 120 | """ 121 | print(text_expansion.generate(query[1:-1], 1)[0]) 122 | -------------------------------------------------------------------------------- /examples/task_sentence_similarity_lcqmc.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # 句子对分类任务，LCQMC数据集 3 | # val_acc: 0.887071, test_acc: 0.870320 4 | 5 | import numpy as np 6 | from bert4keras.backend import keras, set_gelu, K 7 | from bert4keras.tokenizers import Tokenizer 8 | from bert4keras.models import build_transformer_model 9 | from bert4keras.optimizers import Adam 10 | from bert4keras.snippets import sequence_padding, DataGenerator 11 | from bert4keras.snippets import open 12 | from keras.layers import Dropout, Dense 13 | 14 | set_gelu('tanh') # 切换gelu版本 15 | 16 | maxlen = 128 17 | batch_size = 64 18 | config_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_config.json' 19 | checkpoint_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_model.ckpt' 20 | dict_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/vocab.txt' 21 | 22 | 23 | def load_data(filename): 24 | """加载数据 25 | 单条格式：(文本1, 文本2, 标签id) 26 | """ 27 | D = [] 28 | with open(filename, encoding='utf-8') as f: 29 | for l in f: 30 | text1, text2, label = l.strip().split('\t') 31 | D.append((text1, text2, int(label))) 32 | return D 33 | 34 | 35 | # 加载数据集 36 | train_data = load_data('datasets/lcqmc/lcqmc.train.data') 37 | valid_data = load_data('datasets/lcqmc/lcqmc.valid.data') 38 | test_data = load_data('datasets/lcqmc/lcqmc.test.data') 39 | 40 | # 建立分词器 41 | tokenizer = Tokenizer(dict_path, do_lower_case=True) 42 | 43 | 44 | class data_generator(DataGenerator): 45 | """数据生成器 46 | """ 47 | def __iter__(self, random=False): 48 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 49 | for is_end, (text1, text2, label) in self.sample(random): 50 | token_ids, segment_ids = tokenizer.encode( 51 | text1, text2, maxlen=maxlen 52 | ) 53 | batch_token_ids.append(token_ids) 54 | batch_segment_ids.append(segment_ids) 55 | batch_labels.append([label]) 56 | if len(batch_token_ids) == self.batch_size or is_end: 57 | batch_token_ids = sequence_padding(batch_token_ids) 58 | batch_segment_ids = sequence_padding(batch_segment_ids) 59 | batch_labels = sequence_padding(batch_labels) 60 | yield [batch_token_ids, batch_segment_ids], batch_labels 61 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 62 | 63 | 64 | # 加载预训练模型 65 | bert = build_transformer_model( 66 | config_path=config_path, 67 | checkpoint_path=checkpoint_path, 68 | with_pool=True, 69 | return_keras_model=False, 70 | ) 71 | 72 | output = Dropout(rate=0.1)(bert.model.output) 73 | output = Dense( 74 | units=2, activation='softmax', kernel_initializer=bert.initializer 75 | )(output) 76 | 77 | model = keras.models.Model(bert.model.input, output) 78 | model.summary() 79 | 80 | model.compile( 81 | loss='sparse_categorical_crossentropy', 82 | optimizer=Adam(2e-5), # 用足够小的学习率 83 | # optimizer=PiecewiseLinearLearningRate(Adam(5e-5), {10000: 1, 30000: 0.1}), 84 | metrics=['accuracy'], 85 | ) 86 | 87 | # 转换数据集 88 | train_generator = data_generator(train_data, batch_size) 89 | valid_generator = data_generator(valid_data, batch_size) 90 | test_generator = data_generator(test_data, batch_size) 91 | 92 | 93 | def evaluate(data): 94 | total, right = 0., 0. 95 | for x_true, y_true in data: 96 | y_pred = model.predict(x_true).argmax(axis=1) 97 | y_true = y_true[:, 0] 98 | total += len(y_true) 99 | right += (y_true == y_pred).sum() 100 | return right / total 101 | 102 | 103 | class Evaluator(keras.callbacks.Callback): 104 | """评估与保存 105 | """ 106 | def __init__(self): 107 | self.best_val_acc = 0. 108 | 109 | def on_epoch_end(self, epoch, logs=None): 110 | val_acc = evaluate(valid_generator) 111 | if val_acc > self.best_val_acc: 112 | self.best_val_acc = val_acc 113 | model.save_weights('best_model.weights') 114 | test_acc = evaluate(test_generator) 115 | print( 116 | u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' % 117 | (val_acc, self.best_val_acc, test_acc) 118 | ) 119 | 120 | 121 | if __name__ == '__main__': 122 | 123 | evaluator = Evaluator() 124 | 125 | model.fit( 126 | train_generator.forfit(), 127 | steps_per_epoch=len(train_generator), 128 | epochs=20, 129 | callbacks=[evaluator] 130 | ) 131 | 132 | model.load_weights('best_model.weights') 133 | print(u'final test acc: %05f\n' % (evaluate(test_generator))) 134 | 135 | else: 136 | 137 | model.load_weights('best_model.weights') 138 | -------------------------------------------------------------------------------- /examples/task_sentiment_albert.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # 情感分析例子，加载albert_zh权重(https://github.com/brightmart/albert_zh) 3 | 4 | import numpy as np 5 | from bert4keras.backend import keras, set_gelu 6 | from bert4keras.tokenizers import Tokenizer 7 | from bert4keras.models import build_transformer_model 8 | from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr 9 | from bert4keras.snippets import sequence_padding, DataGenerator 10 | from bert4keras.snippets import open 11 | from keras.layers import Lambda, Dense 12 | 13 | set_gelu('tanh') # 切换gelu版本 14 | 15 | num_classes = 2 16 | maxlen = 128 17 | batch_size = 32 18 | config_path = '/root/kg/bert/albert_small_zh_google/albert_config.json' 19 | checkpoint_path = '/root/kg/bert/albert_small_zh_google/albert_model.ckpt' 20 | dict_path = '/root/kg/bert/albert_small_zh_google/vocab.txt' 21 | 22 | 23 | def load_data(filename): 24 | """加载数据 25 | 单条格式：(文本, 标签id) 26 | """ 27 | D = [] 28 | with open(filename, encoding='utf-8') as f: 29 | for l in f: 30 | text, label = l.strip().split('\t') 31 | D.append((text, int(label))) 32 | return D 33 | 34 | 35 | # 加载数据集 36 | train_data = load_data('datasets/sentiment/sentiment.train.data') 37 | valid_data = load_data('datasets/sentiment/sentiment.valid.data') 38 | test_data = load_data('datasets/sentiment/sentiment.test.data') 39 | 40 | # 建立分词器 41 | tokenizer = Tokenizer(dict_path, do_lower_case=True) 42 | 43 | 44 | class data_generator(DataGenerator): 45 | """数据生成器 46 | """ 47 | def __iter__(self, random=False): 48 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 49 | for is_end, (text, label) in self.sample(random): 50 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 51 | batch_token_ids.append(token_ids) 52 | batch_segment_ids.append(segment_ids) 53 | batch_labels.append([label]) 54 | if len(batch_token_ids) == self.batch_size or is_end: 55 | batch_token_ids = sequence_padding(batch_token_ids) 56 | batch_segment_ids = sequence_padding(batch_segment_ids) 57 | batch_labels = sequence_padding(batch_labels) 58 | yield [batch_token_ids, batch_segment_ids], batch_labels 59 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 60 | 61 | 62 | # 加载预训练模型 63 | bert = build_transformer_model( 64 | config_path=config_path, 65 | checkpoint_path=checkpoint_path, 66 | model='albert', 67 | return_keras_model=False, 68 | ) 69 | 70 | output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output) 71 | output = Dense( 72 | units=num_classes, 73 | activation='softmax', 74 | kernel_initializer=bert.initializer 75 | )(output) 76 | 77 | model = keras.models.Model(bert.model.input, output) 78 | model.summary() 79 | 80 | # 派生为带分段线性学习率的优化器。 81 | # 其中name参数可选，但最好填入，以区分不同的派生优化器。 82 | AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR') 83 | 84 | model.compile( 85 | loss='sparse_categorical_crossentropy', 86 | # optimizer=Adam(1e-5), # 用足够小的学习率 87 | optimizer=AdamLR(learning_rate=1e-4, lr_schedule={ 88 | 1000: 1, 89 | 2000: 0.1 90 | }), 91 | metrics=['accuracy'], 92 | ) 93 | 94 | # 转换数据集 95 | train_generator = data_generator(train_data, batch_size) 96 | valid_generator = data_generator(valid_data, batch_size) 97 | test_generator = data_generator(test_data, batch_size) 98 | 99 | 100 | def evaluate(data): 101 | total, right = 0., 0. 102 | for x_true, y_true in data: 103 | y_pred = model.predict(x_true).argmax(axis=1) 104 | y_true = y_true[:, 0] 105 | total += len(y_true) 106 | right += (y_true == y_pred).sum() 107 | return right / total 108 | 109 | 110 | class Evaluator(keras.callbacks.Callback): 111 | """评估与保存 112 | """ 113 | def __init__(self): 114 | self.best_val_acc = 0. 115 | 116 | def on_epoch_end(self, epoch, logs=None): 117 | val_acc = evaluate(valid_generator) 118 | if val_acc > self.best_val_acc: 119 | self.best_val_acc = val_acc 120 | model.save_weights('best_model.weights') 121 | test_acc = evaluate(test_generator) 122 | print( 123 | u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' % 124 | (val_acc, self.best_val_acc, test_acc) 125 | ) 126 | 127 | 128 | if __name__ == '__main__': 129 | 130 | evaluator = Evaluator() 131 | 132 | model.fit( 133 | train_generator.forfit(), 134 | steps_per_epoch=len(train_generator), 135 | epochs=10, 136 | callbacks=[evaluator] 137 | ) 138 | 139 | model.load_weights('best_model.weights') 140 | print(u'final test acc: %05f\n' % (evaluate(test_generator))) 141 | 142 | else: 143 | 144 | model.load_weights('best_model.weights') 145 | -------------------------------------------------------------------------------- /examples/task_seq2seq_autotitle.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # bert做Seq2Seq任务，采用UNILM方案 3 | # 介绍链接：https://kexue.fm/archives/6933 4 | 5 | from __future__ import print_function 6 | import glob 7 | import numpy as np 8 | from bert4keras.backend import keras, K 9 | from bert4keras.layers import Loss 10 | from bert4keras.models import build_transformer_model 11 | from bert4keras.tokenizers import Tokenizer, load_vocab 12 | from bert4keras.optimizers import Adam 13 | from bert4keras.snippets import sequence_padding, open 14 | from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder 15 | from keras.models import Model 16 | 17 | # 基本参数 18 | maxlen = 256 19 | batch_size = 16 20 | steps_per_epoch = 1000 21 | epochs = 10000 22 | 23 | # bert配置 24 | config_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_config.json' 25 | checkpoint_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_model.ckpt' 26 | dict_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/vocab.txt' 27 | 28 | # 训练样本。THUCNews数据集，每个样本保存为一个txt。 29 | txts = glob.glob('/root/thuctc/THUCNews/*/*.txt') 30 | 31 | # 加载并精简词表，建立分词器 32 | token_dict, keep_tokens = load_vocab( 33 | dict_path=dict_path, 34 | simplified=True, 35 | startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], 36 | ) 37 | tokenizer = Tokenizer(token_dict, do_lower_case=True) 38 | 39 | 40 | class data_generator(DataGenerator): 41 | """数据生成器 42 | """ 43 | def __iter__(self, random=False): 44 | batch_token_ids, batch_segment_ids = [], [] 45 | for is_end, txt in self.sample(random): 46 | text = open(txt, encoding='utf-8').read() 47 | text = text.split('\n') 48 | if len(text) > 1: 49 | title = text[0] 50 | content = '\n'.join(text[1:]) 51 | token_ids, segment_ids = tokenizer.encode( 52 | content, title, maxlen=maxlen 53 | ) 54 | batch_token_ids.append(token_ids) 55 | batch_segment_ids.append(segment_ids) 56 | if len(batch_token_ids) == self.batch_size or is_end: 57 | batch_token_ids = sequence_padding(batch_token_ids) 58 | batch_segment_ids = sequence_padding(batch_segment_ids) 59 | yield [batch_token_ids, batch_segment_ids], None 60 | batch_token_ids, batch_segment_ids = [], [] 61 | 62 | 63 | class CrossEntropy(Loss): 64 | """交叉熵作为loss，并mask掉输入部分 65 | """ 66 | def compute_loss(self, inputs, mask=None): 67 | y_true, y_mask, y_pred = inputs 68 | y_true = y_true[:, 1:] # 目标token_ids 69 | y_mask = y_mask[:, 1:] # segment_ids，刚好指示了要预测的部分 70 | y_pred = y_pred[:, :-1] # 预测序列，错开一位 71 | loss = K.sparse_categorical_crossentropy(y_true, y_pred) 72 | loss = K.sum(loss * y_mask) / K.sum(y_mask) 73 | return loss 74 | 75 | 76 | model = build_transformer_model( 77 | config_path, 78 | checkpoint_path, 79 | application='unilm', 80 | keep_tokens=keep_tokens, # 只保留keep_tokens中的字，精简原字表 81 | ) 82 | 83 | output = CrossEntropy(2)(model.inputs + model.outputs) 84 | 85 | model = Model(model.inputs, output) 86 | model.compile(optimizer=Adam(1e-5)) 87 | model.summary() 88 | 89 | 90 | class AutoTitle(AutoRegressiveDecoder): 91 | """seq2seq解码器 92 | """ 93 | @AutoRegressiveDecoder.wraps(default_rtype='probas') 94 | def predict(self, inputs, output_ids, states): 95 | token_ids, segment_ids = inputs 96 | token_ids = np.concatenate([token_ids, output_ids], 1) 97 | segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1) 98 | return model.predict([token_ids, segment_ids])[:, -1] 99 | 100 | def generate(self, text, topk=1): 101 | max_c_len = maxlen - self.maxlen 102 | token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len) 103 | output_ids = self.beam_search([token_ids, segment_ids], 104 | topk=topk) # 基于beam search 105 | return tokenizer.decode(output_ids) 106 | 107 | 108 | autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=32) 109 | 110 | 111 | def just_show(): 112 | s1 = u'夏天来临，皮肤在强烈紫外线的照射下，晒伤不可避免，因此，晒后及时修复显得尤为重要，否则可能会造成长期伤害。专家表示，选择晒后护肤品要慎重，芦荟凝胶是最安全，有效的一种选择，晒伤严重者，还请及时就医。' 113 | s2 = u'8月28日，网络爆料称，华住集团旗下连锁酒店用户数据疑似发生泄露。从卖家发布的内容看，数据包含华住旗下汉庭、禧玥、桔子、宜必思等10余个品牌酒店的住客信息。泄露的信息包括华住官网注册资料、酒店入住登记的身份信息及酒店开房记录，住客姓名、手机号、邮箱、身份证号、登录账号密码等。卖家对这个约5亿条数据打包出售。第三方安全平台威胁猎人对信息出售者提供的三万条数据进行验证，认为数据真实性非常高。当天下午，华住集团发声明称，已在内部迅速开展核查，并第一时间报警。当晚，上海警方消息称，接到华住集团报案，警方已经介入调查。' 114 | for s in [s1, s2]: 115 | print(u'生成标题:', autotitle.generate(s)) 116 | print() 117 | 118 | 119 | class Evaluator(keras.callbacks.Callback): 120 | """评估与保存 121 | """ 122 | def __init__(self): 123 | self.lowest = 1e10 124 | 125 | def on_epoch_end(self, epoch, logs=None): 126 | # 保存最优 127 | if logs['loss'] <= self.lowest: 128 | self.lowest = logs['loss'] 129 | model.save_weights('./best_model.weights') 130 | # 演示效果 131 | just_show() 132 | 133 | 134 | if __name__ == '__main__': 135 | 136 | evaluator = Evaluator() 137 | train_generator = data_generator(txts, batch_size) 138 | 139 | model.fit( 140 | train_generator.forfit(), 141 | steps_per_epoch=steps_per_epoch, 142 | epochs=epochs, 143 | callbacks=[evaluator] 144 | ) 145 | 146 | else: 147 | 148 | model.load_weights('./best_model.weights') 149 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # 例子合集 2 | 3 | 提示：Github上的examples只保证兼容Github上的最新版bert4keras，如果报错，请首先尝试升级bert4keras。 4 | 5 | ## 简介 6 | 7 | - [basic_extract_features.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_extract_features.py): 基础测试，测试BERT对句子的编码序列。 8 | - [basic_language_model_cpm_lm.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_language_model_cpm_lm.py): 基础测试，测试[CPM_LM](https://github.com/TsinghuaAI/CPM-Generate)的生成效果。 9 | - [basic_language_model_gpt2_ml.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_language_model_gpt2_ml.py): 基础测试，测试[GPT2_ML](https://github.com/imcaspar/gpt2-ml)的生成效果。 10 | - [basic_language_model_nezha_gen_gpt.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_language_model_nezha_gen_gpt.py): 基础测试，测试[GPT Base（又叫NEZHE-GEN）](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-Gen-TensorFlow)的生成效果。 11 | - [basic_make_uncased_model_cased.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_make_uncased_model_cased.py): 基础测试，通过简单修改词表，使得不区分大小写的模型有区分大小写的能力。 12 | - [basic_masked_language_model.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_masked_language_model.py): 基础测试，测试BERT的MLM模型效果。 13 | - [basic_simple_web_serving_simbert.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_simple_web_serving_simbert.py): 基础测试，测试自带的WebServing（将模型转化为Web接口）。 14 | - [task_conditional_language_model.py](https://github.com/bojone/bert4keras/tree/master/examples/task_conditional_language_model.py): 任务例子，结合 BERT + [Conditional Layer Normalization](https://kexue.fm/archives/7124) 做条件语言模型。 15 | - [task_iflytek_adversarial_training.py](https://github.com/bojone/bert4keras/tree/master/examples/task_iflytek_adversarial_training.py): 任务例子，通过[对抗训练](https://kexue.fm/archives/7234)提升分类效果。 16 | - [task_iflytek_bert_of_theseus.py](https://github.com/bojone/bert4keras/tree/master/examples/task_iflytek_bert_of_theseus.py): 任务例子，通过[BERT-of-Theseus](https://kexue.fm/archives/7575)来进行模型压缩。 17 | - [task_iflytek_gradient_penalty.py](https://github.com/bojone/bert4keras/tree/master/examples/task_iflytek_gradient_penalty.py): 任务例子，通过[梯度惩罚](https://kexue.fm/archives/7234)提升分类效果，可以视为另一种对抗训练。 18 | - [task_image_caption.py](https://github.com/bojone/bert4keras/tree/master/examples/task_image_caption.py): 任务例子，BERT + [Conditional Layer Normalization](https://kexue.fm/archives/7124) + ImageNet预训练模型来做图像描述生成。 19 | - [task_language_model.py](https://github.com/bojone/bert4keras/tree/master/examples/task_language_model.py): 任务例子，加载BERT的预训练权重做无条件语言模型，效果上等价于GPT。 20 | - [task_language_model_chinese_chess.py](https://github.com/bojone/bert4keras/tree/master/examples/task_language_model_chinese_chess.py): 任务例子，用GPT的方式下中国象棋，过程请参考[博客](https://kexue.fm/archives/7877)。 21 | - [task_question_answer_generation_by_seq2seq.py](https://github.com/bojone/bert4keras/tree/master/examples/task_question_answer_generation_by_seq2seq.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做[问答对自动构建](https://kexue.fm/archives/7630)，属于自回归文本生成。 22 | - [task_reading_comprehension_by_mlm.py](https://github.com/bojone/bert4keras/tree/master/examples/task_reading_comprehension_by_mlm.py): 任务例子，通过MLM模型来做[阅读理解问答](https://kexue.fm/archives/7148)，属于简单的非自回归文本生成。 23 | - [task_reading_comprehension_by_seq2seq.py](https://github.com/bojone/bert4keras/tree/master/examples/task_reading_comprehension_by_seq2seq.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做[阅读理解问答](https://kexue.fm/archives/7115)，属于自回归文本生成。 24 | - [task_relation_extraction.py](https://github.com/bojone/bert4keras/tree/master/examples/task_relation_extraction.py): 任务例子，结合BERT以及自行设计的“半指针-半标注”结构来做[关系抽取](https://kexue.fm/archives/7161)。 25 | - [task_sentence_similarity_lcqmc.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sentence_similarity_lcqmc.py): 任务例子，句子对分类任务。 26 | - [task_sentiment_albert.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sentiment_albert.py): 任务例子，情感分类任务，加载ALBERT模型。 27 | - [task_sentiment_integrated_gradients.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sentiment_integrated_gradients.py): 任务例子，通过[积分梯度](https://kexue.fm/archives/7533)的方式可视化情感分类任务。 28 | - [task_sentiment_virtual_adversarial_training.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sentiment_virtual_adversarial_training.py): 任务例子，通过[虚拟对抗训练](https://kexue.fm/archives/7466)进行半监督学习，提升小样本下的情感分类性能。 29 | - [task_seq2seq_ape210k_math_word_problem.py](https://github.com/bojone/bert4keras/tree/master/examples/task_seq2seq_ape210k_math_word_problem.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做小学数学应用题（数学公式生成），详情请见[这里](https://kexue.fm/archives/7809)。 30 | - [task_seq2seq_autotitle.py](https://github.com/bojone/bert4keras/tree/master/examples/task_seq2seq_autotitle.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做新闻标题生成。 31 | - [task_seq2seq_autotitle_csl.py](https://github.com/bojone/bert4keras/tree/master/examples/task_seq2seq_autotitle_csl.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做论文标题生成，包含了评测代码。 32 | - [task_seq2seq_autotitle_csl_mt5.py](https://github.com/bojone/bert4keras/tree/master/examples/task_seq2seq_autotitle_csl_mt5.py): 任务例子，通过[多国语言版T5](https://kexue.fm/archives/7867)式的Seq2Seq模型来做论文标题生成，包含了评测代码。 33 | - [task_sequence_labeling_cws_crf.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sequence_labeling_cws_crf.py): 任务例子，通过 BERT + [CRF](https://kexue.fm/archives/7196) 来做中文分词。 34 | - [task_sequence_labeling_ner_crf.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sequence_labeling_ner_crf.py): 35 | 任务例子，通过 BERT + [CRF](https://kexue.fm/archives/7196) 来做中文NER。 36 | -------------------------------------------------------------------------------- /examples/task_iflytek_gradient_penalty.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # 通过梯度惩罚增强模型的泛化性能 3 | # 比CLUE榜单公开的同数据集上的BERT base的成绩高2% 4 | # 数据集：IFLYTEK' 长文本分类 (https://github.com/CLUEbenchmark/CLUE) 5 | # 博客：https://kexue.fm/archives/7234 6 | # 适用于Keras 2.3.1 7 | 8 | import json 9 | import numpy as np 10 | from bert4keras.backend import keras, search_layer, K 11 | from bert4keras.tokenizers import Tokenizer 12 | from bert4keras.models import build_transformer_model 13 | from bert4keras.optimizers import Adam 14 | from bert4keras.snippets import sequence_padding, DataGenerator 15 | from keras.layers import Lambda, Dense 16 | from tqdm import tqdm 17 | 18 | num_classes = 119 19 | maxlen = 128 20 | batch_size = 32 21 | 22 | # BERT base 23 | config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' 24 | checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 25 | dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' 26 | 27 | 28 | def load_data(filename): 29 | """加载数据 30 | 单条格式：(文本, 标签id) 31 | """ 32 | D = [] 33 | with open(filename) as f: 34 | for i, l in enumerate(f): 35 | l = json.loads(l) 36 | text, label = l['sentence'], l['label'] 37 | D.append((text, int(label))) 38 | return D 39 | 40 | 41 | # 加载数据集 42 | train_data = load_data( 43 | '/root/CLUE-master/baselines/CLUEdataset/iflytek/train.json' 44 | ) 45 | valid_data = load_data( 46 | '/root/CLUE-master/baselines/CLUEdataset/iflytek/dev.json' 47 | ) 48 | 49 | # 建立分词器 50 | tokenizer = Tokenizer(dict_path, do_lower_case=True) 51 | 52 | 53 | class data_generator(DataGenerator): 54 | """数据生成器 55 | """ 56 | def __iter__(self, random=False): 57 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 58 | for is_end, (text, label) in self.sample(random): 59 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 60 | batch_token_ids.append(token_ids) 61 | batch_segment_ids.append(segment_ids) 62 | batch_labels.append([label]) 63 | if len(batch_token_ids) == self.batch_size or is_end: 64 | batch_token_ids = sequence_padding(batch_token_ids) 65 | batch_segment_ids = sequence_padding(batch_segment_ids) 66 | batch_labels = sequence_padding(batch_labels) 67 | yield [batch_token_ids, batch_segment_ids], batch_labels 68 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 69 | 70 | 71 | # 转换数据集 72 | train_generator = data_generator(train_data, batch_size) 73 | valid_generator = data_generator(valid_data, batch_size) 74 | 75 | # 加载预训练模型 76 | bert = build_transformer_model( 77 | config_path=config_path, 78 | checkpoint_path=checkpoint_path, 79 | return_keras_model=False, 80 | ) 81 | 82 | output = Lambda(lambda x: x[:, 0])(bert.model.output) 83 | output = Dense( 84 | units=num_classes, 85 | activation='softmax', 86 | kernel_initializer=bert.initializer 87 | )(output) 88 | 89 | model = keras.models.Model(bert.model.input, output) 90 | model.summary() 91 | 92 | 93 | def sparse_categorical_crossentropy(y_true, y_pred): 94 | """自定义稀疏交叉熵 95 | 这主要是因为keras自带的sparse_categorical_crossentropy不支持求二阶梯度。 96 | """ 97 | y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) 98 | y_true = K.cast(y_true, 'int32') 99 | y_true = K.one_hot(y_true, K.shape(y_pred)[-1]) 100 | return K.categorical_crossentropy(y_true, y_pred) 101 | 102 | 103 | def loss_with_gradient_penalty(y_true, y_pred, epsilon=1): 104 | """带梯度惩罚的loss 105 | """ 106 | loss = K.mean(sparse_categorical_crossentropy(y_true, y_pred)) 107 | embeddings = search_layer(y_pred, 'Embedding-Token').embeddings 108 | gp = K.sum(K.gradients(loss, [embeddings])[0].values**2) 109 | return loss + 0.5 * epsilon * gp 110 | 111 | 112 | model.compile( 113 | loss=loss_with_gradient_penalty, 114 | optimizer=Adam(2e-5), 115 | metrics=['sparse_categorical_accuracy'], 116 | ) 117 | 118 | 119 | def evaluate(data): 120 | total, right = 0., 0. 121 | for x_true, y_true in data: 122 | y_pred = model.predict(x_true).argmax(axis=1) 123 | y_true = y_true[:, 0] 124 | total += len(y_true) 125 | right += (y_true == y_pred).sum() 126 | return right / total 127 | 128 | 129 | class Evaluator(keras.callbacks.Callback): 130 | """评估与保存 131 | """ 132 | def __init__(self): 133 | self.best_val_acc = 0. 134 | 135 | def on_epoch_end(self, epoch, logs=None): 136 | val_acc = evaluate(valid_generator) 137 | if val_acc > self.best_val_acc: 138 | self.best_val_acc = val_acc 139 | model.save_weights('best_model.weights') 140 | print( 141 | u'val_acc: %.5f, best_val_acc: %.5f\n' % 142 | (val_acc, self.best_val_acc) 143 | ) 144 | 145 | 146 | def predict_to_file(in_file, out_file): 147 | """输出预测结果到文件 148 | 结果文件可以提交到 https://www.cluebenchmarks.com 评测。 149 | """ 150 | fw = open(out_file, 'w') 151 | with open(in_file) as fr: 152 | for l in tqdm(fr): 153 | l = json.loads(l) 154 | text = l['sentence'] 155 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 156 | label = model.predict([[token_ids], [segment_ids]])[0].argmax() 157 | l = json.dumps({'id': str(l['id']), 'label': str(label)}) 158 | fw.write(l + '\n') 159 | fw.close() 160 | 161 | 162 | if __name__ == '__main__': 163 | 164 | evaluator = Evaluator() 165 | 166 | model.fit( 167 | train_generator.forfit(), 168 | steps_per_epoch=len(train_generator), 169 | epochs=50, 170 | callbacks=[evaluator] 171 | ) 172 | 173 | else: 174 | 175 | model.load_weights('best_model.weights') 176 | # predict_to_file('/root/CLUE-master/baselines/CLUEdataset/iflytek/test.json', 'iflytek_predict.json') 177 | -------------------------------------------------------------------------------- /examples/task_seq2seq_autotitle_csl.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # bert做Seq2Seq任务，采用UNILM方案 3 | # 介绍链接：https://kexue.fm/archives/6933 4 | # 数据集：https://github.com/CLUEbenchmark/CLGE 中的CSL数据集 5 | # 补充了评测指标bleu、rouge-1、rouge-2、rouge-l 6 | 7 | from __future__ import print_function 8 | import numpy as np 9 | from tqdm import tqdm 10 | from bert4keras.backend import keras, K 11 | from bert4keras.layers import Loss 12 | from bert4keras.models import build_transformer_model 13 | from bert4keras.tokenizers import Tokenizer, load_vocab 14 | from bert4keras.optimizers import Adam 15 | from bert4keras.snippets import sequence_padding, open 16 | from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder 17 | from keras.models import Model 18 | from rouge import Rouge # pip install rouge 19 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction 20 | 21 | # 基本参数 22 | maxlen = 256 23 | batch_size = 16 24 | epochs = 20 25 | 26 | # bert配置 27 | config_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_config.json' 28 | checkpoint_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_model.ckpt' 29 | dict_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/vocab.txt' 30 | 31 | 32 | def load_data(filename): 33 | """加载数据 34 | 单条格式：(标题, 正文) 35 | """ 36 | D = [] 37 | with open(filename, encoding='utf-8') as f: 38 | for l in f: 39 | title, content = l.strip().split('\t') 40 | D.append((title, content)) 41 | return D 42 | 43 | 44 | # 加载数据集 45 | train_data = load_data('/root/csl/train.tsv') 46 | valid_data = load_data('/root/csl/val.tsv') 47 | test_data = load_data('/root/csl/test.tsv') 48 | 49 | # 加载并精简词表，建立分词器 50 | token_dict, keep_tokens = load_vocab( 51 | dict_path=dict_path, 52 | simplified=True, 53 | startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], 54 | ) 55 | tokenizer = Tokenizer(token_dict, do_lower_case=True) 56 | 57 | 58 | class data_generator(DataGenerator): 59 | """数据生成器 60 | """ 61 | def __iter__(self, random=False): 62 | batch_token_ids, batch_segment_ids = [], [] 63 | for is_end, (title, content) in self.sample(random): 64 | token_ids, segment_ids = tokenizer.encode( 65 | content, title, maxlen=maxlen 66 | ) 67 | batch_token_ids.append(token_ids) 68 | batch_segment_ids.append(segment_ids) 69 | if len(batch_token_ids) == self.batch_size or is_end: 70 | batch_token_ids = sequence_padding(batch_token_ids) 71 | batch_segment_ids = sequence_padding(batch_segment_ids) 72 | yield [batch_token_ids, batch_segment_ids], None 73 | batch_token_ids, batch_segment_ids = [], [] 74 | 75 | 76 | class CrossEntropy(Loss): 77 | """交叉熵作为loss，并mask掉输入部分 78 | """ 79 | def compute_loss(self, inputs, mask=None): 80 | y_true, y_mask, y_pred = inputs 81 | y_true = y_true[:, 1:] # 目标token_ids 82 | y_mask = y_mask[:, 1:] # segment_ids，刚好指示了要预测的部分 83 | y_pred = y_pred[:, :-1] # 预测序列，错开一位 84 | loss = K.sparse_categorical_crossentropy(y_true, y_pred) 85 | loss = K.sum(loss * y_mask) / K.sum(y_mask) 86 | return loss 87 | 88 | 89 | model = build_transformer_model( 90 | config_path, 91 | checkpoint_path, 92 | application='unilm', 93 | keep_tokens=keep_tokens, # 只保留keep_tokens中的字，精简原字表 94 | ) 95 | 96 | output = CrossEntropy(2)(model.inputs + model.outputs) 97 | 98 | model = Model(model.inputs, output) 99 | model.compile(optimizer=Adam(1e-5)) 100 | model.summary() 101 | 102 | 103 | class AutoTitle(AutoRegressiveDecoder): 104 | """seq2seq解码器 105 | """ 106 | @AutoRegressiveDecoder.wraps(default_rtype='probas') 107 | def predict(self, inputs, output_ids, states): 108 | token_ids, segment_ids = inputs 109 | token_ids = np.concatenate([token_ids, output_ids], 1) 110 | segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1) 111 | return model.predict([token_ids, segment_ids])[:, -1] 112 | 113 | def generate(self, text, topk=1): 114 | max_c_len = maxlen - self.maxlen 115 | token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len) 116 | output_ids = self.beam_search([token_ids, segment_ids], 117 | topk=topk) # 基于beam search 118 | return tokenizer.decode(output_ids) 119 | 120 | 121 | autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=32) 122 | 123 | 124 | class Evaluator(keras.callbacks.Callback): 125 | """评估与保存 126 | """ 127 | def __init__(self): 128 | self.rouge = Rouge() 129 | self.smooth = SmoothingFunction().method1 130 | self.best_bleu = 0. 131 | 132 | def on_epoch_end(self, epoch, logs=None): 133 | metrics = self.evaluate(valid_data) # 评测模型 134 | if metrics['bleu'] > self.best_bleu: 135 | self.best_bleu = metrics['bleu'] 136 | model.save_weights('./best_model.weights') # 保存模型 137 | metrics['best_bleu'] = self.best_bleu 138 | print('valid_data:', metrics) 139 | 140 | def evaluate(self, data, topk=1): 141 | total = 0 142 | rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 143 | for title, content in tqdm(data): 144 | total += 1 145 | title = ' '.join(title).lower() 146 | pred_title = ' '.join(autotitle.generate(content, topk)).lower() 147 | if pred_title.strip(): 148 | scores = self.rouge.get_scores(hyps=pred_title, refs=title) 149 | rouge_1 += scores[0]['rouge-1']['f'] 150 | rouge_2 += scores[0]['rouge-2']['f'] 151 | rouge_l += scores[0]['rouge-l']['f'] 152 | bleu += sentence_bleu( 153 | references=[title.split(' ')], 154 | hypothesis=pred_title.split(' '), 155 | smoothing_function=self.smooth 156 | ) 157 | rouge_1 /= total 158 | rouge_2 /= total 159 | rouge_l /= total 160 | bleu /= total 161 | return { 162 | 'rouge-1': rouge_1, 163 | 'rouge-2': rouge_2, 164 | 'rouge-l': rouge_l, 165 | 'bleu': bleu, 166 | } 167 | 168 | 169 | if __name__ == '__main__': 170 | 171 | evaluator = Evaluator() 172 | train_generator = data_generator(train_data, batch_size) 173 | 174 | model.fit( 175 | train_generator.forfit(), 176 | steps_per_epoch=len(train_generator), 177 | epochs=epochs, 178 | callbacks=[evaluator] 179 | ) 180 | 181 | else: 182 | 183 | model.load_weights('./best_model.weights') 184 | -------------------------------------------------------------------------------- /examples/task_conditional_language_model.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # bert做conditional language model任务 3 | # 按类随机生成文本，这个demo的类别是情感极性（正／负） 4 | # 请参考：https://kexue.fm/archives/7124 5 | 6 | from __future__ import print_function 7 | import re 8 | import numpy as np 9 | from bert4keras.backend import keras, K 10 | from bert4keras.layers import Loss 11 | from bert4keras.models import build_transformer_model 12 | from bert4keras.tokenizers import Tokenizer, load_vocab 13 | from bert4keras.optimizers import Adam 14 | from bert4keras.snippets import sequence_padding, open 15 | from bert4keras.snippets import text_segmentate 16 | from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder 17 | from bert4keras.snippets import uniout # 打印中文 18 | from keras.layers import Input, Embedding, Reshape 19 | from keras.models import Model 20 | 21 | # 模型配置 22 | maxlen = 128 23 | batch_size = 32 24 | num_classes = 2 25 | epochs = 20 26 | 27 | # bert配置 28 | config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' 29 | checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 30 | dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' 31 | 32 | # 加载并精简词表，建立分词器 33 | token_dict, keep_tokens = load_vocab( 34 | dict_path=dict_path, 35 | simplified=True, 36 | startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], 37 | ) 38 | tokenizer = Tokenizer(token_dict, do_lower_case=True) 39 | 40 | 41 | def load_data(filenames): 42 | """加载数据，并尽量划分为不超过maxlen的句子 43 | """ 44 | D = [] 45 | seps, strips = u'\n。！？!?；;，, ', u'；;，, ' 46 | for filename in filenames: 47 | with open(filename, encoding='utf-8') as f: 48 | for l in f: 49 | text, label = l.strip().split('\t') 50 | for t in text_segmentate(text, maxlen - 2, seps, strips): 51 | D.append((t, int(label))) 52 | return D 53 | 54 | 55 | # 加载数据集 56 | data = load_data([ 57 | 'datasets/sentiment/sentiment.train.data', 58 | 'datasets/sentiment/sentiment.valid.data', 59 | 'datasets/sentiment/sentiment.test.data', 60 | ]) 61 | 62 | 63 | class data_generator(DataGenerator): 64 | """数据生成器 65 | """ 66 | def __iter__(self, random=False): 67 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 68 | for is_end, (text, label) in self.sample(random): 69 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 70 | batch_token_ids.append(token_ids) 71 | batch_segment_ids.append(segment_ids) 72 | batch_labels.append([label]) 73 | if len(batch_token_ids) == self.batch_size or is_end: 74 | batch_token_ids = sequence_padding(batch_token_ids) 75 | batch_segment_ids = sequence_padding(batch_segment_ids) 76 | batch_labels = sequence_padding(batch_labels) 77 | yield [batch_token_ids, batch_segment_ids, batch_labels], None 78 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 79 | 80 | 81 | class CrossEntropy(Loss): 82 | """交叉熵作为loss，并mask掉padding部分 83 | """ 84 | def compute_loss(self, inputs, mask=None): 85 | y_true, y_pred = inputs 86 | if mask[1] is None: 87 | y_mask = 1.0 88 | else: 89 | y_mask = K.cast(mask[1], K.floatx())[:, 1:] 90 | y_true = y_true[:, 1:] # 目标token_ids 91 | y_pred = y_pred[:, :-1] # 预测序列，错开一位 92 | loss = K.sparse_categorical_crossentropy(y_true, y_pred) 93 | loss = K.sum(loss * y_mask) / K.sum(y_mask) 94 | return loss 95 | 96 | 97 | c_in = Input(shape=(1,)) 98 | c = Embedding(num_classes, 128)(c_in) 99 | c = Reshape((128,))(c) 100 | 101 | # Bert模型 102 | model = build_transformer_model( 103 | config_path, 104 | checkpoint_path, 105 | application='lm', 106 | keep_tokens=keep_tokens, # 只保留keep_tokens中的字，精简原字表 107 | layer_norm_cond=c, 108 | additional_input_layers=c_in, 109 | ) 110 | 111 | output = CrossEntropy(1)([model.inputs[0], model.outputs[0]]) 112 | 113 | model = Model(model.inputs, output) 114 | model.compile(optimizer=Adam(1e-5)) 115 | model.summary() 116 | 117 | 118 | class RandomSentiment(AutoRegressiveDecoder): 119 | """根据情感标签（0:负，1:正）随机生成一批句子 120 | """ 121 | @AutoRegressiveDecoder.wraps(default_rtype='probas') 122 | def predict(self, inputs, output_ids, states): 123 | token_ids = output_ids 124 | segment_ids = np.zeros_like(token_ids) 125 | return model.predict([token_ids, segment_ids, inputs[0]])[:, -1] 126 | 127 | def generate(self, label, n=1, topp=0.95): 128 | results = self.random_sample([[label]], n, topp=topp) # 基于随机采样 129 | return [tokenizer.decode(ids) for ids in results] 130 | 131 | 132 | random_sentiment = RandomSentiment( 133 | start_id=tokenizer._token_start_id, 134 | end_id=tokenizer._token_end_id, 135 | maxlen=maxlen 136 | ) 137 | 138 | 139 | def just_show(): 140 | print(u'正面采样:') 141 | print(random_sentiment.generate(1, 5, 5), '\n') 142 | print(u'负面采样:') 143 | print(random_sentiment.generate(0, 5, 5), '\n') 144 | 145 | 146 | class Evaluator(keras.callbacks.Callback): 147 | """评估与保存 148 | """ 149 | def __init__(self): 150 | self.lowest = 1e10 151 | 152 | def on_epoch_end(self, epoch, logs=None): 153 | # 保存最优 154 | if logs['loss'] <= self.lowest: 155 | self.lowest = logs['loss'] 156 | model.save_weights('./best_model.weights') 157 | # 演示效果 158 | just_show() 159 | 160 | 161 | if __name__ == '__main__': 162 | 163 | evaluator = Evaluator() 164 | train_generator = data_generator(data, batch_size) 165 | 166 | model.fit( 167 | train_generator.forfit(), 168 | steps_per_epoch=len(train_generator), 169 | epochs=epochs, 170 | callbacks=[evaluator] 171 | ) 172 | 173 | else: 174 | 175 | model.load_weights('./best_model.weights') 176 | """ 177 | 正面采样: 178 | [ 179 | u'外观时尚、漂亮、性价比高。', 180 | u'外观漂亮，配置均衡，比较满意，性价比高，外观漂亮，性能较高。', 181 | u'我是在大学的时候看到这本书的，所以一直在买。书中的作者是林静蕾，她用自己的口吻写出了一个孩子成长中的心路历程，让我看到了她们成长中的不同之处，以及她们成长过程中的不同境界。让我很欣赏！', 182 | u'我想这是一本能够告诉读者什么是坏的，而不是教你怎样说话，告诉我什么是错。这里我推荐了《我要讲故事》，这本书是我很喜欢的一本书，我认为它的理由很多，但是，我相信我。如果你从中得到一些改进，或者你已经有了一个明智的决定。', 183 | u'我们一家五口住的是标间，大床房，大床的床很舒服；而我们在携程网上订了两套大床房，这个酒店的价格还是比较合理的；但是房间的隔音效果不太理想，有点响的声音；酒店门口的地铁在施工中，不方便；但是酒店的门口的出租车不知道是哪个车的，打车不是很方便；酒店外面的停' 184 | ] 185 | 186 | 负面采样: 187 | [ 188 | u'不知道是不是因为电池不太好，不是我不喜欢。', 189 | u'看了评论才买的. 结果发现不是那么便宜, 价格也不便宜.', 190 | u'1、外壳不容易沾手印，不容易洗洗2、屏幕有点旧，不能下载铃声', 191 | u'我是7月6日订购了《杜拉拉升职记》并已通过银行付款，为什么订单下了两周多至今还未到货？是收货时间太快了，可能就这么过去了吧？', 192 | u'这本书我是在网上先看了一遍，后来我再看了一遍。感觉作者的文笔实在太烂了，特别是在写他的博客时特别别扭，写得很不专业，特别是他写股票时那个情绪调节的小男孩，简直就是自作聪明的样子，简直就是自作聪明的一种表现！' 193 | ] 194 | """ 195 | -------------------------------------------------------------------------------- /examples/task_question_answer_generation_by_seq2seq.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8- -*- 2 | # 用Seq2Seq做阅读理解构建 3 | # 根据篇章先采样生成答案，然后采样生成问题 4 | # 数据集同 https://github.com/bojone/dgcnn_for_reading_comprehension 5 | 6 | import json, os 7 | import numpy as np 8 | from bert4keras.backend import keras, K 9 | from bert4keras.layers import Loss 10 | from bert4keras.models import build_transformer_model 11 | from bert4keras.tokenizers import Tokenizer, load_vocab 12 | from bert4keras.optimizers import Adam 13 | from bert4keras.snippets import sequence_padding, open 14 | from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder 15 | from bert4keras.snippets import text_segmentate 16 | from keras.models import Model 17 | from tqdm import tqdm 18 | 19 | # 基本参数 20 | max_p_len = 128 21 | max_q_len = 64 22 | max_a_len = 16 23 | batch_size = 32 24 | epochs = 100 25 | 26 | # bert配置 27 | config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' 28 | checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 29 | dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' 30 | 31 | # 标注数据 32 | webqa_data = json.load(open('/root/qa_datasets/WebQA.json')) 33 | sogou_data = json.load(open('/root/qa_datasets/SogouQA.json')) 34 | 35 | # 筛选数据 36 | seps, strips = u'\n。！？!?；;，, ', u'；;，, ' 37 | data = [] 38 | for d in webqa_data + sogou_data: 39 | for p in d['passages']: 40 | if p['answer']: 41 | for t in text_segmentate(p['passage'], max_p_len - 2, seps, strips): 42 | if p['answer'] in t: 43 | data.append((t, d['question'], p['answer'])) 44 | 45 | del webqa_data 46 | del sogou_data 47 | 48 | # 保存一个随机序（供划分valid用） 49 | if not os.path.exists('../random_order.json'): 50 | random_order = list(range(len(data))) 51 | np.random.shuffle(random_order) 52 | json.dump(random_order, open('../random_order.json', 'w'), indent=4) 53 | else: 54 | random_order = json.load(open('../random_order.json')) 55 | 56 | # 划分valid 57 | train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0] 58 | valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0] 59 | 60 | # 加载并精简词表，建立分词器 61 | token_dict, keep_tokens = load_vocab( 62 | dict_path=dict_path, 63 | simplified=True, 64 | startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], 65 | ) 66 | tokenizer = Tokenizer(token_dict, do_lower_case=True) 67 | 68 | 69 | class data_generator(DataGenerator): 70 | """数据生成器 71 | """ 72 | def __iter__(self, random=False): 73 | """单条样本格式：[CLS]篇章[SEP]答案[SEP]问题[SEP] 74 | """ 75 | batch_token_ids, batch_segment_ids = [], [] 76 | for is_end, (p, q, a) in self.sample(random): 77 | p_token_ids, _ = tokenizer.encode(p, maxlen=max_p_len + 1) 78 | a_token_ids, _ = tokenizer.encode(a, maxlen=max_a_len) 79 | q_token_ids, _ = tokenizer.encode(q, maxlen=max_q_len) 80 | token_ids = p_token_ids + a_token_ids[1:] + q_token_ids[1:] 81 | segment_ids = [0] * len(p_token_ids) 82 | segment_ids += [1] * (len(token_ids) - len(p_token_ids)) 83 | batch_token_ids.append(token_ids) 84 | batch_segment_ids.append(segment_ids) 85 | if len(batch_token_ids) == self.batch_size or is_end: 86 | batch_token_ids = sequence_padding(batch_token_ids) 87 | batch_segment_ids = sequence_padding(batch_segment_ids) 88 | yield [batch_token_ids, batch_segment_ids], None 89 | batch_token_ids, batch_segment_ids = [], [] 90 | 91 | 92 | class CrossEntropy(Loss): 93 | """交叉熵作为loss，并mask掉输入部分 94 | """ 95 | def compute_loss(self, inputs, mask=None): 96 | y_true, y_mask, y_pred = inputs 97 | y_true = y_true[:, 1:] # 目标token_ids 98 | y_mask = y_mask[:, 1:] # segment_ids，刚好指示了要预测的部分 99 | y_pred = y_pred[:, :-1] # 预测序列，错开一位 100 | loss = K.sparse_categorical_crossentropy(y_true, y_pred) 101 | loss = K.sum(loss * y_mask) / K.sum(y_mask) 102 | return loss 103 | 104 | 105 | model = build_transformer_model( 106 | config_path, 107 | checkpoint_path, 108 | application='unilm', 109 | keep_tokens=keep_tokens, # 只保留keep_tokens中的字，精简原字表 110 | ) 111 | 112 | output = CrossEntropy(2)(model.inputs + model.outputs) 113 | 114 | model = Model(model.inputs, output) 115 | model.compile(optimizer=Adam(1e-5)) 116 | model.summary() 117 | 118 | 119 | class QuestionAnswerGeneration(AutoRegressiveDecoder): 120 | """随机生成答案，并且通过beam search来生成问题 121 | """ 122 | @AutoRegressiveDecoder.wraps(default_rtype='probas') 123 | def predict(self, inputs, output_ids, states): 124 | token_ids, segment_ids = inputs 125 | token_ids = np.concatenate([token_ids, output_ids], 1) 126 | segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1) 127 | return model.predict([token_ids, segment_ids])[:, -1] 128 | 129 | def generate(self, passage, topk=1, topp=0.95): 130 | token_ids, segment_ids = tokenizer.encode(passage, maxlen=max_p_len) 131 | a_ids = self.random_sample([token_ids, segment_ids], 1, 132 | topp=topp)[0] # 基于随机采样 133 | token_ids += list(a_ids) 134 | segment_ids += [1] * len(a_ids) 135 | q_ids = self.beam_search([token_ids, segment_ids], 136 | topk=topk) # 基于beam search 137 | return (tokenizer.decode(q_ids), tokenizer.decode(a_ids)) 138 | 139 | 140 | qag = QuestionAnswerGeneration( 141 | start_id=None, end_id=tokenizer._token_end_id, maxlen=max_q_len 142 | ) 143 | 144 | 145 | def predict_to_file(data, filename, topk=1): 146 | """将预测结果输出到文件，方便评估 147 | """ 148 | with open(filename, 'w', encoding='utf-8') as f: 149 | for d in tqdm(iter(data), desc=u'正在预测(共%s条样本)' % len(data)): 150 | q, a = qag.generate(d[0]) 151 | s = '%s\t%s\t%s\n' % (q, a, d[0]) 152 | f.write(s) 153 | f.flush() 154 | 155 | 156 | class Evaluator(keras.callbacks.Callback): 157 | """评估与保存 158 | """ 159 | def __init__(self): 160 | self.lowest = 1e10 161 | 162 | def on_epoch_end(self, epoch, logs=None): 163 | # 保存最优 164 | if logs['loss'] <= self.lowest: 165 | self.lowest = logs['loss'] 166 | model.save_weights('./best_model.weights') 167 | 168 | 169 | if __name__ == '__main__': 170 | 171 | evaluator = Evaluator() 172 | train_generator = data_generator(train_data, batch_size) 173 | 174 | model.fit( 175 | train_generator.forfit(), 176 | steps_per_epoch=1000, 177 | epochs=epochs, 178 | callbacks=[evaluator] 179 | ) 180 | 181 | else: 182 | 183 | model.load_weights('./best_model.weights') 184 | # predict_to_file(valid_data, 'qa.csv') 185 | -------------------------------------------------------------------------------- /examples/task_seq2seq_autotitle_csl_mt5.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 微调多国语言版T5做Seq2Seq任务 3 | # 介绍链接：kexue.fm/archives/7867 4 | # 细节请看：https://github.com/bojone/t5_in_bert4keras 5 | # 数据集：https://github.com/CLUEbenchmark/CLGE 中的CSL数据集 6 | # 补充了评测指标bleu、rouge-1、rouge-2、rouge-l 7 | 8 | from __future__ import print_function 9 | import json 10 | import numpy as np 11 | from tqdm import tqdm 12 | from bert4keras.backend import keras, K 13 | from bert4keras.layers import Loss 14 | from bert4keras.models import build_transformer_model 15 | from bert4keras.tokenizers import SpTokenizer 16 | from bert4keras.optimizers import Adam 17 | from bert4keras.snippets import sequence_padding, open 18 | from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder 19 | from keras.models import Model 20 | from rouge import Rouge # pip install rouge 21 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction 22 | 23 | # 基本参数 24 | max_c_len = 256 25 | max_t_len = 32 26 | batch_size = 16 27 | epochs = 40 28 | 29 | # 模型路径 30 | config_path = '/root/kg/bert/mt5/mt5_base/mt5_base_config.json' 31 | checkpoint_path = '/root/kg/bert/mt5/mt5_base/model.ckpt-1000000' 32 | spm_path = '/root/kg/bert/mt5/sentencepiece_cn.model' 33 | keep_tokens_path = '/root/kg/bert/mt5/sentencepiece_cn_keep_tokens.json' 34 | 35 | 36 | def load_data(filename): 37 | """加载数据 38 | 单条格式：(标题, 正文) 39 | """ 40 | D = [] 41 | with open(filename, encoding='utf-8') as f: 42 | for l in f: 43 | title, content = l.strip().split('\t') 44 | D.append((title, content)) 45 | return D 46 | 47 | 48 | # 加载数据集 49 | train_data = load_data('/root/csl/train.tsv') 50 | valid_data = load_data('/root/csl/val.tsv') 51 | test_data = load_data('/root/csl/test.tsv') 52 | 53 | # 加载分词器 54 | tokenizer = SpTokenizer(spm_path, token_start=None, token_end='') 55 | keep_tokens = json.load(open(keep_tokens_path)) 56 | 57 | 58 | class data_generator(DataGenerator): 59 | """数据生成器 60 | """ 61 | def __iter__(self, random=False): 62 | batch_c_token_ids, batch_t_token_ids = [], [] 63 | for is_end, (title, content) in self.sample(random): 64 | c_token_ids, _ = tokenizer.encode(content, maxlen=max_c_len) 65 | t_token_ids, _ = tokenizer.encode(title, maxlen=max_t_len) 66 | batch_c_token_ids.append(c_token_ids) 67 | batch_t_token_ids.append([0] + t_token_ids) 68 | if len(batch_c_token_ids) == self.batch_size or is_end: 69 | batch_c_token_ids = sequence_padding(batch_c_token_ids) 70 | batch_t_token_ids = sequence_padding(batch_t_token_ids) 71 | yield [batch_c_token_ids, batch_t_token_ids], None 72 | batch_c_token_ids, batch_t_token_ids = [], [] 73 | 74 | 75 | class CrossEntropy(Loss): 76 | """交叉熵作为loss，并mask掉输入部分 77 | """ 78 | def compute_loss(self, inputs, mask=None): 79 | y_true, y_pred = inputs 80 | y_true = y_true[:, 1:] # 目标token_ids 81 | y_mask = K.cast(mask[1], K.floatx())[:, :-1] # 解码器自带mask 82 | y_pred = y_pred[:, :-1] # 预测序列，错开一位 83 | loss = K.sparse_categorical_crossentropy(y_true, y_pred) 84 | loss = K.sum(loss * y_mask) / K.sum(y_mask) 85 | return loss 86 | 87 | 88 | t5 = build_transformer_model( 89 | config_path=config_path, 90 | checkpoint_path=checkpoint_path, 91 | keep_tokens=keep_tokens, 92 | model='t5.1.1', 93 | return_keras_model=False, 94 | name='T5', 95 | ) 96 | 97 | encoder = t5.encoder 98 | decoder = t5.decoder 99 | model = t5.model 100 | model.summary() 101 | 102 | output = CrossEntropy(1)([model.inputs[1], model.outputs[0]]) 103 | 104 | model = Model(model.inputs, output) 105 | model.compile(optimizer=Adam(2e-4)) 106 | 107 | 108 | class AutoTitle(AutoRegressiveDecoder): 109 | """seq2seq解码器 110 | """ 111 | @AutoRegressiveDecoder.wraps(default_rtype='probas') 112 | def predict(self, inputs, output_ids, states): 113 | c_encoded = inputs[0] 114 | return decoder.predict([c_encoded, output_ids])[:, -1] 115 | 116 | def generate(self, text, topk=1): 117 | c_token_ids, _ = tokenizer.encode(text, maxlen=max_c_len) 118 | c_encoded = encoder.predict(np.array([c_token_ids]))[0] 119 | output_ids = self.beam_search([c_encoded], topk=topk) # 基于beam search 120 | return tokenizer.decode([int(i) for i in output_ids]) 121 | 122 | 123 | # 注：T5有一个很让人不解的设置，它的标记id是0，即和其实都是0 124 | autotitle = AutoTitle(start_id=0, end_id=tokenizer._token_end_id, maxlen=32) 125 | 126 | 127 | class Evaluator(keras.callbacks.Callback): 128 | """评估与保存 129 | """ 130 | def __init__(self): 131 | self.rouge = Rouge() 132 | self.smooth = SmoothingFunction().method1 133 | self.best_bleu = 0. 134 | 135 | def on_epoch_end(self, epoch, logs=None): 136 | metrics = self.evaluate(valid_data) # 评测模型 137 | if metrics['bleu'] > self.best_bleu: 138 | self.best_bleu = metrics['bleu'] 139 | model.save_weights('./best_model.weights') # 保存模型 140 | metrics['best_bleu'] = self.best_bleu 141 | print('valid_data:', metrics) 142 | 143 | def evaluate(self, data, topk=1): 144 | total = 0 145 | rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0 146 | for title, content in tqdm(data): 147 | total += 1 148 | title = ' '.join(title).lower() 149 | pred_title = ' '.join(autotitle.generate(content, 150 | topk=topk)).lower() 151 | if pred_title.strip(): 152 | scores = self.rouge.get_scores(hyps=pred_title, refs=title) 153 | rouge_1 += scores[0]['rouge-1']['f'] 154 | rouge_2 += scores[0]['rouge-2']['f'] 155 | rouge_l += scores[0]['rouge-l']['f'] 156 | bleu += sentence_bleu( 157 | references=[title.split(' ')], 158 | hypothesis=pred_title.split(' '), 159 | smoothing_function=self.smooth 160 | ) 161 | rouge_1 /= total 162 | rouge_2 /= total 163 | rouge_l /= total 164 | bleu /= total 165 | return { 166 | 'rouge-1': rouge_1, 167 | 'rouge-2': rouge_2, 168 | 'rouge-l': rouge_l, 169 | 'bleu': bleu, 170 | } 171 | 172 | 173 | if __name__ == '__main__': 174 | 175 | evaluator = Evaluator() 176 | train_generator = data_generator(train_data, batch_size) 177 | 178 | model.fit( 179 | train_generator.forfit(), 180 | steps_per_epoch=len(train_generator), 181 | epochs=epochs, 182 | callbacks=[evaluator] 183 | ) 184 | 185 | else: 186 | 187 | model.load_weights('./best_model.weights') 188 | -------------------------------------------------------------------------------- /examples/task_language_model.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # bert做language model任务，小说生成 3 | 4 | from __future__ import print_function 5 | import glob, re 6 | import numpy as np 7 | from tqdm import tqdm 8 | from bert4keras.backend import keras, K 9 | from bert4keras.layers import Loss 10 | from bert4keras.models import build_transformer_model 11 | from bert4keras.tokenizers import Tokenizer, load_vocab 12 | from bert4keras.optimizers import Adam 13 | from bert4keras.snippets import sequence_padding, open 14 | from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder 15 | from keras.models import Model 16 | 17 | maxlen = 256 18 | batch_size = 16 19 | steps_per_epoch = 1000 20 | epochs = 10000 21 | 22 | # bert配置 23 | config_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json' 24 | checkpoint_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt' 25 | dict_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' 26 | 27 | novels = [] 28 | 29 | for txt in glob.glob('/root/金庸/*/*.txt'): 30 | txt = open(txt, encoding='gbk').read() 31 | txt = txt.replace('\r', '').replace('\n', '') 32 | txt = txt.replace(u'整理制作，并提供下载', '') 33 | txt = re.sub(u'www.*?com', '', txt) 34 | txt = txt.replace(u'\u3000', ' ') 35 | sents = [] 36 | for t in txt.split(' '): 37 | for s in re.findall(u'.*?。', t): 38 | if len(s) <= maxlen - 2: 39 | sents.append(s) 40 | novels.append(sents) 41 | 42 | # 加载并精简词表，建立分词器 43 | token_dict, keep_tokens = load_vocab( 44 | dict_path=dict_path, 45 | simplified=True, 46 | startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], 47 | ) 48 | tokenizer = Tokenizer(token_dict, do_lower_case=True) 49 | 50 | data = [] 51 | pbar = tqdm(desc=u'构建语料中', total=sum(len(n) for n in novels)) 52 | 53 | for novel in novels: 54 | s = u'' 55 | for i in range(len(novel)): 56 | for j in range(len(novel) - i): 57 | if len(s) + len(novel[i + j]) > maxlen - 2: 58 | data.append(s) 59 | s = u'' 60 | break 61 | else: 62 | s += novel[i + j] 63 | pbar.update(1) 64 | if i + j >= len(novel): 65 | break 66 | if s: 67 | data.append(s) 68 | 69 | pbar.close() 70 | np.random.shuffle(data) 71 | 72 | 73 | class data_generator(DataGenerator): 74 | """数据生成器 75 | """ 76 | def __iter__(self, random=False): 77 | batch_token_ids, batch_segment_ids = [], [] 78 | for is_end, text in self.sample(random): 79 | token_ids, segment_ids = tokenizer.encode(text) 80 | batch_token_ids.append(token_ids) 81 | batch_segment_ids.append(segment_ids) 82 | if len(batch_token_ids) == self.batch_size or is_end: 83 | batch_token_ids = sequence_padding(batch_token_ids) 84 | batch_segment_ids = sequence_padding(batch_segment_ids) 85 | yield [batch_token_ids, batch_segment_ids], None 86 | batch_token_ids, batch_segment_ids = [], [] 87 | 88 | 89 | class CrossEntropy(Loss): 90 | """交叉熵作为loss，并mask掉padding部分 91 | """ 92 | def compute_loss(self, inputs, mask=None): 93 | y_true, y_pred = inputs 94 | if mask[1] is None: 95 | y_mask = 1.0 96 | else: 97 | y_mask = K.cast(mask[1], K.floatx())[:, 1:] 98 | y_true = y_true[:, 1:] # 目标token_ids 99 | y_pred = y_pred[:, :-1] # 预测序列，错开一位 100 | loss = K.sparse_categorical_crossentropy(y_true, y_pred) 101 | loss = K.sum(loss * y_mask) / K.sum(y_mask) 102 | return loss 103 | 104 | 105 | model = build_transformer_model( 106 | config_path, 107 | checkpoint_path, 108 | application='lm', 109 | keep_tokens=keep_tokens, # 只保留keep_tokens中的字，精简原字表 110 | ) 111 | 112 | output = CrossEntropy(1)([model.inputs[0], model.outputs[0]]) 113 | 114 | model = Model(model.inputs, output) 115 | model.compile(optimizer=Adam(1e-5)) 116 | model.summary() 117 | 118 | 119 | class StoryCompletion(AutoRegressiveDecoder): 120 | """基于随机采样的故事续写 121 | """ 122 | @AutoRegressiveDecoder.wraps(default_rtype='probas') 123 | def predict(self, inputs, output_ids, states): 124 | token_ids = inputs[0] 125 | token_ids = np.concatenate([token_ids, output_ids], 1) 126 | segment_ids = np.zeros_like(token_ids) 127 | return model.predict([token_ids, segment_ids])[:, -1] 128 | 129 | def generate(self, text, n=1, topp=0.95): 130 | token_ids, _ = tokenizer.encode(text) 131 | results = self.random_sample([token_ids[:-1]], n, topp=topp) # 基于随机采样 132 | return [text + tokenizer.decode(ids) for ids in results] 133 | 134 | 135 | story_completion = StoryCompletion( 136 | start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen 137 | ) 138 | 139 | 140 | def just_show(): 141 | s1 = u'当晚两人在一家小客店中宿歇。张无忌躺在炕上，越想越是担心，走到赵敏窗外，但听她呼吸调匀，正自香梦沉酣。' 142 | s2 = u'虚竹飞身跃上松树的枝干，只见段延庆的钢杖深深嵌在树枝之中，全凭一股内力粘劲，挂住了下面四人，内力之深厚，实是非同小可。虚竹伸左手抓住钢杖，提将上来。' 143 | s3 = u'杨过居住在侠客岛，是令狐冲的弟子，武器是金蛇剑。' 144 | for s in [s1, s2, s3]: 145 | t = story_completion.generate(s) 146 | print(u'输入: %s' % s) 147 | print(u'结果: %s\n' % ('\n'.join(t))) 148 | 149 | 150 | class Evaluator(keras.callbacks.Callback): 151 | """评估与保存 152 | """ 153 | def __init__(self): 154 | self.lowest = 1e10 155 | 156 | def on_epoch_end(self, epoch, logs=None): 157 | # 保存最优 158 | if logs['loss'] <= self.lowest: 159 | self.lowest = logs['loss'] 160 | model.save_weights('./best_model.weights') 161 | # 演示效果 162 | just_show() 163 | 164 | 165 | if __name__ == '__main__': 166 | 167 | evaluator = Evaluator() 168 | train_generator = data_generator(data, batch_size) 169 | 170 | model.fit( 171 | train_generator.forfit(), 172 | steps_per_epoch=steps_per_epoch, 173 | epochs=epochs, 174 | callbacks=[evaluator] 175 | ) 176 | 177 | else: 178 | 179 | model.load_weights('./best_model.weights') 180 | """ 181 | 效果： 182 | 183 | 输入: 当晚两人在一家小客店中宿歇。张无忌躺在炕上，越想越是担心，走到赵敏窗外，但听她呼吸调匀，正自香梦沉酣。 184 | 结果: 当晚两人在一家小客店中宿歇。张无忌躺在炕上，越想越是担心，走到赵敏窗外，但听她呼吸调匀，正自香梦沉酣。次日清晨，张无忌便和赵敏去买了一匹高头大马，自己骑了随伴。那马甚有神骏，三十六斤重的身躯之中，竟无一头白马。他心中怦怦乱跳，暗想：若能将赵敏引出迷城，我决不致再和她相会，但若和赵姑娘相遇，我一生一世决计再难相见。何况我是她的私生女儿，这般亲热，岂不是好？我如何能和她相见？今后我要教训教训她才好？我教教她，教训她，要她心里快快活活的。他心如刀割，当即回到客店，将张无忌的所在说了。 185 | 186 | 输入: 虚竹飞身跃上松树的枝干，只见段延庆的钢杖深深嵌在树枝之中，全凭一股内力粘劲，挂住了下面四人，内力之深厚，实是非同小可。虚竹伸左手抓住钢杖，提将上来。 187 | 结果: 虚竹飞身跃上松树的枝干，只见段延庆的钢杖深深嵌在树枝之中，全凭一股内力粘劲，挂住了下面四人，内力之深厚，实是非同小可。虚竹伸左手抓住钢杖，提将上来。那矮子见他如此功力，大吃一惊，叫道：什么人？是谁？你干什么？我师父是谁？你们是谁？是谁？你们是谁？我师父是谁？你这矮子，便是段延庆。你们不知道我师父便是，是不是？快快说来。那矮子道：我师父便是延庆太子，他的徒弟也是段延庆。他老人家在唐朝做镇南王，你们便将他改名为延庆太子，叫做延庆太子！这名头倒怪，你们大伙儿听见了，也不知道他老人家是死是活。 188 | 189 | 输入: 杨过居住在侠客岛，是令狐冲的弟子，武器是金蛇剑。 190 | 结果: 杨过居住在侠客岛，是令狐冲的弟子，武器是金蛇剑。这时见他手中所握，竟是一柄特制的短剑，心中大喜，叫道：：原来是金蛇郎君的剑！原来你便是金蛇郎君的弟子，这一下可要叫我失望了。那人哈哈一笑，说道：好啊！好啊，好啊！我的金蛇剑是我的，不过我是你的。这人道：我姓杨名过，名字叫过。你是我儿子，是我女儿，是不是？你这么大的年纪，怎地自称金刀驸马？我这就给你取个名字，叫作过儿。 191 | """ 192 | -------------------------------------------------------------------------------- /examples/task_iflytek_adversarial_training.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # 通过对抗训练增强模型的泛化性能 3 | # 比CLUE榜单公开的同数据集上的BERT base的成绩高2% 4 | # 数据集：IFLYTEK' 长文本分类 (https://github.com/CLUEbenchmark/CLUE) 5 | # 博客：https://kexue.fm/archives/7234 6 | # 适用于Keras 2.3.1 7 | 8 | import json 9 | import numpy as np 10 | from bert4keras.backend import keras, search_layer, K 11 | from bert4keras.tokenizers import Tokenizer 12 | from bert4keras.models import build_transformer_model 13 | from bert4keras.optimizers import Adam 14 | from bert4keras.snippets import sequence_padding, DataGenerator 15 | from keras.layers import Lambda, Dense 16 | from tqdm import tqdm 17 | 18 | num_classes = 119 19 | maxlen = 128 20 | batch_size = 32 21 | 22 | # BERT base 23 | config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' 24 | checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 25 | dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' 26 | 27 | 28 | def load_data(filename): 29 | """加载数据 30 | 单条格式：(文本, 标签id) 31 | """ 32 | D = [] 33 | with open(filename) as f: 34 | for i, l in enumerate(f): 35 | l = json.loads(l) 36 | text, label = l['sentence'], l['label'] 37 | D.append((text, int(label))) 38 | return D 39 | 40 | 41 | # 加载数据集 42 | train_data = load_data( 43 | '/root/CLUE-master/baselines/CLUEdataset/iflytek/train.json' 44 | ) 45 | valid_data = load_data( 46 | '/root/CLUE-master/baselines/CLUEdataset/iflytek/dev.json' 47 | ) 48 | 49 | # 建立分词器 50 | tokenizer = Tokenizer(dict_path, do_lower_case=True) 51 | 52 | 53 | class data_generator(DataGenerator): 54 | """数据生成器 55 | """ 56 | def __iter__(self, random=False): 57 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 58 | for is_end, (text, label) in self.sample(random): 59 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 60 | batch_token_ids.append(token_ids) 61 | batch_segment_ids.append(segment_ids) 62 | batch_labels.append([label]) 63 | if len(batch_token_ids) == self.batch_size or is_end: 64 | batch_token_ids = sequence_padding(batch_token_ids) 65 | batch_segment_ids = sequence_padding(batch_segment_ids) 66 | batch_labels = sequence_padding(batch_labels) 67 | yield [batch_token_ids, batch_segment_ids], batch_labels 68 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 69 | 70 | 71 | # 转换数据集 72 | train_generator = data_generator(train_data, batch_size) 73 | valid_generator = data_generator(valid_data, batch_size) 74 | 75 | # 加载预训练模型 76 | bert = build_transformer_model( 77 | config_path=config_path, 78 | checkpoint_path=checkpoint_path, 79 | return_keras_model=False, 80 | ) 81 | 82 | output = Lambda(lambda x: x[:, 0])(bert.model.output) 83 | output = Dense( 84 | units=num_classes, 85 | activation='softmax', 86 | kernel_initializer=bert.initializer 87 | )(output) 88 | 89 | model = keras.models.Model(bert.model.input, output) 90 | model.summary() 91 | 92 | model.compile( 93 | loss='sparse_categorical_crossentropy', 94 | optimizer=Adam(2e-5), 95 | metrics=['sparse_categorical_accuracy'], 96 | ) 97 | 98 | 99 | def adversarial_training(model, embedding_name, epsilon=1): 100 | """给模型添加对抗训练 101 | 其中model是需要添加对抗训练的keras模型，embedding_name 102 | 则是model里边Embedding层的名字。要在模型compile之后使用。 103 | """ 104 | if model.train_function is None: # 如果还没有训练函数 105 | model._make_train_function() # 手动make 106 | old_train_function = model.train_function # 备份旧的训练函数 107 | 108 | # 查找Embedding层 109 | for output in model.outputs: 110 | embedding_layer = search_layer(output, embedding_name) 111 | if embedding_layer is not None: 112 | break 113 | if embedding_layer is None: 114 | raise Exception('Embedding layer not found') 115 | 116 | # 求Embedding梯度 117 | embeddings = embedding_layer.embeddings # Embedding矩阵 118 | gradients = K.gradients(model.total_loss, [embeddings]) # Embedding梯度 119 | gradients = K.zeros_like(embeddings) + gradients[0] # 转为dense tensor 120 | 121 | # 封装为函数 122 | inputs = ( 123 | model._feed_inputs + model._feed_targets + model._feed_sample_weights 124 | ) # 所有输入层 125 | embedding_gradients = K.function( 126 | inputs=inputs, 127 | outputs=[gradients], 128 | name='embedding_gradients', 129 | ) # 封装为函数 130 | 131 | def train_function(inputs): # 重新定义训练函数 132 | grads = embedding_gradients(inputs)[0] # Embedding梯度 133 | delta = epsilon * grads / (np.sqrt((grads**2).sum()) + 1e-8) # 计算扰动 134 | K.set_value(embeddings, K.eval(embeddings) + delta) # 注入扰动 135 | outputs = old_train_function(inputs) # 梯度下降 136 | K.set_value(embeddings, K.eval(embeddings) - delta) # 删除扰动 137 | return outputs 138 | 139 | model.train_function = train_function # 覆盖原训练函数 140 | 141 | 142 | # 写好函数后，启用对抗训练只需要一行代码 143 | adversarial_training(model, 'Embedding-Token', 0.5) 144 | 145 | 146 | def evaluate(data): 147 | total, right = 0., 0. 148 | for x_true, y_true in data: 149 | y_pred = model.predict(x_true).argmax(axis=1) 150 | y_true = y_true[:, 0] 151 | total += len(y_true) 152 | right += (y_true == y_pred).sum() 153 | return right / total 154 | 155 | 156 | class Evaluator(keras.callbacks.Callback): 157 | """评估与保存 158 | """ 159 | def __init__(self): 160 | self.best_val_acc = 0. 161 | 162 | def on_epoch_end(self, epoch, logs=None): 163 | val_acc = evaluate(valid_generator) 164 | if val_acc > self.best_val_acc: 165 | self.best_val_acc = val_acc 166 | model.save_weights('best_model.weights') 167 | print( 168 | u'val_acc: %.5f, best_val_acc: %.5f\n' % 169 | (val_acc, self.best_val_acc) 170 | ) 171 | 172 | 173 | def predict_to_file(in_file, out_file): 174 | """输出预测结果到文件 175 | 结果文件可以提交到 https://www.cluebenchmarks.com 评测。 176 | """ 177 | fw = open(out_file, 'w') 178 | with open(in_file) as fr: 179 | for l in tqdm(fr): 180 | l = json.loads(l) 181 | text = l['sentence'] 182 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 183 | label = model.predict([[token_ids], [segment_ids]])[0].argmax() 184 | l = json.dumps({'id': str(l['id']), 'label': str(label)}) 185 | fw.write(l + '\n') 186 | fw.close() 187 | 188 | 189 | if __name__ == '__main__': 190 | 191 | evaluator = Evaluator() 192 | 193 | model.fit( 194 | train_generator.forfit(), 195 | steps_per_epoch=len(train_generator), 196 | epochs=50, 197 | callbacks=[evaluator] 198 | ) 199 | 200 | else: 201 | 202 | model.load_weights('best_model.weights') 203 | # predict_to_file('/root/CLUE-master/baselines/CLUEdataset/iflytek/test.json', 'iflytek_predict.json') 204 | -------------------------------------------------------------------------------- /examples/task_sequence_labeling_cws_crf.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 用CRF做中文分词（CWS, Chinese Word Segmentation） 3 | # 数据集 http://sighan.cs.uchicago.edu/bakeoff2005/ 4 | # 最后测试集的F1约为96.1% 5 | 6 | import re, os, json 7 | import numpy as np 8 | from bert4keras.backend import keras, K 9 | from bert4keras.models import build_transformer_model 10 | from bert4keras.tokenizers import Tokenizer 11 | from bert4keras.optimizers import Adam 12 | from bert4keras.snippets import sequence_padding, DataGenerator 13 | from bert4keras.snippets import open, ViterbiDecoder, to_array 14 | from bert4keras.layers import ConditionalRandomField 15 | from keras.layers import Dense 16 | from keras.models import Model 17 | from tqdm import tqdm 18 | 19 | maxlen = 256 20 | epochs = 10 21 | num_labels = 4 22 | batch_size = 32 23 | bert_layers = 12 24 | learing_rate = 1e-5 # bert_layers越小，学习率应该要越大 25 | crf_lr_multiplier = 1 # 必要时扩大CRF层的学习率 26 | 27 | # bert配置 28 | config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' 29 | checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 30 | dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' 31 | 32 | 33 | def load_data(filename): 34 | """加载数据 35 | 单条格式：[词1, 词2, 词3, ...] 36 | """ 37 | D = [] 38 | with open(filename, encoding='utf-8') as f: 39 | for l in f: 40 | D.append(re.split(' +', l.strip())) 41 | return D 42 | 43 | 44 | # 标注数据 45 | data = load_data('/root/icwb2-data/training/pku_training.utf8') 46 | 47 | # 保存一个随机序（供划分valid用） 48 | if not os.path.exists('../random_order.json'): 49 | random_order = list(range(len(data))) 50 | np.random.shuffle(random_order) 51 | json.dump(random_order, open('../random_order.json', 'w'), indent=4) 52 | else: 53 | random_order = json.load(open('../random_order.json')) 54 | 55 | # 划分valid 56 | train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0] 57 | valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0] 58 | 59 | # 建立分词器 60 | tokenizer = Tokenizer(dict_path, do_lower_case=True) 61 | 62 | 63 | class data_generator(DataGenerator): 64 | """数据生成器 65 | """ 66 | def __iter__(self, random=False): 67 | """标签含义 68 | 0: 单字词； 1: 多字词首字； 2: 多字词中间； 3: 多字词末字 69 | """ 70 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 71 | for is_end, item in self.sample(random): 72 | token_ids, labels = [tokenizer._token_start_id], [0] 73 | for w in item: 74 | w_token_ids = tokenizer.encode(w)[0][1:-1] 75 | if len(token_ids) + len(w_token_ids) < maxlen: 76 | token_ids += w_token_ids 77 | if len(w_token_ids) == 1: 78 | labels += [0] 79 | else: 80 | labels += [1] + [2] * (len(w_token_ids) - 2) + [3] 81 | else: 82 | break 83 | token_ids += [tokenizer._token_end_id] 84 | labels += [0] 85 | segment_ids = [0] * len(token_ids) 86 | batch_token_ids.append(token_ids) 87 | batch_segment_ids.append(segment_ids) 88 | batch_labels.append(labels) 89 | if len(batch_token_ids) == self.batch_size or is_end: 90 | batch_token_ids = sequence_padding(batch_token_ids) 91 | batch_segment_ids = sequence_padding(batch_segment_ids) 92 | batch_labels = sequence_padding(batch_labels) 93 | yield [batch_token_ids, batch_segment_ids], batch_labels 94 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 95 | 96 | 97 | """ 98 | 后面的代码使用的是bert类型的模型，如果你用的是albert，那么前几行请改为： 99 | 100 | model = build_transformer_model( 101 | config_path, 102 | checkpoint_path, 103 | model='albert', 104 | ) 105 | 106 | output_layer = 'Transformer-FeedForward-Norm' 107 | output = model.get_layer(output_layer).get_output_at(bert_layers - 1) 108 | """ 109 | 110 | model = build_transformer_model( 111 | config_path, 112 | checkpoint_path, 113 | ) 114 | 115 | output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1) 116 | output = model.get_layer(output_layer).output 117 | output = Dense(num_labels)(output) 118 | CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier) 119 | output = CRF(output) 120 | 121 | model = Model(model.input, output) 122 | model.summary() 123 | 124 | model.compile( 125 | loss=CRF.sparse_loss, 126 | optimizer=Adam(learing_rate), 127 | metrics=[CRF.sparse_accuracy] 128 | ) 129 | 130 | 131 | class WordSegmenter(ViterbiDecoder): 132 | """基本分词器 133 | """ 134 | def tokenize(self, text): 135 | tokens = tokenizer.tokenize(text) 136 | while len(tokens) > 512: 137 | tokens.pop(-2) 138 | mapping = tokenizer.rematch(text, tokens) 139 | token_ids = tokenizer.tokens_to_ids(tokens) 140 | segment_ids = [0] * len(token_ids) 141 | token_ids, segment_ids = to_array([token_ids], [segment_ids]) 142 | nodes = model.predict([token_ids, segment_ids])[0] 143 | labels = self.decode(nodes) 144 | words = [] 145 | for i, label in enumerate(labels[1:-1]): 146 | if label < 2 or len(words) == 0: 147 | words.append([i + 1]) 148 | else: 149 | words[-1].append(i + 1) 150 | return [text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1] for w in words] 151 | 152 | 153 | segmenter = WordSegmenter(trans=K.eval(CRF.trans), starts=[0], ends=[0]) 154 | 155 | 156 | def simple_evaluate(data): 157 | """简单的评测 158 | 该评测指标不等价于官方的评测指标，但基本呈正相关关系， 159 | 可以用来快速筛选模型。 160 | """ 161 | total, right = 0., 0. 162 | for w_true in tqdm(data): 163 | w_pred = segmenter.tokenize(''.join(w_true)) 164 | w_pred = set(w_pred) 165 | w_true = set(w_true) 166 | total += len(w_true) 167 | right += len(w_true & w_pred) 168 | return right / total 169 | 170 | 171 | def predict_to_file(in_file, out_file): 172 | """预测结果到文件，便于用官方脚本评测 173 | 使用示例： 174 | predict_to_file('/root/icwb2-data/testing/pku_test.utf8', 'myresult.txt') 175 | 官方评测代码示例： 176 | data_dir="/root/icwb2-data" 177 | $data_dir/scripts/score $data_dir/gold/pku_training_words.utf8 $data_dir/gold/pku_test_gold.utf8 myresult.txt > myscore.txt 178 | （执行完毕后查看myscore.txt的内容末尾） 179 | """ 180 | fw = open(out_file, 'w', encoding='utf-8') 181 | with open(in_file, encoding='utf-8') as fr: 182 | for l in tqdm(fr): 183 | l = l.strip() 184 | if l: 185 | l = ' '.join(segmenter.tokenize(l)) 186 | fw.write(l + '\n') 187 | fw.close() 188 | 189 | 190 | class Evaluator(keras.callbacks.Callback): 191 | """评估与保存 192 | """ 193 | def __init__(self): 194 | self.best_val_acc = 0 195 | 196 | def on_epoch_end(self, epoch, logs=None): 197 | trans = K.eval(CRF.trans) 198 | segmenter.trans = trans 199 | print(segmenter.trans) 200 | acc = simple_evaluate(valid_data) 201 | # 保存最优 202 | if acc >= self.best_val_acc: 203 | self.best_val_acc = acc 204 | model.save_weights('./best_model.weights') 205 | print('acc: %.5f, best acc: %.5f' % (acc, self.best_val_acc)) 206 | 207 | 208 | if __name__ == '__main__': 209 | 210 | evaluator = Evaluator() 211 | train_generator = data_generator(train_data, batch_size) 212 | 213 | model.fit( 214 | train_generator.forfit(), 215 | steps_per_epoch=len(train_generator), 216 | epochs=epochs, 217 | callbacks=[evaluator] 218 | ) 219 | 220 | else: 221 | 222 | model.load_weights('./best_model.weights') 223 | segmenter.trans = K.eval(CRF.trans) 224 | -------------------------------------------------------------------------------- /examples/task_sentiment_virtual_adversarial_training.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # 通过虚拟对抗训练进行半监督学习 3 | # use_vat=True比use_vat=False约有1%的提升 4 | # 数据集：情感分析数据集 5 | # 博客：https://kexue.fm/archives/7466 6 | # 适用于Keras 2.3.1 7 | 8 | import json 9 | import numpy as np 10 | from bert4keras.backend import keras, search_layer, K 11 | from bert4keras.tokenizers import Tokenizer 12 | from bert4keras.models import build_transformer_model 13 | from bert4keras.optimizers import Adam 14 | from bert4keras.snippets import sequence_padding, DataGenerator 15 | from bert4keras.snippets import open 16 | from keras.layers import Lambda, Dense 17 | from keras.utils import to_categorical 18 | from tqdm import tqdm 19 | 20 | # 配置信息 21 | num_classes = 2 22 | maxlen = 128 23 | batch_size = 32 24 | train_frac = 0.01 # 标注数据的比例 25 | use_vat = True # 可以比较True/False的效果 26 | 27 | # BERT base 28 | config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' 29 | checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 30 | dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' 31 | 32 | 33 | def load_data(filename): 34 | """加载数据 35 | 单条格式：(文本, 标签id) 36 | """ 37 | D = [] 38 | with open(filename, encoding='utf-8') as f: 39 | for l in f: 40 | text, label = l.strip().split('\t') 41 | D.append((text, int(label))) 42 | return D 43 | 44 | 45 | # 加载数据集 46 | train_data = load_data('datasets/sentiment/sentiment.train.data') 47 | valid_data = load_data('datasets/sentiment/sentiment.valid.data') 48 | test_data = load_data('datasets/sentiment/sentiment.test.data') 49 | 50 | # 模拟标注和非标注数据 51 | num_labeled = int(len(train_data) * train_frac) 52 | unlabeled_data = [(t, 0) for t, l in train_data[num_labeled:]] 53 | train_data = train_data[:num_labeled] 54 | 55 | # 建立分词器 56 | tokenizer = Tokenizer(dict_path, do_lower_case=True) 57 | 58 | 59 | class data_generator(DataGenerator): 60 | """数据生成器 61 | """ 62 | def __iter__(self, random=False): 63 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 64 | for is_end, (text, label) in self.sample(random): 65 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 66 | batch_token_ids.append(token_ids) 67 | batch_segment_ids.append(segment_ids) 68 | batch_labels.append(label) 69 | if len(batch_token_ids) == self.batch_size or is_end: 70 | batch_token_ids = sequence_padding(batch_token_ids) 71 | batch_segment_ids = sequence_padding(batch_segment_ids) 72 | batch_labels = to_categorical(batch_labels, num_classes) 73 | yield [batch_token_ids, batch_segment_ids], batch_labels 74 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 75 | 76 | 77 | # 转换数据集 78 | train_generator = data_generator(train_data, batch_size) 79 | valid_generator = data_generator(valid_data, batch_size) 80 | test_generator = data_generator(test_data, batch_size) 81 | 82 | # 加载预训练模型 83 | bert = build_transformer_model( 84 | config_path=config_path, 85 | checkpoint_path=checkpoint_path, 86 | return_keras_model=False, 87 | ) 88 | 89 | output = Lambda(lambda x: x[:, 0])(bert.model.output) 90 | output = Dense( 91 | units=num_classes, 92 | activation='softmax', 93 | kernel_initializer=bert.initializer 94 | )(output) 95 | 96 | # 用于正常训练的模型 97 | model = keras.models.Model(bert.model.input, output) 98 | model.summary() 99 | 100 | model.compile( 101 | loss='kld', 102 | optimizer=Adam(2e-5), 103 | metrics=['categorical_accuracy'], 104 | ) 105 | 106 | # 用于虚拟对抗训练的模型 107 | model_vat = keras.models.Model(bert.model.input, output) 108 | model_vat.compile( 109 | loss='kld', 110 | optimizer=Adam(1e-5), 111 | metrics=['categorical_accuracy'], 112 | ) 113 | 114 | 115 | def virtual_adversarial_training( 116 | model, embedding_name, epsilon=1, xi=10, iters=1 117 | ): 118 | """给模型添加虚拟对抗训练 119 | 其中model是需要添加对抗训练的keras模型，embedding_name 120 | 则是model里边Embedding层的名字。要在模型compile之后使用。 121 | """ 122 | if model.train_function is None: # 如果还没有训练函数 123 | model._make_train_function() # 手动make 124 | old_train_function = model.train_function # 备份旧的训练函数 125 | 126 | # 查找Embedding层 127 | for output in model.outputs: 128 | embedding_layer = search_layer(output, embedding_name) 129 | if embedding_layer is not None: 130 | break 131 | if embedding_layer is None: 132 | raise Exception('Embedding layer not found') 133 | 134 | # 求Embedding梯度 135 | embeddings = embedding_layer.embeddings # Embedding矩阵 136 | gradients = K.gradients(model.total_loss, [embeddings]) # Embedding梯度 137 | gradients = K.zeros_like(embeddings) + gradients[0] # 转为dense tensor 138 | 139 | # 封装为函数 140 | inputs = ( 141 | model._feed_inputs + model._feed_targets + model._feed_sample_weights 142 | ) # 所有输入层 143 | model_outputs = K.function( 144 | inputs=inputs, 145 | outputs=model.outputs, 146 | name='model_outputs', 147 | ) # 模型输出函数 148 | embedding_gradients = K.function( 149 | inputs=inputs, 150 | outputs=[gradients], 151 | name='embedding_gradients', 152 | ) # 模型梯度函数 153 | 154 | def l2_normalize(x): 155 | return x / (np.sqrt((x**2).sum()) + 1e-8) 156 | 157 | def train_function(inputs): # 重新定义训练函数 158 | outputs = model_outputs(inputs) 159 | inputs = inputs[:2] + outputs + inputs[3:] 160 | delta1, delta2 = 0.0, np.random.randn(*K.int_shape(embeddings)) 161 | for _ in range(iters): # 迭代求扰动 162 | delta2 = xi * l2_normalize(delta2) 163 | K.set_value(embeddings, K.eval(embeddings) - delta1 + delta2) 164 | delta1 = delta2 165 | delta2 = embedding_gradients(inputs)[0] # Embedding梯度 166 | delta2 = epsilon * l2_normalize(delta2) 167 | K.set_value(embeddings, K.eval(embeddings) - delta1 + delta2) 168 | outputs = old_train_function(inputs) # 梯度下降 169 | K.set_value(embeddings, K.eval(embeddings) - delta2) # 删除扰动 170 | return outputs 171 | 172 | model.train_function = train_function # 覆盖原训练函数 173 | 174 | 175 | # 写好函数后，启用对抗训练只需要一行代码 176 | virtual_adversarial_training(model_vat, 'Embedding-Token') 177 | 178 | 179 | def evaluate(data): 180 | total, right = 0., 0. 181 | for x_true, y_true in data: 182 | y_pred = model.predict(x_true).argmax(axis=1) 183 | y_true = y_true.argmax(axis=1) 184 | total += len(y_true) 185 | right += (y_true == y_pred).sum() 186 | return right / total 187 | 188 | 189 | class Evaluator(keras.callbacks.Callback): 190 | """评估与保存 191 | """ 192 | def __init__(self): 193 | self.best_val_acc = 0. 194 | self.data = data_generator(unlabeled_data, batch_size).forfit() 195 | 196 | def on_epoch_end(self, epoch, logs=None): 197 | val_acc = evaluate(valid_generator) 198 | if val_acc > self.best_val_acc: 199 | self.best_val_acc = val_acc 200 | model.save_weights('best_model.weights') 201 | test_acc = evaluate(test_generator) 202 | print( 203 | u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' % 204 | (val_acc, self.best_val_acc, test_acc) 205 | ) 206 | 207 | def on_batch_end(self, batch, logs=None): 208 | if use_vat: 209 | dx, dy = next(self.data) 210 | model_vat.train_on_batch(dx, dy) 211 | 212 | 213 | if __name__ == '__main__': 214 | 215 | evaluator = Evaluator() 216 | 217 | model.fit( 218 | train_generator.forfit(), 219 | steps_per_epoch=30, 220 | epochs=100, 221 | callbacks=[evaluator] 222 | ) 223 | 224 | else: 225 | 226 | model.load_weights('best_model.weights') 227 | -------------------------------------------------------------------------------- /examples/task_iflytek_bert_of_theseus.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # 文本分类例子下的模型压缩 3 | # 方法为BERT-of-Theseus 4 | # 论文：https://arxiv.org/abs/2002.02925 5 | # 博客：https://kexue.fm/archives/7575 6 | 7 | import json 8 | import numpy as np 9 | from bert4keras.backend import keras, K 10 | from bert4keras.tokenizers import Tokenizer 11 | from bert4keras.models import build_transformer_model 12 | from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr 13 | from bert4keras.snippets import sequence_padding, DataGenerator 14 | from bert4keras.snippets import open 15 | from keras.layers import Input, Lambda, Dense, Layer 16 | from keras.models import Model 17 | 18 | num_classes = 119 19 | maxlen = 128 20 | batch_size = 32 21 | 22 | # BERT base 23 | config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' 24 | checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 25 | dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' 26 | 27 | 28 | def load_data(filename): 29 | """加载数据 30 | 单条格式：(文本, 标签id) 31 | """ 32 | D = [] 33 | with open(filename) as f: 34 | for i, l in enumerate(f): 35 | l = json.loads(l) 36 | text, label = l['sentence'], l['label'] 37 | D.append((text, int(label))) 38 | return D 39 | 40 | 41 | # 加载数据集 42 | train_data = load_data( 43 | '/root/CLUE-master/baselines/CLUEdataset/iflytek/train.json' 44 | ) 45 | valid_data = load_data( 46 | '/root/CLUE-master/baselines/CLUEdataset/iflytek/dev.json' 47 | ) 48 | 49 | # 建立分词器 50 | tokenizer = Tokenizer(dict_path, do_lower_case=True) 51 | 52 | 53 | class data_generator(DataGenerator): 54 | """数据生成器 55 | """ 56 | def __iter__(self, random=False): 57 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 58 | for is_end, (text, label) in self.sample(random): 59 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 60 | batch_token_ids.append(token_ids) 61 | batch_segment_ids.append(segment_ids) 62 | batch_labels.append([label]) 63 | if len(batch_token_ids) == self.batch_size or is_end: 64 | batch_token_ids = sequence_padding(batch_token_ids) 65 | batch_segment_ids = sequence_padding(batch_segment_ids) 66 | batch_labels = sequence_padding(batch_labels) 67 | yield [batch_token_ids, batch_segment_ids], batch_labels 68 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 69 | 70 | 71 | # 转换数据集 72 | train_generator = data_generator(train_data, batch_size) 73 | valid_generator = data_generator(valid_data, batch_size) 74 | 75 | 76 | class BinaryRandomChoice(Layer): 77 | """随机二选一 78 | """ 79 | def __init__(self, **kwargs): 80 | super(BinaryRandomChoice, self).__init__(**kwargs) 81 | self.supports_masking = True 82 | 83 | def compute_mask(self, inputs, mask=None): 84 | if mask is not None: 85 | return mask[1] 86 | 87 | def call(self, inputs): 88 | source, target = inputs 89 | mask = K.random_binomial(shape=[1], p=0.5) 90 | output = mask * source + (1 - mask) * target 91 | return K.in_train_phase(output, target) 92 | 93 | def compute_output_shape(self, input_shape): 94 | return input_shape[1] 95 | 96 | 97 | def bert_of_theseus(predecessor, successor, classfier): 98 | """bert of theseus 99 | """ 100 | inputs = predecessor.inputs 101 | # 固定住已经训练好的层 102 | for layer in predecessor.model.layers: 103 | layer.trainable = False 104 | classfier.trainable = False 105 | # Embedding层替换 106 | predecessor_outputs = predecessor.apply_embeddings(inputs) 107 | successor_outputs = successor.apply_embeddings(inputs) 108 | outputs = BinaryRandomChoice()([predecessor_outputs, successor_outputs]) 109 | # Transformer层替换 110 | layers_per_module = predecessor.num_hidden_layers // successor.num_hidden_layers 111 | for index in range(successor.num_hidden_layers): 112 | predecessor_outputs = outputs 113 | for sub_index in range(layers_per_module): 114 | predecessor_outputs = predecessor.apply_main_layers( 115 | predecessor_outputs, layers_per_module * index + sub_index 116 | ) 117 | successor_outputs = successor.apply_main_layers(outputs, index) 118 | outputs = BinaryRandomChoice()([predecessor_outputs, successor_outputs]) 119 | # 返回模型 120 | outputs = classfier(outputs) 121 | model = Model(inputs, outputs) 122 | return model 123 | 124 | 125 | def evaluate(data, model): 126 | total, right = 0., 0. 127 | for x_true, y_true in data: 128 | y_pred = model.predict(x_true).argmax(axis=1) 129 | y_true = y_true[:, 0] 130 | total += len(y_true) 131 | right += (y_true == y_pred).sum() 132 | return right / total 133 | 134 | 135 | class Evaluator(keras.callbacks.Callback): 136 | """评估与保存 137 | """ 138 | def __init__(self, savename): 139 | self.best_val_acc = 0. 140 | self.savename = savename 141 | 142 | def on_epoch_end(self, epoch, logs=None): 143 | val_acc = evaluate(valid_generator, self.model) 144 | if val_acc > self.best_val_acc: 145 | self.best_val_acc = val_acc 146 | self.model.save_weights(self.savename) 147 | print( 148 | u'val_acc: %.5f, best_val_acc: %.5f\n' % 149 | (val_acc, self.best_val_acc) 150 | ) 151 | 152 | 153 | # 加载预训练模型（12层） 154 | predecessor = build_transformer_model( 155 | config_path=config_path, 156 | checkpoint_path=checkpoint_path, 157 | return_keras_model=False, 158 | prefix='Predecessor-' 159 | ) 160 | 161 | # 加载预训练模型（3层） 162 | successor = build_transformer_model( 163 | config_path=config_path, 164 | checkpoint_path=checkpoint_path, 165 | return_keras_model=False, 166 | num_hidden_layers=3, 167 | prefix='Successor-' 168 | ) 169 | 170 | # 判别模型 171 | x_in = Input(shape=K.int_shape(predecessor.output)[1:]) 172 | x = Lambda(lambda x: x[:, 0])(x_in) 173 | x = Dense(units=num_classes, activation='softmax')(x) 174 | classfier = Model(x_in, x) 175 | 176 | predecessor_model = Model(predecessor.inputs, classfier(predecessor.output)) 177 | predecessor_model.compile( 178 | loss='sparse_categorical_crossentropy', 179 | optimizer=Adam(2e-5), # 用足够小的学习率 180 | metrics=['sparse_categorical_accuracy'], 181 | ) 182 | predecessor_model.summary() 183 | 184 | successor_model = Model(successor.inputs, classfier(successor.output)) 185 | successor_model.compile( 186 | loss='sparse_categorical_crossentropy', 187 | optimizer=Adam(2e-5), # 用足够小的学习率 188 | metrics=['sparse_categorical_accuracy'], 189 | ) 190 | successor_model.summary() 191 | 192 | theseus_model = bert_of_theseus(predecessor, successor, classfier) 193 | theseus_model.compile( 194 | loss='sparse_categorical_crossentropy', 195 | optimizer=Adam(2e-5), # 用足够小的学习率 196 | metrics=['sparse_categorical_accuracy'], 197 | ) 198 | theseus_model.summary() 199 | 200 | if __name__ == '__main__': 201 | 202 | # 训练predecessor 203 | predecessor_evaluator = Evaluator('best_predecessor.weights') 204 | predecessor_model.fit( 205 | train_generator.forfit(), 206 | steps_per_epoch=len(train_generator), 207 | epochs=5, 208 | callbacks=[predecessor_evaluator] 209 | ) 210 | 211 | # 训练theseus 212 | theseus_evaluator = Evaluator('best_theseus.weights') 213 | theseus_model.fit( 214 | train_generator.forfit(), 215 | steps_per_epoch=len(train_generator), 216 | epochs=10, 217 | callbacks=[theseus_evaluator] 218 | ) 219 | theseus_model.load_weights('best_theseus.weights') 220 | 221 | # 训练successor 222 | successor_evaluator = Evaluator('best_successor.weights') 223 | successor_model.fit( 224 | train_generator.forfit(), 225 | steps_per_epoch=len(train_generator), 226 | epochs=5, 227 | callbacks=[successor_evaluator] 228 | ) 229 | -------------------------------------------------------------------------------- /examples/task_language_model_chinese_chess.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 用语言模型+棋谱的方式监督训练一个下中国象棋模型 3 | # 介绍：https://kexue.fm/archives/7877 4 | # 数据：https://github.com/bojone/gpt_cchess 5 | # 模型训练可以在python2/python3进行。但是cchess模块只支持python3， 6 | # 因此如果需要交互式体验模型棋力，那么需要在python3下进行。 7 | 8 | import json 9 | import numpy as np 10 | from bert4keras.backend import keras, K 11 | from bert4keras.layers import Loss 12 | from bert4keras.models import build_transformer_model 13 | from bert4keras.tokenizers import Tokenizer, load_vocab 14 | from bert4keras.optimizers import Adam 15 | from bert4keras.snippets import sequence_padding, open 16 | from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder 17 | from keras.models import Model 18 | from cchess import * 19 | 20 | # 基本信息 21 | maxlen = 512 22 | steps_per_epoch = 1000 23 | epochs = 10000 24 | batch_size = 16 25 | 26 | # bert配置 27 | config_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json' 28 | checkpoint_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt' 29 | dict_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' 30 | 31 | 32 | def load_data(filename): 33 | """读取全局棋谱 34 | 返回：[(棋谱, 结果)]，其中结果等于2为红方赢棋，1为和棋， 35 | 0为黑方赢棋，-1则为无明确标注胜负。 36 | """ 37 | D = [] 38 | with open(filename) as f: 39 | for l in f: 40 | l = json.loads(l) 41 | if not l['fen']: 42 | result = int(l['items'].get(u'棋局结果', -1)) 43 | D.append((l['iccs'], result)) 44 | return D 45 | 46 | 47 | # 加载数据 48 | data = load_data('/root/qipu.json') 49 | 50 | # 建立分词器 51 | chars = [u'[PAD]'] + list(u'0123456789abcdefghi') 52 | token_dict = dict(zip(chars, range(len(chars)))) 53 | tokenizer = Tokenizer(token_dict) 54 | tokenizer._token_unk_id = 0 55 | bert_token_dict = load_vocab(dict_path) 56 | keep_tokens = [bert_token_dict[c] for c in chars] 57 | 58 | 59 | class data_generator(DataGenerator): 60 | """数据生成器 61 | """ 62 | def __iter__(self, random=False): 63 | batch_token_ids, batch_segment_ids = [], [] 64 | for is_end, (text, label) in self.sample(random): 65 | token_ids, segment_ids = tokenizer.encode( 66 | ' '.join(text), maxlen=maxlen // self.n + 1 67 | ) 68 | batch_token_ids.append([0] + token_ids[1:-1]) 69 | batch_segment_ids.append([0] + segment_ids[1:-1]) 70 | if len(batch_token_ids) == self.batch_size or is_end: 71 | batch_token_ids = sequence_padding(batch_token_ids) 72 | batch_segment_ids = sequence_padding(batch_segment_ids) 73 | yield [batch_token_ids, batch_segment_ids], None 74 | batch_token_ids, batch_segment_ids = [], [] 75 | self.count += 1 76 | 77 | @property 78 | def n(self): 79 | if not hasattr(self, 'count'): 80 | self.count = 0 81 | if self.count < 20000: 82 | n = 8 83 | elif self.count < 40000: 84 | n = 4 85 | elif self.count < 80000: 86 | n = 2 87 | else: 88 | n = 1 89 | return n 90 | 91 | 92 | class CrossEntropy(Loss): 93 | """交叉熵作为loss，并mask掉padding部分 94 | """ 95 | def compute_loss(self, inputs, mask=None): 96 | y_true, y_pred = inputs 97 | if mask[1] is None: 98 | y_mask = 1.0 99 | else: 100 | y_mask = K.cast(mask[1], K.floatx())[:, 1:] 101 | y_true = y_true[:, 1:] # 目标token_ids 102 | y_pred = y_pred[:, :-1] # 预测序列，错开一位 103 | loss = K.sparse_categorical_crossentropy(y_true, y_pred) 104 | loss = K.sum(loss * y_mask) / K.sum(y_mask) 105 | return loss 106 | 107 | 108 | model = build_transformer_model( 109 | config_path, 110 | checkpoint_path, 111 | application='lm', 112 | keep_tokens=keep_tokens, # 只保留keep_tokens中的字，精简原字表 113 | ) 114 | 115 | output = CrossEntropy(1)([model.inputs[0], model.outputs[0]]) 116 | 117 | model = Model(model.inputs, output) 118 | model.compile(optimizer=Adam(1e-5)) 119 | model.summary() 120 | 121 | 122 | class ChessPlayer(object): 123 | """交互式下棋程序 124 | """ 125 | def move_to_chinese(self, move): 126 | """将单步走法转为中文描述 127 | """ 128 | if not isinstance(move, Move): 129 | move = Move(self.board, move[0], move[1]) 130 | return move.to_chinese() 131 | 132 | def move_to_iccs(self, move): 133 | """将单步走法转为iccs表示 134 | """ 135 | if not isinstance(move, Move): 136 | move = Move(self.board, move[0], move[1]) 137 | return move.to_iccs() 138 | 139 | def print_board(self): 140 | """打印当前棋盘 141 | 直观起见，红方用红色表示，黑方用绿色表示。 142 | """ 143 | for l in self.board.dump_board(): 144 | for c in u'兵炮车马相仕帅': 145 | l = l.replace(c, u'\033[1;31;40m%s\033[0m' % c) 146 | for c in u'卒砲砗碼象士将': 147 | l = l.replace(c, u'\033[1;32;40m%s\033[0m' % c) 148 | print(l) 149 | 150 | def movable_steps(self): 151 | """给出当前局面所有候选走法 152 | """ 153 | return [self.move_to_iccs(m) for m in self.board.create_moves()] 154 | 155 | def human_input(self): 156 | """人类行棋 157 | """ 158 | while True: 159 | try: 160 | iccs = input(u'请输入iccs棋着: ') 161 | print(iccs) 162 | move = self.board.move_iccs(iccs) 163 | if move is not None: 164 | return iccs, move 165 | except KeyboardInterrupt: 166 | return None 167 | except: 168 | pass 169 | 170 | def record(self, iccs): 171 | """将局面往前推进一步 172 | """ 173 | self.history += iccs 174 | self.board.next_turn() 175 | self.print_board() 176 | self.current = (self.current + 1) % 2 177 | 178 | def new_game(self, current=0): 179 | """开新局 180 | """ 181 | self.board = ChessBoard() 182 | self.board.from_fen(FULL_INIT_FEN) 183 | self.print_board() 184 | self.history = '' 185 | self.current = current 186 | if self.current == 0: # 人类先手 187 | iccs, move = self.human_input() 188 | self.record(iccs) 189 | while True: 190 | # 机器走棋 191 | moves = self.movable_steps() 192 | iccses = [' '.join(self.history + m) for m in moves] 193 | token_ids = [[0] + tokenizer.encode(ic)[0][1:-1] for ic in iccses] 194 | token_ids = np.array(token_ids) 195 | segment_ids = np.zeros_like(token_ids) 196 | preds = model.predict([token_ids, segment_ids])[:, -5:-1] 197 | preds = np.take_along_axis(preds, token_ids[:, -4:, None], axis=2) 198 | preds = np.log(preds + 1e-8)[:, :, 0].sum(axis=1) 199 | iccs = moves[preds.argmax()] 200 | move = self.board.move_iccs(iccs) 201 | self.record(iccs) 202 | if self.board.is_win(): 203 | print(u'机器赢了') 204 | break 205 | # 人类走棋 206 | iccs, move = self.human_input() 207 | self.record(iccs) 208 | if self.board.is_win(): 209 | print(u'人类赢了') 210 | break 211 | 212 | 213 | chessplayer = ChessPlayer() 214 | """ 215 | chessplayer.new_game(0) # 启动新棋局，0为人类先手，1为机器先手 216 | """ 217 | 218 | 219 | class Evaluator(keras.callbacks.Callback): 220 | """评估与保存 221 | """ 222 | def on_epoch_end(self, epoch, logs=None): 223 | # 保存模型 224 | model.save_weights('./best_model.weights') 225 | 226 | 227 | if __name__ == '__main__': 228 | 229 | evaluator = Evaluator() 230 | train_generator = data_generator(data, batch_size) 231 | 232 | model.fit( 233 | train_generator.forfit(), 234 | steps_per_epoch=steps_per_epoch, 235 | epochs=epochs, 236 | callbacks=[evaluator] 237 | ) 238 | 239 | else: 240 | 241 | model.load_weights('./best_model.weights') 242 | -------------------------------------------------------------------------------- /examples/task_sequence_labeling_ner_crf.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 用CRF做中文命名实体识别 3 | # 数据集 http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz 4 | # 实测验证集的F1可以到96.18%，测试集的F1可以到95.35% 5 | 6 | import numpy as np 7 | from bert4keras.backend import keras, K 8 | from bert4keras.models import build_transformer_model 9 | from bert4keras.tokenizers import Tokenizer 10 | from bert4keras.optimizers import Adam 11 | from bert4keras.snippets import sequence_padding, DataGenerator 12 | from bert4keras.snippets import open, ViterbiDecoder, to_array 13 | from bert4keras.layers import ConditionalRandomField 14 | from keras.layers import Dense 15 | from keras.models import Model 16 | from tqdm import tqdm 17 | 18 | maxlen = 256 19 | epochs = 10 20 | batch_size = 32 21 | bert_layers = 12 22 | learing_rate = 1e-5 # bert_layers越小，学习率应该要越大 23 | crf_lr_multiplier = 1000 # 必要时扩大CRF层的学习率 24 | 25 | # bert配置 26 | config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' 27 | checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 28 | dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' 29 | 30 | 31 | def load_data(filename): 32 | """加载数据 33 | 单条格式：[(片段1, 标签1), (片段2, 标签2), (片段3, 标签3), ...] 34 | """ 35 | D = [] 36 | with open(filename, encoding='utf-8') as f: 37 | f = f.read() 38 | for l in f.split('\n\n'): 39 | if not l: 40 | continue 41 | d, last_flag = [], '' 42 | for c in l.split('\n'): 43 | char, this_flag = c.split(' ') 44 | if this_flag == 'O' and last_flag == 'O': 45 | d[-1][0] += char 46 | elif this_flag == 'O' and last_flag != 'O': 47 | d.append([char, 'O']) 48 | elif this_flag[:1] == 'B': 49 | d.append([char, this_flag[2:]]) 50 | else: 51 | d[-1][0] += char 52 | last_flag = this_flag 53 | D.append(d) 54 | return D 55 | 56 | 57 | # 标注数据 58 | train_data = load_data('/root/ner/china-people-daily-ner-corpus/example.train') 59 | valid_data = load_data('/root/ner/china-people-daily-ner-corpus/example.dev') 60 | test_data = load_data('/root/ner/china-people-daily-ner-corpus/example.test') 61 | 62 | # 建立分词器 63 | tokenizer = Tokenizer(dict_path, do_lower_case=True) 64 | 65 | # 类别映射 66 | labels = ['PER', 'LOC', 'ORG'] 67 | id2label = dict(enumerate(labels)) 68 | label2id = {j: i for i, j in id2label.items()} 69 | num_labels = len(labels) * 2 + 1 70 | 71 | 72 | class data_generator(DataGenerator): 73 | """数据生成器 74 | """ 75 | def __iter__(self, random=False): 76 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 77 | for is_end, item in self.sample(random): 78 | token_ids, labels = [tokenizer._token_start_id], [0] 79 | for w, l in item: 80 | w_token_ids = tokenizer.encode(w)[0][1:-1] 81 | if len(token_ids) + len(w_token_ids) < maxlen: 82 | token_ids += w_token_ids 83 | if l == 'O': 84 | labels += [0] * len(w_token_ids) 85 | else: 86 | B = label2id[l] * 2 + 1 87 | I = label2id[l] * 2 + 2 88 | labels += ([B] + [I] * (len(w_token_ids) - 1)) 89 | else: 90 | break 91 | token_ids += [tokenizer._token_end_id] 92 | labels += [0] 93 | segment_ids = [0] * len(token_ids) 94 | batch_token_ids.append(token_ids) 95 | batch_segment_ids.append(segment_ids) 96 | batch_labels.append(labels) 97 | if len(batch_token_ids) == self.batch_size or is_end: 98 | batch_token_ids = sequence_padding(batch_token_ids) 99 | batch_segment_ids = sequence_padding(batch_segment_ids) 100 | batch_labels = sequence_padding(batch_labels) 101 | yield [batch_token_ids, batch_segment_ids], batch_labels 102 | batch_token_ids, batch_segment_ids, batch_labels = [], [], [] 103 | 104 | 105 | """ 106 | 后面的代码使用的是bert类型的模型，如果你用的是albert，那么前几行请改为： 107 | 108 | model = build_transformer_model( 109 | config_path, 110 | checkpoint_path, 111 | model='albert', 112 | ) 113 | 114 | output_layer = 'Transformer-FeedForward-Norm' 115 | output = model.get_layer(output_layer).get_output_at(bert_layers - 1) 116 | """ 117 | 118 | model = build_transformer_model( 119 | config_path, 120 | checkpoint_path, 121 | ) 122 | 123 | output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1) 124 | output = model.get_layer(output_layer).output 125 | output = Dense(num_labels)(output) 126 | CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier) 127 | output = CRF(output) 128 | 129 | model = Model(model.input, output) 130 | model.summary() 131 | 132 | model.compile( 133 | loss=CRF.sparse_loss, 134 | optimizer=Adam(learing_rate), 135 | metrics=[CRF.sparse_accuracy] 136 | ) 137 | 138 | 139 | class NamedEntityRecognizer(ViterbiDecoder): 140 | """命名实体识别器 141 | """ 142 | def recognize(self, text): 143 | tokens = tokenizer.tokenize(text) 144 | while len(tokens) > 512: 145 | tokens.pop(-2) 146 | mapping = tokenizer.rematch(text, tokens) 147 | token_ids = tokenizer.tokens_to_ids(tokens) 148 | segment_ids = [0] * len(token_ids) 149 | token_ids, segment_ids = to_array([token_ids], [segment_ids]) 150 | nodes = model.predict([token_ids, segment_ids])[0] 151 | labels = self.decode(nodes) 152 | entities, starting = [], False 153 | for i, label in enumerate(labels): 154 | if label > 0: 155 | if label % 2 == 1: 156 | starting = True 157 | entities.append([[i], id2label[(label - 1) // 2]]) 158 | elif starting: 159 | entities[-1][0].append(i) 160 | else: 161 | starting = False 162 | else: 163 | starting = False 164 | 165 | return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l) 166 | for w, l in entities] 167 | 168 | 169 | NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0]) 170 | 171 | 172 | def evaluate(data): 173 | """评测函数 174 | """ 175 | X, Y, Z = 1e-10, 1e-10, 1e-10 176 | for d in tqdm(data): 177 | text = ''.join([i[0] for i in d]) 178 | R = set(NER.recognize(text)) 179 | T = set([tuple(i) for i in d if i[1] != 'O']) 180 | X += len(R & T) 181 | Y += len(R) 182 | Z += len(T) 183 | f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z 184 | return f1, precision, recall 185 | 186 | 187 | class Evaluator(keras.callbacks.Callback): 188 | """评估与保存 189 | """ 190 | def __init__(self): 191 | self.best_val_f1 = 0 192 | 193 | def on_epoch_end(self, epoch, logs=None): 194 | trans = K.eval(CRF.trans) 195 | NER.trans = trans 196 | print(NER.trans) 197 | f1, precision, recall = evaluate(valid_data) 198 | # 保存最优 199 | if f1 >= self.best_val_f1: 200 | self.best_val_f1 = f1 201 | model.save_weights('./best_model.weights') 202 | print( 203 | 'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' % 204 | (f1, precision, recall, self.best_val_f1) 205 | ) 206 | f1, precision, recall = evaluate(test_data) 207 | print( 208 | 'test: f1: %.5f, precision: %.5f, recall: %.5f\n' % 209 | (f1, precision, recall) 210 | ) 211 | 212 | 213 | if __name__ == '__main__': 214 | 215 | evaluator = Evaluator() 216 | train_generator = data_generator(train_data, batch_size) 217 | 218 | model.fit( 219 | train_generator.forfit(), 220 | steps_per_epoch=len(train_generator), 221 | epochs=epochs, 222 | callbacks=[evaluator] 223 | ) 224 | 225 | else: 226 | 227 | model.load_weights('./best_model.weights') 228 | NER.trans = K.eval(CRF.trans) 229 | -------------------------------------------------------------------------------- /examples/task_reading_comprehension_by_mlm.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 用MLM的方式做阅读理解任务 3 | # 数据集和评测同 https://github.com/bojone/dgcnn_for_reading_comprehension 4 | # 10个epoch后在valid上能达到约0.77的分数 5 | # (Accuracy=0.7282149325820084 F1=0.8207266829447049 Final=0.7744708077633566) 6 | 7 | import json, os, re 8 | import numpy as np 9 | from bert4keras.backend import keras, K 10 | from bert4keras.models import build_transformer_model 11 | from bert4keras.tokenizers import Tokenizer, load_vocab 12 | from bert4keras.optimizers import Adam 13 | from bert4keras.snippets import sequence_padding, DataGenerator 14 | from bert4keras.snippets import open 15 | from keras.layers import Lambda 16 | from keras.models import Model 17 | from tqdm import tqdm 18 | 19 | max_p_len = 256 20 | max_q_len = 64 21 | max_a_len = 32 22 | batch_size = 32 23 | epochs = 10 24 | 25 | # bert配置 26 | config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' 27 | checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 28 | dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' 29 | 30 | # 标注数据 31 | webqa_data = json.load(open('/root/qa_datasets/WebQA.json')) 32 | sogou_data = json.load(open('/root/qa_datasets/SogouQA.json')) 33 | 34 | # 保存一个随机序（供划分valid用） 35 | if not os.path.exists('../random_order.json'): 36 | random_order = list(range(len(sogou_data))) 37 | np.random.shuffle(random_order) 38 | json.dump(random_order, open('../random_order.json', 'w'), indent=4) 39 | else: 40 | random_order = json.load(open('../random_order.json')) 41 | 42 | # 划分valid 43 | train_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 != 0] 44 | valid_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 == 0] 45 | train_data.extend(train_data) 46 | train_data.extend(webqa_data) # 将SogouQA和WebQA按2:1的比例混合 47 | 48 | # 加载并精简词表，建立分词器 49 | token_dict, keep_tokens = load_vocab( 50 | dict_path=dict_path, 51 | simplified=True, 52 | startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], 53 | ) 54 | tokenizer = Tokenizer(token_dict, do_lower_case=True) 55 | 56 | 57 | class data_generator(DataGenerator): 58 | """数据生成器 59 | """ 60 | def __iter__(self, random=False): 61 | """单条样本格式为 62 | 输入：[CLS][MASK][MASK][SEP]问题[SEP]篇章[SEP] 63 | 输出：答案 64 | """ 65 | batch_token_ids, batch_segment_ids, batch_a_token_ids = [], [], [] 66 | for is_end, D in self.sample(random): 67 | question = D['question'] 68 | answers = [p['answer'] for p in D['passages'] if p['answer']] 69 | passage = np.random.choice(D['passages'])['passage'] 70 | passage = re.sub(u' |、|；|，', ',', passage) 71 | final_answer = '' 72 | for answer in answers: 73 | if all([ 74 | a in passage[:max_p_len - 2] for a in answer.split(' ') 75 | ]): 76 | final_answer = answer.replace(' ', ',') 77 | break 78 | a_token_ids, _ = tokenizer.encode( 79 | final_answer, maxlen=max_a_len + 1 80 | ) 81 | q_token_ids, _ = tokenizer.encode(question, maxlen=max_q_len + 1) 82 | p_token_ids, _ = tokenizer.encode(passage, maxlen=max_p_len + 1) 83 | token_ids = [tokenizer._token_start_id] 84 | token_ids += ([tokenizer._token_mask_id] * max_a_len) 85 | token_ids += [tokenizer._token_end_id] 86 | token_ids += (q_token_ids[1:] + p_token_ids[1:]) 87 | segment_ids = [0] * len(token_ids) 88 | batch_token_ids.append(token_ids) 89 | batch_segment_ids.append(segment_ids) 90 | batch_a_token_ids.append(a_token_ids[1:]) 91 | if len(batch_token_ids) == self.batch_size or is_end: 92 | batch_token_ids = sequence_padding(batch_token_ids) 93 | batch_segment_ids = sequence_padding(batch_segment_ids) 94 | batch_a_token_ids = sequence_padding( 95 | batch_a_token_ids, max_a_len 96 | ) 97 | yield [batch_token_ids, batch_segment_ids], batch_a_token_ids 98 | batch_token_ids, batch_segment_ids, batch_a_token_ids = [], [], [] 99 | 100 | 101 | model = build_transformer_model( 102 | config_path, 103 | checkpoint_path, 104 | with_mlm=True, 105 | keep_tokens=keep_tokens, # 只保留keep_tokens中的字，精简原字表 106 | ) 107 | output = Lambda(lambda x: x[:, 1:max_a_len + 1])(model.output) 108 | model = Model(model.input, output) 109 | model.summary() 110 | 111 | 112 | def masked_cross_entropy(y_true, y_pred): 113 | """交叉熵作为loss，并mask掉padding部分的预测 114 | """ 115 | y_true = K.reshape(y_true, [K.shape(y_true)[0], -1]) 116 | y_mask = K.cast(K.not_equal(y_true, 0), K.floatx()) 117 | cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred) 118 | cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) 119 | return cross_entropy 120 | 121 | 122 | model.compile(loss=masked_cross_entropy, optimizer=Adam(1e-5)) 123 | 124 | 125 | def get_ngram_set(x, n): 126 | """生成ngram合集，返回结果格式是: 127 | {(n-1)-gram: set([n-gram的第n个字集合])} 128 | """ 129 | result = {} 130 | for i in range(len(x) - n + 1): 131 | k = tuple(x[i:i + n]) 132 | if k[:-1] not in result: 133 | result[k[:-1]] = set() 134 | result[k[:-1]].add(k[-1]) 135 | return result 136 | 137 | 138 | def gen_answer(question, passages): 139 | """由于是MLM模型，所以可以直接argmax解码。 140 | """ 141 | all_p_token_ids, token_ids, segment_ids = [], [], [] 142 | for passage in passages: 143 | passage = re.sub(u' |、|；|，', ',', passage) 144 | p_token_ids, _ = tokenizer.encode(passage, maxlen=max_p_len + 1) 145 | q_token_ids, _ = tokenizer.encode(question, maxlen=max_q_len + 1) 146 | all_p_token_ids.append(p_token_ids[1:]) 147 | token_ids.append([tokenizer._token_start_id]) 148 | token_ids[-1] += ([tokenizer._token_mask_id] * max_a_len) 149 | token_ids[-1] += [tokenizer._token_end_id] 150 | token_ids[-1] += (q_token_ids[1:] + p_token_ids[1:]) 151 | segment_ids.append([0] * len(token_ids[-1])) 152 | token_ids = sequence_padding(token_ids) 153 | segment_ids = sequence_padding(segment_ids) 154 | probas = model.predict([token_ids, segment_ids]) 155 | results = {} 156 | for t, p in zip(all_p_token_ids, probas): 157 | a, score = tuple(), 0. 158 | for i in range(max_a_len): 159 | idxs = list(get_ngram_set(t, i + 1)[a]) 160 | if tokenizer._token_end_id not in idxs: 161 | idxs.append(tokenizer._token_end_id) 162 | # pi是将passage以外的token的概率置零 163 | pi = np.zeros_like(p[i]) 164 | pi[idxs] = p[i, idxs] 165 | a = a + (pi.argmax(),) 166 | score += pi.max() 167 | if a[-1] == tokenizer._token_end_id: 168 | break 169 | score = score / (i + 1) 170 | a = tokenizer.decode(a) 171 | if a: 172 | results[a] = results.get(a, []) + [score] 173 | results = { 174 | k: (np.array(v)**2).sum() / (sum(v) + 1) 175 | for k, v in results.items() 176 | } 177 | return results 178 | 179 | 180 | def max_in_dict(d): 181 | if d: 182 | return sorted(d.items(), key=lambda s: -s[1])[0][0] 183 | 184 | 185 | def predict_to_file(data, filename): 186 | """将预测结果输出到文件，方便评估 187 | """ 188 | with open(filename, 'w', encoding='utf-8') as f: 189 | for d in tqdm(iter(data), desc=u'正在预测(共%s条样本)' % len(data)): 190 | q_text = d['question'] 191 | p_texts = [p['passage'] for p in d['passages']] 192 | a = gen_answer(q_text, p_texts) 193 | a = max_in_dict(a) 194 | if a: 195 | s = u'%s\t%s\n' % (d['id'], a) 196 | else: 197 | s = u'%s\t\n' % (d['id']) 198 | f.write(s) 199 | f.flush() 200 | 201 | 202 | class Evaluator(keras.callbacks.Callback): 203 | """评估与保存 204 | """ 205 | def __init__(self): 206 | self.lowest = 1e10 207 | 208 | def on_epoch_end(self, epoch, logs=None): 209 | # 保存最优 210 | if logs['loss'] <= self.lowest: 211 | self.lowest = logs['loss'] 212 | model.save_weights('./best_model.weights') 213 | 214 | 215 | if __name__ == '__main__': 216 | 217 | evaluator = Evaluator() 218 | train_generator = data_generator(train_data, batch_size) 219 | 220 | model.fit( 221 | train_generator.forfit(), 222 | steps_per_epoch=len(train_generator), 223 | epochs=epochs, 224 | callbacks=[evaluator] 225 | ) 226 | 227 | else: 228 | 229 | model.load_weights('./best_model.weights') 230 | -------------------------------------------------------------------------------- /examples/task_image_caption.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # bert做image caption任务，coco数据集 3 | # 通过Conditional Layer Normalization融入条件信息 4 | # 请参考：https://kexue.fm/archives/7124 5 | 6 | from __future__ import print_function 7 | import json 8 | import numpy as np 9 | from bert4keras.backend import keras, K 10 | from bert4keras.layers import Loss 11 | from bert4keras.models import build_transformer_model 12 | from bert4keras.tokenizers import Tokenizer, load_vocab 13 | from bert4keras.optimizers import Adam 14 | from bert4keras.snippets import sequence_padding, is_string 15 | from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder 16 | from keras.models import Model 17 | import cv2 18 | 19 | # 模型配置 20 | maxlen = 64 21 | batch_size = 32 22 | steps_per_epoch = 1000 23 | epochs = 10000 24 | 25 | # bert配置 26 | config_path = '/root/kg/bert/uncased_L-12_H-768_A-12/bert_config.json' 27 | checkpoint_path = '/root/kg/bert/uncased_L-12_H-768_A-12/bert_model.ckpt' 28 | dict_path = '/root/kg/bert/uncased_L-12_H-768_A-12/vocab.txt' 29 | 30 | # 加载并精简词表，建立分词器 31 | token_dict, keep_tokens = load_vocab( 32 | dict_path=dict_path, 33 | simplified=True, 34 | startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], 35 | ) 36 | tokenizer = Tokenizer(token_dict, do_lower_case=True) 37 | 38 | 39 | def read_caption(f): 40 | """读取并整理COCO的Caption数据 41 | """ 42 | data = json.load(open(f)) 43 | images = {} 44 | for img in data['images']: 45 | images[img['id']] = { 46 | 'image_id': img['file_name'], 47 | 'caption': [], 48 | 'url': img['coco_url'] 49 | } 50 | for caption in data['annotations']: 51 | images[caption['image_id']]['caption'].append(caption['caption']) 52 | return list(images.values()) 53 | 54 | 55 | def read_image(f): 56 | """单图读取函数（对非方形的图片进行白色填充，使其变为方形） 57 | """ 58 | img = cv2.imread(f) 59 | height, width = img.shape[:2] 60 | if height > width: 61 | height, width = img_size, width * img_size // height 62 | img = cv2.resize(img, (width, height)) 63 | delta = (height - width) // 2 64 | img = cv2.copyMakeBorder( 65 | img, 66 | top=0, 67 | bottom=0, 68 | left=delta, 69 | right=height - width - delta, 70 | borderType=cv2.BORDER_CONSTANT, 71 | value=[255, 255, 255] 72 | ) 73 | else: 74 | height, width = height * img_size // width, img_size 75 | img = cv2.resize(img, (width, height)) 76 | delta = (width - height) // 2 77 | img = cv2.copyMakeBorder( 78 | img, 79 | top=delta, 80 | bottom=width - height - delta, 81 | left=0, 82 | right=0, 83 | borderType=cv2.BORDER_CONSTANT, 84 | value=[255, 255, 255] 85 | ) 86 | img = img.astype('float32') 87 | return img[..., ::-1] # cv2的读取模式为BGR，但keras的模型要求为RGB 88 | 89 | 90 | class data_generator(DataGenerator): 91 | """数据生成器 92 | """ 93 | def __iter__(self, random=False): 94 | batch_images, batch_token_ids, batch_segment_ids = [], [], [] 95 | for is_end, D in self.sample(random): 96 | img = '/root/caption/coco/train2014/%s' % D['image_id'] 97 | caption = np.random.choice(D['caption']) 98 | token_ids, segment_ids = tokenizer.encode(caption, maxlen=maxlen) 99 | batch_images.append(read_image(img)) 100 | batch_token_ids.append(token_ids) 101 | batch_segment_ids.append(segment_ids) 102 | if len(batch_token_ids) == self.batch_size or is_end: 103 | batch_images = np.array(batch_images) 104 | batch_images = preprocess_input(batch_images) 105 | batch_token_ids = sequence_padding(batch_token_ids) 106 | batch_segment_ids = sequence_padding(batch_segment_ids) 107 | yield [batch_token_ids, batch_segment_ids, batch_images], None 108 | batch_images, batch_token_ids, batch_segment_ids = [], [], [] 109 | 110 | 111 | # 加载数据 112 | train_data = read_caption( 113 | '/root/caption/coco/annotations/captions_train2014.json' 114 | ) 115 | valid_data = read_caption( 116 | '/root/caption/coco/annotations/captions_val2014.json' 117 | ) 118 | 119 | 120 | class CrossEntropy(Loss): 121 | """交叉熵作为loss，并mask掉padding部分 122 | """ 123 | def compute_loss(self, inputs, mask=None): 124 | y_true, y_pred = inputs 125 | if mask[1] is None: 126 | y_mask = 1.0 127 | else: 128 | y_mask = K.cast(mask[1], K.floatx())[:, 1:] 129 | y_true = y_true[:, 1:] # 目标token_ids 130 | y_pred = y_pred[:, :-1] # 预测序列，错开一位 131 | loss = K.sparse_categorical_crossentropy(y_true, y_pred) 132 | loss = K.sum(loss * y_mask) / K.sum(y_mask) 133 | return loss 134 | 135 | 136 | # 图像模型 137 | MobileNetV2 = keras.applications.mobilenet_v2.MobileNetV2 138 | preprocess_input = keras.applications.mobilenet_v2.preprocess_input 139 | image_model = MobileNetV2(include_top=False, pooling='avg') 140 | img_size = 299 141 | 142 | # Bert模型 143 | model = build_transformer_model( 144 | config_path, 145 | checkpoint_path, 146 | application='lm', 147 | keep_tokens=keep_tokens, # 只保留keep_tokens中的字，精简原字表 148 | layer_norm_cond=image_model.output, 149 | layer_norm_cond_hidden_size=128, 150 | layer_norm_cond_hidden_act='swish', 151 | additional_input_layers=image_model.input, 152 | ) 153 | 154 | output = CrossEntropy(1)([model.inputs[0], model.outputs[0]]) 155 | 156 | model = Model(model.inputs, output) 157 | model.compile(optimizer=Adam(1e-5)) 158 | model.summary() 159 | 160 | 161 | class AutoCaption(AutoRegressiveDecoder): 162 | """img2seq解码器 163 | """ 164 | @AutoRegressiveDecoder.wraps(default_rtype='probas') 165 | def predict(self, inputs, output_ids, states): 166 | image = inputs[0] 167 | token_ids = output_ids 168 | segment_ids = np.zeros_like(token_ids) 169 | return model.predict([token_ids, segment_ids, image])[:, -1] 170 | 171 | def generate(self, image, topk=1): 172 | if is_string(image): 173 | image = read_image(image) 174 | image = preprocess_input(image) 175 | output_ids = self.beam_search([image], topk=topk) # 基于beam search 176 | return tokenizer.decode(output_ids) 177 | 178 | 179 | autocaption = AutoCaption( 180 | start_id=tokenizer._token_start_id, 181 | end_id=tokenizer._token_end_id, 182 | maxlen=maxlen 183 | ) 184 | 185 | 186 | def just_show(): 187 | samples = [valid_data[i] for i in np.random.choice(len(valid_data), 2)] 188 | for D in samples: 189 | img = '/root/caption/coco/val2014/%s' % D['image_id'] 190 | print(u'image_id:', D['image_id']) 191 | print(u'url:', D['url']) 192 | print(u'predict:', autocaption.generate(img)) 193 | print(u'references:', D['caption']) 194 | print() 195 | 196 | 197 | class Evaluator(keras.callbacks.Callback): 198 | """评估与保存 199 | """ 200 | def __init__(self): 201 | self.lowest = 1e10 202 | 203 | def on_epoch_end(self, epoch, logs=None): 204 | # 保存最优 205 | if logs['loss'] <= self.lowest: 206 | self.lowest = logs['loss'] 207 | model.save_weights('./best_model.weights') 208 | # 演示效果 209 | just_show() 210 | 211 | 212 | if __name__ == '__main__': 213 | 214 | evaluator = Evaluator() 215 | train_generator = data_generator(train_data, batch_size) 216 | 217 | model.fit( 218 | train_generator.forfit(), 219 | steps_per_epoch=steps_per_epoch, 220 | epochs=epochs, 221 | callbacks=[evaluator] 222 | ) 223 | 224 | else: 225 | 226 | model.load_weights('./best_model.weights') 227 | """ 228 | image_id: COCO_val2014_000000524611.jpg 229 | url: http://images.cocodataset.org/val2014/COCO_val2014_000000524611.jpg 230 | predict: a train that is sitting on the tracks. 231 | references: [u'A train carrying chemical tanks traveling past a water tower.', u'Dual train tracks with a train on one of them and a water tower in the background.', u'a train some trees and a water tower ', u'Train on tracks with water tower for Davis Junction in the rear.', u'A train on a train track going through a bunch of trees.'] 232 | 233 | image_id: COCO_val2014_000000202923.jpg 234 | url: http://images.cocodataset.org/val2014/COCO_val2014_000000202923.jpg 235 | predict: a baseball game in progress with the batter up to plate. 236 | references: [u'Batter, catcher, and umpire anticipating the next pitch.', u'A baseball player holding a baseball bat in the game.', u'A baseball player stands ready at the plate.', u'Baseball players on the field ready for the pitch.', u'A view from behind a mesh fence of a baseball game.'] 237 | """ 238 | -------------------------------------------------------------------------------- /bert4keras/backend.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 分离后端函数，主要是为了同时兼容原生keras和tf.keras 3 | # 通过设置环境变量TF_KERAS=1来切换tf.keras 4 | 5 | import os, sys 6 | from distutils.util import strtobool 7 | import numpy as np 8 | import tensorflow as tf 9 | from tensorflow.python.util import nest, tf_inspect 10 | from tensorflow.python.eager import tape 11 | from tensorflow.python.ops.custom_gradient import _graph_mode_decorator 12 | 13 | # 判断是tf.keras还是纯keras的标记 14 | is_tf_keras = strtobool(os.environ.get('TF_KERAS', '0')) 15 | 16 | if is_tf_keras: 17 | import tensorflow.keras as keras 18 | import tensorflow.keras.backend as K 19 | sys.modules['keras'] = keras 20 | else: 21 | import keras 22 | import keras.backend as K 23 | 24 | # 判断是否启用重计算（通过时间换空间） 25 | do_recompute = strtobool(os.environ.get('RECOMPUTE', '0')) 26 | 27 | 28 | def gelu_erf(x): 29 | """基于Erf直接计算的gelu函数 30 | """ 31 | return 0.5 * x * (1.0 + tf.math.erf(x / np.sqrt(2.0))) 32 | 33 | 34 | def gelu_tanh(x): 35 | """基于Tanh近似计算的gelu函数 36 | """ 37 | cdf = 0.5 * ( 38 | 1.0 + K.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * K.pow(x, 3)))) 39 | ) 40 | return x * cdf 41 | 42 | 43 | def set_gelu(version): 44 | """设置gelu版本 45 | """ 46 | version = version.lower() 47 | assert version in ['erf', 'tanh'], 'gelu version must be erf or tanh' 48 | if version == 'erf': 49 | keras.utils.get_custom_objects()['gelu'] = gelu_erf 50 | else: 51 | keras.utils.get_custom_objects()['gelu'] = gelu_tanh 52 | 53 | 54 | def piecewise_linear(t, schedule): 55 | """分段线性函数 56 | 其中schedule是形如{1000: 1, 2000: 0.1}的字典， 57 | 表示 t ∈ [0, 1000]时，输出从0均匀增加至1，而 58 | t ∈ [1000, 2000]时，输出从1均匀降低到0.1，最后 59 | t > 2000时，保持0.1不变。 60 | """ 61 | schedule = sorted(schedule.items()) 62 | if schedule[0][0] != 0: 63 | schedule = [(0, 0.0)] + schedule 64 | 65 | x = K.constant(schedule[0][1], dtype=K.floatx()) 66 | t = K.cast(t, K.floatx()) 67 | for i in range(len(schedule)): 68 | t_begin = schedule[i][0] 69 | x_begin = x 70 | if i != len(schedule) - 1: 71 | dx = schedule[i + 1][1] - schedule[i][1] 72 | dt = schedule[i + 1][0] - schedule[i][0] 73 | slope = 1.0 * dx / dt 74 | x = schedule[i][1] + slope * (t - t_begin) 75 | else: 76 | x = K.constant(schedule[i][1], dtype=K.floatx()) 77 | x = K.switch(t >= t_begin, x, x_begin) 78 | 79 | return x 80 | 81 | 82 | def search_layer(inputs, name, exclude_from=None): 83 | """根据inputs和name来搜索层 84 | 说明：inputs为某个层或某个层的输出；name为目标层的名字。 85 | 实现：根据inputs一直往上递归搜索，直到发现名字为name的层为止； 86 | 如果找不到，那就返回None。 87 | """ 88 | if exclude_from is None: 89 | exclude_from = set() 90 | 91 | if isinstance(inputs, keras.layers.Layer): 92 | layer = inputs 93 | else: 94 | layer = inputs._keras_history[0] 95 | 96 | if layer.name == name: 97 | return layer 98 | elif layer in exclude_from: 99 | return None 100 | else: 101 | exclude_from.add(layer) 102 | if isinstance(layer, keras.models.Model): 103 | model = layer 104 | for layer in model.layers: 105 | if layer.name == name: 106 | return layer 107 | inbound_layers = layer._inbound_nodes[0].inbound_layers 108 | if not isinstance(inbound_layers, list): 109 | inbound_layers = [inbound_layers] 110 | if len(inbound_layers) > 0: 111 | for layer in inbound_layers: 112 | layer = search_layer(layer, name, exclude_from) 113 | if layer is not None: 114 | return layer 115 | 116 | 117 | def sequence_masking(x, mask, mode=0, axis=None): 118 | """为序列条件mask的函数 119 | mask: 形如(batch_size, seq_len)的0-1矩阵； 120 | mode: 如果是0，则直接乘以mask； 121 | 如果是1，则在padding部分减去一个大正数。 122 | axis: 序列所在轴，默认为1； 123 | """ 124 | if mask is None or mode not in [0, 1]: 125 | return x 126 | else: 127 | if axis is None: 128 | axis = 1 129 | if axis == -1: 130 | axis = K.ndim(x) - 1 131 | assert axis > 0, 'axis must be greater than 0' 132 | for _ in range(axis - 1): 133 | mask = K.expand_dims(mask, 1) 134 | for _ in range(K.ndim(x) - K.ndim(mask) - axis + 1): 135 | mask = K.expand_dims(mask, K.ndim(mask)) 136 | if mode == 0: 137 | return x * mask 138 | else: 139 | return x - (1 - mask) * 1e12 140 | 141 | 142 | def batch_gather(params, indices): 143 | """同tf旧版本的batch_gather 144 | """ 145 | if K.dtype(indices)[:3] != 'int': 146 | indices = K.cast(indices, 'int32') 147 | 148 | try: 149 | return tf.gather(params, indices, batch_dims=K.ndim(indices) - 1) 150 | except Exception as e1: 151 | try: 152 | return tf.batch_gather(params, indices) 153 | except Exception as e2: 154 | raise ValueError('%s\n%s\n' % (e1.message, e2.message)) 155 | 156 | 157 | def pool1d( 158 | x, 159 | pool_size, 160 | strides=1, 161 | padding='valid', 162 | data_format=None, 163 | pool_mode='max' 164 | ): 165 | """向量序列的pool函数 166 | """ 167 | x = K.expand_dims(x, 1) 168 | x = K.pool2d( 169 | x, 170 | pool_size=(1, pool_size), 171 | strides=(1, strides), 172 | padding=padding, 173 | data_format=data_format, 174 | pool_mode=pool_mode 175 | ) 176 | return x[:, 0] 177 | 178 | 179 | def divisible_temporal_padding(x, n): 180 | """将一维向量序列右padding到长度能被n整除 181 | """ 182 | r_len = K.shape(x)[1] % n 183 | p_len = K.switch(r_len > 0, n - r_len, 0) 184 | return K.temporal_padding(x, (0, p_len)) 185 | 186 | 187 | def swish(x): 188 | """swish函数（这样封装过后才有 __name__ 属性） 189 | """ 190 | return tf.nn.swish(x) 191 | 192 | 193 | def leaky_relu(x, alpha=0.2): 194 | """leaky relu函数（这样封装过后才有 __name__ 属性） 195 | """ 196 | return tf.nn.leaky_relu(x, alpha=alpha) 197 | 198 | 199 | class Sinusoidal(keras.initializers.Initializer): 200 | """Sin-Cos位置向量初始化器 201 | 来自：https://arxiv.org/abs/1706.03762 202 | """ 203 | def __call__(self, shape, dtype=None): 204 | """Sin-Cos形式的位置向量 205 | """ 206 | vocab_size, depth = shape 207 | embeddings = np.zeros(shape) 208 | for pos in range(vocab_size): 209 | for i in range(depth // 2): 210 | theta = pos / np.power(10000, 2. * i / depth) 211 | embeddings[pos, 2 * i] = np.sin(theta) 212 | embeddings[pos, 2 * i + 1] = np.cos(theta) 213 | return embeddings 214 | 215 | 216 | def symbolic(f): 217 | """恒等装饰器（兼容旧版本keras用） 218 | """ 219 | return f 220 | 221 | 222 | def graph_mode_decorator(f, *args, **kwargs): 223 | """tf 2.1与之前版本的传参方式不一样，这里做个同步 224 | """ 225 | if tf.__version__ < '2.1': 226 | return _graph_mode_decorator(f, *args, **kwargs) 227 | else: 228 | return _graph_mode_decorator(f, args, kwargs) 229 | 230 | 231 | def recompute_grad(call): 232 | """重计算装饰器（用来装饰Keras层的call函数） 233 | 关于重计算，请参考：https://arxiv.org/abs/1604.06174 234 | """ 235 | if not do_recompute: 236 | return call 237 | 238 | def inner(self, inputs, **kwargs): 239 | """定义需要求梯度的函数以及重新定义求梯度过程 240 | （参考自官方自带的tf.recompute_grad函数） 241 | """ 242 | flat_inputs = nest.flatten(inputs) 243 | call_args = tf_inspect.getfullargspec(call).args 244 | for key in ['mask', 'training']: 245 | if key not in call_args and key in kwargs: 246 | del kwargs[key] 247 | 248 | def kernel_call(): 249 | """定义前向计算 250 | """ 251 | return call(self, inputs, **kwargs) 252 | 253 | def call_and_grad(*inputs): 254 | """定义前向计算和反向计算 255 | """ 256 | if is_tf_keras: 257 | with tape.stop_recording(): 258 | outputs = kernel_call() 259 | outputs = tf.identity(outputs) 260 | else: 261 | outputs = kernel_call() 262 | 263 | def grad_fn(doutputs, variables=None): 264 | watches = list(inputs) 265 | if variables is not None: 266 | watches += list(variables) 267 | with tf.GradientTape() as t: 268 | t.watch(watches) 269 | with tf.control_dependencies([doutputs]): 270 | outputs = kernel_call() 271 | grads = t.gradient( 272 | outputs, watches, output_gradients=[doutputs] 273 | ) 274 | del t 275 | return grads[:len(inputs)], grads[len(inputs):] 276 | 277 | return outputs, grad_fn 278 | 279 | if is_tf_keras: # 仅在tf >= 2.0下可用 280 | outputs, grad_fn = call_and_grad(*flat_inputs) 281 | flat_outputs = nest.flatten(outputs) 282 | 283 | def actual_grad_fn(*doutputs): 284 | grads = grad_fn(*doutputs, variables=self.trainable_weights) 285 | return grads[0] + grads[1] 286 | 287 | watches = flat_inputs + self.trainable_weights 288 | watches = [tf.convert_to_tensor(x) for x in watches] 289 | tape.record_operation( 290 | call.__name__, flat_outputs, watches, actual_grad_fn 291 | ) 292 | return outputs 293 | else: # keras + tf >= 1.14 均可用 294 | return graph_mode_decorator(call_and_grad, *flat_inputs) 295 | 296 | return inner 297 | 298 | 299 | # 给旧版本keras新增symbolic方法（装饰器）， 300 | # 以便兼容optimizers.py中的代码 301 | K.symbolic = getattr(K, 'symbolic', None) or symbolic 302 | 303 | custom_objects = { 304 | 'gelu_erf': gelu_erf, 305 | 'gelu_tanh': gelu_tanh, 306 | 'gelu': gelu_erf, 307 | 'swish': swish, 308 | 'leaky_relu': leaky_relu, 309 | 'Sinusoidal': Sinusoidal, 310 | } 311 | 312 | keras.utils.get_custom_objects().update(custom_objects) 313 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bert4keras 2 | - Our light reimplement of bert for keras 3 | - 更清晰、更轻量级的keras版bert 4 | - 个人博客：https://kexue.fm/ 5 | - 在线文档：http://bert4keras.spaces.ac.cn/ （还在构建中） 6 | 7 | ## 说明 8 | 这是笔者重新实现的keras版的transformer模型库，致力于用尽可能清爽的代码来实现结合transformer和keras。 9 | 10 | 本项目的初衷是为了修改、定制上的方便，所以可能会频繁更新。 11 | 12 | 因此欢迎star，但不建议fork，因为你fork下来的版本可能很快就过期了。 13 | 14 | ## 功能 15 | 目前已经实现： 16 | - 加载bert/roberta/albert的预训练权重进行finetune； 17 | - 实现语言模型、seq2seq所需要的attention mask； 18 | - 丰富的examples； 19 | - 从零预训练代码（支持TPU、多GPU，请看pretraining）； 20 | - 兼容keras、tf.keras 21 | 22 | ## 使用 23 | 安装稳定版： 24 | ```shell 25 | pip install bert4keras 26 | ``` 27 | 安装最新版： 28 | ```shell 29 | pip install git+https://www.github.com/bojone/bert4keras.git 30 | ``` 31 | 32 | 使用例子请参考examples目录。 33 | 34 | 之前基于keras-bert给出的例子，仍适用于本项目，只需要将`bert_model`的加载方式换成本项目的。 35 | 36 | 理论上兼容Python2和Python3，兼容tensorflow 1.14+和tensorflow 2.x，实验环境是Python 2.7、Tesorflow 1.14+以及Keras 2.3.1（已经在2.2.4、2.3.0、2.3.1、tf.keras下测试通过）。 37 | 38 | **为了获得最好的体验，建议你使用Tensorflow 1.14 + Keras 2.3.1组合。** 39 | 40 |

关于环境组合 41 | 42 | - 支持tf+keras和tf+tf.keras，后者需要提前传入环境变量TF_KERAS=1。 43 | 44 | - 当使用tf+keras时，建议2.2.4 <= keras <= 2.3.1，以及 1.14 <= tf <= 2.2，不能使用tf 2.3+。 45 | 46 | - keras 2.4+可以用，但事实上keras 2.4.x基本上已经完全等价于tf.keras了，因此如果你要用keras 2.4+，倒不如直接用tf.keras。 47 |

48 | 49 | 当然，乐于贡献的朋友如果发现了某些bug的话，也欢迎指出修正甚至Pull Requests～ 50 | 51 | ## 权重 52 | 53 | 目前支持加载的权重： 54 | - Google原版bert: https://github.com/google-research/bert 55 | - brightmart版roberta: https://github.com/brightmart/roberta_zh 56 | - 哈工大版roberta: https://github.com/ymcui/Chinese-BERT-wwm 57 | - Google原版albert^[例子]: https://github.com/google-research/ALBERT 58 | - brightmart版albert: https://github.com/brightmart/albert_zh 59 | - 转换后的albert: https://github.com/bojone/albert_zh 60 | - 华为的NEZHA: https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-TensorFlow 61 | - 华为的NEZHA-GEN: https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-Gen-TensorFlow 62 | - 自研语言模型: https://github.com/ZhuiyiTechnology/pretrained-models 63 | - T5模型: https://github.com/google-research/text-to-text-transfer-transformer 64 | - GPT_OpenAI: https://github.com/bojone/CDial-GPT-tf 65 | - GPT2_ML: https://github.com/imcaspar/gpt2-ml 66 | - Google原版ELECTRA: https://github.com/google-research/electra 67 | - 哈工大版ELECTRA: https://github.com/ymcui/Chinese-ELECTRA 68 | - CLUE版ELECTRA: https://github.com/CLUEbenchmark/ELECTRA 69 | - LaBSE（多国语言BERT）: https://github.com/bojone/labse 70 | - Chinese-GEN项目下的模型: https://github.com/bojone/chinese-gen 71 | - T5.1.1: https://github.com/google-research/text-to-text-transfer-transformer/blob/master/released_checkpoints.md#t511 72 | - Multilingual T5: https://github.com/google-research/multilingual-t5/ 73 | 74 | 注意事项 75 | - 注1：brightmart版albert的开源时间早于Google版albert，这导致早期brightmart版albert的权重与Google版的不完全一致，换言之两者不能直接相互替换。为了减少代码冗余，bert4keras的0.2.4及后续版本均只支持加载Google版以brightmart版中带Google字眼的权重。如果要加载早期版本的权重，请用0.2.3版本，或者考虑作者转换过的albert_zh。 76 | - 注2：下载下来的ELECTRA权重，如果没有json配置文件的话，参考这里自己改一个（需要加上`type_vocab_size`字段）。 77 | 78 | ## 更新 79 | - 2020.12.04: `PositionEmbedding`引入层次分解，可以让BERT直接处理超长文本，在`build_transformer_model`中传入参数`hierarchical_position=True`启用。 80 | - 2020.11.19: 支持GPT2模型，参考[CPM_LM_bert4keras](https://github.com/bojone/CPM_LM_bert4keras)项目。 81 | - 2020.11.14: 新增分参数学习率`extend_with_parameter_wise_lr`，可用于给每层设置不同的学习率。 82 | - 2020.10.27: 支持T5.1.1和Multilingual T5。 83 | - 2020.08.28: 支持GPT_OpenAI。 84 | - 2020.08.22: 新增`WebServing`类，允许简单地将模型转换为Web接口，详情请参考该类的说明。 85 | - 2020.07.14: `Transformer`类加入`prefix`参数；`snippets.py`引入`to_array`函数；`AutoRegressiveDecoder`修改`rtype='logits'`时的一个隐藏bug。 86 | - 2020.06.06: 强迫症作祟：将`Tokenizer`原来的`max_length`参数重命名为`maxlen`，同时保留向后兼容性，建议大家用新参数名。 87 | - 2020.04.29: 增加重计算（参考keras_recompute），可以通过时间换空间，通过设置环境变量`RECOMPUTE=1`启用。 88 | - 2020.04.25: 优化tf2下的表现。 89 | - 2020.04.16: 所有example均适配tensorflow 2.0。 90 | - 2020.04.06: 增加UniLM预训练模式（测试中）。 91 | - 2020.04.06: 完善`rematch`方法。 92 | - 2020.04.01: `Tokenizer`增加`rematch`方法，给出分词结果与原序列的映射关系。 93 | - 2020.03.30: 尽量统一py文件的写法。 94 | - 2020.03.25: 支持ELECTRA。 95 | - 2020.03.24: 继续加强`DataGenerator`，允许传入迭代器时进行局部shuffle。 96 | - 2020.03.23: 增加调整Attention的`key_size`的选项。 97 | - 2020.03.17: 增强`DataGenerator`；优化模型写法。 98 | - 2020.03.15: 支持GPT2_ML。 99 | - 2020.03.10: 支持Google的T5模型。 100 | - 2020.03.05: 将`tokenizer.py`更名为`tokenizers.py`。 101 | - 2020.03.05: `application='seq2seq'`改名为`application='unilm'`。 102 | - 2020.03.05: `build_bert_model`更名为`build_transformer_model`。 103 | - 2020.03.05: 重写`models.py`结构。 104 | - 2020.03.04: 将`bert.py`更名为`models.py`。 105 | - 2020.03.02: 重构mask机制（用回Keras自带的mask机制），以便更好地编写更复杂的应用。 106 | - 2020.02.22: 新增`AutoRegressiveDecoder`类，统一处理Seq2Seq的解码问题。 107 | - 2020.02.19: transformer block的前缀改为Transformer（本来是Encoder），使得其含义局限性更少。 108 | - 2020.02.13: 优化`load_vocab`函数；将`build_bert_model`中的`keep_words`参数更名为`keep_tokens`，此处改动可能会对部分脚本产生影响。 109 | - 2020.01.18: 调整文本处理方式，去掉codecs的使用。 110 | - 2020.01.17: 各api日趋稳定，为了方便大家使用，打包到pypi，首个打包版本号为0.4.6。 111 | - 2020.01.10: 重写模型mask方案，某种程度上让代码更为简练清晰；后端优化。 112 | - 2019.12.27: 重构预训练代码，减少冗余；目前支持RoBERTa和GPT两种预训练方式，详见pretraining。 113 | - 2019.12.17: 适配华为的nezha权重，只需要在`build_bert_model`函数里加上`model='nezha'`；此外原来albert的加载方式`albert=True`改为`model='albert'`。 114 | - 2019.12.16: 通过跟keras 2.3+版本类似的思路给低版本引入层中层功能，从而恢复对低于2.3.0版本的keras的支持。 115 | - 2019.12.14: 新增Conditional Layer Normalization及相关demo。 116 | - 2019.12.09: 各example的data_generator规范化；修复application='lm'时的一个错误。 117 | - 2019.12.05: 优化tokenizer的do_lower_case，同时微调各个example。 118 | - 2019.11.23: 将train.py重命名为optimizers.py，更新大量优化器实现，全面兼容keras和tf.keras。 119 | - 2019.11.19: 将utils.py重命名为tokenizer.py。 120 | - 2019.11.19: 想来想去，最后还是决定把snippets放到bert4keras.snippets下面去好了。 121 | - 2019.11.18: 优化预训练权重加载逻辑，增加保存模型权重至Bert的checkpoint格式方法。 122 | - 2019.11.17: ~~分离一些与Bert本身不直接相关的常用代码片段到python_snippets，供其它项目共用。~~ 123 | - 2019.11.11: 添加NSP部分。 124 | - 2019.11.05: 适配google版albert，不再支持非Google版albert_zh。 125 | - 2019.11.05: 以RoBERTa为例子的预训练代码开发完毕，同时支持TPU/多GPU训练，详见roberta。欢迎在此基础上构建更多的预训练代码。 126 | - 2019.11.01: 逐步增加预训练相关代码，详见pretraining。 127 | - 2019.10.28: 支持使用基于sentencepiece的tokenizer。 128 | - 2019.10.25: 引入原生tokenizer。 129 | - 2019.10.22: 引入梯度累积优化器。 130 | - 2019.10.21: 为了简化代码结构，决定放弃keras 2.3.0之前的版本的支持，目前只支持keras 2.3.0+以及tf.keras。 131 | - 2019.10.20: 应网友要求，现支持直接用`model.save`保存模型结构，用`load_model`加载整个模型（只需要在`load_model`之前执行`from bert4keras.layers import *`，不需要额外写`custom_objects`）。 132 | - 2019.10.09: 已兼容tf.keras，同时在tf 1.13和tf 2.0下的tf.keras测试通过，通过设置环境变量`TF_KERAS=1`来切换tf.keras。 133 | - 2019.10.09: 已兼容Keras 2.3.x，但只是临时方案，后续可能直接移除掉2.3之前版本的支持。 134 | - 2019.10.02: 适配albert，能成功加载albert_zh的权重，只需要在`load_pretrained_model`函数里加上`albert=True`。 135 | 136 | ## 背景 137 | 之前一直用CyberZHG大佬的keras-bert，如果纯粹只是为了在keras下对bert进行调用和fine tune来说，keras-bert已经足够能让人满意了。 138 | 139 | 然而，如果想要在加载官方预训练权重的基础上，对bert的内部结构进行修改，那么keras-bert就比较难满足我们的需求了，因为keras-bert为了代码的复用性，几乎将每个小模块都封装为了一个单独的库，比如keras-bert依赖于keras-transformer，而keras-transformer依赖于keras-multi-head，keras-multi-head依赖于keras-self-attention，这样一重重依赖下去，改起来就相当头疼了。 140 | 141 | 所以，我决定重新写一个keras版的bert，争取在几个文件内把它完整地实现出来，减少这些依赖性，并且保留可以加载官方预训练权重的特性。 142 | 143 | ## 鸣谢 144 | 感谢CyberZHG大佬实现的keras-bert，本实现有不少地方参考了keras-bert的源码，在此衷心感谢大佬的无私奉献。 145 | 146 | ## 交流 147 | QQ交流群：808623966，微信群请加机器人微信号spaces_ac_cn 148 | -------------------------------------------------------------------------------- /examples/task_reading_comprehension_by_seq2seq.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 用seq2seq的方式做阅读理解任务 3 | # 数据集和评测同 https://github.com/bojone/dgcnn_for_reading_comprehension 4 | # 8个epoch后在valid上能达到约0.77的分数 5 | # (Accuracy=0.7259005836184343 F1=0.813860036706151 Final=0.7698803101622926) 6 | 7 | import json, os, re 8 | import numpy as np 9 | from bert4keras.backend import keras, K 10 | from bert4keras.layers import Loss 11 | from bert4keras.models import build_transformer_model 12 | from bert4keras.tokenizers import Tokenizer, load_vocab 13 | from bert4keras.optimizers import Adam 14 | from bert4keras.snippets import sequence_padding, open 15 | from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder 16 | from keras.models import Model 17 | from tqdm import tqdm 18 | 19 | max_p_len = 256 20 | max_q_len = 64 21 | max_a_len = 32 22 | max_qa_len = max_q_len + max_a_len 23 | batch_size = 32 24 | epochs = 8 25 | 26 | # bert配置 27 | config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' 28 | checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 29 | dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' 30 | 31 | # 标注数据 32 | webqa_data = json.load(open('/root/qa_datasets/WebQA.json')) 33 | sogou_data = json.load(open('/root/qa_datasets/SogouQA.json')) 34 | 35 | # 保存一个随机序（供划分valid用） 36 | if not os.path.exists('../random_order.json'): 37 | random_order = list(range(len(sogou_data))) 38 | np.random.shuffle(random_order) 39 | json.dump(random_order, open('../random_order.json', 'w'), indent=4) 40 | else: 41 | random_order = json.load(open('../random_order.json')) 42 | 43 | # 划分valid 44 | train_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 != 0] 45 | valid_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 == 0] 46 | train_data.extend(train_data) 47 | train_data.extend(webqa_data) # 将SogouQA和WebQA按2:1的比例混合 48 | 49 | # 加载并精简词表，建立分词器 50 | token_dict, keep_tokens = load_vocab( 51 | dict_path=dict_path, 52 | simplified=True, 53 | startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], 54 | ) 55 | tokenizer = Tokenizer(token_dict, do_lower_case=True) 56 | 57 | 58 | class data_generator(DataGenerator): 59 | """数据生成器 60 | """ 61 | def __iter__(self, random=False): 62 | """单条样本格式：[CLS]篇章[SEP]问题[SEP]答案[SEP] 63 | """ 64 | batch_token_ids, batch_segment_ids = [], [] 65 | for is_end, D in self.sample(random): 66 | question = D['question'] 67 | answers = [p['answer'] for p in D['passages'] if p['answer']] 68 | passage = np.random.choice(D['passages'])['passage'] 69 | passage = re.sub(u' |、|；|，', ',', passage) 70 | final_answer = '' 71 | for answer in answers: 72 | if all([ 73 | a in passage[:max_p_len - 2] for a in answer.split(' ') 74 | ]): 75 | final_answer = answer.replace(' ', ',') 76 | break 77 | qa_token_ids, qa_segment_ids = tokenizer.encode( 78 | question, final_answer, maxlen=max_qa_len + 1 79 | ) 80 | p_token_ids, p_segment_ids = tokenizer.encode( 81 | passage, maxlen=max_p_len 82 | ) 83 | token_ids = p_token_ids + qa_token_ids[1:] 84 | segment_ids = p_segment_ids + qa_segment_ids[1:] 85 | batch_token_ids.append(token_ids) 86 | batch_segment_ids.append(segment_ids) 87 | if len(batch_token_ids) == self.batch_size or is_end: 88 | batch_token_ids = sequence_padding(batch_token_ids) 89 | batch_segment_ids = sequence_padding(batch_segment_ids) 90 | yield [batch_token_ids, batch_segment_ids], None 91 | batch_token_ids, batch_segment_ids = [], [] 92 | 93 | 94 | class CrossEntropy(Loss): 95 | """交叉熵作为loss，并mask掉输入部分 96 | """ 97 | def compute_loss(self, inputs, mask=None): 98 | y_true, y_mask, y_pred = inputs 99 | y_true = y_true[:, 1:] # 目标token_ids 100 | y_mask = y_mask[:, 1:] # segment_ids，刚好指示了要预测的部分 101 | y_pred = y_pred[:, :-1] # 预测序列，错开一位 102 | loss = K.sparse_categorical_crossentropy(y_true, y_pred) 103 | loss = K.sum(loss * y_mask) / K.sum(y_mask) 104 | return loss 105 | 106 | 107 | model = build_transformer_model( 108 | config_path, 109 | checkpoint_path, 110 | application='unilm', 111 | keep_tokens=keep_tokens, # 只保留keep_tokens中的字，精简原字表 112 | ) 113 | 114 | output = CrossEntropy(2)(model.inputs + model.outputs) 115 | 116 | model = Model(model.inputs, output) 117 | model.compile(optimizer=Adam(1e-5)) 118 | model.summary() 119 | 120 | 121 | class ReadingComprehension(AutoRegressiveDecoder): 122 | """beam search解码来生成答案 123 | passages为多篇章组成的list，从多篇文章中自动决策出最优的答案， 124 | 如果没答案，则返回空字符串。 125 | mode是extractive时，按照抽取式执行，即答案必须是原篇章的一个片段。 126 | """ 127 | def __init__(self, mode='extractive', **kwargs): 128 | super(ReadingComprehension, self).__init__(**kwargs) 129 | self.mode = mode 130 | 131 | def get_ngram_set(self, x, n): 132 | """生成ngram合集，返回结果格式是: 133 | {(n-1)-gram: set([n-gram的第n个字集合])} 134 | """ 135 | result = {} 136 | for i in range(len(x) - n + 1): 137 | k = tuple(x[i:i + n]) 138 | if k[:-1] not in result: 139 | result[k[:-1]] = set() 140 | result[k[:-1]].add(k[-1]) 141 | return result 142 | 143 | @AutoRegressiveDecoder.wraps(default_rtype='probas', use_states=True) 144 | def predict(self, inputs, output_ids, states): 145 | inputs = [i for i in inputs if i[0, 0] > -1] # 过滤掉无答案篇章 146 | topk = len(inputs[0]) 147 | all_token_ids, all_segment_ids = [], [] 148 | for token_ids in inputs: # inputs里每个元素都代表一个篇章 149 | token_ids = np.concatenate([token_ids, output_ids], 1) 150 | segment_ids = np.zeros_like(token_ids) 151 | if states > 0: 152 | segment_ids[:, -output_ids.shape[1]:] = 1 153 | all_token_ids.extend(token_ids) 154 | all_segment_ids.extend(segment_ids) 155 | padded_all_token_ids = sequence_padding(all_token_ids) 156 | padded_all_segment_ids = sequence_padding(all_segment_ids) 157 | probas = model.predict([padded_all_token_ids, padded_all_segment_ids]) 158 | probas = [ 159 | probas[i, len(ids) - 1] for i, ids in enumerate(all_token_ids) 160 | ] 161 | probas = np.array(probas).reshape((len(inputs), topk, -1)) 162 | if states == 0: 163 | # 这一步主要是排除没有答案的篇章 164 | # 如果一开始最大值就为end_id，那说明该篇章没有答案 165 | argmax = probas[:, 0].argmax(axis=1) 166 | available_idxs = np.where(argmax != self.end_id)[0] 167 | if len(available_idxs) == 0: 168 | scores = np.zeros_like(probas[0]) 169 | scores[:, self.end_id] = 1 170 | return scores, states + 1 171 | else: 172 | for i in np.where(argmax == self.end_id)[0]: 173 | inputs[i][:, 0] = -1 # 无答案篇章首位标记为-1 174 | probas = probas[available_idxs] 175 | inputs = [i for i in inputs if i[0, 0] > -1] # 过滤掉无答案篇章 176 | if self.mode == 'extractive': 177 | # 如果是抽取式，那么答案必须是篇章的一个片段 178 | # 那么将非篇章片段的概率值全部置0 179 | new_probas = np.zeros_like(probas) 180 | ngrams = {} 181 | for token_ids in inputs: 182 | token_ids = token_ids[0] 183 | sep_idx = np.where(token_ids == tokenizer._token_end_id)[0][0] 184 | p_token_ids = token_ids[1:sep_idx] 185 | for k, v in self.get_ngram_set(p_token_ids, states + 1).items(): 186 | ngrams[k] = ngrams.get(k, set()) | v 187 | for i, ids in enumerate(output_ids): 188 | available_idxs = ngrams.get(tuple(ids), set()) 189 | available_idxs.add(tokenizer._token_end_id) 190 | available_idxs = list(available_idxs) 191 | new_probas[:, i, available_idxs] = probas[:, i, available_idxs] 192 | probas = new_probas 193 | return (probas**2).sum(0) / (probas.sum(0) + 1), states + 1 # 某种平均投票方式 194 | 195 | def answer(self, question, passages, topk=1): 196 | token_ids = [] 197 | for passage in passages: 198 | passage = re.sub(u' |、|；|，', ',', passage) 199 | p_token_ids = tokenizer.encode(passage, maxlen=max_p_len)[0] 200 | q_token_ids = tokenizer.encode(question, maxlen=max_q_len + 1)[0] 201 | token_ids.append(p_token_ids + q_token_ids[1:]) 202 | output_ids = self.beam_search( 203 | token_ids, topk=topk, states=0 204 | ) # 基于beam search 205 | return tokenizer.decode(output_ids) 206 | 207 | 208 | reader = ReadingComprehension( 209 | start_id=None, 210 | end_id=tokenizer._token_end_id, 211 | maxlen=max_a_len, 212 | mode='extractive' 213 | ) 214 | 215 | 216 | def predict_to_file(data, filename, topk=1): 217 | """将预测结果输出到文件，方便评估 218 | """ 219 | with open(filename, 'w', encoding='utf-8') as f: 220 | for d in tqdm(iter(data), desc=u'正在预测(共%s条样本)' % len(data)): 221 | q_text = d['question'] 222 | p_texts = [p['passage'] for p in d['passages']] 223 | a = reader.answer(q_text, p_texts, topk) 224 | if a: 225 | s = u'%s\t%s\n' % (d['id'], a) 226 | else: 227 | s = u'%s\t\n' % (d['id']) 228 | f.write(s) 229 | f.flush() 230 | 231 | 232 | class Evaluator(keras.callbacks.Callback): 233 | """评估与保存 234 | """ 235 | def __init__(self): 236 | self.lowest = 1e10 237 | 238 | def on_epoch_end(self, epoch, logs=None): 239 | # 保存最优 240 | if logs['loss'] <= self.lowest: 241 | self.lowest = logs['loss'] 242 | model.save_weights('./best_model.weights') 243 | 244 | 245 | if __name__ == '__main__': 246 | 247 | evaluator = Evaluator() 248 | train_generator = data_generator(train_data, batch_size) 249 | 250 | model.fit( 251 | train_generator.forfit(), 252 | steps_per_epoch=len(train_generator), 253 | epochs=epochs, 254 | callbacks=[evaluator] 255 | ) 256 | 257 | else: 258 | 259 | model.load_weights('./best_model.weights') 260 | -------------------------------------------------------------------------------- /examples/task_seq2seq_ape210k_math_word_problem.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 用Seq2Seq做小学数学应用题 3 | # 数据集为ape210k：https://github.com/Chenny0808/ape210k 4 | # Base版准确率为70%+，Large版准确率为73%+ 5 | # 实测环境：tensorflow 1.14 + keras 2.3.1 + bert4keras 0.8.8 6 | # 介绍链接：https://kexue.fm/archives/7809 7 | 8 | from __future__ import division 9 | import json, re 10 | import numpy as np 11 | import pandas as pd 12 | from tqdm import tqdm 13 | from bert4keras.backend import keras, K 14 | from bert4keras.layers import Loss 15 | from bert4keras.models import build_transformer_model 16 | from bert4keras.tokenizers import Tokenizer, load_vocab 17 | from bert4keras.optimizers import Adam 18 | from bert4keras.snippets import sequence_padding, open 19 | from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder 20 | from keras.models import Model 21 | from sympy import Integer 22 | 23 | # 基本参数 24 | maxlen = 192 25 | batch_size = 32 26 | epochs = 100 27 | 28 | # bert配置 29 | config_path = '/root/kg/bert/uer/mixed_corpus_bert_base_model/bert_config.json' 30 | checkpoint_path = '/root/kg/bert/uer/mixed_corpus_bert_base_model/bert_model.ckpt' 31 | dict_path = '/root/kg/bert/uer/mixed_corpus_bert_base_model/vocab.txt' 32 | 33 | 34 | def is_equal(a, b): 35 | """比较两个结果是否相等 36 | """ 37 | a = round(float(a), 6) 38 | b = round(float(b), 6) 39 | return a == b 40 | 41 | 42 | def remove_bucket(equation): 43 | """去掉冗余的括号 44 | """ 45 | l_buckets, buckets = [], [] 46 | for i, c in enumerate(equation): 47 | if c == '(': 48 | l_buckets.append(i) 49 | elif c == ')': 50 | buckets.append((l_buckets.pop(), i)) 51 | eval_equation = eval(equation) 52 | for l, r in buckets: 53 | new_equation = '%s %s %s' % ( 54 | equation[:l], equation[l + 1:r], equation[r + 1:] 55 | ) 56 | try: 57 | if is_equal(eval(new_equation.replace(' ', '')), eval_equation): 58 | equation = new_equation 59 | except: 60 | pass 61 | return equation.replace(' ', '') 62 | 63 | 64 | def load_data(filename): 65 | """读取训练数据，并做一些标准化，保证equation是可以eval的 66 | 参考：https://kexue.fm/archives/7809 67 | """ 68 | D = [] 69 | for l in open(filename): 70 | l = json.loads(l) 71 | question, equation, answer = l['original_text'], l['equation'], l['ans'] 72 | # 处理带分数 73 | question = re.sub('(\d+)$(\d+/\d+)$', '(\\1+\\2)', question) 74 | equation = re.sub('(\d+)$(\d+/\d+)$', '(\\1+\\2)', equation) 75 | answer = re.sub('(\d+)$(\d+/\d+)$', '(\\1+\\2)', answer) 76 | equation = re.sub('(\d+)$', '\\1+(', equation) 77 | answer = re.sub('(\d+)\(', '\\1+(', answer) 78 | # 分数去括号 79 | question = re.sub('\((\d+/\d+)$', '\\1', question) 80 | # 处理百分数 81 | equation = re.sub('([\.\d]+)%', '(\\1/100)', equation) 82 | answer = re.sub('([\.\d]+)%', '(\\1/100)', answer) 83 | # 冒号转除号、剩余百分号处理 84 | equation = equation.replace(':', '/').replace('%', '/100') 85 | answer = answer.replace(':', '/').replace('%', '/100') 86 | if equation[:2] == 'x=': 87 | equation = equation[2:] 88 | try: 89 | if is_equal(eval(equation), eval(answer)): 90 | D.append((question, remove_bucket(equation), answer)) 91 | except: 92 | continue 93 | return D 94 | 95 | 96 | # 加载数据集 97 | train_data = load_data('/root/ape210k/train.ape.json') 98 | valid_data = load_data('/root/ape210k/valid.ape.json') 99 | test_data = load_data('/root/ape210k/test.ape.json') 100 | 101 | # 加载并精简词表，建立分词器 102 | token_dict, keep_tokens = load_vocab( 103 | dict_path=dict_path, 104 | simplified=True, 105 | startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], 106 | ) 107 | tokenizer = Tokenizer(token_dict, do_lower_case=True) 108 | 109 | 110 | class data_generator(DataGenerator): 111 | """数据生成器 112 | """ 113 | def __iter__(self, random=False): 114 | batch_token_ids, batch_segment_ids = [], [] 115 | for is_end, (question, equation, answer) in self.sample(random): 116 | token_ids, segment_ids = tokenizer.encode( 117 | question, equation, maxlen=maxlen 118 | ) 119 | batch_token_ids.append(token_ids) 120 | batch_segment_ids.append(segment_ids) 121 | if len(batch_token_ids) == self.batch_size or is_end: 122 | batch_token_ids = sequence_padding(batch_token_ids) 123 | batch_segment_ids = sequence_padding(batch_segment_ids) 124 | yield [batch_token_ids, batch_segment_ids], None 125 | batch_token_ids, batch_segment_ids = [], [] 126 | 127 | 128 | class CrossEntropy(Loss): 129 | """交叉熵作为loss，并mask掉输入部分 130 | """ 131 | def compute_loss(self, inputs, mask=None): 132 | y_true, y_mask, y_pred = inputs 133 | y_true = y_true[:, 1:] # 目标token_ids 134 | y_mask = y_mask[:, 1:] # segment_ids，刚好指示了要预测的部分 135 | y_pred = y_pred[:, :-1] # 预测序列，错开一位 136 | loss = K.sparse_categorical_crossentropy(y_true, y_pred) 137 | loss = K.sum(loss * y_mask) / K.sum(y_mask) 138 | return loss 139 | 140 | 141 | model = build_transformer_model( 142 | config_path, 143 | checkpoint_path, 144 | application='unilm', 145 | keep_tokens=keep_tokens, # 只保留keep_tokens中的字，精简原字表 146 | ) 147 | 148 | output = CrossEntropy(2)(model.inputs + model.outputs) 149 | 150 | model = Model(model.inputs, output) 151 | model.compile(optimizer=Adam(2e-5)) 152 | model.summary() 153 | 154 | 155 | class AutoSolve(AutoRegressiveDecoder): 156 | """seq2seq解码器 157 | """ 158 | @AutoRegressiveDecoder.wraps(default_rtype='probas') 159 | def predict(self, inputs, output_ids, states): 160 | token_ids, segment_ids = inputs 161 | token_ids = np.concatenate([token_ids, output_ids], 1) 162 | segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1) 163 | return model.predict([token_ids, segment_ids])[:, -1] 164 | 165 | def generate(self, text, topk=1): 166 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 167 | output_ids = self.beam_search([token_ids, segment_ids], 168 | topk=topk) # 基于beam search 169 | return tokenizer.decode(output_ids).replace(' ', '') 170 | 171 | 172 | autosolve = AutoSolve(start_id=None, end_id=tokenizer._token_end_id, maxlen=64) 173 | 174 | 175 | class Evaluator(keras.callbacks.Callback): 176 | """评估与保存 177 | """ 178 | def __init__(self): 179 | self.best_acc = 0. 180 | 181 | def on_epoch_end(self, epoch, logs=None): 182 | metrics = self.evaluate(valid_data) # 评测模型 183 | if metrics['acc'] >= self.best_acc: 184 | self.best_acc = metrics['acc'] 185 | model.save_weights('./best_model.weights') # 保存模型 186 | metrics['best_acc'] = self.best_acc 187 | print('valid_data:', metrics) 188 | 189 | def evaluate(self, data, topk=1): 190 | total, right = 0.0, 0.0 191 | for question, equation, answer in tqdm(data): 192 | total += 1 193 | pred_equation = autosolve.generate(question, topk) 194 | try: 195 | right += int(is_equal(eval(pred_equation), eval(answer))) 196 | except: 197 | pass 198 | return {'acc': right / total} 199 | 200 | 201 | def predict(in_file, out_file, topk=1): 202 | """输出预测结果到文件 203 | 该函数主要为比赛 https://www.datafountain.cn/competitions/467 所写， 204 | 主要是读取该比赛的测试集，然后预测equation，并且根据不同的问题输出不同格式的答案， 205 | out_file可以直接提交到线上评测，线上准确率可以达到38%+。 206 | """ 207 | fw = open(out_file, 'w', encoding='utf-8') 208 | raw_data = pd.read_csv(in_file, header=None, encoding='utf-8') 209 | for i, question in tqdm(raw_data.values): 210 | question = re.sub('(\d+)_(\d+/\d+)', '(\\1+\\2)', question) 211 | pred_equation = autosolve.generate(question, topk) 212 | if '.' not in pred_equation: 213 | pred_equation = re.sub('([\d]+)', 'Integer(\\1)', pred_equation) 214 | try: 215 | pred_answer = eval(pred_equation) 216 | except: 217 | pred_answer = np.random.choice(21) + 1 218 | if '.' in pred_equation: 219 | if u'百分之几' in question: 220 | pred_answer = pred_answer * 100 221 | pred_answer = round(pred_answer, 2) 222 | if int(pred_answer) == pred_answer: 223 | pred_answer = int(pred_answer) 224 | if ( 225 | re.findall(u'多少[辆|人|个|只|箱|包本|束|头|盒|张]', question) or 226 | re.findall(u'几[辆|人|个|只|箱|包|本|束|头|盒|张]', question) 227 | ): 228 | if re.findall(u'至少|最少', question): 229 | pred_answer = np.ceil(pred_answer) 230 | elif re.findall(u'至多|最多', question): 231 | pred_answer = np.floor(pred_answer) 232 | else: 233 | pred_answer = np.ceil(pred_answer) 234 | pred_answer = int(pred_answer) 235 | pred_answer = str(pred_answer) 236 | if u'百分之几' in question: 237 | pred_answer = pred_answer + '%' 238 | else: 239 | pred_answer = str(pred_answer) 240 | if '/' in pred_answer: 241 | if re.findall('\d+/\d+', question): 242 | a, b = pred_answer.split('/') 243 | a, b = int(a), int(b) 244 | if a > b: 245 | pred_answer = '%s_%s/%s' % (a // b, a % b, b) 246 | else: 247 | if re.findall(u'至少|最少', question): 248 | pred_answer = np.ceil(eval(pred_answer)) 249 | elif re.findall(u'至多|最多', question): 250 | pred_answer = np.floor(eval(pred_answer)) 251 | else: 252 | pred_answer = np.ceil(eval(pred_answer)) 253 | pred_answer = str(int(pred_answer)) 254 | fw.write(str(i) + ',' + pred_answer + '\n') 255 | fw.flush() 256 | fw.close() 257 | 258 | 259 | if __name__ == '__main__': 260 | 261 | evaluator = Evaluator() 262 | train_generator = data_generator(train_data, batch_size) 263 | 264 | model.fit( 265 | train_generator.forfit(), 266 | steps_per_epoch=len(train_generator), 267 | epochs=epochs, 268 | callbacks=[evaluator] 269 | ) 270 | 271 | else: 272 | 273 | model.load_weights('./best_model.weights') 274 | -------------------------------------------------------------------------------- /pretraining/pretraining.py: -------------------------------------------------------------------------------- 1 | #! -*- coding: utf-8 -*- 2 | # 预训练脚本，多GPU版/TPU版本 3 | 4 | import os, re 5 | os.environ['TF_KERAS'] = '1' # 必须使用tf.keras 6 | 7 | import tensorflow as tf 8 | from data_utils import * 9 | from bert4keras.models import build_transformer_model 10 | from bert4keras.backend import keras, K 11 | from bert4keras.optimizers import Adam 12 | from bert4keras.optimizers import extend_with_weight_decay 13 | from bert4keras.optimizers import extend_with_layer_adaptation 14 | from bert4keras.optimizers import extend_with_piecewise_linear_lr 15 | from bert4keras.optimizers import extend_with_gradient_accumulation 16 | from keras.layers import Input, Lambda 17 | from keras.models import Model 18 | from keras.callbacks import Callback, CSVLogger 19 | 20 | model = 'roberta' 21 | 22 | # 语料路径和模型保存路径 23 | # 如果是TPU训练，那么语料必须存放在Google Cloud Storage上面， 24 | # 路径必须以gs://开头；如果是GPU训练，改为普通路径即可。 25 | model_saved_path = 'gs://xxxx/bert4keras/saved_model/bert_model.ckpt' 26 | corpus_paths = [ 27 | 'gs://xxxx/bert4keras/corpus/corpus.%s.tfrecord' % i for i in range(10) 28 | ] 29 | 30 | # 其他配置 31 | sequence_length = 512 32 | batch_size = 4096 33 | config_path = '/home/spaces_ac_cn/chinese_L-12_H-768_A-12/bert_config.json' 34 | checkpoint_path = '/home/spaces_ac_cn/chinese_L-12_H-768_A-12/bert_model.ckpt' # 如果从零训练，就设为None 35 | learning_rate = 0.00176 36 | weight_decay_rate = 0.01 37 | num_warmup_steps = 3125 38 | num_train_steps = 125000 39 | steps_per_epoch = 10000 40 | grad_accum_steps = 16 # 大于1即表明使用梯度累积 41 | epochs = num_train_steps * grad_accum_steps // steps_per_epoch 42 | exclude_from_weight_decay = ['Norm', 'bias'] 43 | exclude_from_layer_adaptation = ['Norm', 'bias'] 44 | tpu_address = 'grpc://xxx.xxx.xxx.xxx:8470' # 如果用多GPU跑，直接设为None 45 | which_optimizer = 'lamb' # adam 或 lamb，均自带weight decay 46 | lr_schedule = { 47 | num_warmup_steps * grad_accum_steps: 1.0, 48 | num_train_steps * grad_accum_steps: 0.0, 49 | } 50 | floatx = K.floatx() 51 | 52 | # 读取数据集，构建数据张量 53 | 54 | if model == 'roberta': 55 | 56 | dataset = TrainingDatasetRoBERTa.load_tfrecord( 57 | record_names=corpus_paths, 58 | sequence_length=sequence_length, 59 | batch_size=batch_size // grad_accum_steps, 60 | ) 61 | 62 | elif model == 'gpt': 63 | 64 | dataset = TrainingDatasetGPT.load_tfrecord( 65 | record_names=corpus_paths, 66 | sequence_length=sequence_length, 67 | batch_size=batch_size // grad_accum_steps, 68 | ) 69 | 70 | elif model == 'unilm': 71 | 72 | dataset = TrainingDatasetUniLM.load_tfrecord( 73 | record_names=corpus_paths, 74 | sequence_length=sequence_length, 75 | batch_size=batch_size // grad_accum_steps, 76 | token_sep_id=3, # 这里需要自己指定[SEP]的id 77 | ) 78 | 79 | 80 | def build_transformer_model_with_mlm(): 81 | """带mlm的bert模型 82 | """ 83 | bert = build_transformer_model( 84 | config_path, with_mlm='linear', return_keras_model=False 85 | ) 86 | proba = bert.model.output 87 | 88 | # 辅助输入 89 | token_ids = Input(shape=(None,), dtype='int64', name='token_ids') # 目标id 90 | is_masked = Input(shape=(None,), dtype=floatx, name='is_masked') # mask标记 91 | 92 | def mlm_loss(inputs): 93 | """计算loss的函数，需要封装为一个层 94 | """ 95 | y_true, y_pred, mask = inputs 96 | loss = K.sparse_categorical_crossentropy( 97 | y_true, y_pred, from_logits=True 98 | ) 99 | loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon()) 100 | return loss 101 | 102 | def mlm_acc(inputs): 103 | """计算准确率的函数，需要封装为一个层 104 | """ 105 | y_true, y_pred, mask = inputs 106 | y_true = K.cast(y_true, floatx) 107 | acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) 108 | acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon()) 109 | return acc 110 | 111 | mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked]) 112 | mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked]) 113 | 114 | train_model = Model( 115 | bert.model.inputs + [token_ids, is_masked], [mlm_loss, mlm_acc] 116 | ) 117 | 118 | loss = { 119 | 'mlm_loss': lambda y_true, y_pred: y_pred, 120 | 'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), 121 | } 122 | 123 | return bert, train_model, loss 124 | 125 | 126 | def build_transformer_model_with_lm(): 127 | """带lm的bert模型 128 | """ 129 | bert = build_transformer_model( 130 | config_path, 131 | with_mlm='linear', 132 | application='lm', 133 | return_keras_model=False 134 | ) 135 | token_ids = bert.model.inputs[0] 136 | proba = bert.model.output 137 | 138 | def lm_loss(inputs, mask=None): 139 | """计算loss的函数，需要封装为一个层 140 | """ 141 | y_true, y_pred = inputs 142 | y_true, y_pred = y_true[:, 1:], y_pred[:, :-1] 143 | 144 | if mask is None: 145 | mask = 1.0 146 | else: 147 | mask = K.cast(mask[1][:, 1:], floatx) 148 | 149 | loss = K.sparse_categorical_crossentropy( 150 | y_true, y_pred, from_logits=True 151 | ) 152 | loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon()) 153 | return loss 154 | 155 | def lm_acc(inputs, mask=None): 156 | """计算准确率的函数，需要封装为一个层 157 | """ 158 | y_true, y_pred = inputs 159 | y_true, y_pred = K.cast(y_true[:, 1:], floatx), y_pred[:, :-1] 160 | 161 | if mask is None: 162 | mask = 1.0 163 | else: 164 | mask = K.cast(mask[1][:, 1:], floatx) 165 | 166 | acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) 167 | acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon()) 168 | return acc 169 | 170 | lm_loss = Lambda(lm_loss, name='lm_loss')([token_ids, proba]) 171 | lm_acc = Lambda(lm_acc, name='lm_acc')([token_ids, proba]) 172 | 173 | train_model = Model(bert.model.inputs, [lm_loss, lm_acc]) 174 | 175 | loss = { 176 | 'lm_loss': lambda y_true, y_pred: y_pred, 177 | 'lm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), 178 | } 179 | 180 | return bert, train_model, loss 181 | 182 | 183 | def build_transformer_model_with_unilm(): 184 | """带unilm的bert模型 185 | """ 186 | bert = build_transformer_model( 187 | config_path, 188 | with_mlm='linear', 189 | application='unilm', 190 | return_keras_model=False 191 | ) 192 | token_ids = bert.model.inputs[0] 193 | segment_ids = bert.model.inputs[1] 194 | proba = bert.model.output 195 | 196 | def unilm_loss(inputs, mask=None): 197 | """计算loss的函数，需要封装为一个层 198 | """ 199 | y_true, y_pred, segment_ids = inputs 200 | y_true, y_pred = y_true[:, 1:], y_pred[:, :-1] 201 | 202 | if mask is None: 203 | mask = 1.0 204 | else: 205 | mask = K.cast(mask[1][:, 1:], floatx) 206 | 207 | segment_ids = K.cast(segment_ids, floatx) 208 | mask = mask * segment_ids[:, 1:] 209 | 210 | loss = K.sparse_categorical_crossentropy( 211 | y_true, y_pred, from_logits=True 212 | ) 213 | loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon()) 214 | return loss 215 | 216 | def unilm_acc(inputs, mask=None): 217 | """计算准确率的函数，需要封装为一个层 218 | """ 219 | y_true, y_pred, segment_ids = inputs 220 | y_true, y_pred = K.cast(y_true[:, 1:], floatx), y_pred[:, :-1] 221 | 222 | if mask is None: 223 | mask = 1.0 224 | else: 225 | mask = K.cast(mask[1][:, 1:], floatx) 226 | 227 | segment_ids = K.cast(segment_ids, floatx) 228 | mask = mask * segment_ids[:, 1:] 229 | 230 | acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred) 231 | acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon()) 232 | return acc 233 | 234 | token_proba_segment = [token_ids, proba, segment_ids] 235 | unilm_loss = Lambda(unilm_loss, name='unilm_loss')(token_proba_segment) 236 | unilm_acc = Lambda(unilm_acc, name='unilm_acc')(token_proba_segment) 237 | 238 | train_model = Model(bert.model.inputs, [unilm_loss, unilm_acc]) 239 | 240 | loss = { 241 | 'unilm_loss': lambda y_true, y_pred: y_pred, 242 | 'unilm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred), 243 | } 244 | 245 | return bert, train_model, loss 246 | 247 | 248 | def build_transformer_model_for_pretraining(): 249 | """构建训练模型，通用于TPU/GPU 250 | 注意全程要用keras标准的层写法，一些比较灵活的“移花接木”式的 251 | 写法可能会在TPU上训练失败。此外，要注意的是TPU并非支持所有 252 | tensorflow算子，尤其不支持动态（变长）算子，因此编写相应运算 253 | 时要格外留意。 254 | """ 255 | if model == 'roberta': 256 | bert, train_model, loss = build_transformer_model_with_mlm() 257 | elif model == 'gpt': 258 | bert, train_model, loss = build_transformer_model_with_lm() 259 | elif model == 'unilm': 260 | bert, train_model, loss = build_transformer_model_with_unilm() 261 | 262 | # 优化器 263 | optimizer = extend_with_weight_decay(Adam) 264 | if which_optimizer == 'lamb': 265 | optimizer = extend_with_layer_adaptation(optimizer) 266 | optimizer = extend_with_piecewise_linear_lr(optimizer) 267 | optimizer_params = { 268 | 'learning_rate': learning_rate, 269 | 'lr_schedule': lr_schedule, 270 | 'weight_decay_rate': weight_decay_rate, 271 | 'exclude_from_weight_decay': exclude_from_weight_decay, 272 | 'exclude_from_layer_adaptation': exclude_from_layer_adaptation, 273 | 'bias_correction': False, 274 | } 275 | if grad_accum_steps > 1: 276 | optimizer = extend_with_gradient_accumulation(optimizer) 277 | optimizer_params['grad_accum_steps'] = grad_accum_steps 278 | optimizer = optimizer(**optimizer_params) 279 | 280 | # 模型定型 281 | train_model.compile(loss=loss, optimizer=optimizer) 282 | 283 | # 如果传入权重，则加载。注：须在此处加载，才保证不报错。 284 | if checkpoint_path is not None: 285 | bert.load_weights_from_checkpoint(checkpoint_path) 286 | 287 | return train_model 288 | 289 | 290 | if tpu_address is None: 291 | # 单机多卡模式（多机多卡也类似，但需要硬软件配合，请参考https://tf.wiki） 292 | strategy = tf.distribute.MirroredStrategy() 293 | else: 294 | # TPU模式 295 | resolver = tf.distribute.cluster_resolver.TPUClusterResolver( 296 | tpu=tpu_address 297 | ) 298 | tf.config.experimental_connect_to_host(resolver.master()) 299 | tf.tpu.experimental.initialize_tpu_system(resolver) 300 | strategy = tf.distribute.experimental.TPUStrategy(resolver) 301 | 302 | with strategy.scope(): 303 | train_model = build_transformer_model_for_pretraining() 304 | train_model.summary() 305 | 306 | 307 | class ModelCheckpoint(keras.callbacks.Callback): 308 | """自动保存最新模型 309 | """ 310 | def on_epoch_end(self, epoch, logs=None): 311 | self.model.save_weights(model_saved_path, overwrite=True) 312 | 313 | 314 | # 保存模型 315 | checkpoint = ModelCheckpoint() 316 | # 记录日志 317 | csv_logger = keras.callbacks.CSVLogger('training.log') 318 | 319 | # 模型训练 320 | train_model.fit( 321 | dataset, 322 | steps_per_epoch=steps_per_epoch, 323 | epochs=epochs, 324 | callbacks=[checkpoint, csv_logger], 325 | ) 326 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /examples/task_relation_extraction.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | # 三元组抽取任务，基于“半指针-半标注”结构 3 | # 文章介绍：https://kexue.fm/archives/7161 4 | # 数据集：http://ai.baidu.com/broad/download?dataset=sked 5 | # 最优f1=0.82198 6 | # 换用RoBERTa Large可以达到f1=0.829+ 7 | # 说明：由于使用了EMA，需要跑足够多的步数(5000步以上）才生效，如果 8 | # 你的数据总量比较少，那么请务必跑足够多的epoch数，或者去掉EMA。 9 | 10 | import json 11 | import numpy as np 12 | from bert4keras.backend import keras, K, batch_gather 13 | from bert4keras.layers import Loss 14 | from bert4keras.layers import LayerNormalization 15 | from bert4keras.tokenizers import Tokenizer 16 | from bert4keras.models import build_transformer_model 17 | from bert4keras.optimizers import Adam, extend_with_exponential_moving_average 18 | from bert4keras.snippets import sequence_padding, DataGenerator 19 | from bert4keras.snippets import open, to_array 20 | from keras.layers import Input, Dense, Lambda, Reshape 21 | from keras.models import Model 22 | from tqdm import tqdm 23 | 24 | maxlen = 128 25 | batch_size = 64 26 | config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' 27 | checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' 28 | dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' 29 | 30 | 31 | def load_data(filename): 32 | """加载数据 33 | 单条格式：{'text': text, 'spo_list': [(s, p, o)]} 34 | """ 35 | D = [] 36 | with open(filename, encoding='utf-8') as f: 37 | for l in f: 38 | l = json.loads(l) 39 | D.append({ 40 | 'text': l['text'], 41 | 'spo_list': [(spo['subject'], spo['predicate'], spo['object']) 42 | for spo in l['spo_list']] 43 | }) 44 | return D 45 | 46 | 47 | # 加载数据集 48 | train_data = load_data('/root/kg/datasets/train_data.json') 49 | valid_data = load_data('/root/kg/datasets/dev_data.json') 50 | predicate2id, id2predicate = {}, {} 51 | 52 | with open('/root/kg/datasets/all_50_schemas') as f: 53 | for l in f: 54 | l = json.loads(l) 55 | if l['predicate'] not in predicate2id: 56 | id2predicate[len(predicate2id)] = l['predicate'] 57 | predicate2id[l['predicate']] = len(predicate2id) 58 | 59 | # 建立分词器 60 | tokenizer = Tokenizer(dict_path, do_lower_case=True) 61 | 62 | 63 | def search(pattern, sequence): 64 | """从sequence中寻找子串pattern 65 | 如果找到，返回第一个下标；否则返回-1。 66 | """ 67 | n = len(pattern) 68 | for i in range(len(sequence)): 69 | if sequence[i:i + n] == pattern: 70 | return i 71 | return -1 72 | 73 | 74 | class data_generator(DataGenerator): 75 | """数据生成器 76 | """ 77 | def __iter__(self, random=False): 78 | batch_token_ids, batch_segment_ids = [], [] 79 | batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], [] 80 | for is_end, d in self.sample(random): 81 | token_ids, segment_ids = tokenizer.encode(d['text'], maxlen=maxlen) 82 | # 整理三元组 {s: [(o, p)]} 83 | spoes = {} 84 | for s, p, o in d['spo_list']: 85 | s = tokenizer.encode(s)[0][1:-1] 86 | p = predicate2id[p] 87 | o = tokenizer.encode(o)[0][1:-1] 88 | s_idx = search(s, token_ids) 89 | o_idx = search(o, token_ids) 90 | if s_idx != -1 and o_idx != -1: 91 | s = (s_idx, s_idx + len(s) - 1) 92 | o = (o_idx, o_idx + len(o) - 1, p) 93 | if s not in spoes: 94 | spoes[s] = [] 95 | spoes[s].append(o) 96 | if spoes: 97 | # subject标签 98 | subject_labels = np.zeros((len(token_ids), 2)) 99 | for s in spoes: 100 | subject_labels[s[0], 0] = 1 101 | subject_labels[s[1], 1] = 1 102 | # 随机选一个subject 103 | start, end = np.array(list(spoes.keys())).T 104 | start = np.random.choice(start) 105 | end = np.random.choice(end[end >= start]) 106 | subject_ids = (start, end) 107 | # 对应的object标签 108 | object_labels = np.zeros((len(token_ids), len(predicate2id), 2)) 109 | for o in spoes.get(subject_ids, []): 110 | object_labels[o[0], o[2], 0] = 1 111 | object_labels[o[1], o[2], 1] = 1 112 | # 构建batch 113 | batch_token_ids.append(token_ids) 114 | batch_segment_ids.append(segment_ids) 115 | batch_subject_labels.append(subject_labels) 116 | batch_subject_ids.append(subject_ids) 117 | batch_object_labels.append(object_labels) 118 | if len(batch_token_ids) == self.batch_size or is_end: 119 | batch_token_ids = sequence_padding(batch_token_ids) 120 | batch_segment_ids = sequence_padding(batch_segment_ids) 121 | batch_subject_labels = sequence_padding( 122 | batch_subject_labels 123 | ) 124 | batch_subject_ids = np.array(batch_subject_ids) 125 | batch_object_labels = sequence_padding(batch_object_labels) 126 | yield [ 127 | batch_token_ids, batch_segment_ids, 128 | batch_subject_labels, batch_subject_ids, 129 | batch_object_labels 130 | ], None 131 | batch_token_ids, batch_segment_ids = [], [] 132 | batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], [] 133 | 134 | 135 | def extract_subject(inputs): 136 | """根据subject_ids从output中取出subject的向量表征 137 | """ 138 | output, subject_ids = inputs 139 | start = batch_gather(output, subject_ids[:, :1]) 140 | end = batch_gather(output, subject_ids[:, 1:]) 141 | subject = K.concatenate([start, end], 2) 142 | return subject[:, 0] 143 | 144 | 145 | # 补充输入 146 | subject_labels = Input(shape=(None, 2), name='Subject-Labels') 147 | subject_ids = Input(shape=(2,), name='Subject-Ids') 148 | object_labels = Input(shape=(None, len(predicate2id), 2), name='Object-Labels') 149 | 150 | # 加载预训练模型 151 | bert = build_transformer_model( 152 | config_path=config_path, 153 | checkpoint_path=checkpoint_path, 154 | return_keras_model=False, 155 | ) 156 | 157 | # 预测subject 158 | output = Dense( 159 | units=2, activation='sigmoid', kernel_initializer=bert.initializer 160 | )(bert.model.output) 161 | subject_preds = Lambda(lambda x: x**2)(output) 162 | 163 | subject_model = Model(bert.model.inputs, subject_preds) 164 | 165 | # 传入subject，预测object 166 | # 通过Conditional Layer Normalization将subject融入到object的预测中 167 | output = bert.model.layers[-2].get_output_at(-1) # 自己想为什么是-2而不是-1 168 | subject = Lambda(extract_subject)([output, subject_ids]) 169 | output = LayerNormalization(conditional=True)([output, subject]) 170 | output = Dense( 171 | units=len(predicate2id) * 2, 172 | activation='sigmoid', 173 | kernel_initializer=bert.initializer 174 | )(output) 175 | output = Lambda(lambda x: x**4)(output) 176 | object_preds = Reshape((-1, len(predicate2id), 2))(output) 177 | 178 | object_model = Model(bert.model.inputs + [subject_ids], object_preds) 179 | 180 | 181 | class TotalLoss(Loss): 182 | """subject_loss与object_loss之和，都是二分类交叉熵 183 | """ 184 | def compute_loss(self, inputs, mask=None): 185 | subject_labels, object_labels = inputs[:2] 186 | subject_preds, object_preds, _ = inputs[2:] 187 | if mask[4] is None: 188 | mask = 1.0 189 | else: 190 | mask = K.cast(mask[4], K.floatx()) 191 | # sujuect部分loss 192 | subject_loss = K.binary_crossentropy(subject_labels, subject_preds) 193 | subject_loss = K.mean(subject_loss, 2) 194 | subject_loss = K.sum(subject_loss * mask) / K.sum(mask) 195 | # object部分loss 196 | object_loss = K.binary_crossentropy(object_labels, object_preds) 197 | object_loss = K.sum(K.mean(object_loss, 3), 2) 198 | object_loss = K.sum(object_loss * mask) / K.sum(mask) 199 | # 总的loss 200 | return subject_loss + object_loss 201 | 202 | 203 | subject_preds, object_preds = TotalLoss([2, 3])([ 204 | subject_labels, object_labels, subject_preds, object_preds, 205 | bert.model.output 206 | ]) 207 | 208 | # 训练模型 209 | train_model = Model( 210 | bert.model.inputs + [subject_labels, subject_ids, object_labels], 211 | [subject_preds, object_preds] 212 | ) 213 | 214 | AdamEMA = extend_with_exponential_moving_average(Adam, name='AdamEMA') 215 | optimizer = AdamEMA(learning_rate=1e-5) 216 | train_model.compile(optimizer=optimizer) 217 | 218 | 219 | def extract_spoes(text): 220 | """抽取输入text所包含的三元组 221 | """ 222 | tokens = tokenizer.tokenize(text, maxlen=maxlen) 223 | mapping = tokenizer.rematch(text, tokens) 224 | token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 225 | token_ids, segment_ids = to_array([token_ids], [segment_ids]) 226 | # 抽取subject 227 | subject_preds = subject_model.predict([token_ids, segment_ids]) 228 | start = np.where(subject_preds[0, :, 0] > 0.6)[0] 229 | end = np.where(subject_preds[0, :, 1] > 0.5)[0] 230 | subjects = [] 231 | for i in start: 232 | j = end[end >= i] 233 | if len(j) > 0: 234 | j = j[0] 235 | subjects.append((i, j)) 236 | if subjects: 237 | spoes = [] 238 | token_ids = np.repeat(token_ids, len(subjects), 0) 239 | segment_ids = np.repeat(segment_ids, len(subjects), 0) 240 | subjects = np.array(subjects) 241 | # 传入subject，抽取object和predicate 242 | object_preds = object_model.predict([token_ids, segment_ids, subjects]) 243 | for subject, object_pred in zip(subjects, object_preds): 244 | start = np.where(object_pred[:, :, 0] > 0.6) 245 | end = np.where(object_pred[:, :, 1] > 0.5) 246 | for _start, predicate1 in zip(*start): 247 | for _end, predicate2 in zip(*end): 248 | if _start <= _end and predicate1 == predicate2: 249 | spoes.append( 250 | ((mapping[subject[0]][0], 251 | mapping[subject[1]][-1]), predicate1, 252 | (mapping[_start][0], mapping[_end][-1])) 253 | ) 254 | break 255 | return [(text[s[0]:s[1] + 1], id2predicate[p], text[o[0]:o[1] + 1]) 256 | for s, p, o, in spoes] 257 | else: 258 | return [] 259 | 260 | 261 | class SPO(tuple): 262 | """用来存三元组的类 263 | 表现跟tuple基本一致，只是重写了 __hash__ 和 __eq__ 方法， 264 | 使得在判断两个三元组是否等价时容错性更好。 265 | """ 266 | def __init__(self, spo): 267 | self.spox = ( 268 | tuple(tokenizer.tokenize(spo[0])), 269 | spo[1], 270 | tuple(tokenizer.tokenize(spo[2])), 271 | ) 272 | 273 | def __hash__(self): 274 | return self.spox.__hash__() 275 | 276 | def __eq__(self, spo): 277 | return self.spox == spo.spox 278 | 279 | 280 | def evaluate(data): 281 | """评估函数，计算f1、precision、recall 282 | """ 283 | X, Y, Z = 1e-10, 1e-10, 1e-10 284 | f = open('dev_pred.json', 'w', encoding='utf-8') 285 | pbar = tqdm() 286 | for d in data: 287 | R = set([SPO(spo) for spo in extract_spoes(d['text'])]) 288 | T = set([SPO(spo) for spo in d['spo_list']]) 289 | X += len(R & T) 290 | Y += len(R) 291 | Z += len(T) 292 | f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z 293 | pbar.update() 294 | pbar.set_description( 295 | 'f1: %.5f, precision: %.5f, recall: %.5f' % (f1, precision, recall) 296 | ) 297 | s = json.dumps({ 298 | 'text': d['text'], 299 | 'spo_list': list(T), 300 | 'spo_list_pred': list(R), 301 | 'new': list(R - T), 302 | 'lack': list(T - R), 303 | }, 304 | ensure_ascii=False, 305 | indent=4) 306 | f.write(s + '\n') 307 | pbar.close() 308 | f.close() 309 | return f1, precision, recall 310 | 311 | 312 | class Evaluator(keras.callbacks.Callback): 313 | """评估与保存 314 | """ 315 | def __init__(self): 316 | self.best_val_f1 = 0. 317 | 318 | def on_epoch_end(self, epoch, logs=None): 319 | optimizer.apply_ema_weights() 320 | f1, precision, recall = evaluate(valid_data) 321 | if f1 >= self.best_val_f1: 322 | self.best_val_f1 = f1 323 | train_model.save_weights('best_model.weights') 324 | optimizer.reset_old_weights() 325 | print( 326 | 'f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' % 327 | (f1, precision, recall, self.best_val_f1) 328 | ) 329 | 330 | 331 | if __name__ == '__main__': 332 | 333 | train_generator = data_generator(train_data, batch_size) 334 | evaluator = Evaluator() 335 | 336 | train_model.fit( 337 | train_generator.forfit(), 338 | steps_per_epoch=len(train_generator), 339 | epochs=20, 340 | callbacks=[evaluator] 341 | ) 342 | 343 | else: 344 | 345 | train_model.load_weights('best_model.weights') 346 | --------------------------------------------------------------------------------