├── bert_seq2seq ├── model │ ├── __init__.py │ ├── blocks │ │ └── __init__.py │ ├── layers │ │ ├── __init__.py │ │ ├── layer_norm.py │ │ └── activations.py │ ├── utils.py │ └── prompt.py ├── task │ ├── seq2seq │ │ ├── __init__.py │ │ ├── t5_seq2seq_model.py │ │ ├── gpt2_seq2seq_model.py │ │ ├── bert_seq2seq_model.py │ │ └── GLM_seq2seq_model.py │ ├── embedding │ │ ├── __init__.py │ │ └── bert_embedding.py │ ├── classification │ │ ├── __init__.py │ │ └── bert_cls_classifier.py │ ├── sequence_labeling │ │ ├── __init__.py │ │ └── bert_sequence_labeling.py │ ├── relationship_extraction │ │ ├── __init__.py │ │ └── bert_relationship_extraction.py │ └── __init__.py ├── predictor │ ├── __init__.py │ └── predictor.py ├── __init__.py ├── mpu │ ├── func_utils.py │ ├── __init__.py │ ├── utils.py │ ├── grads.py │ ├── data.py │ ├── mappings.py │ ├── cross_entropy.py │ ├── initialize.py │ └── mp_tools.py ├── config.py ├── utils.py ├── launch.py ├── dataset.py └── layers.py ├── .idea ├── .gitignore ├── vcs.xml ├── inspectionProfiles │ ├── profiles_settings.xml │ └── Project_Default.xml ├── misc.xml ├── modules.xml └── bert_seq2seq_DDP.iml ├── .DS_Store ├── data ├── auto_title │ ├── train.tgt │ └── train.src ├── semantic_matching │ └── train.tsv ├── LCCC-base-split │ ├── LCCC-base_test.json │ ├── LCCC-base_train.json │ └── LCCC-base_valid.json ├── ner │ └── china-people-daily-ner-corpus │ │ ├── example.test │ │ ├── example.dev │ │ └── example.train └── relationship_extraction │ ├── all_50_schemas │ └── dev_data.json ├── setup.py ├── examples ├── seq2seq │ ├── gpt2 │ │ ├── test_gpt2_text_writting.py │ │ ├── test_gpt2_multi_chat.py │ │ ├── test_multi_processing_generate.py │ │ └── train_gpt2_multi_chat.py │ ├── t5 │ │ └── test_t5_auto_title.py │ ├── GLM │ │ ├── glm_generate_samples.py │ │ └── train_glm_auto_title.py │ └── bert │ │ ├── test_roberta_auto_title.py │ │ ├── train_roberta_auto_title.py │ │ └── train_roberta_auto_title_multi_gpu.py ├── text_classification │ ├── test.py │ ├── train_roberta_large_news_title_classification.py │ ├── train_roberta_semantic_matching.py │ ├── train_roberta_news_title_classification.py │ └── train_roberta_news_title_classification_multi_gpu.py ├── bert_embedding │ └── get_bert_embedding.py ├── FAQ │ ├── 1_construct_data.py │ └── 2_test_bert_faq.py ├── README.md ├── ner │ ├── train_bert_ner_people_daily.py │ ├── train_roberta_ner_gp_people_daily.py │ └── train_bert_ner_crf_people_daily.py └── relationship_extraction │ └── train_bert_relationship_extraction.py ├── .gitignore └── README.md /bert_seq2seq/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bert_seq2seq/model/blocks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bert_seq2seq/model/layers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bert_seq2seq/task/seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bert_seq2seq/task/embedding/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bert_seq2seq/task/classification/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bert_seq2seq/task/sequence_labeling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bert_seq2seq/task/relationship_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bert_seq2seq/predictor/__init__.py: -------------------------------------------------------------------------------- 1 | from .predictor import * -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/920232796/bert_seq2seq_DDP/HEAD/.DS_Store -------------------------------------------------------------------------------- /data/auto_title/train.tgt: -------------------------------------------------------------------------------- 1 | 修改后的立法法全文公布 2 | 深圳机场9死24伤续:司机全责赔偿或超千万 3 | 孟建柱:主动适应形势新变化提高政法机关服务大局的能力 -------------------------------------------------------------------------------- /data/semantic_matching/train.tsv: -------------------------------------------------------------------------------- 1 | 好无聊啊 啊好无聊啊 1 2 | 我好想谈恋爱呀 我多想谈一场恋爱呀 1 3 | 今天我四点就起床了 今天下午一点五十叫我起床 0 4 | 现在不需要你了不要回来了你 不要回来了 1 5 | 语音助手用不了怎么办 怎么用语音召唤小助手? 0 -------------------------------------------------------------------------------- /bert_seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | from .tokenizer import Tokenizer 2 | from .utils import * 3 | from .predictor import * 4 | from .trainer import Trainer 5 | from .glm_tokenizer import GLMTokenizer -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /bert_seq2seq/task/__init__.py: -------------------------------------------------------------------------------- 1 | from .classification.bert_cls_classifier import * 2 | from .embedding.bert_embedding import * 3 | from .seq2seq.bert_seq2seq_model import * 4 | from .sequence_labeling.bert_sequence_labeling import * -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /data/LCCC-base-split/LCCC-base_test.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | "我 饿 了 。", 4 | "去 相 机 家 里 吃 … …", 5 | "相 机 今 年 木 有 回 去 T . T" 6 | ], 7 | [ 8 | "网 络 大 实 话 里 说 的 是 也 许 你 能 在 网 络 里 找 到 你 想 要 的 友 情 但 永 远 不 会 找 到 你 想 要 的 爱 情", 9 | "你 过 来 我 们 什 么 关 系" 10 | ] 11 | ] -------------------------------------------------------------------------------- /data/auto_title/train.src: -------------------------------------------------------------------------------- 1 | 新华社受权于18日全文播发修改后的《中华人民共和国立法法》修改后的立法法分为“总则”“法律”“行政法规”“地方性法规自治条例和单行条例规章”“适用与备案审查”“附则”等6章共计105条 2 | 一辆小轿车一名女司机竟造成9死24伤日前深圳市交警局对事故进行通报:从目前证据看事故系司机超速行驶且操作不当导致目前24名伤员已有6名治愈出院其余正接受治疗预计事故赔偿费或超一千万元 3 | 1月18日习近平总书记对政法工作作出重要指示:2014年政法战线各项工作特别是改革工作取得新成效新形势下希望全国政法机关主动适应新形势为公正司法和提高执法司法公信力提供有力制度保障 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /data/LCCC-base-split/LCCC-base_train.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | "你 去 那儿 竟然 不喊 我 生气 了 , 快点 给 我 道歉", 4 | "道歉 ! ! 再有 时间 找 你 去", 5 | "领个 搓衣板 去 吧" 6 | ], 7 | [ 8 | "我用 SEED.24 小时 签到 一次 可以 用 4 小时 , 对于 我 这种 每天晚上 逛 一下 的 感觉 不错", 9 | "SEED 早上 刚 被 禁用 还有 一个月 的 VIP 路线 呢 禁 了 之后 才 买 的 另 一个 买 了 一年 结果 用 了 一 下午 就 挂 了 现在 用 了 个 极速网 速差 的 很", 10 | "心疼 你" 11 | ], 12 | 13 | ] -------------------------------------------------------------------------------- /data/ner/china-people-daily-ner-corpus/example.test: -------------------------------------------------------------------------------- 1 | 我 O 2 | 们 O 3 | 变 O 4 | 而 O 5 | 以 O 6 | 书 O 7 | 会 O 8 | 友 O 9 | , O 10 | 以 O 11 | 书 O 12 | 结 O 13 | 缘 O 14 | , O 15 | 把 O 16 | 欧 B-LOC 17 | 美 B-LOC 18 | 、 O 19 | 港 B-LOC 20 | 台 B-LOC 21 | 流 O 22 | 行 O 23 | 的 O 24 | 食 O 25 | 品 O 26 | 类 O 27 | 图 O 28 | 谱 O 29 | 、 O 30 | 画 O 31 | 册 O 32 | 、 O 33 | 工 O 34 | 具 O 35 | 书 O 36 | 汇 O 37 | 集 O 38 | 一 O 39 | 堂 O 40 | 。 O -------------------------------------------------------------------------------- /.idea/bert_seq2seq_DDP.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /data/LCCC-base-split/LCCC-base_valid.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | "啊 我 好 爱 虾 仁 蛋 黄 酱 金 枪 鱼 蛋 黄 酱", 4 | "那 个 饭 凉 了 吧 唧 的 怎 么 吃 啊 摔" 5 | ], 6 | [ 7 | "考 试 撞 墙 关 驾 校 屁 事 ? 你 怎 么 不 顺 便 把 考 场 施 工 单 位 也 告 了 ?", 8 | "看 了 下 全 文 , 那 女 的 考 试 当 天 就 表 明 身 体 不 舒 服 了 , 考 试 不 是 她 预 约 是 教 练 自 己 安 排 的 , 教 练 还 让 她 考 试 不 就 是 教 练 的 错 吗 ? 而 且 她 住 院 花 了 3 1 万 , 赔 3 0 万 不 过 分 吧", 9 | "更 改 要 提 前 3 天 , 当 天 不 上 场 视 为 放 弃 考 试 , 又 要 重 新 报 名 重 新 交 钱 , 估 计 她 也 不 肯 的 吧 ?" 10 | ] 11 | ] -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='bert_seq2seq_DDP', 5 | version='0.3.0', 6 | description='use torch to do bert_seq2seq task', 7 | long_description='bert_seq2seq_DDP: https://github.com/920232796/bert_seq2seq_DDP', 8 | license='Apache License 2.0', 9 | url='https://github.com/920232796/bert_seq2seq_DDP', 10 | author='xingzhaohu', 11 | author_email='920232796@qq.com', 12 | packages=find_packages() 13 | ) 14 | -------------------------------------------------------------------------------- /examples/seq2seq/gpt2/test_gpt2_text_writting.py: -------------------------------------------------------------------------------- 1 | from bert_seq2seq.utils import load_model 2 | from bert_seq2seq.tokenizer import Tokenizer 3 | from bert_seq2seq import Predictor 4 | 5 | model_path = "../state_dict/gpt2/pytorch_model.bin" 6 | vocab_path = "../state_dict/gpt2/vocab.txt" 7 | 8 | tokenizer = Tokenizer(vocab_path) 9 | 10 | model = load_model(tokenizer.vocab, 11 | model_name="gpt2", 12 | task_name="seq2seq") 13 | model.load_pretrain_params(model_path) 14 | predictor = Predictor(model, tokenizer) 15 | 16 | if __name__ == '__main__': 17 | text = "今天天气好," 18 | out = predictor.predict_generate_randomsample(text, out_max_length=100, 19 | repetition_penalty=1.5, 20 | top_p=1.0, top_k=20) 21 | print(out) -------------------------------------------------------------------------------- /examples/seq2seq/t5/test_t5_auto_title.py: -------------------------------------------------------------------------------- 1 | from bert_seq2seq.utils import load_model 2 | from bert_seq2seq.tokenizer import T5PegasusTokenizer 3 | from bert_seq2seq import Predictor 4 | 5 | model_path = "../state_dict/t5-chinese/pytorch_model.bin" 6 | vocab_path = "../state_dict/t5-chinese/vocab.txt" 7 | 8 | tokenizer = T5PegasusTokenizer(vocab_path) 9 | 10 | model = load_model(tokenizer.vocab, 11 | model_name="t5", 12 | task_name="seq2seq") 13 | model.load_pretrain_params(model_path) 14 | 15 | predictor = Predictor(model, tokenizer) 16 | 17 | if __name__ == '__main__': 18 | text = "本文总结了十个可穿戴产品的设计原则,而这些原则同样也是笔者认为是这个行业最吸引人的地方:1.为人们解决重复性问题,2.从人开始而不是从机器开始,3.要引起注意但不要刻意,4.提升用户能力而不是取代人" 19 | out = predictor.predict_generate_randomsample(text, out_max_length=100, 20 | repetition_penalty=1.0, 21 | top_p=0.9, top_k=50) 22 | print(out) -------------------------------------------------------------------------------- /bert_seq2seq/task/embedding/bert_embedding.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from bert_seq2seq.basic_bert import BasicBert 3 | 4 | class BertEmbedding(BasicBert): 5 | """ 6 | """ 7 | def __init__(self, vocab, 8 | model_name="roberta", 9 | size="base", 10 | **kwargs): 11 | super(BertEmbedding, self).__init__(word2ix=vocab, model_name=model_name, size=size) 12 | self.layer_norm_cond = None 13 | self.cls.predictions.decoder = None 14 | 15 | def forward(self, **data): 16 | 17 | input_ids = data["input_ids"] 18 | token_type_ids = data.get("token_type_ids", None) 19 | 20 | all_layers, _ = self.bert(input_ids, token_type_ids=token_type_ids, 21 | output_all_encoded_layers=True) 22 | sequence_out = all_layers[-1] 23 | tokens_hidden_state = self.cls.predictions.transform(sequence_out) 24 | 25 | return_data = {"logits": tokens_hidden_state, } 26 | 27 | return return_data 28 | -------------------------------------------------------------------------------- /examples/text_classification/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from bert_seq2seq import Tokenizer 4 | from bert_seq2seq import load_model 5 | from bert_seq2seq import Predictor 6 | 7 | 8 | model_name = "roberta" # 选择模型名字 9 | task_name = "cls" 10 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置 11 | model_save_path = "./bert_emotion_analysis.bin" 12 | # 加载字典 13 | tokenizer = Tokenizer(vocab_path) 14 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 15 | 16 | target = ["中性", "积极", "消极"] 17 | 18 | def main(): 19 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name, target_size=3) 20 | bert_model.load_all_params(model_save_path) 21 | predictor = Predictor(bert_model, tokenizer) 22 | 23 | text = ["今天天气很好,挺喜欢。", 24 | "你今天是生谁的气了?怎么这么不开心??", 25 | "明天要下雨了。"] 26 | 27 | for t in text: 28 | ids = predictor.predict_cls_classifier(t).argmax(dim=0) 29 | print(target[ids]) 30 | 31 | if __name__ == '__main__': 32 | main() 33 | -------------------------------------------------------------------------------- /data/ner/china-people-daily-ner-corpus/example.dev: -------------------------------------------------------------------------------- 1 | 在 O 2 | 这 O 3 | 里 O 4 | 恕 O 5 | 弟 O 6 | 不 O 7 | 恭 O 8 | 之 O 9 | 罪 O 10 | , O 11 | 敢 O 12 | 在 O 13 | 尊 O 14 | 前 O 15 | 一 O 16 | 诤 O 17 | : O 18 | 前 O 19 | 人 O 20 | 论 O 21 | 书 O 22 | , O 23 | 每 O 24 | 曰 O 25 | “ O 26 | 字 O 27 | 字 O 28 | 有 O 29 | 来 O 30 | 历 O 31 | , O 32 | 笔 O 33 | 笔 O 34 | 有 O 35 | 出 O 36 | 处 O 37 | ” O 38 | , O 39 | 细 O 40 | 读 O 41 | 公 O 42 | 字 O 43 | , O 44 | 何 O 45 | 尝 O 46 | 跳 O 47 | 出 O 48 | 前 O 49 | 人 O 50 | 藩 O 51 | 篱 O 52 | , O 53 | 自 O 54 | 隶 O 55 | 变 O 56 | 而 O 57 | 后 O 58 | , O 59 | 直 O 60 | 至 O 61 | 明 O 62 | 季 O 63 | , O 64 | 兄 O 65 | 有 O 66 | 何 O 67 | 新 O 68 | 出 O 69 | ? O 70 | 71 | 相 O 72 | 比 O 73 | 之 O 74 | 下 O 75 | , O 76 | 青 B-ORG 77 | 岛 I-ORG 78 | 海 I-ORG 79 | 牛 I-ORG 80 | 队 I-ORG 81 | 和 O 82 | 广 B-ORG 83 | 州 I-ORG 84 | 松 I-ORG 85 | 日 I-ORG 86 | 队 I-ORG 87 | 的 O 88 | 雨 O 89 | 中 O 90 | 之 O 91 | 战 O 92 | 虽 O 93 | 然 O 94 | 也 O 95 | 是 O 96 | 0 O 97 | ∶ O 98 | 0 O 99 | , O 100 | 但 O 101 | 乏 O 102 | 善 O 103 | 可 O 104 | 陈 O 105 | 。 O -------------------------------------------------------------------------------- /examples/seq2seq/gpt2/test_gpt2_multi_chat.py: -------------------------------------------------------------------------------- 1 | ## 多轮对话,测试 2 | from bert_seq2seq.utils import load_model 3 | from bert_seq2seq.tokenizer import Tokenizer 4 | from bert_seq2seq import Predictor 5 | import os 6 | 7 | vocab_path = "../state_dict/gpt2/vocab.txt" 8 | model_save_path = "./gpt2_multi_chat_model.bin" # 训练好的模型保存位置。 9 | 10 | tokenizer = Tokenizer(vocab_path) 11 | 12 | model = load_model(tokenizer.vocab, 13 | model_name="gpt2", 14 | task_name="seq2seq") 15 | model.load_all_params(model_save_path) 16 | predictor = Predictor(model, tokenizer) 17 | 18 | if __name__ == '__main__': 19 | sentences_list = [["今天我去吃了火锅,还可以,想不想尝尝?"], 20 | ["今天天气很好", "是啊,真的非常好,我也出去玩了一会"], 21 | ["今天天气很好", "是啊,真的非常好", "你也出去玩了吗?"]] 22 | 23 | for sentences in sentences_list: 24 | out = predictor.predict_multi_response(sentences, 25 | repetition_penalty=1.2, 26 | temperature=1.2, 27 | top_p=1.0, top_k=30) 28 | print(out) -------------------------------------------------------------------------------- /examples/bert_embedding/get_bert_embedding.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | from bert_seq2seq import Tokenizer 4 | from bert_seq2seq import load_model 5 | from bert_seq2seq import Predictor 6 | import numpy as np 7 | 8 | def compute_similarity(in_1, in_2): 9 | res = np.dot(in_1, in_2) / (np.linalg.norm(in_1) * np.linalg.norm(in_2)) 10 | return res 11 | 12 | maxlen = 256 13 | model_name = "bert" # 选择模型名字 14 | task_name = "embedding" 15 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 16 | 17 | vocab_path = "../state_dict/bert-base-chinese/vocab.txt" # roberta模型字典的位置 18 | model_path = "../state_dict/bert-base-chinese/pytorch_model.bin" # roberta模型位置 19 | 20 | tokenizer = Tokenizer(vocab_path, do_lower_case=True, max_len=maxlen) 21 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name) 22 | bert_model.load_pretrain_params(model_path, strict=False) 23 | 24 | predictor = Predictor(bert_model, tokenizer) 25 | text = ["今天天气很好", "今天天气不错", "今天有事出去忙"] 26 | 27 | embedding_1 = predictor.predict_embedding(text[0], maxlen=maxlen) 28 | embedding_2 = predictor.predict_embedding(text[1], maxlen=maxlen) 29 | embedding_3 = predictor.predict_embedding(text[2], maxlen=maxlen) 30 | 31 | print(f"cos sim 1-2 is {compute_similarity(embedding_1, embedding_2)}") 32 | print(f"cos sim 1-3 is {compute_similarity(embedding_1, embedding_3)}") 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /bert_seq2seq/task/seq2seq/t5_seq2seq_model.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | from bert_seq2seq.model.t5_model import T5ForConditionalGeneration, T5Config, T5SmallConfig 4 | from bert_seq2seq.basic_bert import BasicT5 5 | import torch.nn.functional as F 6 | 7 | class T5Model(BasicT5): 8 | 9 | def __init__(self, vocab, 10 | model_name="t5", 11 | size="base", 12 | **kwargs): 13 | super().__init__() 14 | if size == "base": 15 | config = T5Config(vocab_size=len(vocab)) 16 | elif size == "small": 17 | config = T5SmallConfig(vocab_size=len(vocab)) 18 | else: 19 | raise Exception("not support this model type") 20 | self.model = T5ForConditionalGeneration(config) 21 | print(f"model is {model_name}") 22 | 23 | def forward(self, **data): 24 | input_ids = data.get("input_ids", None) 25 | decoder_input_ids = data["decoder_input_ids"] 26 | encoder_last_hidden_state = data.get("encoder_last_hidden_state", None) 27 | if encoder_last_hidden_state is not None: 28 | encoder_last_hidden_state = [encoder_last_hidden_state] 29 | labels = data.get("labels", None) 30 | t5_out = self.model(input_ids=input_ids, encoder_outputs=encoder_last_hidden_state, decoder_input_ids=decoder_input_ids, labels=labels) 31 | if labels is not None: 32 | return {"logits": t5_out[1], "loss": t5_out[0], "encoder_last_hidden_state": t5_out[2]} 33 | 34 | return {"logits": t5_out[0], "encoder_last_hidden_state": t5_out[1]} -------------------------------------------------------------------------------- /bert_seq2seq/mpu/func_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2022 BAAI. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License") 4 | # coding=utf-8 5 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | import torch 20 | import math 21 | 22 | 23 | @torch.jit.script 24 | def gelu_impl(x): 25 | """OpenAI's gelu implementation.""" 26 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * 27 | (1.0 + 0.044715 * x * x))) 28 | 29 | 30 | def gelu(x): 31 | return gelu_impl(x) 32 | 33 | 34 | def unscaled_init_method(sigma): 35 | """Init method based on N(0, sigma).""" 36 | def init_(tensor): 37 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) 38 | 39 | return init_ 40 | 41 | 42 | def scaled_init_method(sigma, num_layers): 43 | """Init method based on N(0, sigma/sqrt(2*num_layers).""" 44 | std = sigma / math.sqrt(2.0 * num_layers) 45 | 46 | def init_(tensor): 47 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 48 | 49 | return init_ 50 | 51 | 52 | def sqrt(x): 53 | return int(math.sqrt(x) + 1e-4) 54 | -------------------------------------------------------------------------------- /bert_seq2seq/task/seq2seq/gpt2_seq2seq_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from bert_seq2seq.model.gpt2_model import GPT2LMHeadModel, GPT2Config 3 | from bert_seq2seq.basic_bert import BasicGPT 4 | 5 | class GPT2(BasicGPT): 6 | def __init__(self, vocab, 7 | model_name="gpt2", 8 | **kwargs 9 | ): 10 | super().__init__() 11 | self.word2ix = vocab 12 | if model_name == "gpt2": 13 | self.config = GPT2Config(len(vocab)) 14 | else : 15 | self.config = None 16 | self.model = GPT2LMHeadModel(self.config) 17 | print(f"model is {model_name}") 18 | 19 | def _make_causal_mask(self, input_ids): 20 | device = input_ids.device 21 | bsz, tgt_len = input_ids.shape 22 | mask = torch.full((tgt_len, tgt_len), 0.0).to(device) 23 | mask_cond = torch.arange(mask.size(-1)).to(device) 24 | mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 1.0) 25 | 26 | return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len) 27 | 28 | def forward(self, **data): 29 | input_ids = data["input_ids"] 30 | labels = data.get("labels", None) 31 | extend_mask = (input_ids > 0).float() 32 | 33 | return_data = {} 34 | attention_mask = self._make_causal_mask(input_ids) 35 | extend_mask = extend_mask.unsqueeze(1).unsqueeze(1) * attention_mask 36 | if labels is not None: 37 | loss, lm_logits = self.model(input_ids, labels=labels, attention_mask=extend_mask) 38 | return_data["loss"] = loss 39 | 40 | else : 41 | lm_logits = self.model(input_ids, attention_mask=attention_mask) 42 | return_data["logits"] = lm_logits 43 | 44 | return return_data -------------------------------------------------------------------------------- /examples/seq2seq/GLM/glm_generate_samples.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2022 BAAI. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License") 4 | 5 | import torch 6 | from bert_seq2seq import Predictor 7 | from bert_seq2seq import GLMTokenizer 8 | from bert_seq2seq.utils import load_model 9 | import torch 10 | import os 11 | 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | 14 | if __name__ == '__main__': 15 | tokenizer = GLMTokenizer("../state_dict/GLM-large-ch/cog-pretrain.model") 16 | model = load_model(model_name="glm", 17 | task_name="seq2seq", 18 | size="large") 19 | 20 | model.load_pretrain_params("../state_dict/GLM-large-ch/pytorch_model.bin") 21 | model.to(device) 22 | 23 | predictor = Predictor(model, tokenizer) 24 | # generate samples 25 | text = [ 26 | '问题:啤酒伤胃吗?回答:[gMASK]', "问题:隔夜菜能吃吗?回答:[gMASK]", "问题:如何评价许嵩?回答:[gMASK]" 27 | ] 28 | for t in text: 29 | output = predictor.predict_generate_randomsample( 30 | t, top_k=50, repetition_penalty=4.0, top_p=1.0) 31 | print(t, '\n', output) 32 | 33 | text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[MASK]大都市。", "天津大学是[MASK]现代大学。"] 34 | for t in text: 35 | output = predictor.predict_generate_randomsample( 36 | t, top_k=50, repetition_penalty=4.0, top_p=1.0) 37 | print(t, '\n', output) 38 | # 39 | text = [ 40 | "人工智能是一个以计算机科学为基础,由计算机、数学、哲学等多学科交叉融合的交叉学科,[sMASK],具有非常巨大的前景。", 41 | "最近十多年来,人工神经网络的研究工作不断深入,已经取得了很大的进展,[sMASK],表现出了良好的智能特性。" 42 | ] 43 | for t in text: 44 | output = predictor.predict_generate_randomsample( 45 | t, top_k=50, repetition_penalty=4.0, top_p=1.0) 46 | print(t, '\n', output) 47 | -------------------------------------------------------------------------------- /examples/FAQ/1_construct_data.py: -------------------------------------------------------------------------------- 1 | ## 构建数据库 2 | ## 数据来源 https://github.com/murufeng/ChineseNlpCorpus 3 | import torch 4 | from bert_seq2seq import Tokenizer 5 | from bert_seq2seq import load_model 6 | from bert_seq2seq import Predictor 7 | import pandas as pd 8 | import numpy as np 9 | from tqdm import tqdm 10 | import collections 11 | import faiss 12 | 13 | faq_data_path = "../data/financezhidao_filter.csv" 14 | answer_save_path = "../data/finance_fqa.json" 15 | embeddings_save_path = "../data/finance_embeddings.json" 16 | 17 | maxlen = 256 18 | model_name = "bert" # 选择模型名字 19 | task_name = "embedding" 20 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 21 | 22 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置 23 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置 24 | 25 | tokenizer = Tokenizer(vocab_path, do_lower_case=True, max_len=maxlen) 26 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name) 27 | bert_model.load_pretrain_params(model_path) 28 | bert_model.to(device) 29 | predictor = Predictor(bert_model, tokenizer) 30 | 31 | def resave_data(): 32 | answer = collections.OrderedDict() 33 | embeddings = [] 34 | df = pd.read_csv(faq_data_path) 35 | for index, row in tqdm(df.iterrows(), total=len(df)): 36 | if type(row[0]) == str: 37 | if row[0] not in answer: 38 | answer[row[0]] = row[2] 39 | embeddings.append(predictor.predict_embedding(row[0], maxlen=maxlen).numpy()) 40 | 41 | embeddings = np.array(embeddings) 42 | torch.save(answer, answer_save_path) 43 | torch.save(embeddings, embeddings_save_path) 44 | 45 | print(f"数据保存成功: {answer_save_path}, {embeddings_save_path}") 46 | 47 | if __name__ == '__main__': 48 | 49 | resave_data() -------------------------------------------------------------------------------- /bert_seq2seq/mpu/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Model parallel utility interface.""" 16 | from .data import broadcast_data 17 | 18 | from .grads import clip_grad_norm 19 | 20 | from .initialize import destroy_model_parallel 21 | from .initialize import get_data_parallel_group 22 | from .initialize import get_data_parallel_rank 23 | from .initialize import get_data_parallel_world_size 24 | from .initialize import get_model_parallel_group 25 | from .initialize import get_model_parallel_rank 26 | from .initialize import get_model_parallel_src_rank 27 | from .initialize import get_model_parallel_world_size 28 | from .initialize import initialize_model_parallel 29 | from .initialize import model_parallel_is_initialized 30 | 31 | from .mappings import copy_to_model_parallel_region 32 | from .mappings import gather_from_model_parallel_region 33 | from .mappings import reduce_from_model_parallel_region 34 | from .mappings import scatter_to_model_parallel_region 35 | 36 | from .random import checkpoint 37 | from .random import partition_activations_in_checkpoint 38 | from .random import get_cuda_rng_tracker 39 | from .random import model_parallel_cuda_manual_seed 40 | -------------------------------------------------------------------------------- /bert_seq2seq/task/classification/bert_cls_classifier.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from bert_seq2seq.basic_bert import BasicBert 3 | 4 | class BertClsClassifier(BasicBert): 5 | """ 6 | """ 7 | def __init__(self, vocab, 8 | target_size, 9 | model_name="roberta", 10 | **kwargs): 11 | super(BertClsClassifier, self).__init__(word2ix=vocab, model_name=model_name) 12 | self.target_size = target_size 13 | self.final_dense = nn.Linear(self.config.hidden_size, self.target_size) 14 | self.cls = None 15 | self.layer_norm_cond = None 16 | 17 | def compute_loss(self, predictions, labels): 18 | """ 19 | 计算loss 20 | predictions: (batch_size, 1) 21 | """ 22 | predictions = predictions.view(-1, self.target_size) 23 | labels = labels.view(-1) 24 | loss = nn.CrossEntropyLoss(reduction="mean") 25 | return loss(predictions, labels) 26 | 27 | def compute_loss_sigmoid(self, predictions, labels): 28 | predictions = predictions.view(-1) 29 | labels = labels.view(-1).float() 30 | 31 | loss_sigmoid = nn.BCEWithLogitsLoss() 32 | return loss_sigmoid(predictions, labels) 33 | 34 | def forward(self, **data): 35 | 36 | input_ids = data["input_ids"] 37 | token_type_ids = data["token_type_ids"] 38 | labels = data.get("labels", None) 39 | 40 | all_layers, pooled_out = self.bert(input_ids, token_type_ids=token_type_ids, 41 | output_all_encoded_layers=True) 42 | 43 | predictions = self.final_dense(pooled_out) 44 | return_data = {"logits": predictions, } 45 | if labels is not None: 46 | ## 计算loss 47 | if self.target_size == 1: 48 | loss = self.compute_loss_sigmoid(predictions, labels) 49 | else : 50 | loss = self.compute_loss(predictions, labels) 51 | return_data["loss"] = loss 52 | 53 | return return_data -------------------------------------------------------------------------------- /data/ner/china-people-daily-ner-corpus/example.train: -------------------------------------------------------------------------------- 1 | 海 O 2 | 钓 O 3 | 比 O 4 | 赛 O 5 | 地 O 6 | 点 O 7 | 在 O 8 | 厦 B-LOC 9 | 门 I-LOC 10 | 与 O 11 | 金 B-LOC 12 | 门 I-LOC 13 | 之 O 14 | 间 O 15 | 的 O 16 | 海 O 17 | 域 O 18 | 。 O 19 | 20 | 这 O 21 | 座 O 22 | 依 O 23 | 山 O 24 | 傍 O 25 | 水 O 26 | 的 O 27 | 博 O 28 | 物 O 29 | 馆 O 30 | 由 O 31 | 国 O 32 | 内 O 33 | 一 O 34 | 流 O 35 | 的 O 36 | 设 O 37 | 计 O 38 | 师 O 39 | 主 O 40 | 持 O 41 | 设 O 42 | 计 O 43 | , O 44 | 整 O 45 | 个 O 46 | 建 O 47 | 筑 O 48 | 群 O 49 | 精 O 50 | 美 O 51 | 而 O 52 | 恢 O 53 | 宏 O 54 | 。 O 55 | 56 | 但 O 57 | 作 O 58 | 为 O 59 | 一 O 60 | 个 O 61 | 共 O 62 | 产 O 63 | 党 O 64 | 员 O 65 | 、 O 66 | 人 O 67 | 民 O 68 | 公 O 69 | 仆 O 70 | , O 71 | 应 O 72 | 当 O 73 | 胸 O 74 | 怀 O 75 | 宽 O 76 | 阔 O 77 | , O 78 | 真 O 79 | 正 O 80 | 做 O 81 | 到 O 82 | “ O 83 | 先 O 84 | 天 O 85 | 下 O 86 | 之 O 87 | 忧 O 88 | 而 O 89 | 忧 O 90 | , O 91 | 后 O 92 | 天 O 93 | 下 O 94 | 之 O 95 | 乐 O 96 | 而 O 97 | 乐 O 98 | ” O 99 | , O 100 | 淡 O 101 | 化 O 102 | 个 O 103 | 人 O 104 | 的 O 105 | 名 O 106 | 利 O 107 | 得 O 108 | 失 O 109 | 和 O 110 | 宠 O 111 | 辱 O 112 | 悲 O 113 | 喜 O 114 | , O 115 | 把 O 116 | 改 O 117 | 革 O 118 | 大 O 119 | 业 O 120 | 摆 O 121 | 在 O 122 | 首 O 123 | 位 O 124 | , O 125 | 这 O 126 | 样 O 127 | 才 O 128 | 能 O 129 | 超 O 130 | 越 O 131 | 自 O 132 | 我 O 133 | , O 134 | 摆 O 135 | 脱 O 136 | 世 O 137 | 俗 O 138 | , O 139 | 有 O 140 | 所 O 141 | 作 O 142 | 为 O 143 | 。 O 144 | 145 | 在 O 146 | 发 O 147 | 达 O 148 | 国 O 149 | 家 O 150 | , O 151 | 急 O 152 | 救 O 153 | 保 O 154 | 险 O 155 | 十 O 156 | 分 O 157 | 普 O 158 | 及 O 159 | , O 160 | 已 O 161 | 成 O 162 | 为 O 163 | 社 O 164 | 会 O 165 | 保 O 166 | 障 O 167 | 体 O 168 | 系 O 169 | 的 O 170 | 重 O 171 | 要 O 172 | 组 O 173 | 成 O 174 | 部 O 175 | 分 O 176 | 。 O 177 | 178 | 日 B-LOC 179 | 俄 B-LOC 180 | 两 O 181 | 国 O 182 | 国 O 183 | 内 O 184 | 政 O 185 | 局 O 186 | 都 O 187 | 充 O 188 | 满 O 189 | 变 O 190 | 数 O 191 | , O 192 | 尽 O 193 | 管 O 194 | 日 B-LOC 195 | 俄 B-LOC 196 | 关 O 197 | 系 O 198 | 目 O 199 | 前 O 200 | 是 O 201 | 历 O 202 | 史 O 203 | 最 O 204 | 佳 O 205 | 时 O 206 | 期 O 207 | , O 208 | 但 O 209 | 其 O 210 | 脆 O 211 | 弱 O 212 | 性 O 213 | 不 O 214 | 言 O 215 | 自 O 216 | 明 O 217 | 。 O -------------------------------------------------------------------------------- /examples/seq2seq/gpt2/test_multi_processing_generate.py: -------------------------------------------------------------------------------- 1 | from cgi import test 2 | from multiprocessing import Pool, Process 3 | import os 4 | from nbformat import write 5 | import pandas as pd 6 | import torch 7 | from torch.utils.data import Dataset 8 | from bert_seq2seq import Tokenizer 9 | from bert_seq2seq import load_model 10 | from bert_seq2seq import Predictor 11 | 12 | vocab_path = "../state_dict/gpt2/vocab.txt" 13 | model_save_path = "./gpt2_writing_model.bin" # 训练好的模型保存位置。 14 | 15 | model_name = "gpt2" # 选择模型名字 16 | task_name = "seq2seq" # 任务名字 17 | 18 | data_path = "../data/xzwaz2kx4cu.csv" 19 | 20 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 21 | 22 | tokenizer = Tokenizer(vocab_path) 23 | model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name) 24 | model.load_all_params(model_save_path) 25 | model.to(device) 26 | 27 | predictor = Predictor(model, tokenizer) 28 | 29 | def read_file(): 30 | ## 更换数据集只需要实现这个函数即可,返回框架需要的src、tgt 31 | data = [] 32 | 33 | df = pd.read_csv(data_path) 34 | for index, row in df.iterrows(): 35 | if type(row[0]) is str: 36 | data.append(row[0]) 37 | 38 | return data 39 | 40 | test_data = read_file() 41 | print(f"data len is {len(test_data)}") 42 | 43 | def generate_multiprocess(data): 44 | print(f"data is {data}") 45 | out = predictor.predict_generate_randomsample(data, 46 | input_max_length=100, 47 | out_max_length=900, 48 | top_k=50, 49 | top_p=0.8, 50 | repetition_penalty=3.0, 51 | temperature=1.5) 52 | 53 | with open(os.path.join("./gene", f"{data}.txt"), "w+") as f : 54 | f.write(str(out)) 55 | # return (out, data) 56 | 57 | 58 | if __name__ == "__main__": 59 | torch.multiprocessing.set_start_method("spawn") 60 | p = Pool(3) 61 | p.map_async(generate_multiprocess, test_data, chunksize=3) 62 | p.close() 63 | p.join() 64 | print('done.') -------------------------------------------------------------------------------- /bert_seq2seq/model/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2022 BAAI. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License") 4 | import math 5 | 6 | import torch 7 | 8 | 9 | def ensure_divisibility(numerator, denominator): 10 | """Ensure that numerator is divisible by the denominator.""" 11 | assert numerator % denominator == 0, '{} is not divisible by {}'.format( 12 | numerator, denominator) 13 | 14 | 15 | def divide(numerator, denominator): 16 | """Ensure that numerator is divisible by the denominator and return 17 | the division value.""" 18 | ensure_divisibility(numerator, denominator) 19 | return numerator // denominator 20 | 21 | 22 | def split_tensor_along_last_dim(tensor, 23 | num_partitions, 24 | contiguous_split_chunks=False): 25 | """Split a tensor along its last dimension. 26 | Arguments: 27 | tensor: input tensor. 28 | num_partitions: number of partitions to split the tensor 29 | contiguous_split_chunks: If True, make each chunk contiguous 30 | in memory. 31 | """ 32 | # Get the size and dimension. 33 | last_dim = tensor.dim() - 1 34 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 35 | # Split. 36 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 37 | # Note: torch.split does not create contiguous tensors by default. 38 | if contiguous_split_chunks: 39 | return tuple(chunk.contiguous() for chunk in tensor_list) 40 | 41 | return tensor_list 42 | 43 | 44 | def unscaled_init_method(sigma): 45 | """Init method based on N(0, sigma).""" 46 | def init_(tensor): 47 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) 48 | 49 | return init_ 50 | 51 | 52 | def scaled_init_method(mean, sigma, num_layers): 53 | """Init method based on N(0, sigma/sqrt(2*num_layers).""" 54 | std = sigma / math.sqrt(2.0 * num_layers) 55 | 56 | def init_(tensor): 57 | return torch.nn.init.normal_(tensor, mean=mean, std=std) 58 | 59 | return init_ 60 | 61 | 62 | def sqrt(x): 63 | return int(math.sqrt(x) + 1e-4) 64 | 65 | 66 | def normal_init_method(mean=0.0, std=0.02): 67 | def init_(tensor): 68 | return torch.nn.init.normal_(tensor, mean=mean, std=std) 69 | 70 | return init_ 71 | -------------------------------------------------------------------------------- /examples/FAQ/2_test_bert_faq.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from bert_seq2seq import Tokenizer 3 | from bert_seq2seq import load_model 4 | from bert_seq2seq import Predictor 5 | import faiss 6 | 7 | faq_data_path = "../data/financezhidao_filter.csv" 8 | answer_save_path = "../data/finance_fqa.json" 9 | embeddings_save_path = "../data/finance_embeddings.json" 10 | 11 | maxlen = 256 12 | d = 768 13 | nlist = 5 14 | 15 | model_name = "bert" # 选择模型名字 16 | task_name = "embedding" 17 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 18 | 19 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置 20 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置 21 | 22 | tokenizer = Tokenizer(vocab_path, do_lower_case=True, max_len=maxlen) 23 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name) 24 | bert_model.load_pretrain_params(model_path) 25 | 26 | predictor = Predictor(bert_model, tokenizer) 27 | 28 | class Search: 29 | def __init__(self, training_vectors, d, nlist=10, nprobe=1): 30 | quantizer = faiss.IndexFlatIP(d) # the other index,需要以其他index作为基础 31 | self.index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2) 32 | assert not self.index.is_trained 33 | self.index.train(training_vectors) 34 | assert self.index.is_trained 35 | self.index.nprobe = nprobe # default nprobe is 1, try a few more 36 | self.index.add(training_vectors) # add may be a bit slower as well 37 | self.d = d 38 | 39 | def search(self, answer, query, k=10): 40 | query = query.numpy().reshape(-1, self.d) 41 | D, I = self.index.search(query, k) # actual search 42 | result = [] 43 | all_question = list(answer.keys()) 44 | for s, i in zip(D[0], I[0]): 45 | print(i) 46 | if i != -1: 47 | result.append({all_question[i]: s}) 48 | 49 | print(result) 50 | 51 | if __name__ == '__main__': 52 | # load data 53 | answer = torch.load(answer_save_path) 54 | embeddings = torch.load(embeddings_save_path) 55 | 56 | method = Search(training_vectors=embeddings, d=d, nlist=nlist, nprobe=2) 57 | 58 | while True: 59 | question = input("请输入问题:") 60 | if question == "q": 61 | break 62 | question_embedding = predictor.predict_embedding(question, maxlen=maxlen) 63 | method.search(answer, question_embedding, k=10) 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /bert_seq2seq/config.py: -------------------------------------------------------------------------------- 1 | 2 | max_length = 256 3 | 4 | yayun_list = [ 5 | "东同铜桐筒童僮瞳中衷忠虫终戎崇嵩弓躬宫融雄熊穹穷冯风枫丰充隆空公功工攻蒙笼聋珑洪红鸿虹丛翁聪通蓬烘潼胧砻峒螽梦讧冻忡酆恫总侗窿懵庞种盅芎倥艨绒葱匆骢", 6 | "冬农宗钟龙舂松冲容蓉庸封胸雍浓重从逢缝踪茸峰锋烽蛩慵恭供淙侬松凶墉镛佣溶邛共憧喁邕壅纵龚枞脓淞匈汹禺蚣榕彤", 7 | "江扛窗邦缸降双庞逄腔撞幢桩淙豇", 8 | "支枝移为垂吹陂碑奇宜仪皮儿离施知驰池规危夷师姿迟眉悲之芝时诗棋旗辞词期祠基疑姬丝司葵医帷思滋持随痴维卮麋螭麾墀弥慈遗肌脂雌披嬉尸狸炊篱兹差疲茨卑亏蕤陲骑曦歧岐谁斯私窥熙欺疵赀笞羁彝颐资糜饥衰锥姨楣夔涯伊蓍追", 9 | "缁箕椎罴篪萎匙脾坻嶷治骊尸綦怡尼漪累牺饴而鸱推縻璃祁绥逵羲羸肢骐訾狮奇嗤咨堕其睢漓蠡噫馗辎胝鳍蛇陴淇淄丽筛厮氏痍貔比僖贻祺嘻鹂瓷琦嵋怩熹孜台蚩罹魑丕琪耆衰惟剂提禧居栀戏畸椅磁痿离佳虽仔寅委崎隋逶倭黎犁郦", 10 | "微薇晖徽挥韦围帏违霏菲妃绯飞非扉肥腓威畿机几讥矶稀希衣依沂巍归诽痱欷葳颀圻", 11 | "鱼渔初书舒居裾车渠余予誉舆胥狙锄疏蔬梳虚嘘徐猪闾庐驴诸除储如墟与畲疽苴于茹蛆且沮祛蜍榈淤好雎纾躇趄滁屠据匹咀衙涂虑", 12 | "虞愚娱隅刍无芜巫于盂衢儒濡襦须株诛蛛殊瑜榆谀愉腴区驱躯朱珠趋扶符凫雏敷夫肤纡输枢厨俱驹模谟蒲胡湖瑚乎壶狐弧孤辜姑觚菰徒途涂荼图屠奴呼吾七虞梧吴租卢鲈苏酥乌枯都铺禺诬竽吁瞿劬需俞逾觎揄萸臾渝岖镂娄夫孚桴俘迂姝拘摹糊鸪沽呱蛄驽逋舻垆徂孥泸栌嚅蚨诹扶母毋芙喁颅轳句邾洙麸机膜瓠恶芋呕驺喻枸侏龉葫懦帑拊", 13 | "齐蛴脐黎犁梨黧妻萋凄堤低氐诋题提荑缔折篦鸡稽兮奚嵇蹊倪霓西栖犀嘶撕梯鼙批挤迷泥溪圭闺睽奎携畦骊鹂儿", 14 | "佳街鞋牌柴钗差涯阶偕谐骸排乖怀淮豺侪埋霾斋娲蜗娃哇皆喈揩蛙楷槐俳", 15 | "灰恢魁隈回徊枚梅媒煤瑰雷催摧堆陪杯醅嵬推开哀埃台苔该才材财裁来莱栽哉灾猜胎孩虺崔裴培坏垓陔徕皑傀崃诙煨桅唉颏能茴酶偎隗咳", 16 | "真因茵辛新薪晨辰臣人仁神亲申伸绅身宾滨邻鳞麟珍尘陈春津秦频苹颦银垠筠巾民珉缗贫淳醇纯唇伦纶轮沦匀旬巡驯钧均臻榛姻寅彬鹑皴遵循振甄岷谆椿询恂峋莘堙屯呻粼磷辚濒闽豳逡填狺泯洵溱夤荀竣娠纫鄞抡畛嶙斌氤", 17 | "文闻纹云氛分纷芬焚坟群裙君军勤斤筋勋薰曛熏荤耘芸汾氲员欣芹殷昕贲郧雯蕲", 18 | "元原源园猿辕坦烦繁蕃樊翻萱喧冤言轩藩魂浑温孙门尊存蹲敦墩暾屯豚村盆奔论坤昏婚阍痕根恩吞沅媛援爰幡番反埙鸳宛掀昆琨鲲扪荪髡跟垠抡蕴犍袁怨蜿溷昆炖饨臀喷纯", 19 | "寒韩翰丹殚单安难餐滩坛檀弹残干肝竿乾阑栏澜兰看刊丸桓纨端湍酸团抟攒官观冠鸾銮栾峦欢宽盘蟠漫汗郸叹摊奸剜棺钻瘢谩瞒潘胖弁拦完莞獾拌掸萑倌繁曼馒鳗谰洹滦", 20 | "删潸关弯湾还环鹌鬟寰班斑颁般蛮颜菅攀顽山鳏艰闲娴悭孱潺殷扳讪患", 21 | "先前千阡笺天坚肩贤弦烟燕莲怜田填钿年颠巅牵妍研眠渊涓蠲编玄县泉迁仙鲜钱煎然延筵禅蝉缠连联涟篇偏便全宣镌穿川缘鸢铅捐旋娟船涎鞭专圆员乾虔愆骞权拳椽传焉跹溅舷咽零骈阗鹃翩扁平沿诠痊悛荃遄卷挛戋佃滇婵颛犍搴嫣癣澶单竣鄢扇键蜷棉", 22 | "萧箫挑貂刁凋雕迢条跳苕调枭浇聊辽寥撩僚寮尧幺宵消霄绡销超朝潮嚣樵谯骄娇焦蕉椒饶烧遥姚摇谣瑶韶昭招飚标杓镳瓢苗描猫要腰邀乔桥侨妖夭漂飘翘祧佻徼侥哨娆陶橇劭潇骁獠料硝灶鹞钊蛲峤轿荞嘹逍燎憔剽", 23 | "肴巢交郊茅嘲钞包胶爻苞梢蛟庖匏坳敲胞抛鲛崤铙炮哮捎茭淆泡跑咬啁教咆鞘剿刨佼抓姣唠", 24 | "豪毫操髦刀萄猱桃糟漕旄袍挠蒿涛皋号陶翱敖遭篙羔高嘈搔毛艘滔骚韬缫膏牢醪逃槽劳洮叨绸饕骜熬臊涝淘尻挑嚣捞嗥薅咎谣", 25 | "歌多罗河戈阿和波科柯陀娥蛾鹅萝荷过磨螺禾哥娑驼佗沱峨那苛诃珂轲莎蓑梭婆摩魔讹坡颇俄哦呵皤么涡窝茄迦伽磋跎番蹉搓驮献蝌箩锅倭罗嵯锣", 26 | "麻花霞家茶华沙车牙蛇瓜斜邪芽嘉瑕纱鸦遮叉葩奢楂琶衙赊涯夸巴加耶嗟遐笳差蟆蛙虾拿葭茄挝呀枷哑娲爬杷蜗爷芭鲨珈骅娃哇洼畲丫夸裟瘕些桠杈痂哆爹椰咤笆桦划迦揶吾佘", 27 | "阳杨扬香乡光昌堂章张王房芳长塘妆常凉霜藏场央泱鸯秧嫱床方浆觞梁娘庄黄仓皇装殇襄骧相湘箱缃创忘芒望尝偿樯枪坊囊郎唐狂强肠康冈苍匡荒遑行妨棠翔良航倡伥羌庆姜僵缰疆粮穰将墙桑刚祥详洋徉佯粱量羊伤汤鲂樟彰漳璋猖商防", 28 | "筐煌隍凰蝗惶璜廊浪裆沧纲亢吭潢钢丧盲簧忙茫傍汪臧琅当庠裳昂障糖疡锵杭邙赃滂禳攘瓤抢螳踉眶炀阊彭蒋亡殃蔷镶孀搪彷胱磅膀螃八庚更羹盲横觥彭棚亨英瑛烹平评京惊荆明盟鸣荣莹兵卿生甥笙牲檠擎鲸迎行衡耕萌氓宏闳茎莺樱泓橙筝争清情晴精睛菁旌晶盈瀛嬴营婴缨贞成盛城诚呈程声征正轻名令并倾萦琼赓撑瞠枪伧峥猩珩蘅铿嵘丁嘤鹦铮砰绷轰訇瞪侦顷榜抨趟坪请", 29 | "青经泾形刑邢型陉亭庭廷霆蜓停丁宁钉仃馨星腥醒惺娉灵棂龄铃苓伶零玲翎瓴囹聆听厅汀冥溟螟铭瓶屏萍荧萤荥扃町瞑暝", 30 | "蒸承丞惩陵凌绫冰膺鹰应蝇绳渑乘升胜兴缯凭仍兢矜征凝称登灯僧增曾憎层能棱朋鹏弘肱腾滕藤恒冯瞢扔誊", 31 | "尤邮优忧流留榴骝刘由油游猷悠攸牛修羞秋周州洲舟酬仇柔俦畴筹稠邱抽湫遒收鸠不愁休囚求裘球浮谋牟眸矛侯猴喉讴沤鸥瓯楼娄陬偷头投钩沟幽彪疣绸浏瘤犹啾酋售蹂揉搜叟邹貅泅球逑俅蜉桴罘欧搂抠髅蝼兜句妯惆呕缪繇偻篓馗区", 32 | "侵寻浔林霖临针箴斟沈深淫心琴禽擒钦衾吟今襟金音阴岑簪琳琛椹谌忱壬任黔歆禁喑森参淋郴妊湛", 33 | "覃潭谭参骖南男谙庵含涵函岚蚕探贪耽龛堪戡谈甘三酣篮柑惭蓝郯婪庵颔褴澹", 34 | "盐檐廉帘嫌严占髯谦奁纤签瞻蟾炎添兼缣尖潜阎镰粘淹箝甜恬拈暹詹渐歼黔沾苫占崦阉砭", 35 | "咸缄谗衔岩帆衫杉监凡馋芟喃嵌掺搀严"] -------------------------------------------------------------------------------- /bert_seq2seq/model/layers/layer_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2022 BAAI. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License") 4 | # layer norm 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | class LayerNorm(nn.Module): 10 | def __init__(self, hidden_size, eps=1e-6): 11 | super(LayerNorm, self).__init__() 12 | self.eps = eps 13 | self.gamma = nn.Parameter(torch.ones(hidden_size)) 14 | self.beta = nn.Parameter(torch.zeros(hidden_size)) 15 | 16 | def forward(self, x): 17 | """Perform layer normalization to input x, with two learnable variables gamma and beta""" 18 | mean = x.mean(-1, keepdim=True) 19 | std = x.std(-1, keepdim=True) 20 | hidden_states = self.gamma * (x - mean) / (std + self.eps) 21 | 22 | return hidden_states + self.beta 23 | 24 | 25 | class T5LayerNorm(nn.Module): 26 | def __init__(self, hidden_size, eps=1e-6): 27 | """ 28 | Construct a layernorm module in the T5 style No bias and no subtraction of mean. 29 | """ 30 | super().__init__() 31 | self.weight = nn.Parameter(torch.ones(hidden_size)) 32 | self.variance_epsilon = eps 33 | 34 | def forward(self, hidden_states): 35 | # layer norm should always be calculated in float32 36 | variance = hidden_states.to(torch.float32).pow(2).mean(-1, 37 | keepdim=True) 38 | hidden_states = hidden_states * torch.rsqrt(variance + 39 | self.variance_epsilon) 40 | 41 | # convert into float16 if necessary 42 | if self.weight.dtype == torch.float16: 43 | hidden_states = hidden_states.to(torch.float16) 44 | return self.weight * hidden_states 45 | 46 | 47 | class BertLayerNorm(nn.Module): 48 | def __init__(self, hidden_size, eps=1e-12): 49 | """Construct a layernorm module in the TF style (epsilon inside the square root). 50 | """ 51 | super(BertLayerNorm, self).__init__() 52 | self.weight = nn.Parameter(torch.ones(hidden_size)) 53 | self.bias = nn.Parameter(torch.zeros(hidden_size)) 54 | self.variance_epsilon = eps 55 | 56 | def forward(self, x): 57 | u = x.mean(-1, keepdim=True) 58 | s = (x - u).pow(2).mean(-1, keepdim=True) 59 | x = (x - u) / torch.sqrt(s + self.variance_epsilon) 60 | return self.weight * x + self.bias 61 | -------------------------------------------------------------------------------- /bert_seq2seq/task/seq2seq/bert_seq2seq_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from bert_seq2seq.basic_bert import BasicBert 4 | 5 | class BertSeq2SeqModel(BasicBert): 6 | """ 7 | """ 8 | def __init__(self, vocab, 9 | model_name="roberta", 10 | size="base", 11 | **kwargs): 12 | super(BertSeq2SeqModel, self).__init__(word2ix=vocab, model_name=model_name, size=size) 13 | 14 | self.hidden_dim = self.config.hidden_size 15 | self.vocab_size = len(vocab) 16 | 17 | def compute_loss(self, predictions, labels, target_mask): 18 | """ 19 | target_mask : 句子a部分和pad部分全为0, 而句子b部分为1 20 | """ 21 | predictions = predictions.view(-1, self.vocab_size) 22 | labels = labels.view(-1) 23 | target_mask = target_mask.view(-1).float() 24 | loss = nn.CrossEntropyLoss(ignore_index=0, reduction="none") 25 | return (loss(predictions, labels) * target_mask).sum() / target_mask.sum() ## 通过mask 取消 pad 和句子a部分预测的影响 26 | 27 | def forward(self, **data): 28 | input_ids = data["input_ids"] 29 | token_type_ids = data["token_type_ids"] 30 | labels = data.get("labels", None) 31 | device = input_ids.device 32 | 33 | input_shape = input_ids.shape 34 | seq_len = input_shape[1] 35 | ## 构建特殊的mask 36 | ones = torch.ones((1, 1, seq_len, seq_len), dtype=torch.float32, device=device) 37 | a_mask = ones.tril() 38 | s_ex12 = token_type_ids.unsqueeze(1).unsqueeze(2).float() 39 | s_ex13 = token_type_ids.unsqueeze(1).unsqueeze(3).float() 40 | a_mask = (1.0 - s_ex12) * (1.0 - s_ex13) + s_ex13 * a_mask 41 | 42 | enc_layers, _ = self.bert(input_ids, position_ids=None, token_type_ids=token_type_ids, attention_mask=a_mask, 43 | output_all_encoded_layers=True) 44 | squence_out = enc_layers[-1] ## 取出来最后一层输出 (batch, seq_len, 768) 45 | 46 | tokens_hidden_state, predictions = self.cls(squence_out) 47 | result_data = {"logits": predictions, "hidden_states": tokens_hidden_state} 48 | 49 | if labels is not None: 50 | 51 | predictions = predictions[:, :-1].contiguous() 52 | target_mask = token_type_ids[:, 1:].contiguous() 53 | loss = self.compute_loss(predictions, labels, target_mask) 54 | result_data["loss"] = loss 55 | 56 | return result_data 57 | 58 | 59 | -------------------------------------------------------------------------------- /examples/seq2seq/bert/test_roberta_auto_title.py: -------------------------------------------------------------------------------- 1 | from bert_seq2seq.tokenizer import Tokenizer 2 | from bert_seq2seq import Predictor 3 | from bert_seq2seq import load_model 4 | import torch 5 | 6 | model_name = "roberta" # 选择模型名字 7 | task_name = "seq2seq" # 任务名字 8 | model_path = "./roberta_auto_title_model.bin" 9 | vocab_path = "../state_dict/roberta/vocab.txt" 10 | 11 | tokenizer = Tokenizer(vocab_path) 12 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name) 13 | bert_model.load_all_params(model_path) 14 | predictor = Predictor(bert_model, tokenizer) 15 | 16 | if __name__ == '__main__': 17 | textset = ["近期,美国国会众院通过法案,重申美国对台湾的承诺。对此,中国外交部发言人表示,有关法案严重违反一个中国原则和中美三个联合公报规定,粗暴干涉中国内政,中方对此坚决反对并已向美方提出严正交涉。\n" \ 18 | "事实上,中国高度关注美国国内打“台湾牌”、挑战一中原则的危险动向。近年来,作为“亲台”势力大本营的美国国会动作不断,先后通过“与台湾交往法”“亚洲再保证倡议法”等一系列“挺台”法案,“2019" \ 19 | "财年国防授权法案”也多处触及台湾问题。", 20 | "在推进“双一流”高校建设进程中,我们要紧紧围绕为党育人、为国育才,找准问题、破解难题,以一流意识和担当精神,大力推进高校的治理能力建设。", 21 | "增强政治引领力。坚持党对高校工作的全面领导,始终把政治建设摆在首位,增强校党委的政治领导力,全面推进党的建设各项工作。落实立德树人根本任务,把培养社会主义建设者和接班人放在中心位置。紧紧抓住思想政治工作这条生命线,全面加强师生思想政治工作,推进“三全育人”综合改革,将思想政治工作贯穿学校教育管理服务全过程,努力让学生成为德才兼备、全面发展的人才。", 22 | "提升人才聚集力。人才是创新的核心要素,创新驱动本质上是人才驱动。要坚持引育并举,建立绿色通道,探索知名专家举荐制,完善“一事一议”支持机制。在大力支持自然科学人才队伍建设的同时,实施哲学社会科学人才工程。立足实际,在条件成熟的学院探索“一院一策”改革。创新科研组织形式,为人才成长创设空间,建设更加崇尚学术、更加追求卓越、更加关爱学生、更加担当有为的学术共同体。", 23 | "培养学生竞争力。遵循学生成长成才的规律培育人才,着力培养具有国际竞争力的拔尖创新人才和各类专门人才,使优势学科、优秀教师、优质资源、优良环境围绕立德树人的根本任务配置。淘汰“水课”,打造“金课”,全力打造世界一流本科教育。深入推进研究生教育综合改革,加强事关国家重大战略的高精尖急缺人才培养,建设具有国际竞争力的研究生教育。", 24 | "激发科技创新力。在国家急需发展的领域挑大梁,就要更加聚焦科技前沿和国家需求,狠抓平台建设,包括加快牵头“武汉光源”建设步伐,积极参与国家实验室建设,建立校级大型科研仪器设备共享平台。关键核心技术领域“卡脖子”问题,归根结底是基础科学研究薄弱。要加大基础研究的支持力度,推进理论、技术和方法创新,鼓励支持重大原创和颠覆性技术创新,催生一批高水平、原创性研究成果。", 25 | "发展社会服务力。在贡献和服务中体现价值,推动合作共建、多元投入的格局,大力推进政产学研用结合,强化科技成果转移转化及产业化。探索校城融合发展、校地联动发展的新模式,深度融入地方创新发展网络,为地方经济社会发展提供人才支撑,不断拓展和优化社会服务网络。", 26 | "涵育文化软实力。加快体制机制改革,优化学校、学部、学院三级评审机制,充分发挥优秀学者特别是德才兼备的年轻学者在学术治理中的重要作用。牢固树立一流意识、紧紧围绕一流目标、认真执行一流标准,让成就一流事业成为普遍追求和行动自觉。培育具有强大凝聚力的大学文化,营造积极团结、向上向善、干事创业的氛围,让大学成为吸引和留住一大批优秀人才建功立业的沃土,让敢干事、肯干事、能干事的人有更多的荣誉感和获得感。", 27 | "建设中国特色、世界一流大学不是等得来、喊得来的,而是脚踏实地拼出来、干出来的。对标一流,深化改革,坚持按章程办学,构建以一流质量标准为核心的制度规范体系,扎实推进学校综合改革,探索更具活力、更富效率的管理体制和运行机制,我们就一定能构建起具有中国特色的现代大学治理体系,进一步提升管理服务水平和工作效能。" 28 | ] 29 | for text in textset: 30 | out = predictor.predict_generate_beamsearch(text, beam_size=3, input_max_length=200, out_max_length=40) 31 | print(out) 32 | -------------------------------------------------------------------------------- /bert_seq2seq/task/relationship_extraction/bert_relationship_extraction.py: -------------------------------------------------------------------------------- 1 | 2 | import torch.nn as nn 3 | from bert_seq2seq.basic_bert import BasicBert 4 | from bert_seq2seq.layers import GlobalPointer 5 | 6 | class BertRelationshipExtraction(BasicBert): 7 | """ 8 | """ 9 | def __init__(self, vocab, 10 | target_size, 11 | inner_dim=64, 12 | size="base", 13 | model_name="roberta", 14 | **kwargs): 15 | super(BertRelationshipExtraction, self).__init__(word2ix=vocab, model_name=model_name, size=size) 16 | self.entity_output = GlobalPointer(self.config.hidden_size, 2, 17 | inner_dim, RoPE=True, trill_mask=True) 18 | self.head_output = GlobalPointer(self.config.hidden_size, target_size, 19 | inner_dim, RoPE=False, trill_mask=False) 20 | self.tail_output = GlobalPointer(self.config.hidden_size, target_size, 21 | inner_dim, RoPE=False, trill_mask=False) 22 | self.layer_norm_cond = None 23 | self.cls = None 24 | 25 | def forward(self, **data): 26 | input_ids = data["input_ids"] 27 | token_type_ids = data.get("token_type_ids", None) 28 | head_labels = data.get("head_labels", None) 29 | tail_labels = data.get("tail_labels", None) 30 | entity_labels = data.get("entity_labels", None) 31 | 32 | padding_mask = (input_ids > 0).float() 33 | 34 | all_layers, _ = self.bert(input_ids, token_type_ids=token_type_ids, 35 | output_all_encoded_layers=True) 36 | sequence_out = all_layers[-1] 37 | 38 | entity_output = self.entity_output(sequence_out, padding_mask) 39 | head_output = self.head_output(sequence_out, padding_mask) 40 | tail_output = self.tail_output(sequence_out, padding_mask) 41 | 42 | return_data = {"entity_output": entity_output, "head_output": head_output, "tail_output": tail_output} 43 | if entity_labels is not None: 44 | loss_entity = self.entity_output.compute_loss_sparse(entity_output, entity_labels, mask_zero=True) 45 | loss_head = self.head_output.compute_loss_sparse(head_output, head_labels, mask_zero=True) 46 | loss_tail = self.tail_output.compute_loss_sparse(tail_output, tail_labels, mask_zero=True) 47 | 48 | return_data["loss"] = (loss_entity + loss_head + loss_tail) / 3 49 | return return_data -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | no_push 12 | 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | help.txt 38 | examples/text_classification/train_roberta_emotion_analysis.py 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /bert_seq2seq/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from bert_seq2seq.task.seq2seq.bert_seq2seq_model import BertSeq2SeqModel 3 | import os 4 | from bert_seq2seq.task.embedding.bert_embedding import BertEmbedding 5 | from bert_seq2seq.task.classification.bert_cls_classifier import BertClsClassifier 6 | from bert_seq2seq.task.sequence_labeling.bert_sequence_labeling import BertNERGP, BertNERCRF, BertSequenceLabling 7 | from bert_seq2seq.task.seq2seq.gpt2_seq2seq_model import GPT2 8 | from bert_seq2seq.task.seq2seq.t5_seq2seq_model import T5Model 9 | from bert_seq2seq.task.relationship_extraction.bert_relationship_extraction import BertRelationshipExtraction 10 | # from bert_seq2seq.GLM.model.modeling_glm import GLMModel 11 | # from GLM.model.modeling_glm import GLMModel 12 | from bert_seq2seq.task.seq2seq.GLM_seq2seq_model import GLMSeq2SeqModel 13 | 14 | ALL_TASK = { 15 | "bert_seq2seq": BertSeq2SeqModel, 16 | "roberta_seq2seq": BertSeq2SeqModel, 17 | "roberta-large_seq2seq": BertSeq2SeqModel, 18 | "bert_classification": BertClsClassifier, 19 | "roberta_classification": BertClsClassifier, 20 | "roberta-large_classification": BertClsClassifier, 21 | "bert_sequence_labeling_gp": BertNERGP, 22 | "roberta_sequence_labeling_gp": BertNERGP, 23 | "roberta-large_sequence_labeling_gp": BertNERGP, 24 | "bert_sequence_labeling_crf": BertNERCRF, 25 | "roberta_sequence_labeling_crf": BertNERCRF, 26 | "roberta-large_sequence_labeling_crf": BertNERCRF, 27 | "bert_sequence_labeling": BertSequenceLabling, 28 | "roberta_sequence_labeling": BertSequenceLabling, 29 | "roberta-large_sequence_labeling": BertSequenceLabling, 30 | "bert_embedding": BertEmbedding, 31 | "roberta_embedding": BertEmbedding, 32 | "roberta-large_embedding": BertEmbedding, 33 | "gpt2_seq2seq": GPT2, 34 | "t5_seq2seq": T5Model, 35 | "bert_relationship_extraction":BertRelationshipExtraction, 36 | "roberta_relationship_extraction":BertRelationshipExtraction, 37 | "nezha_relationship_extraction":BertRelationshipExtraction, 38 | "glm": GLMSeq2SeqModel, 39 | "glm_seq2seq": GLMSeq2SeqModel, 40 | "glm_lm": GLMSeq2SeqModel, 41 | 42 | } 43 | 44 | def load_model(vocab=None, 45 | model_name="roberta", 46 | task_name="seq2seq", 47 | target_size=0, 48 | ner_inner_dim=-1, 49 | size="base"): 50 | if model_name != "glm": 51 | assert vocab is not None, "vocab 字典不能为空" 52 | task_model = ALL_TASK.get(f"{model_name}_{task_name}", None) 53 | if task_model is None : 54 | print("no this task") 55 | os._exit(0) 56 | 57 | return task_model(vocab=vocab, 58 | model_name=model_name, 59 | size=size, 60 | target_size=target_size, 61 | ent_type_size=target_size, 62 | inner_dim=ner_inner_dim) 63 | 64 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | ## 例子文件说明 2 | 3 | ### bert embedding 4 | bert、roberta、nezha模型,输入一个句子,得到这个句子的embedding 5 | 1. [get_bert_embedding.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/bert_embedding/get_bert_embedding.py) 6 | 7 | ### ner 8 | bert、roberta、nezha模型,命名实体识别任务,支持crf与global pointer方式 9 | 1. [train_bert_ner_crf_people_daily.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/ner/train_bert_ner_crf_people_daily.py) crf方式进行ner任务 10 | 2. [train_roberta_ner_gp_people_daily.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/ner/train_roberta_ner_gp_people_daily.py) global pointer 方式进行ner任务 11 | 12 | ### seq2seq 13 | 生成任务,支持bert、roberta、nezha、gpt2、t5、bart等模型 14 | 1. [test_gpt2_text_writting.py.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/gpt2/test_gpt2_text_writting.py.py) gpt2续写测试 15 | 2. [train_roberta_auto_title.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/bert/train_roberta_auto_title.py) roberta模型自动标题训练 16 | 3. [train_roberta_auto_title_multi_gpu.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/bert/train_roberta_auto_title_multi_gpu.py) roberta自动标题训练(多gpu版本) 17 | 4. [train_gpt2_multi_chat.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/gpt2/train_gpt2_multi_chat.py) gpt2多轮对话训练 18 | 5. [test_t5_auto_title.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/t5/test_t5_auto_title.py) T5模型自动标题测试代码 19 | 6. [test_gpt2_multi_chat.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/gpt2/test_gpt2_multi_chat.py) gpt2多轮对话测试 20 | 7. [test_multi_processing_generate.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/gpt2/test_multi_processing_generate.py) 多进程生成的例子 21 | 22 | ### text classification 23 | bert、roberta、nezha模型,支持文本分类、情感分析、语义匹配任务 24 | 1. [train_roberta_news_title_classification.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/text_classification/train_roberta_news_title_classification.py) 新闻摘要文本分类训练 25 | 2. [train_roberta_news_title_classification_multi_gpu.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/text_classification/train_roberta_news_title_classification_multi_gpu.py) 新闻摘要文本分类训练(多gpu版本) 26 | 3. [train_roberta_semantic_matching.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/text_classification/train_roberta_semantic_matching.py) 语义匹配训练 27 | 4. [test.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/text_classification/test.py) 加载训练好的模型进行测试 28 | 29 | ### FAQ 检索式问答 30 | 1. [1_construct_data.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/text_classification/1_construct_data.py) 构建数据集,提前提取embedding特征 31 | 2. [2_test_bert_faq.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/text_classification/2_test_bert_faq.py) 加载构建的embeddings,利用faiss进行相似问题的检索 32 | -------------------------------------------------------------------------------- /bert_seq2seq/model/prompt.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2022 BAAI. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License") 4 | import random 5 | import torch 6 | 7 | 8 | class PromptSpell(torch.nn.Module): 9 | def __init__(self, spell_length, hidden_size, spell_func): 10 | super(PromptSpell, self).__init__() 11 | self.spell_length = spell_length 12 | self.hidden_size = hidden_size 13 | self.spell_embeddings = torch.nn.Embedding(self.spell_length, self.hidden_size) 14 | self.spell_func = spell_func 15 | if self.spell_func == "lstm": 16 | self.lstm_head = torch.nn.LSTM(input_size=self.hidden_size, 17 | hidden_size=self.hidden_size, 18 | num_layers=2, 19 | # dropout=self.lstm_dropout, 20 | bidirectional=True, 21 | batch_first=True) # .to(torch.device("cuda")) 22 | self.mlp_head = torch.nn.Sequential(torch.nn.Linear(2 * self.hidden_size, self.hidden_size), 23 | torch.nn.ReLU(), 24 | torch.nn.Linear(self.hidden_size, self.hidden_size)) 25 | elif self.spell_func == "mlp": 26 | self.mlp_head = torch.nn.Sequential(torch.nn.Linear(self.hidden_size, self.hidden_size), 27 | torch.nn.ReLU(), 28 | torch.nn.Linear(self.hidden_size, self.hidden_size)) 29 | elif self.spell_func != "none": 30 | raise NotImplementedError("Prompt function " + self.spell_func) 31 | 32 | def init_embedding(self, word_embeddings=None, task_tokens=None): 33 | num_words = 5000 34 | with torch.no_grad(): 35 | for i in range(self.spell_length): 36 | rand_token = random.randrange(num_words) 37 | if task_tokens is None: 38 | target_embedding = word_embeddings[rand_token] 39 | else: 40 | word_embedding = word_embeddings[rand_token] 41 | task_token = random.choice(task_tokens) 42 | task_embedding = word_embeddings[task_token] 43 | ratio = random.random() 44 | target_embedding = word_embedding * ratio + task_embedding * (1 - ratio) 45 | self.spell_embeddings.weight.data[i] = target_embedding 46 | 47 | def forward(self): 48 | prompt_embeds = self.spell_embeddings.weight.unsqueeze(0) 49 | if self.spell_func == "lstm": 50 | prompt_embeds = self.lstm_head(prompt_embeds)[0] 51 | if self.spell_func == "lstm" or self.spell_func == "mlp": 52 | prompt_embeds = self.mlp_head(prompt_embeds) 53 | return prompt_embeds 54 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 44 | -------------------------------------------------------------------------------- /bert_seq2seq/mpu/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2022 BAAI. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License") 4 | # coding=utf-8 5 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | import torch 20 | 21 | 22 | def ensure_divisibility(numerator, denominator): 23 | """Ensure that numerator is divisible by the denominator.""" 24 | assert numerator % denominator == 0, '{} is not divisible by {}'.format( 25 | numerator, denominator) 26 | 27 | 28 | def divide(numerator, denominator): 29 | """Ensure that numerator is divisible by the denominator and return 30 | the division value.""" 31 | ensure_divisibility(numerator, denominator) 32 | return numerator // denominator 33 | 34 | 35 | def split_tensor_along_last_dim(tensor, 36 | num_partitions, 37 | contiguous_split_chunks=False): 38 | """Split a tensor along its last dimension. 39 | Arguments: 40 | tensor: input tensor. 41 | num_partitions: number of partitions to split the tensor 42 | contiguous_split_chunks: If True, make each chunk contiguous 43 | in memory. 44 | """ 45 | # Get the size and dimension. 46 | last_dim = tensor.dim() - 1 47 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 48 | # Split. 49 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 50 | # Note: torch.split does not create contiguous tensors by default. 51 | if contiguous_split_chunks: 52 | return tuple(chunk.contiguous() for chunk in tensor_list) 53 | 54 | return tensor_list 55 | 56 | 57 | class VocabUtility: 58 | """Split the vocabulary into `world_size` chunks amd return the 59 | first and last index of the vocabulary belonging to the `rank` 60 | partition: Note that indecies in [fist, last)""" 61 | @staticmethod 62 | def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, 63 | rank, world_size): 64 | index_f = rank * per_partition_vocab_size 65 | index_l = index_f + per_partition_vocab_size 66 | return index_f, index_l 67 | 68 | @staticmethod 69 | def vocab_range_from_global_vocab_size(global_vocab_size, rank, 70 | world_size): 71 | per_partition_vocab_size = divide(global_vocab_size, world_size) 72 | return VocabUtility.vocab_range_from_per_partition_vocab_size( 73 | per_partition_vocab_size, rank, world_size) 74 | -------------------------------------------------------------------------------- /bert_seq2seq/mpu/grads.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2022 BAAI. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License") 4 | # coding=utf-8 5 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | # Parts of the code here are adapted from PyTorch 20 | # repo: https://github.com/pytorch/pytorch 21 | 22 | import torch 23 | from torch._six import inf 24 | 25 | from .initialize import get_model_parallel_group 26 | from .initialize import get_model_parallel_rank 27 | 28 | 29 | def clip_grad_norm(parameters, max_norm, norm_type=2): 30 | """Clips gradient norm of an iterable of parameters. 31 | 32 | This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and 33 | added functionality to handle model parallel parameters. Note that 34 | the gradients are modified in place. 35 | 36 | Arguments: 37 | parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a 38 | single Tensor that will have gradients normalized 39 | max_norm (float or int): max norm of the gradients 40 | norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for 41 | infinity norm. 42 | 43 | Returns: 44 | Total norm of the parameters (viewed as a single vector). 45 | """ 46 | if isinstance(parameters, torch.Tensor): 47 | parameters = [parameters] 48 | parameters = list(filter(lambda p: p.grad is not None, parameters)) 49 | max_norm = float(max_norm) 50 | norm_type = float(norm_type) 51 | if norm_type == inf: 52 | total_norm = max(p.grad.data.abs().max() for p in parameters) 53 | total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) 54 | # Take max across all GPUs. 55 | torch.distributed.all_reduce(total_norm_cuda, 56 | op=torch.distributed.ReduceOp.MAX, 57 | group=get_model_parallel_group()) 58 | total_norm = total_norm_cuda[0].item() 59 | else: 60 | total_norm = 0 61 | for p in parameters: 62 | if p.model_parallel or (get_model_parallel_rank() == 0): 63 | param_norm = p.grad.data.norm(norm_type) 64 | total_norm += param_norm.item()**norm_type 65 | # Sum across all model parallel GPUs. 66 | total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)]) 67 | torch.distributed.all_reduce(total_norm_cuda, 68 | op=torch.distributed.ReduceOp.SUM, 69 | group=get_model_parallel_group()) 70 | total_norm = total_norm_cuda[0].item()**(1. / norm_type) 71 | clip_coef = max_norm / (total_norm + 1e-6) 72 | if clip_coef < 1: 73 | for p in parameters: 74 | p.grad.data.mul_(clip_coef) 75 | return total_norm 76 | -------------------------------------------------------------------------------- /data/relationship_extraction/all_50_schemas: -------------------------------------------------------------------------------- 1 | {"object_type": "地点", "predicate": "祖籍", "subject_type": "人物"} 2 | {"object_type": "人物", "predicate": "父亲", "subject_type": "人物"} 3 | {"object_type": "地点", "predicate": "总部地点", "subject_type": "企业"} 4 | {"object_type": "地点", "predicate": "出生地", "subject_type": "人物"} 5 | {"object_type": "目", "predicate": "目", "subject_type": "生物"} 6 | {"object_type": "Number", "predicate": "面积", "subject_type": "行政区"} 7 | {"object_type": "Text", "predicate": "简称", "subject_type": "机构"} 8 | {"object_type": "Date", "predicate": "上映时间", "subject_type": "影视作品"} 9 | {"object_type": "人物", "predicate": "妻子", "subject_type": "人物"} 10 | {"object_type": "音乐专辑", "predicate": "所属专辑", "subject_type": "歌曲"} 11 | {"object_type": "Number", "predicate": "注册资本", "subject_type": "企业"} 12 | {"object_type": "城市", "predicate": "首都", "subject_type": "国家"} 13 | {"object_type": "人物", "predicate": "导演", "subject_type": "影视作品"} 14 | {"object_type": "Text", "predicate": "字", "subject_type": "历史人物"} 15 | {"object_type": "Number", "predicate": "身高", "subject_type": "人物"} 16 | {"object_type": "企业", "predicate": "出品公司", "subject_type": "影视作品"} 17 | {"object_type": "Number", "predicate": "修业年限", "subject_type": "学科专业"} 18 | {"object_type": "Date", "predicate": "出生日期", "subject_type": "人物"} 19 | {"object_type": "人物", "predicate": "制片人", "subject_type": "影视作品"} 20 | {"object_type": "人物", "predicate": "母亲", "subject_type": "人物"} 21 | {"object_type": "人物", "predicate": "编剧", "subject_type": "影视作品"} 22 | {"object_type": "国家", "predicate": "国籍", "subject_type": "人物"} 23 | {"object_type": "Number", "predicate": "海拔", "subject_type": "地点"} 24 | {"object_type": "网站", "predicate": "连载网站", "subject_type": "网络小说"} 25 | {"object_type": "人物", "predicate": "丈夫", "subject_type": "人物"} 26 | {"object_type": "Text", "predicate": "朝代", "subject_type": "历史人物"} 27 | {"object_type": "Text", "predicate": "民族", "subject_type": "人物"} 28 | {"object_type": "Text", "predicate": "号", "subject_type": "历史人物"} 29 | {"object_type": "出版社", "predicate": "出版社", "subject_type": "书籍"} 30 | {"object_type": "人物", "predicate": "主持人", "subject_type": "电视综艺"} 31 | {"object_type": "Text", "predicate": "专业代码", "subject_type": "学科专业"} 32 | {"object_type": "人物", "predicate": "歌手", "subject_type": "歌曲"} 33 | {"object_type": "人物", "predicate": "作词", "subject_type": "歌曲"} 34 | {"object_type": "人物", "predicate": "主角", "subject_type": "网络小说"} 35 | {"object_type": "人物", "predicate": "董事长", "subject_type": "企业"} 36 | {"object_type": "Date", "predicate": "成立日期", "subject_type": "机构"} 37 | {"object_type": "学校", "predicate": "毕业院校", "subject_type": "人物"} 38 | {"object_type": "Number", "predicate": "占地面积", "subject_type": "机构"} 39 | {"object_type": "语言", "predicate": "官方语言", "subject_type": "国家"} 40 | {"object_type": "Text", "predicate": "邮政编码", "subject_type": "行政区"} 41 | {"object_type": "Number", "predicate": "人口数量", "subject_type": "行政区"} 42 | {"object_type": "城市", "predicate": "所在城市", "subject_type": "景点"} 43 | {"object_type": "人物", "predicate": "作者", "subject_type": "图书作品"} 44 | {"object_type": "Date", "predicate": "成立日期", "subject_type": "企业"} 45 | {"object_type": "人物", "predicate": "作曲", "subject_type": "歌曲"} 46 | {"object_type": "气候", "predicate": "气候", "subject_type": "行政区"} 47 | {"object_type": "人物", "predicate": "嘉宾", "subject_type": "电视综艺"} 48 | {"object_type": "人物", "predicate": "主演", "subject_type": "影视作品"} 49 | {"object_type": "作品", "predicate": "改编自", "subject_type": "影视作品"} 50 | {"object_type": "人物", "predicate": "创始人", "subject_type": "企业"} -------------------------------------------------------------------------------- /examples/seq2seq/gpt2/train_gpt2_multi_chat.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from torch.utils.data import Dataset 5 | from bert_seq2seq import Tokenizer 6 | from bert_seq2seq import load_model 7 | from bert_seq2seq import Trainer 8 | from bert_seq2seq.dataset import gpt_collate_fn 9 | from bert_seq2seq import Predictor 10 | import json 11 | 12 | model_name = "gpt2" # 选择模型名字 13 | task_name = "seq2seq" # 任务名字 14 | 15 | model_path = "../state_dict/gpt2/pytorch_model.bin" 16 | vocab_path = "../state_dict/gpt2/vocab.txt" 17 | model_save_path = "./gpt2_multi_chat_model.bin" # 训练好的模型保存位置。 18 | lr = 2e-5 19 | maxlen = 1024 20 | data_path = '../data/LCCC-base-split/LCCC-base_train.json' # 数据位置 21 | 22 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 23 | tokenizer = Tokenizer(vocab_path) 24 | model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name) 25 | model.load_pretrain_params(model_path) 26 | predictor = Predictor(model, tokenizer) 27 | 28 | trainer = Trainer(env_type="pytorch", 29 | epoches=5, 30 | val_every_step=500, 31 | device=device, 32 | batch_size=8, 33 | gradient_accmulation_step=8) 34 | 35 | def read_file(): 36 | ## 更换数据集只需要实现这个函数即可,返回框架需要的src 37 | 38 | with open(data_path) as f: 39 | data = json.loads(f.read()) 40 | 41 | return data 42 | 43 | data = read_file() 44 | print(data[:5]) 45 | 46 | class ChatDataset(Dataset): 47 | """ 48 | 针对特定数据集,定义一个相关的取数据的方式 49 | """ 50 | def __init__(self, data) : 51 | ## 一般init函数是加载所有数据 52 | super().__init__() 53 | self.data = data 54 | 55 | def __getitem__(self, i): 56 | ## 得到单个数据 57 | # print(i) 58 | data = self.data[i] 59 | input_ids = [tokenizer.token_start_id] 60 | 61 | for index, text in enumerate(data): 62 | if (index + 1) % 2 == 1: 63 | text = "A:" + text 64 | else : 65 | text = "B:" + text 66 | 67 | text_ids = tokenizer.encode_plus(text, max_length=maxlen)["input_ids"][1:] 68 | input_ids.extend(text_ids) 69 | 70 | output = { 71 | "input_ids": input_ids, 72 | } 73 | return output 74 | 75 | def __len__(self): 76 | 77 | return len(self.data) 78 | 79 | class Evaluator: 80 | 81 | def on_validation(self, data): 82 | loss = data["loss"] 83 | step = data["iteration"] 84 | ## 自己定义validate函数实现,十分灵活。 85 | test_data = [["A:今天天气很好,你觉得呢?"], 86 | ["A:我去吃了火锅。"], 87 | ["A:我去吃了火锅。", "B:我也是,真不错,你吃的哪家?"] 88 | ] 89 | for text in test_data: 90 | print(predictor.predict_multi_response(text, 91 | input_max_length=200, 92 | out_max_length=40, 93 | top_k=30, top_p=0.9, 94 | repetition_penalty=1.2, 95 | temperature=1.2)) 96 | 97 | torch.save(model.state_dict(), model_save_path) 98 | print(f"模型保存成功~") 99 | 100 | def main(): 101 | ## 加载数据 102 | data = read_file() 103 | 104 | optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-3) 105 | train_dataset = ChatDataset(data) 106 | 107 | trainer.train(model, optimizer, train_dataset=train_dataset, evaluator=Evaluator, 108 | collate_fn=gpt_collate_fn) 109 | 110 | if __name__ == '__main__': 111 | main() 112 | -------------------------------------------------------------------------------- /data/relationship_extraction/dev_data.json: -------------------------------------------------------------------------------- 1 | {"postag": [{"word": "查尔斯", "pos": "nr"}, {"word": "·", "pos": "w"}, {"word": "阿兰基斯", "pos": "nr"}, {"word": "(", "pos": "w"}, {"word": "Charles Aránguiz", "pos": "nz"}, {"word": ")", "pos": "w"}, {"word": ",", "pos": "w"}, {"word": "1989年4月17日", "pos": "t"}, {"word": "出生", "pos": "v"}, {"word": "于", "pos": "p"}, {"word": "智利圣地亚哥", "pos": "ns"}, {"word": ",", "pos": "w"}, {"word": "智利", "pos": "ns"}, {"word": "职业", "pos": "n"}, {"word": "足球", "pos": "n"}, {"word": "运动员", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "司职", "pos": "v"}, {"word": "中场", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "效力", "pos": "v"}, {"word": "于", "pos": "p"}, {"word": "德国", "pos": "ns"}, {"word": "足球", "pos": "n"}, {"word": "甲级", "pos": "a"}, {"word": "联赛", "pos": "n"}, {"word": "勒沃库森足球俱乐部", "pos": "nt"}], "text": "查尔斯·阿兰基斯(Charles Aránguiz),1989年4月17日出生于智利圣地亚哥,智利职业足球运动员,司职中场,效力于德国足球甲级联赛勒沃库森足球俱乐部", "spo_list": [{"predicate": "出生地", "object_type": "地点", "subject_type": "人物", "object": "圣地亚哥", "subject": "查尔斯·阿兰基斯"}, {"predicate": "出生日期", "object_type": "Date", "subject_type": "人物", "object": "1989年4月17日", "subject": "查尔斯·阿兰基斯"}]} 2 | {"postag": [{"word": "《", "pos": "w"}, {"word": "离开", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "是", "pos": "v"}, {"word": "由", "pos": "p"}, {"word": "张宇", "pos": "nr"}, {"word": "谱曲", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "演唱", "pos": "v"}], "text": "《离开》是由张宇谱曲,演唱", "spo_list": [{"predicate": "歌手", "object_type": "人物", "subject_type": "歌曲", "object": "张宇", "subject": "离开"}, {"predicate": "作曲", "object_type": "人物", "subject_type": "歌曲", "object": "张宇", "subject": "离开"}]} 3 | {"postag": [{"word": "《", "pos": "w"}, {"word": "愤怒的唐僧", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "由", "pos": "p"}, {"word": "北京吴意波影视文化工作室", "pos": "nt"}, {"word": "与", "pos": "p"}, {"word": "优酷", "pos": "nt"}, {"word": "电视剧", "pos": "n"}, {"word": "频道", "pos": "n"}, {"word": "联合", "pos": "vd"}, {"word": "制作", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "故事", "pos": "n"}, {"word": "以", "pos": "p"}, {"word": "喜剧", "pos": "n"}, {"word": "元素", "pos": "n"}, {"word": "为主", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "讲述", "pos": "v"}, {"word": "唐僧", "pos": "nr"}, {"word": "与", "pos": "c"}, {"word": "佛祖", "pos": "n"}, {"word": "打牌", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "得罪", "pos": "v"}, {"word": "了", "pos": "u"}, {"word": "佛祖", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "被", "pos": "p"}, {"word": "踢", "pos": "v"}, {"word": "下", "pos": "v"}, {"word": "人间", "pos": "n"}, {"word": "再", "pos": "d"}, {"word": "渡", "pos": "v"}, {"word": "九九八十一难", "pos": "nz"}, {"word": "的", "pos": "u"}, {"word": "故事", "pos": "n"}], "text": "《愤怒的唐僧》由北京吴意波影视文化工作室与优酷电视剧频道联合制作,故事以喜剧元素为主,讲述唐僧与佛祖打牌,得罪了佛祖,被踢下人间再渡九九八十一难的故事", "spo_list": [{"predicate": "出品公司", "object_type": "企业", "subject_type": "影视作品", "object": "北京吴意波影视文化工作室", "subject": "愤怒的唐僧"}, {"predicate": "导演", "object_type": "人物", "subject_type": "影视作品", "object": "吴意波", "subject": "愤怒的唐僧"}]} 4 | {"postag": [{"word": "李治", "pos": "nr"}, {"word": "即位", "pos": "v"}, {"word": "后", "pos": "f"}, {"word": ",", "pos": "w"}, {"word": "萧淑妃", "pos": "nr"}, {"word": "受宠", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "王皇后", "pos": "nr"}, {"word": "为了", "pos": "p"}, {"word": "排挤", "pos": "v"}, {"word": "萧淑妃", "pos": "nr"}, {"word": ",", "pos": "w"}, {"word": "答应", "pos": "v"}, {"word": "李治", "pos": "nr"}, {"word": "让", "pos": "v"}, {"word": "身", "pos": "n"}, {"word": "在", "pos": "v"}, {"word": "感业寺", "pos": "ns"}, {"word": "的", "pos": "u"}, {"word": "武则天", "pos": "nr"}, {"word": "续", "pos": "v"}, {"word": "起", "pos": "v"}, {"word": "头发", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "重新", "pos": "d"}, {"word": "纳入", "pos": "v"}, {"word": "后宫", "pos": "n"}], "text": "李治即位后,萧淑妃受宠,王皇后为了排挤萧淑妃,答应李治让身在感业寺的武则天续起头发,重新纳入后宫", "spo_list": [{"predicate": "妻子", "object_type": "人物", "subject_type": "人物", "object": "萧淑妃", "subject": "李治"}, {"predicate": "丈夫", "object_type": "人物", "subject_type": "人物", "object": "李治", "subject": "萧淑妃"}]} 5 | -------------------------------------------------------------------------------- /examples/seq2seq/bert/train_roberta_auto_title.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from torch.utils.data import Dataset, DataLoader 5 | from bert_seq2seq import Tokenizer 6 | from bert_seq2seq import load_model 7 | from bert_seq2seq import Trainer 8 | from bert_seq2seq.dataset import bert_seq2seq_collate_fn 9 | from bert_seq2seq import Predictor 10 | 11 | model_name = "roberta" # 选择模型名字 12 | task_name = "seq2seq" # 任务名字 13 | 14 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置 15 | model_path = "../state_dict/roberta/pytorch_model.bin" # 预训练模型位置 16 | model_save_path = "./roberta_auto_title_model.bin" # 训练好的模型保存位置。 17 | 18 | lr = 1e-5 19 | maxlen=256 20 | src_dir = '../data/auto_title/train.src' # 数据位置 21 | tgt_dir = '../data/auto_title/train.tgt' 22 | 23 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 24 | tokenizer = Tokenizer(vocab_path) 25 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name) 26 | bert_model.load_pretrain_params(model_path) 27 | predictor = Predictor(bert_model, tokenizer) 28 | 29 | trainer = Trainer(env_type="pytorch", 30 | epoches=5, 31 | val_every_step=500, 32 | device=device, 33 | batch_size=16) 34 | 35 | def read_file(): 36 | ## 更换数据集只需要实现这个函数即可,返回框架需要的src、tgt 37 | src = [] 38 | tgt = [] 39 | 40 | with open(src_dir,'r',encoding='utf-8') as f: 41 | lines = f.readlines() 42 | for line in lines: 43 | src.append(line.strip('\n').lower()) 44 | 45 | with open(tgt_dir,'r',encoding='utf-8') as f: 46 | lines = f.readlines() 47 | for line in lines: 48 | tgt.append(line.strip('\n').lower()) 49 | 50 | return src,tgt 51 | 52 | class AutoTitleDataset(Dataset): 53 | """ 54 | 针对特定数据集,定义一个相关的取数据的方式 55 | """ 56 | def __init__(self, sents_src, sents_tgt) : 57 | ## 一般init函数是加载所有数据 58 | super().__init__() 59 | self.sents_src = sents_src 60 | self.sents_tgt = sents_tgt 61 | 62 | def __getitem__(self, i): 63 | ## 得到单个数据 64 | # print(i) 65 | src = self.sents_src[i] 66 | tgt = self.sents_tgt[i] 67 | tokenizer_out = tokenizer.encode_plus(src, tgt, max_length=maxlen) 68 | 69 | output = { 70 | "input_ids": tokenizer_out["input_ids"], 71 | "token_type_ids": tokenizer_out["token_type_ids"], 72 | } 73 | return output 74 | 75 | def __len__(self): 76 | 77 | return len(self.sents_src) 78 | 79 | class Evaluator: 80 | 81 | def on_validation(self, data): 82 | loss = data["loss"] 83 | step = data["iteration"] 84 | ## 自己定义validate函数实现,十分灵活。 85 | test_data = ["本文总结了十个可穿戴产品的设计原则而这些原则同样也是笔者认为是这个行业最吸引人的地方1为人们解决重复性问题2从人开始而不是从机器开始3要引起注意但不要刻意4提升用户能力而不是取代人", 86 | "2007年乔布斯向人们展示iPhone并宣称它将会改变世界还有人认为他在夸大其词然而在8年后以iPhone为代表的触屏智能手机已经席卷全球各个角落未来智能手机将会成为真正的个人电脑为人类发展做出更大的贡献", 87 | "雅虎发布2014年第四季度财报并推出了免税方式剥离其持有的阿里巴巴集团15%股权的计划打算将这一价值约400亿美元的宝贵投资分配给股东截止发稿前雅虎股价上涨了大约7%至5145美元"] 88 | for text in test_data: 89 | print(predictor.predict_generate_beamsearch(text, beam_size=3, input_max_length=200, out_max_length=40)) 90 | 91 | torch.save(bert_model.state_dict(), model_save_path) 92 | print(f"模型保存成功~") 93 | 94 | def main(): 95 | ## 加载数据 96 | all_src, all_tgt = read_file() 97 | train_size = int(len(all_src) * 0.9) 98 | train_src, train_tgt = all_src[:train_size], all_tgt[:train_size] 99 | # 声明需要优化的参数 100 | optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=1e-3) 101 | train_dataset = AutoTitleDataset(train_src, train_tgt) 102 | 103 | trainer.train(bert_model, optimizer, train_dataset=train_dataset, evaluator=Evaluator, 104 | collate_fn=bert_seq2seq_collate_fn) 105 | 106 | if __name__ == '__main__': 107 | main() 108 | -------------------------------------------------------------------------------- /examples/seq2seq/bert/train_roberta_auto_title_multi_gpu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset, DataLoader 3 | from bert_seq2seq import Tokenizer 4 | from bert_seq2seq import load_model 5 | from bert_seq2seq import Trainer 6 | from bert_seq2seq.dataset import bert_seq2seq_collate_fn 7 | from bert_seq2seq import Predictor 8 | import os 9 | 10 | model_name = "roberta" # 选择模型名字 11 | task_name = "seq2seq" # 任务名字 12 | 13 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置 14 | model_path = "../state_dict/roberta/pytorch_model.bin" # 预训练模型位置 15 | model_save_path = "./roberta_auto_title_model.bin" # 训练好的模型保存位置。 16 | lr = 1e-5 17 | maxlen=256 18 | src_dir = '../data/auto_title/train.src' # 数据位置 19 | tgt_dir = '../data/auto_title/train.tgt' 20 | 21 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 22 | 23 | # os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" 24 | 25 | num_gpus = 4 # gpu个数 26 | num_nodes = 1 ## 机器个数 目前只支持1 ,多机待测试。 27 | trainer = Trainer(env_type="DDP", 28 | epoches=5, 29 | val_every_step=500, 30 | device=device, 31 | batch_size=16, 32 | num_gpus=num_gpus, 33 | num_nodes=num_nodes, 34 | training_script=__file__, 35 | ) 36 | 37 | tokenizer = Tokenizer(vocab_path) 38 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name) 39 | bert_model.load_pretrain_params(model_path) 40 | predictor = Predictor(bert_model, tokenizer) 41 | 42 | def read_file(): 43 | ## 更换数据集只需要实现这个函数即可,返回框架需要的src、tgt 44 | src = [] 45 | tgt = [] 46 | 47 | with open(src_dir,'r',encoding='utf-8') as f: 48 | lines = f.readlines() 49 | for line in lines: 50 | src.append(line.strip('\n').lower()) 51 | 52 | with open(tgt_dir,'r',encoding='utf-8') as f: 53 | lines = f.readlines() 54 | for line in lines: 55 | tgt.append(line.strip('\n').lower()) 56 | 57 | return src,tgt 58 | 59 | class AutoTitleDataset(Dataset): 60 | """ 61 | 针对特定数据集,定义一个相关的取数据的方式 62 | """ 63 | def __init__(self, sents_src, sents_tgt) : 64 | ## 一般init函数是加载所有数据 65 | super().__init__() 66 | self.sents_src = sents_src 67 | self.sents_tgt = sents_tgt 68 | 69 | def __getitem__(self, i): 70 | ## 得到单个数据 71 | # print(i) 72 | src = self.sents_src[i] 73 | tgt = self.sents_tgt[i] 74 | tokenizer_out = tokenizer.encode_plus(src, tgt, max_length=maxlen) 75 | 76 | output = { 77 | "input_ids": tokenizer_out["input_ids"], 78 | "token_type_ids": tokenizer_out["token_type_ids"], 79 | } 80 | return output 81 | 82 | def __len__(self): 83 | 84 | return len(self.sents_src) 85 | 86 | class Evaluator: 87 | 88 | def on_validation(self, data): 89 | loss = data["loss"] 90 | step = data["iteration"] 91 | ## 自己定义validate函数实现,十分灵活。 92 | test_data = ["本文总结了十个可穿戴产品的设计原则而这些原则同样也是笔者认为是这个行业最吸引人的地方1为人们解决重复性问题2从人开始而不是从机器开始3要引起注意但不要刻意4提升用户能力而不是取代人", 93 | "2007年乔布斯向人们展示iPhone并宣称它将会改变世界还有人认为他在夸大其词然而在8年后以iPhone为代表的触屏智能手机已经席卷全球各个角落未来智能手机将会成为真正的个人电脑为人类发展做出更大的贡献", 94 | "雅虎发布2014年第四季度财报并推出了免税方式剥离其持有的阿里巴巴集团15%股权的计划打算将这一价值约400亿美元的宝贵投资分配给股东截止发稿前雅虎股价上涨了大约7%至5145美元"] 95 | for text in test_data: 96 | print(predictor.predict_generate_beamsearch(text, beam_size=3, input_max_length=200, out_max_length=40)) 97 | 98 | torch.save(bert_model.state_dict(), model_save_path) 99 | print(f"模型保存成功~") 100 | 101 | 102 | def main(): 103 | ## 加载数据 104 | all_src, all_tgt = read_file() 105 | train_size = int(len(all_src) * 0.9) 106 | train_src, train_tgt = all_src[:train_size], all_tgt[:train_size] 107 | # 声明需要优化的参数 108 | optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=1e-3) 109 | train_dataset = AutoTitleDataset(train_src, train_tgt) 110 | 111 | trainer.train(bert_model, optimizer, train_dataset=train_dataset, evaluator=Evaluator, 112 | collate_fn=bert_seq2seq_collate_fn) 113 | 114 | if __name__ == '__main__': 115 | main() 116 | -------------------------------------------------------------------------------- /bert_seq2seq/task/sequence_labeling/bert_sequence_labeling.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from bert_seq2seq.basic_bert import BasicBert 3 | from bert_seq2seq.layers import GlobalPointer, CRFLayer 4 | 5 | class BertSequenceLabling(BasicBert): 6 | """ 7 | """ 8 | def __init__(self, vocab, 9 | target_size, 10 | model_name="roberta", 11 | size="base", 12 | **kwargs): 13 | super(BertSequenceLabling, self).__init__(word2ix=vocab, model_name=model_name, size=size) 14 | self.cls = None 15 | self.layer_norm_cond = None 16 | self.target_size = target_size 17 | self.final_dense = nn.Linear(self.config.hidden_size, target_size) 18 | 19 | def compute_loss(self, predictions, labels): 20 | """ 21 | 计算loss 22 | predictions: (batch_size, 1) 23 | """ 24 | predictions = predictions.view(-1, self.target_size) 25 | labels = labels.view(-1) 26 | loss = nn.CrossEntropyLoss(reduction="mean") 27 | return loss(predictions, labels) 28 | 29 | def forward(self, **data): 30 | 31 | input_ids = data["input_ids"] 32 | token_type_ids = data.get("token_type_ids", None) 33 | labels = data.get("labels", None) 34 | 35 | all_layers, pooled_out = self.bert(input_ids, token_type_ids=token_type_ids, 36 | output_all_encoded_layers=True) 37 | 38 | sequence_out = all_layers[-1] 39 | predictions = self.final_dense(sequence_out) 40 | 41 | return_data = {"logits": predictions, } 42 | 43 | if labels is not None: 44 | ## 计算loss 45 | loss = self.compute_loss(predictions, labels) 46 | return_data["loss"] = loss 47 | 48 | return return_data 49 | 50 | class BertNERGP(BasicBert): 51 | """ 52 | """ 53 | def __init__(self, vocab, ent_type_size, inner_dim=64, size="base", model_name="roberta", **kwargs): 54 | super(BertNERGP, self).__init__(word2ix=vocab, model_name=model_name, size=size) 55 | self.gp = GlobalPointer(self.config.hidden_size, ent_type_size, inner_dim, RoPE=True) 56 | self.layer_norm_cond = None 57 | self.cls = None 58 | def compute_loss(self, logits, labels): 59 | pass 60 | 61 | def forward(self, **data): 62 | input_ids = data["input_ids"] 63 | token_type_ids = data.get("token_type_ids", None) 64 | padding_mask = (input_ids > 0).float() 65 | labels = data.get("labels", None) 66 | 67 | all_layers, _ = self.bert(input_ids, token_type_ids=token_type_ids, 68 | output_all_encoded_layers=True) 69 | sequence_out = all_layers[-1] 70 | 71 | gp_out = self.gp(sequence_out, padding_mask) 72 | return_data = {"logits": gp_out, } 73 | 74 | if labels is not None: 75 | return_data["loss"] = self.gp.compute_loss(gp_out, labels) 76 | return return_data 77 | 78 | class BertNERCRF(BasicBert): 79 | """ 80 | """ 81 | def __init__(self, vocab, target_size=-1, size="base", model_name="roberta", **kwargs): 82 | super(BertNERCRF, self).__init__(word2ix=vocab, model_name=model_name, size=size,) 83 | self.layer_norm_cond = None 84 | self.cls = None 85 | self.final_dense = nn.Linear(self.config.hidden_size, target_size) 86 | self.crf_layer = CRFLayer(target_size) 87 | 88 | def compute_loss(self, logits, labels, target_mask): 89 | loss = self.crf_layer(logits, labels, target_mask) 90 | 91 | return loss.mean() 92 | 93 | def forward(self, **data): 94 | input_ids = data["input_ids"] 95 | token_type_ids = data.get("token_type_ids", None) 96 | padding_mask = (input_ids > 0).float() 97 | labels = data.get("labels", None) 98 | 99 | all_layers, _ = self.bert(input_ids, token_type_ids=token_type_ids, 100 | output_all_encoded_layers=True) 101 | sequence_out = all_layers[-1] 102 | 103 | predictions = self.final_dense(sequence_out) 104 | 105 | return_data = {"logits": predictions, } 106 | 107 | if labels is not None: 108 | ## 计算loss 109 | return_data["loss"] = self.compute_loss(predictions, labels, padding_mask) 110 | 111 | return return_data -------------------------------------------------------------------------------- /examples/text_classification/train_roberta_large_news_title_classification.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | from bert_seq2seq import Tokenizer 4 | from bert_seq2seq import load_model 5 | from bert_seq2seq.dataset import bert_cls_collate_fn 6 | from bert_seq2seq.trainer import Trainer 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score 9 | from bert_seq2seq import Predictor 10 | import os 11 | 12 | target = ["财经", "彩票", "房产", "股票", "家居", "教育", "科技", "社会", "时尚", "时政", "体育", "星座", "游戏", "娱乐"] 13 | train_path = "../data/新闻标题文本分类/Train.txt" 14 | model_name = "roberta" # 选择模型名字 15 | task_name = "classification" 16 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置 17 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置 18 | model_save_path = "./bert_news_title_classification.bin" 19 | batch_size = 16 20 | lr = 1e-5 21 | # 加载字典 22 | tokenizer = Tokenizer(vocab_path) 23 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 24 | 25 | trainer = Trainer(epoches=3, val_every_step=500, batch_size=16, env_type="pytorch", 26 | device=device,) 27 | 28 | bert_model = load_model(tokenizer.vocab, 29 | model_name=model_name, 30 | task_name=task_name, 31 | size="large", 32 | target_size=len(target)) 33 | ## 加载预训练的模型参数~ 34 | bert_model.load_pretrain_params(model_path) 35 | predictor = Predictor(bert_model, tokenizer) 36 | 37 | def read_corpus(): 38 | """ 39 | 读原始数据 40 | """ 41 | sents_src = [] 42 | sents_tgt = [] 43 | 44 | with open(train_path) as f: 45 | lines = f.readlines() 46 | for line in lines: 47 | line = line.split("\t") 48 | sents_tgt.append(int(line[0])) 49 | sents_src.append(line[2]) 50 | 51 | return sents_src, sents_tgt 52 | 53 | ## 加载数据 54 | all_input, all_label = read_corpus() 55 | train_input, val_input, train_label, val_label = train_test_split(all_input, all_label, train_size=0.8, random_state=123) 56 | 57 | 58 | ## 自定义dataset 59 | class ClassificationDataset(Dataset): 60 | """ 61 | 针对特定数据集,定义一个相关的取数据的方式 62 | """ 63 | def __init__(self, sents_src, sents_tgt) : 64 | super(ClassificationDataset, self).__init__() 65 | # 读原始数据 66 | self.sents_src = sents_src 67 | self.sents_tgt = sents_tgt 68 | 69 | def __getitem__(self, i): 70 | ## 得到单个数据 71 | src = self.sents_src[i] 72 | tgt = self.sents_tgt[i] 73 | tokenizer_out = tokenizer.encode_plus(src) 74 | 75 | output = { 76 | "input_ids": tokenizer_out["input_ids"], 77 | "token_type_ids": tokenizer_out["token_type_ids"], 78 | "labels": tgt 79 | } 80 | return output 81 | 82 | def __len__(self): 83 | return len(self.sents_src) 84 | 85 | def validate(): 86 | res = [] 87 | for data in val_input: 88 | pred = predictor.predict_cls_classifier(data) 89 | pred = pred.argmax(dim=1).numpy() 90 | res.append(pred) 91 | 92 | f1 = f1_score(val_label, res) 93 | accuracy = accuracy_score(val_label, res) 94 | recall = recall_score(val_label, res) 95 | precision = precision_score(val_label, res) 96 | 97 | print(f" f1 is {f1}, acc is {accuracy}, recall is {recall} precision is {precision}") 98 | return accuracy 99 | 100 | class Evaluator: 101 | def __init__(self): 102 | self.best_acc = 0.0 103 | 104 | def on_validation(self, data): 105 | loss = data["loss"] 106 | step = data["iteration"] 107 | acc = validate() 108 | if acc > self.best_acc: 109 | self.best_acc = acc 110 | torch.save(bert_model.state_dict(), model_save_path) 111 | print(f"模型保存成功~") 112 | 113 | 114 | def main(): 115 | 116 | # 声明需要优化的参数 117 | optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=1e-3) 118 | train_dataset = ClassificationDataset(train_input, train_label) 119 | 120 | trainer.train(bert_model, optimizer=optimizer, 121 | train_dataset=train_dataset, 122 | evaluator=Evaluator, 123 | collate_fn=bert_cls_collate_fn) 124 | 125 | if __name__ == '__main__': 126 | main() 127 | -------------------------------------------------------------------------------- /examples/text_classification/train_roberta_semantic_matching.py: -------------------------------------------------------------------------------- 1 | # https://tianchi.aliyun.com/competition/entrance/531851/information 2 | import torch 3 | from torch.utils.data import Dataset, DataLoader 4 | from bert_seq2seq import Tokenizer 5 | from bert_seq2seq import load_model 6 | from bert_seq2seq.dataset import bert_cls_collate_fn 7 | from bert_seq2seq.trainer import Trainer 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score 10 | from bert_seq2seq import Predictor 11 | import os 12 | 13 | target = [0, 1] 14 | train_path = "../data/语义匹配/train.tsv" 15 | model_name = "roberta" # 选择模型名字 16 | task_name = "classification" 17 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置 18 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置 19 | model_save_path = "./bert_semantic_matching.bin" 20 | batch_size = 16 21 | lr = 1e-5 22 | # 加载字典 23 | tokenizer = Tokenizer(vocab_path) 24 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 25 | 26 | trainer = Trainer(epoches=3, 27 | val_every_step=100, 28 | batch_size=16, 29 | env_type="pytorch", 30 | device=device) 31 | 32 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name, target_size=len(target)) 33 | ## 加载预训练的模型参数~ 34 | bert_model.load_pretrain_params(model_path) 35 | # 声明需要优化的参数 36 | predictor = Predictor(bert_model, tokenizer) 37 | 38 | def read_corpus(data_path): 39 | """ 40 | 读原始数据 41 | """ 42 | sents_src = [] 43 | sents_tgt = [] 44 | 45 | with open(data_path) as f: 46 | lines = f.readlines() 47 | for line in lines: 48 | line = line.split("\t") 49 | if len(line) == 3: 50 | sents_tgt.append(int(line[2])) 51 | sents_src.append(line[0] + "#" +line[1]) 52 | 53 | return sents_src, sents_tgt 54 | 55 | ## 加载数据 56 | all_input, all_label = read_corpus(train_path) 57 | train_input, val_input, train_label, val_label = train_test_split(all_input, all_label, train_size=0.8, random_state=123) 58 | 59 | 60 | ## 自定义dataset 61 | class SemanticMatchingDataset(Dataset): 62 | """ 63 | 针对特定数据集,定义一个相关的取数据的方式 64 | """ 65 | def __init__(self, sents_src, sents_tgt) : 66 | ## 一般init函数是加载所有数据 67 | super(SemanticMatchingDataset, self).__init__() 68 | # 读原始数据 69 | self.sents_src = sents_src 70 | self.sents_tgt = sents_tgt 71 | 72 | def __getitem__(self, i): 73 | ## 得到单个数据 74 | # print(i) 75 | src = self.sents_src[i] 76 | tgt = self.sents_tgt[i] 77 | tokenizer_out = tokenizer.encode_plus(src) 78 | 79 | output = { 80 | "input_ids": tokenizer_out["input_ids"], 81 | "token_type_ids": tokenizer_out["token_type_ids"], 82 | "labels": tgt 83 | } 84 | return output 85 | 86 | def __len__(self): 87 | return len(self.sents_src) 88 | 89 | class Evaluator: 90 | def __init__(self): 91 | self.best_acc = 0.0 92 | 93 | def on_validation(self, data): 94 | loss = data["loss"] 95 | step = data["iteration"] 96 | res = [] 97 | for data in val_input: 98 | pred = predictor.predict_cls_classifier(data) 99 | pred = pred.argmax(dim=0).numpy() 100 | res.append(pred) 101 | 102 | f1 = f1_score(val_label, res) 103 | accuracy = accuracy_score(val_label, res) 104 | recall = recall_score(val_label, res) 105 | precision = precision_score(val_label, res) 106 | 107 | print(f" f1 is {f1}, acc is {accuracy}, recall is {recall} precision is {precision}") 108 | 109 | if accuracy > self.best_acc: 110 | self.best_acc = accuracy 111 | torch.save(bert_model.state_dict(), model_save_path) 112 | print(f"模型保存成功~") 113 | 114 | 115 | def main(): 116 | 117 | optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=1e-3) 118 | train_dataset = SemanticMatchingDataset(train_input, train_label) 119 | 120 | trainer.train(bert_model, optimizer=optimizer, 121 | train_dataset=train_dataset, 122 | evaluator=Evaluator, 123 | collate_fn=bert_cls_collate_fn) 124 | 125 | if __name__ == '__main__': 126 | main() 127 | -------------------------------------------------------------------------------- /examples/text_classification/train_roberta_news_title_classification.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | from bert_seq2seq import Tokenizer 4 | from bert_seq2seq import load_model 5 | from bert_seq2seq.dataset import bert_cls_collate_fn 6 | from bert_seq2seq.trainer import Trainer 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score 9 | from bert_seq2seq import Predictor 10 | import os 11 | from tqdm import tqdm 12 | 13 | target = ["财经", "彩票", "房产", "股票", "家居", "教育", "科技", "社会", "时尚", "时政", "体育", "星座", "游戏", "娱乐"] 14 | train_path = "../data/新闻标题文本分类/Train.txt" 15 | model_name = "roberta" # 选择模型名字 16 | task_name = "classification" 17 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置 18 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置 19 | model_save_path = "./bert_news_title_classification.bin" 20 | batch_size = 16 21 | lr = 1e-5 22 | # 加载字典 23 | tokenizer = Tokenizer(vocab_path) 24 | device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") 25 | 26 | trainer = Trainer(epoches=3, 27 | val_every_step=10000, 28 | batch_size=16, 29 | env_type="pytorch", 30 | device=device) 31 | bert_model = load_model(tokenizer.vocab, 32 | model_name=model_name, 33 | task_name=task_name, 34 | target_size=len(target)) 35 | ## 加载预训练的模型参数~ 36 | bert_model.load_pretrain_params(model_path) 37 | predictor = Predictor(bert_model, tokenizer) 38 | 39 | def read_corpus(): 40 | """ 41 | 读原始数据 42 | """ 43 | sents_src = [] 44 | sents_tgt = [] 45 | 46 | with open(train_path) as f: 47 | lines = f.readlines() 48 | for line in lines: 49 | line = line.split("\t") 50 | sents_tgt.append(int(line[0])) 51 | sents_src.append(line[2]) 52 | 53 | return sents_src, sents_tgt 54 | 55 | ## 加载数据 56 | all_input, all_label = read_corpus() 57 | train_input, val_input, train_label, val_label = train_test_split(all_input, all_label, train_size=0.8, random_state=123) 58 | 59 | 60 | ## 自定义dataset 61 | class ClassificationDataset(Dataset): 62 | """ 63 | 针对特定数据集,定义一个相关的取数据的方式 64 | """ 65 | def __init__(self, sents_src, sents_tgt) : 66 | super(ClassificationDataset, self).__init__() 67 | # 读原始数据 68 | self.sents_src = sents_src 69 | self.sents_tgt = sents_tgt 70 | 71 | def __getitem__(self, i): 72 | ## 得到单个数据 73 | src = self.sents_src[i] 74 | tgt = self.sents_tgt[i] 75 | tokenizer_out = tokenizer.encode_plus(src) 76 | 77 | output = { 78 | "input_ids": tokenizer_out["input_ids"], 79 | "token_type_ids": tokenizer_out["token_type_ids"], 80 | "labels": tgt 81 | } 82 | return output 83 | 84 | def __len__(self): 85 | return len(self.sents_src) 86 | 87 | def validate(): 88 | res = [] 89 | for data in tqdm(val_input, total=len(val_input)): 90 | pred = predictor.predict_cls_classifier(data) 91 | pred = pred.argmax(dim=0).numpy() 92 | res.append(pred) 93 | 94 | f1 = f1_score(val_label, res, average="macro") 95 | accuracy = accuracy_score(val_label, res) 96 | recall = recall_score(val_label, res, average="macro") 97 | precision = precision_score(val_label, res, average="macro") 98 | 99 | print(f" f1 is {f1}, acc is {accuracy}, recall is {recall} precision is {precision}") 100 | return accuracy 101 | 102 | class Evaluator: 103 | def __init__(self): 104 | self.best_acc = 0.0 105 | 106 | def on_epoch_end(self): 107 | acc = validate() 108 | if acc > self.best_acc: 109 | self.best_acc = acc 110 | torch.save(bert_model.state_dict(), model_save_path) 111 | print(f"模型保存成功~") 112 | 113 | def on_validation(self, data): 114 | loss = data["loss"] 115 | step = data["iteration"] 116 | pass 117 | 118 | def main(): 119 | 120 | # 声明需要优化的参数 121 | optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=1e-3) 122 | train_dataset = ClassificationDataset(train_input, train_label) 123 | 124 | trainer.train(bert_model, optimizer=optimizer, 125 | train_dataset=train_dataset, 126 | evaluator=Evaluator, 127 | collate_fn=bert_cls_collate_fn) 128 | 129 | if __name__ == '__main__': 130 | main() 131 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bert_seq2seq_DDP 2 | bert_seq2seq的DDP(分布式训练)版本。 3 | 此项目是对bert_seq2seq项目的重构并且很好的支持pytorch的DDP多卡训练。examples里面是各种训练例子,data中是样例数据。 4 | 5 | 本项目可以轻松调用不同种类transformer结构的模型(Bert、Roberta、T5、Nezha、Bart等)针对不同的任务(生成、序列标注、文本分类、关系抽取、命名实体识别等)进行快速的训练、预测,并且无缝进行分布式(DDP)训练。 6 | 7 | **一个不同的数据集,只需要花5-10分钟修改好构建输入输出的函数,即可快速开始训练!** 8 | #### 欢迎加入交流群~ 可以提问题,提建议,互相交流,还会提供部分数据与模型的下载 QQ群: 975907202 微信群: w11267191 加好友拉入群~ 9 | 10 | 11 | 更多关于bert_seq2seq相关的内容请看:https://github.com/920232796/bert_seq2seq 12 | 13 | ### 项目特点一: 14 | 单卡训练与多卡训练方式相同,无需添加额外代码和使用额外命令运行。 15 | 16 | 单卡与多卡的运行方式均为: 17 | ```shell 18 | python "./train.py" ## train.py为example中以train开头的训练脚本文件 19 | ``` 20 | 切换多卡训练只需要修改 ```train.py``` 文件中的环境设置即可: 21 | 22 | ```python 23 | num_gpus = 4 # gpu个数 24 | num_nodes = 1 ## 机器个数 目前只支持1 ,多机待测试。 25 | trainer = Trainer(env_type="DDP",## DDP为pytorch的分布式数据并行训练 26 | epoches=5, model_save_dir=model_save_dir, 27 | val_every_step=500, device=device, 28 | batch_size=16, num_gpus=num_gpus, num_nodes=num_nodes, 29 | training_script=__file__, 30 | ) 31 | ``` 32 | 具体例子代码可以参考: 33 | 34 | [train_roberta_auto_title_multi_gpu.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/bert/train_roberta_auto_title_multi_gpu.py) 自动标题任务,多gpu训练。 35 | 36 | ### 项目特点二: 37 | 虽然使用Trainer类进行了封装,也能做到比较灵活的evaluation. 38 | 39 | #### 自定义Evaluator类,可以自由进行验证 40 | 41 | [train_roberta_auto_title.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/bert/train_roberta_auto_title.py) 自动标题任务,在训练过程中很方便打印下生成内容。 42 | 43 | [train_roberta_semantic_matching.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/text_classification/train_roberta_semantic_matching.py) 语义匹配任务,在训练过程中轻松插入验证过程。 44 | 45 | ### 项目特点三: 46 | 提供样例数据在data目录中,帮助理解代码运行过程,供参考(qq群文件里提供部分任务的全部数据)。 47 | ### 环境配置 48 | #### 安装pytorch,不是太旧的版本即可。 49 | https://pytorch.org/ 50 | #### 安装额外的包 51 | ```commandline 52 | pip install bert_seq2seq_DDP 53 | pip install tqdm 54 | pip install scikit-learn //可选 55 | ``` 56 | 网络不好请切换国内源进行安装 57 | 58 | #### 模型预训练参数、字典下载 59 | 1. roberta模型(支持base、large),模型和字典文件下载地址:https://drive.google.com/file/d/1iNeYFhCBJWeUsIlnW_2K6SMwXkM4gLb_/view 这里下载。 参考github仓库:https://github.com/ymcui/Chinese-BERT-wwm ,roberta-large模型也是在里面进行下载即可。 60 | 2. bert模型(支持base、large),下载bert中文预训练权重 "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin", 下载bert中文字典 "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt". 61 | 3. nezha模型,字典权重位置(支持base、large):nezha-base模型下载:链接: https://pan.baidu.com/s/1Z0SJbISsKzAgs0lT9hFyZQ 提取码: 4awe 62 | 4. gpt2中文模型:gpt2中文通用模型和字典下载地址:https://pan.baidu.com/s/1vTYc8fJUmlQrre5p0JRelw 密码: f5un,下载好即可在 [examples/seq2seq/gpt2_text_writting.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/gpt2_text_writting.py) 中进行续写测试。 63 | 6. t5中文模型(支持base、small),预训练参数下载:https://github.com/renmada/t5-pegasus-pytorch 64 | 7. SimBert模型,相似句的生成,预训练模型使用bert、roberta、nezha均可。 65 | 8. bart中文模型下载地址:https://huggingface.co/fnlp/bart-base-chinese 66 | 67 | #### 参数说明,以文本分类任务为例 68 | ```python 69 | import torch 70 | from bert_seq2seq import Tokenizer, Predictor, load_model, Trainer 71 | target = ["财经", "彩票", "房产", "股票", "家居", "教育", "科技", "社会", "时尚", "时政", "体育", "星座", "游戏", "娱乐"] # 所有labels 72 | train_path = "../data/新闻标题文本分类/Train.txt" # 数据存放位置 73 | model_name = "roberta" # 选择模型名字 74 | task_name = "cls" # 任务名字 75 | vocab_path = "../state_dict/roberta-large/vocab.txt" # roberta模型字典的位置 76 | model_path = "../state_dict/roberta-large/pytorch_model.bin" # roberta模型位置 77 | model_save_path = "./bert_news_title_classification.bin" ## 训练好的模型保存位置 78 | batch_size = 16 79 | lr = 1e-5 80 | # 加载字典 81 | tokenizer = Tokenizer(vocab_path) 82 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 83 | # 加载roberta large 模型,做利用cls向量分类的任务。 84 | bert_model = load_model(tokenizer.vocab, 85 | model_name=model_name, 86 | size="large", ## load large model 87 | task_name=task_name, 88 | target_size=len(target)) 89 | ## 加载预训练的模型参数~ 90 | bert_model.load_pretrain_params(model_path) 91 | # trainer设置 92 | trainer = Trainer(epoches=3, val_every_step=500, # 每500步进行验证 93 | batch_size=batch_size, 94 | env_type="pytorch", # 单卡训练方式 95 | device=device, 96 | ) 97 | ``` 98 | #### 运行 99 | 确定要做哪个任务,找到examples中对应的train_*.py文件,下载好模型与字典后,理解数据构建过程,运行即可(样例数据在data目录中,帮助理解代码过程,供参考)。 100 | 101 | -------------------------------------------------------------------------------- /bert_seq2seq/mpu/data.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2022 BAAI. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License") 4 | # coding=utf-8 5 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | import torch 20 | 21 | from .initialize import get_model_parallel_group 22 | from .initialize import get_model_parallel_rank 23 | from .initialize import get_model_parallel_src_rank 24 | 25 | _MAX_DATA_DIM = 5 26 | 27 | 28 | def _check_data_types(keys, data, target_dtype): 29 | """Check that all the keys have the same target data type.""" 30 | for key in keys: 31 | assert data[key].dtype == target_dtype, '{} has data type {} which '\ 32 | 'is different than {}'.format(key, data[key].dtype, target_dtype) 33 | 34 | 35 | def _build_key_size_numel_dictionaries(keys, data): 36 | """Build the size on rank 0 and broadcast.""" 37 | max_dim = _MAX_DATA_DIM 38 | sizes = [0 for _ in range(max_dim) for _ in keys] 39 | 40 | # Pack the sizes on rank zero. 41 | if get_model_parallel_rank() == 0: 42 | offset = 0 43 | for key in keys: 44 | assert data[key].dim( 45 | ) < max_dim, 'you should increase MAX_DATA_DIM' 46 | size = data[key].size() 47 | for i, s in enumerate(size): 48 | sizes[i + offset] = s 49 | offset += max_dim 50 | 51 | # Move to GPU and broadcast. 52 | sizes_cuda = torch.cuda.LongTensor(sizes) 53 | torch.distributed.broadcast(sizes_cuda, 54 | get_model_parallel_src_rank(), 55 | group=get_model_parallel_group()) 56 | 57 | # Move back to cpu and unpack. 58 | sizes_cpu = sizes_cuda.cpu() 59 | key_size = {} 60 | key_numel = {} 61 | total_numel = 0 62 | offset = 0 63 | for key in keys: 64 | i = 0 65 | size = [] 66 | numel = 1 67 | while sizes_cpu[offset + i] > 0: 68 | this_size = sizes_cpu[offset + i] 69 | size.append(this_size) 70 | numel *= this_size 71 | i += 1 72 | key_size[key] = size 73 | key_numel[key] = numel 74 | total_numel += numel 75 | offset += max_dim 76 | 77 | return key_size, key_numel, total_numel 78 | 79 | 80 | def broadcast_data(keys, data, datatype): 81 | """Broadcast data from rank zero of each model parallel group to the 82 | members of the same model parallel group. 83 | 84 | Arguments: 85 | keys: list of keys in the data disctionary to be broadcasted 86 | data: data dictionary of string keys and cpu tensor values. 87 | datatype: torch data type of all tensors in data associated 88 | with keys. 89 | """ 90 | # Build (key, size) and (key, number of elements) dictionaries along 91 | # with the total number of elements on all ranks. 92 | key_size, key_numel, total_numel = _build_key_size_numel_dictionaries( 93 | keys, data) 94 | 95 | # Pack on rank zero. 96 | if get_model_parallel_rank() == 0: 97 | # Check that all keys have the same data type. 98 | _check_data_types(keys, data, datatype) 99 | # Flatten the data associated with the keys 100 | flatten_data = torch.cat( 101 | [data[key].contiguous().view(-1) for key in keys], dim=0).cuda() 102 | else: 103 | flatten_data = torch.empty(total_numel, 104 | device=torch.cuda.current_device(), 105 | dtype=datatype) 106 | 107 | # Boradcast 108 | torch.distributed.broadcast(flatten_data, 109 | get_model_parallel_src_rank(), 110 | group=get_model_parallel_group()) 111 | 112 | # Unpack 113 | output = {} 114 | offset = 0 115 | for key in keys: 116 | size = key_size[key] 117 | numel = key_numel[key] 118 | output[key] = flatten_data.narrow(0, offset, numel).view(size) 119 | offset += numel 120 | 121 | return output 122 | -------------------------------------------------------------------------------- /bert_seq2seq/task/seq2seq/GLM_seq2seq_model.py: -------------------------------------------------------------------------------- 1 | 2 | from bert_seq2seq.basic_bert import BasicGLM 3 | from bert_seq2seq.model.glm_model import GLMModel 4 | import os 5 | 6 | large_ch_config = { 7 | "num_layers": 24, 8 | "vocab_size": 50048, 9 | "hidden_size": 1024, 10 | "num_attention_heads":16, 11 | "embedding_dropout_prob":0.1, 12 | "attention_dropout_prob":0.1, 13 | "output_dropout_prob":0.1, 14 | "max_sequence_length":1024, 15 | "max_memory_length":511, 16 | "checkpoint_activations": False , 17 | "checkpoint_num_layers":1 , 18 | "parallel_output": True, 19 | "relative_encoding": False, 20 | "block_position_encoding": True, 21 | "output_predict": True, 22 | "spell_length": None, 23 | "spell_func": "lstm", 24 | "attention_scale":1.0 25 | } 26 | class GLMLargeChConfig: 27 | def __init__(self): 28 | config = large_ch_config 29 | self.num_layers = config["num_layers"] 30 | self.vocab_size = config["vocab_size"] 31 | self.hidden_size = config["hidden_size"] 32 | self.num_attention_heads = config["num_attention_heads"] 33 | self.embedding_dropout_prob = config["embedding_dropout_prob"] 34 | self.attention_dropout_prob = config["attention_dropout_prob"] 35 | self.output_dropout_prob = config["output_dropout_prob"] 36 | self.max_sequence_length = config["max_sequence_length"] 37 | self.max_memory_length = config["max_memory_length"] 38 | self.checkpoint_activations = config["checkpoint_activations"] 39 | self.checkpoint_num_layers = config["checkpoint_num_layers"] 40 | self.parallel_output = config["parallel_output"] 41 | self.relative_encoding = config["relative_encoding"] 42 | self.block_position_encoding = config["block_position_encoding"] 43 | self.output_predict = config["output_predict"] 44 | self.spell_length = config["spell_length"] 45 | self.spell_func = config["spell_func"] 46 | self.attention_scale = config["attention_scale"] 47 | 48 | class GLMSeq2SeqModel(BasicGLM): 49 | """ 50 | """ 51 | def __init__(self, 52 | size="base", **kwargs): 53 | super(GLMSeq2SeqModel, self).__init__() 54 | if size == "base": 55 | pass 56 | print("不支持GLM base模型") 57 | os._exit(0) 58 | elif size == "large": 59 | config = GLMLargeChConfig() 60 | 61 | else : 62 | print("不支持的size") 63 | os._exit(0) 64 | 65 | self.config = config 66 | self.model = GLMModel(num_layers=config.num_layers, 67 | vocab_size=config.vocab_size, 68 | hidden_size=config.hidden_size, 69 | num_attention_heads=config.num_attention_heads, 70 | embedding_dropout_prob=config.embedding_dropout_prob, 71 | attention_dropout_prob=config.attention_dropout_prob, 72 | output_dropout_prob=config.output_dropout_prob, 73 | max_sequence_length=config.max_sequence_length, 74 | max_memory_length=config.max_memory_length, 75 | checkpoint_activations=config.checkpoint_activations, 76 | checkpoint_num_layers=config.checkpoint_num_layers, 77 | output_predict=config.output_predict, 78 | parallel_output=config.parallel_output, 79 | relative_encoding=config.relative_encoding, 80 | block_position_encoding=config.block_position_encoding, 81 | spell_length=config.spell_length, 82 | spell_func=config.spell_func, 83 | attention_scale=config.attention_scale) 84 | 85 | self.hidden_dim = self.config.hidden_size 86 | self.vocab_size = self.config.vocab_size 87 | 88 | def forward(self, **data): 89 | input_ids = data["input_ids"] 90 | labels = data.get("labels", None) 91 | position_ids = data["position_ids"] 92 | attention_mask = data["attention_mask"] 93 | return_memory = data.get("return_memory", False) 94 | mems = data.get("mems", None) 95 | 96 | return self.model(input_ids=input_ids, position_ids=position_ids, 97 | attention_mask=attention_mask, labels=labels, 98 | return_memory=return_memory, mems=mems) 99 | 100 | def load_weights(self, checkpoints_path): 101 | self.model.load_weights_glm(checkpoints_path) 102 | 103 | 104 | -------------------------------------------------------------------------------- /examples/text_classification/train_roberta_news_title_classification_multi_gpu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | from bert_seq2seq import Tokenizer 4 | from bert_seq2seq import load_model 5 | from bert_seq2seq.dataset import bert_cls_collate_fn 6 | from bert_seq2seq.trainer import Trainer 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score 9 | from bert_seq2seq import Predictor 10 | import os 11 | from tqdm import tqdm 12 | 13 | target = ["财经", "彩票", "房产", "股票", "家居", "教育", "科技", "社会", "时尚", "时政", "体育", "星座", "游戏", "娱乐"] 14 | train_path = "../data/新闻标题文本分类/Train.txt" 15 | model_name = "roberta" # 选择模型名字 16 | task_name = "classification" 17 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置 18 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置 19 | model_save_path = "./bert_news_title_classification.bin" 20 | batch_size = 16 21 | lr = 1e-5 22 | os.environ['CUDA_VISIBLE_DEVICES'] = "1,2" 23 | num_gpus = 2 24 | 25 | # 加载字典 26 | tokenizer = Tokenizer(vocab_path) 27 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 28 | 29 | trainer = Trainer(epoches=3, 30 | val_every_step=500, 31 | batch_size=16, 32 | env_type="DDP", 33 | device=device, 34 | num_nodes=1, 35 | num_gpus=num_gpus, 36 | training_script=__file__) 37 | 38 | bert_model = load_model(tokenizer.vocab, 39 | model_name=model_name, 40 | task_name=task_name, 41 | target_size=len(target),) 42 | ## 加载预训练的模型参数~ 43 | bert_model.load_pretrain_params(model_path) 44 | predictor = Predictor(bert_model, tokenizer) 45 | 46 | def read_corpus(): 47 | """ 48 | 读原始数据 49 | """ 50 | sents_src = [] 51 | sents_tgt = [] 52 | 53 | with open(train_path) as f: 54 | lines = f.readlines() 55 | for line in lines: 56 | line = line.split("\t") 57 | sents_tgt.append(int(line[0])) 58 | sents_src.append(line[2]) 59 | 60 | return sents_src, sents_tgt 61 | 62 | ## 加载数据 63 | all_input, all_label = read_corpus() 64 | train_input, val_input, train_label, val_label = train_test_split(all_input, all_label, train_size=0.8, random_state=123) 65 | 66 | 67 | ## 自定义dataset 68 | class ClassificationDataset(Dataset): 69 | """ 70 | 针对特定数据集,定义一个相关的取数据的方式 71 | """ 72 | def __init__(self, sents_src, sents_tgt) : 73 | super(ClassificationDataset, self).__init__() 74 | # 读原始数据 75 | self.sents_src = sents_src 76 | self.sents_tgt = sents_tgt 77 | 78 | def __getitem__(self, i): 79 | ## 得到单个数据 80 | src = self.sents_src[i] 81 | tgt = self.sents_tgt[i] 82 | tokenizer_out = tokenizer.encode_plus(src) 83 | 84 | output = { 85 | "input_ids": tokenizer_out["input_ids"], 86 | "token_type_ids": tokenizer_out["token_type_ids"], 87 | "labels": tgt 88 | } 89 | return output 90 | 91 | def __len__(self): 92 | return len(self.sents_src) 93 | 94 | def validate(): 95 | res = [] 96 | for data in tqdm(val_input, total=len(val_input)): 97 | pred = predictor.predict_cls_classifier(data) 98 | pred = pred.argmax(dim=0).numpy() 99 | res.append(pred) 100 | 101 | f1 = f1_score(val_label, res, average="macro") 102 | accuracy = accuracy_score(val_label, res) 103 | recall = recall_score(val_label, res, average="macro") 104 | precision = precision_score(val_label, res, average="macro") 105 | 106 | print(f" f1 is {f1}, acc is {accuracy}, recall is {recall} precision is {precision}") 107 | return accuracy 108 | 109 | class Evaluator: 110 | def __init__(self): 111 | self.best_acc = 0.0 112 | 113 | def on_epoch_end(self): 114 | acc = validate() 115 | if acc > self.best_acc: 116 | self.best_acc = acc 117 | torch.save(bert_model.state_dict(), model_save_path) 118 | print(f"模型保存成功~") 119 | 120 | def on_validation(self, data): 121 | loss = data["loss"] 122 | step = data["iteration"] 123 | pass 124 | 125 | def main(): 126 | 127 | # 声明需要优化的参数 128 | optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=1e-3) 129 | train_dataset = ClassificationDataset(train_input, train_label) 130 | 131 | trainer.train(bert_model, optimizer=optimizer, 132 | train_dataset=train_dataset, 133 | evaluator=Evaluator, 134 | collate_fn=bert_cls_collate_fn) 135 | 136 | if __name__ == '__main__': 137 | main() 138 | -------------------------------------------------------------------------------- /bert_seq2seq/mpu/mappings.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2022 BAAI. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License") 4 | # coding=utf-8 5 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | import torch 20 | 21 | from .initialize import get_model_parallel_group 22 | from .utils import split_tensor_along_last_dim 23 | 24 | 25 | def _reduce(input_): 26 | """All-reduce the the input tensor across model parallel group.""" 27 | group = get_model_parallel_group() 28 | 29 | # Bypass the function if we are using only 1 GPU. 30 | if torch.distributed.get_world_size(group=group) == 1: 31 | return input_ 32 | 33 | # All-reduce. 34 | torch.distributed.all_reduce(input_, group=group) 35 | 36 | return input_ 37 | 38 | 39 | def _split(input_): 40 | """Split the tensor along its last dimension and keep the 41 | corresponding slice.""" 42 | group = get_model_parallel_group() 43 | 44 | # Bypass the function if we are using only 1 GPU. 45 | if torch.distributed.get_world_size(group=group) == 1: 46 | return input_ 47 | 48 | # Split along last dimension. 49 | world_size = torch.distributed.get_world_size(group=group) 50 | input_list = split_tensor_along_last_dim(input_, world_size) 51 | 52 | # Note: torch.split does not create contiguous tensors by default. 53 | rank = torch.distributed.get_rank(group=group) 54 | output = input_list[rank].contiguous() 55 | 56 | return output 57 | 58 | 59 | def _gather(input_): 60 | """Gather tensors and concatinate along the last dimension.""" 61 | group = get_model_parallel_group() 62 | 63 | # Bypass the function if we are using only 1 GPU. 64 | if torch.distributed.get_world_size(group=group) == 1: 65 | return input_ 66 | 67 | # Size and dimension. 68 | last_dim = input_.dim() - 1 69 | rank = torch.distributed.get_rank(group=group) 70 | world_size = torch.distributed.get_world_size(group=group) 71 | 72 | tensor_list = [torch.empty_like(input_) for _ in range(world_size)] 73 | tensor_list[rank] = input_ 74 | torch.distributed.all_gather(tensor_list, input_, group=group) 75 | 76 | # Note: torch.cat already creates a contiguous tensor. 77 | output = torch.cat(tensor_list, dim=last_dim).contiguous() 78 | 79 | return output 80 | 81 | 82 | class _CopyToModelParallelRegion(torch.autograd.Function): 83 | """Pass the input to the model parallel region.""" 84 | @staticmethod 85 | def forward(ctx, input_): 86 | return input_ 87 | 88 | @staticmethod 89 | def backward(ctx, grad_output): 90 | return _reduce(grad_output) 91 | 92 | 93 | class _ReduceFromModelParallelRegion(torch.autograd.Function): 94 | """All-redcue the input from the model parallel region.""" 95 | @staticmethod 96 | def forward(ctx, input_): 97 | return _reduce(input_) 98 | 99 | @staticmethod 100 | def backward(ctx, grad_output): 101 | return grad_output 102 | 103 | 104 | class _ScatterToModelParallelRegion(torch.autograd.Function): 105 | """Split the input and keep only the corresponding chuck to the rank.""" 106 | @staticmethod 107 | def forward(ctx, input_): 108 | return _split(input_) 109 | 110 | @staticmethod 111 | def backward(ctx, grad_output): 112 | return _gather(grad_output) 113 | 114 | 115 | class _GatherFromModelParallelRegion(torch.autograd.Function): 116 | """Gather the input from model parallel region and concatinate.""" 117 | @staticmethod 118 | def forward(ctx, input_): 119 | return _gather(input_) 120 | 121 | @staticmethod 122 | def backward(ctx, grad_output): 123 | return _split(grad_output) 124 | 125 | 126 | # ----------------- 127 | # Helper functions. 128 | # ----------------- 129 | 130 | 131 | def copy_to_model_parallel_region(input_): 132 | return _CopyToModelParallelRegion.apply(input_) 133 | 134 | 135 | def reduce_from_model_parallel_region(input_): 136 | return _ReduceFromModelParallelRegion.apply(input_) 137 | 138 | 139 | def scatter_to_model_parallel_region(input_): 140 | return _ScatterToModelParallelRegion.apply(input_) 141 | 142 | 143 | def gather_from_model_parallel_region(input_): 144 | return _GatherFromModelParallelRegion.apply(input_) 145 | -------------------------------------------------------------------------------- /examples/seq2seq/GLM/train_glm_auto_title.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import torch 4 | from torch.utils.data import Dataset, DataLoader 5 | from bert_seq2seq import GLMTokenizer 6 | from bert_seq2seq import load_model 7 | from bert_seq2seq import Trainer 8 | from bert_seq2seq.dataset import glm_generation_collate_fn 9 | from bert_seq2seq import Predictor 10 | 11 | model_name = "glm" # 选择模型名字 12 | task_name = "seq2seq" # 任务名字 13 | 14 | vocab_path = "../state_dict/GLM-large-ch/cog-pretrain.model" # roberta模型字典的位置 15 | model_path = "../state_dict/GLM-large-ch/pytorch_model.bin" # 预训练模型位置 16 | model_save_path = "./GLM_auto_title_model.bin" # 训练好的模型保存位置。 17 | 18 | lr = 1e-5 19 | maxlen=1024 20 | 21 | src_dir = '../data/auto_title/train.src' # 数据位置 22 | tgt_dir = '../data/auto_title/train.tgt' 23 | 24 | device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") 25 | tokenizer = GLMTokenizer(vocab_path) 26 | 27 | model = load_model(model_name=model_name, 28 | task_name=task_name, 29 | size="large") 30 | 31 | model.load_pretrain_params(model_path) 32 | predictor = Predictor(model, tokenizer) 33 | 34 | trainer = Trainer(env_type="pytorch", 35 | epoches=5, 36 | val_every_step=500, 37 | device=device, 38 | batch_size=16) 39 | 40 | def read_file(): 41 | ## 更换数据集只需要实现这个函数即可,返回框架需要的src、tgt 42 | src = [] 43 | tgt = [] 44 | 45 | with open(src_dir,'r',encoding='utf-8') as f: 46 | lines = f.readlines() 47 | for line in lines: 48 | src.append(line.strip('\n').lower()) 49 | 50 | with open(tgt_dir,'r',encoding='utf-8') as f: 51 | lines = f.readlines() 52 | for line in lines: 53 | tgt.append(line.strip('\n').lower()) 54 | 55 | return src,tgt 56 | 57 | class AutoTitleDataset(Dataset): 58 | """ 59 | 针对特定数据集,定义一个相关的取数据的方式 60 | """ 61 | def __init__(self, sents_src, sents_tgt) : 62 | ## 一般init函数是加载所有数据 63 | super().__init__() 64 | self.sents_src = sents_src 65 | self.sents_tgt = sents_tgt 66 | 67 | def __getitem__(self, i): 68 | ## 得到单个数据 69 | # print(i) 70 | src = self.sents_src[i] 71 | tgt = self.sents_tgt[i] 72 | 73 | tokenizer_out = tokenizer.encode_plus(src, 74 | tgt, 75 | max_length=maxlen, 76 | # mask_token="sMASK", 77 | prefix_flag="标题生成:", 78 | post_flag=" 回答:") 79 | 80 | output = { 81 | "input_ids": tokenizer_out["input_ids"], 82 | "attention_mask": tokenizer_out["attention_mask"], 83 | "position_ids": tokenizer_out["position_ids"], 84 | "loss_mask": tokenizer_out["loss_mask"], 85 | } 86 | return output 87 | 88 | def __len__(self): 89 | 90 | return len(self.sents_src) 91 | 92 | class Evaluator: 93 | 94 | def prompt(self, text): 95 | text = "标题生成:" + text + "回答:[gMASK]" 96 | return text 97 | 98 | def on_validation(self, data): 99 | loss = data["loss"] 100 | step = data["iteration"] 101 | ## 自己定义validate函数实现,十分灵活。 102 | test_data = ["本文总结了十个可穿戴产品的设计原则而这些原则同样也是笔者认为是这个行业最吸引人的地方1为人们解决重复性问题2从人开始而不是从机器开始3要引起注意但不要刻意4提升用户能力而不是取代人", 103 | "2007年乔布斯向人们展示iPhone并宣称它将会改变世界还有人认为他在夸大其词然而在8年后以iPhone为代表的触屏智能手机已经席卷全球各个角落未来智能手机将会成为真正的个人电脑为人类发展做出更大的贡献", 104 | "雅虎发布2014年第四季度财报并推出了免税方式剥离其持有的阿里巴巴集团15%股权的计划打算将这一价值约400亿美元的宝贵投资分配给股东截止发稿前雅虎股价上涨了大约7%至5145美元"] 105 | for text in test_data: 106 | text = self.prompt(text) 107 | print(predictor.predict_generate_randomsample(text, 108 | top_k=50, 109 | top_p=0.9, 110 | repetition_penalty=1.0, 111 | input_max_length=600, 112 | out_max_length=100, 113 | )) 114 | 115 | torch.save(model.state_dict(), model_save_path) 116 | print(f"模型保存成功~") 117 | 118 | def main(): 119 | ## 加载数据 120 | all_src, all_tgt = read_file() 121 | train_size = int(len(all_src) * 0.9) 122 | train_src, train_tgt = all_src[:train_size], all_tgt[:train_size] 123 | # 声明需要优化的参数 124 | optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-3) 125 | train_dataset = AutoTitleDataset(train_src, train_tgt) 126 | 127 | trainer.train(model, optimizer, train_dataset=train_dataset, evaluator=Evaluator, 128 | collate_fn=glm_generation_collate_fn) 129 | 130 | if __name__ == '__main__': 131 | main() 132 | -------------------------------------------------------------------------------- /bert_seq2seq/mpu/cross_entropy.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2022 BAAI. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License") 4 | # coding=utf-8 5 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | import torch 20 | 21 | from .initialize import get_model_parallel_group 22 | from .initialize import get_model_parallel_rank 23 | from .initialize import get_model_parallel_world_size 24 | from .utils import VocabUtility 25 | 26 | 27 | class _VocabParallelCrossEntropy(torch.autograd.Function): 28 | @staticmethod 29 | def forward(ctx, vocab_parallel_logits, target): 30 | 31 | # Copy so the input remains unchanged. 32 | logits = vocab_parallel_logits.clone() 33 | # Maximum value along vocab dimension across all GPUs. 34 | logits_max = torch.max(logits, dim=-1)[0] 35 | torch.distributed.all_reduce(logits_max, 36 | op=torch.distributed.ReduceOp.MAX, 37 | group=get_model_parallel_group()) 38 | # Subtract the maximum value. 39 | logits.sub_(logits_max.unsqueeze(dim=-1)) 40 | # Sum of exponential of logits along vocab dimension across all GPUs. 41 | exp_logits = logits.exp() 42 | sum_exp_logits = exp_logits.sum(dim=-1) 43 | torch.distributed.all_reduce(sum_exp_logits, 44 | op=torch.distributed.ReduceOp.SUM, 45 | group=get_model_parallel_group()) 46 | 47 | # Get the partition's vocab indecies 48 | get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size 49 | partition_vocab_size = vocab_parallel_logits.size()[-1] 50 | rank = get_model_parallel_rank() 51 | world_size = get_model_parallel_world_size() 52 | vocab_start_index, vocab_end_index = get_vocab_range( 53 | partition_vocab_size, rank, world_size) 54 | 55 | # Create a mask of valid vocab ids (1 means it needs to be masked). 56 | target_mask = (target < vocab_start_index) | (target >= 57 | vocab_end_index) 58 | masked_target = target.clone() - vocab_start_index 59 | masked_target[target_mask] = 0 60 | 61 | # Get predicted-logits = logits[target]. 62 | # For Simplicity, we convert logits to a 2-D tensor with size 63 | # [*, partition-vocab-size] and target to a 1-D tensor of size [*]. 64 | logits_2d = logits.view(-1, partition_vocab_size) 65 | masked_target_1d = masked_target.view(-1) 66 | arange_1d = torch.arange(start=0, 67 | end=logits_2d.size()[0], 68 | device=logits_2d.device) 69 | predicted_logits_1d = logits_2d[arange_1d, masked_target_1d] 70 | predicted_logits = predicted_logits_1d.view_as(target) 71 | predicted_logits[target_mask] = 0.0 72 | # All reduce is needed to get the chunks from other GPUs. 73 | torch.distributed.all_reduce(predicted_logits, 74 | op=torch.distributed.ReduceOp.SUM, 75 | group=get_model_parallel_group()) 76 | 77 | # Loss = log(sum(exp(logits))) - predicted-logit. 78 | loss = torch.log(sum_exp_logits) - predicted_logits 79 | 80 | # Store softmax, target-mask and masked-target for backward pass. 81 | exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1)) 82 | ctx.save_for_backward(exp_logits, target_mask, masked_target_1d) 83 | 84 | return loss 85 | 86 | @staticmethod 87 | def backward(ctx, grad_output): 88 | 89 | # Retreive tensors from the forward path. 90 | softmax, target_mask, masked_target_1d = ctx.saved_tensors 91 | 92 | # All the inputs have softmax as thier gradient. 93 | grad_input = softmax 94 | # For simplicity, work with the 2D gradient. 95 | partition_vocab_size = softmax.size()[-1] 96 | grad_2d = grad_input.view(-1, partition_vocab_size) 97 | 98 | # Add the gradient from matching classes. 99 | arange_1d = torch.arange(start=0, 100 | end=grad_2d.size()[0], 101 | device=grad_2d.device) 102 | grad_2d[arange_1d, 103 | masked_target_1d] -= (1.0 - target_mask.view(-1).float()) 104 | 105 | # Finally elementwise multiplication with the output gradients. 106 | grad_input.mul_(grad_output.unsqueeze(dim=-1)) 107 | 108 | return grad_input, None 109 | 110 | 111 | def vocab_parallel_cross_entropy(vocab_parallel_logits, target): 112 | """Helper function for the cross entropy.""" 113 | return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target) 114 | -------------------------------------------------------------------------------- /bert_seq2seq/mpu/initialize.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2022 BAAI. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License") 4 | # coding=utf-8 5 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | """Model and data parallel groups.""" 19 | 20 | import torch 21 | 22 | from .utils import ensure_divisibility 23 | 24 | # Model parallel group that the current rank belongs to. 25 | _MODEL_PARALLEL_GROUP = None 26 | # Data parallel group that the current rank belongs to. 27 | _DATA_PARALLEL_GROUP = None 28 | 29 | 30 | def initialize_model_parallel(model_parallel_size_): 31 | """ 32 | Initialize model data parallel groups. 33 | 34 | Arguments: 35 | model_parallel_size: number of GPUs used to parallelize model. 36 | 37 | Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we 38 | use 2 GPUs to parallelize the model. The present function will 39 | create 4 model parallel groups and 2 data parallel groups as: 40 | 4 model parallel groups: 41 | [g0, g1], [g2, g3], [g4, g5], [g6, g7] 42 | 2 data parallel groups: 43 | [g0, g2, g4, g6], [g1, g3, g5, g7] 44 | Note that for efficiency, the caller should make sure adjacent ranks 45 | are on the same DGX box. For example if we are using 2 DGX-1 boxes 46 | with a total of 16 GPUs, rank 0 to 7 belong to the first box and 47 | ranks 8 to 15 belong to the second box. 48 | """ 49 | if torch.distributed.get_rank() == 0: 50 | print('> initializing model parallel with size {}'.format( 51 | model_parallel_size_)) 52 | # Get world size and rank. Ensure some consistencies. 53 | assert torch.distributed.is_initialized() 54 | world_size = torch.distributed.get_world_size() 55 | model_parallel_size = min(model_parallel_size_, world_size) 56 | ensure_divisibility(world_size, model_parallel_size) 57 | rank = torch.distributed.get_rank() 58 | 59 | # Build the data parallel groups. 60 | global _DATA_PARALLEL_GROUP 61 | assert _DATA_PARALLEL_GROUP is None, \ 62 | 'data parallel group is already initialized' 63 | for i in range(model_parallel_size): 64 | ranks = range(i, world_size, model_parallel_size) 65 | group = torch.distributed.new_group(ranks) 66 | if i == (rank % model_parallel_size): 67 | _DATA_PARALLEL_GROUP = group 68 | 69 | # Build the model parallel groups. 70 | global _MODEL_PARALLEL_GROUP 71 | assert _MODEL_PARALLEL_GROUP is None, \ 72 | 'model parallel group is already initialized' 73 | for i in range(world_size // model_parallel_size): 74 | ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size) 75 | group = torch.distributed.new_group(ranks) 76 | if i == (rank // model_parallel_size): 77 | _MODEL_PARALLEL_GROUP = group 78 | 79 | 80 | def model_parallel_is_initialized(): 81 | """Check if model and data parallel groups are initialized.""" 82 | if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None: 83 | return False 84 | return True 85 | 86 | 87 | def get_model_parallel_group(): 88 | """Get the model parallel group the caller rank belongs to.""" 89 | assert _MODEL_PARALLEL_GROUP is not None, \ 90 | 'model parallel group is not initialized' 91 | return _MODEL_PARALLEL_GROUP 92 | 93 | 94 | def get_data_parallel_group(): 95 | """Get the data parallel group the caller rank belongs to.""" 96 | assert _DATA_PARALLEL_GROUP is not None, \ 97 | 'data parallel group is not initialized' 98 | return _DATA_PARALLEL_GROUP 99 | 100 | 101 | def get_model_parallel_world_size(): 102 | """Return world size for the model parallel group.""" 103 | return torch.distributed.get_world_size(group=get_model_parallel_group()) 104 | 105 | 106 | def get_model_parallel_rank(): 107 | """Return my rank for the model parallel group.""" 108 | return torch.distributed.get_rank(group=get_model_parallel_group()) 109 | 110 | 111 | def get_model_parallel_src_rank(): 112 | """Calculate the global rank corresponding to a local rank zeor 113 | in the model parallel group.""" 114 | global_rank = torch.distributed.get_rank() 115 | local_world_size = get_model_parallel_world_size() 116 | return (global_rank // local_world_size) * local_world_size 117 | 118 | 119 | def get_data_parallel_world_size(): 120 | """Return world size for the data parallel group.""" 121 | return torch.distributed.get_world_size(group=get_data_parallel_group()) 122 | 123 | 124 | def get_data_parallel_rank(): 125 | """Return my rank for the data parallel group.""" 126 | return torch.distributed.get_rank(group=get_data_parallel_group()) 127 | 128 | 129 | def destroy_model_parallel(): 130 | """Set the groups to none.""" 131 | global _MODEL_PARALLEL_GROUP 132 | _MODEL_PARALLEL_GROUP = None 133 | global _DATA_PARALLEL_GROUP 134 | _DATA_PARALLEL_GROUP = None 135 | -------------------------------------------------------------------------------- /examples/ner/train_bert_ner_people_daily.py: -------------------------------------------------------------------------------- 1 | ## 人民日报数据 2 | import torch 3 | from tqdm import tqdm 4 | from torch.utils.data import Dataset 5 | from bert_seq2seq import Tokenizer 6 | from bert_seq2seq import load_model 7 | from bert_seq2seq.dataset import bert_sequence_label_collate_fn 8 | from bert_seq2seq import Trainer 9 | from bert_seq2seq import Predictor 10 | 11 | train_path = "../data/china-people-daily-ner-corpus/example.train" 12 | valid_path = '../data/china-people-daily-ner-corpus/example.dev' 13 | test_path = '../data/china-people-daily-ner-corpus/example.test' 14 | 15 | model_name = "roberta" # 选择模型名字 16 | task_name = "sequence_labeling" 17 | 18 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置 19 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置 20 | 21 | model_save_path = "./bert_sequence_labeling.bin" 22 | 23 | batch_size = 16 24 | lr = 1e-5 25 | # 加载字典 26 | maxlen = 256 27 | device = torch.device("cuda:5" if torch.cuda.is_available() else "cpu") 28 | tokenizer = Tokenizer(vocab_path, do_lower_case=True, max_len=maxlen) 29 | 30 | trainer = Trainer(epoches=10, 31 | env_type="pytorch", 32 | val_every_step=500, 33 | batch_size=batch_size, 34 | device=device, 35 | ) 36 | 37 | target = ["O", "B-LOC", "I-LOC", "B-ORG", "I-ORG", "B-PER", "I-PER"] 38 | 39 | def load_data(filename): 40 | """加载数据 41 | 单条格式:[text, (start, end, label), (start, end, label), ...], 42 | 意味着text[start:end + 1]是类型为label的实体。 43 | """ 44 | D = [] 45 | with open(filename, encoding='utf-8') as f: 46 | f = f.read() 47 | for l in f.split('\n\n'): 48 | if not l: 49 | continue 50 | d = [''] 51 | for i, c in enumerate(l.split('\n')): 52 | char, flag = c.split(' ') 53 | d[0] += char 54 | if flag[0] == 'B': 55 | d.append([i, i, flag[2:]]) 56 | elif flag[0] == 'I': 57 | d[-1][1] = i 58 | 59 | D.append(d) 60 | return D 61 | 62 | train_data = load_data(train_path) 63 | val_data = load_data(valid_path) 64 | test_data = load_data(test_path) 65 | 66 | print(f"all target is {target}") 67 | 68 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name, 69 | target_size=len(target)) 70 | bert_model.load_pretrain_params(model_path) 71 | 72 | predictor = Predictor(bert_model, tokenizer) 73 | 74 | ## 自定义dataset 75 | class NERDataset(Dataset): 76 | """ 77 | 针对特定数据集,定义一个相关的取数据的方式 78 | """ 79 | def __init__(self, data) : 80 | ## 一般init函数是加载所有数据 81 | super(NERDataset, self).__init__() 82 | # 读原始数据 83 | self.data = data 84 | def __getitem__(self, i): 85 | ## 得到单个数据 86 | # print(i) 87 | data = self.data[i] 88 | 89 | tokens = tokenizer.tokenize(data[0], maxlen=maxlen, add_spatial_tokens=True) 90 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 91 | 92 | mapping = tokenizer.rematch(data[0], tokens) 93 | start_mapping = {j[0]: i for i, j in enumerate(mapping) if j} 94 | end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j} 95 | length = len(tokens) 96 | labels = [0] * length 97 | 98 | for start, end, label in data[1:]: 99 | if start in start_mapping and end in end_mapping: 100 | # 说明找到这个token了。 101 | start = start_mapping[start] 102 | end = end_mapping[end] 103 | 104 | labels[start] = target.index(f"B-{label}") 105 | for j in range(start + 1, end + 1): 106 | labels[j] = target.index(f"I-{label}") 107 | 108 | output = { 109 | "input_ids": input_ids, 110 | "labels": labels 111 | } 112 | return output 113 | 114 | def __len__(self): 115 | return len(self.data) 116 | 117 | def evaluate(data): 118 | """评测函数 119 | """ 120 | X, Y, Z = 1e-10, 1e-10, 1e-10 121 | for d in tqdm(data, ncols=100): 122 | R = set(predictor.predict_ner(d[0], target, maxlen=maxlen)) 123 | T = set([tuple(i) for i in d[1:]]) 124 | X += len(R & T) 125 | Y += len(R) 126 | Z += len(T) 127 | f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z 128 | return f1, precision, recall 129 | 130 | class Evaluator: 131 | 132 | def __init__(self): 133 | self.best_val_f1 = 0.0 134 | 135 | def on_epoch_end(self): 136 | 137 | text = ["6月15日,河南省文物考古研究所曹操高陵文物队公开发表声明承认:“从来没有说过出土的珠子是墓主人的", 138 | "4月8日,北京冬奥会、冬残奥会总结表彰大会在人民大会堂隆重举行。习近平总书记出席大会并发表重要讲话。在讲话中,总书记充分肯定了北京冬奥会、冬残奥会取得的优异成绩,全面回顾了7年筹办备赛的不凡历程,深入总结了筹备举办北京冬奥会、冬残奥会的宝贵经验,深刻阐释了北京冬奥精神,对运用好冬奥遗产推动高质量发展提出明确要求。", 139 | "当地时间8日,欧盟委员会表示,欧盟各成员国政府现已冻结共计约300亿欧元与俄罗斯寡头及其他被制裁的俄方人员有关的资产。", 140 | "这一盘口状态下英国必发公司亚洲盘交易数据显示博洛尼亚热。而从欧赔投注看,也是主队热。巴勒莫两连败,", 141 | ] 142 | for t in text: 143 | entities = predictor.predict_ner(t, target, maxlen=maxlen) 144 | result = {} 145 | for e in entities: 146 | if e[2] not in result: 147 | result[e[2]] = [t[e[0]: e[1]+1]] 148 | else : 149 | result[e[2]].append(t[e[0]: e[1]+1]) 150 | print(f"result is {result}") 151 | 152 | f1, precision, recall = evaluate(val_data) 153 | # 保存最优 154 | if f1 >= self.best_val_f1: 155 | self.best_val_f1 = f1 156 | torch.save(bert_model.state_dict(), model_save_path) 157 | print(f"模型保存成功~") 158 | print( 159 | 'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' % 160 | (f1, precision, recall, self.best_val_f1) 161 | ) 162 | 163 | f1, precision, recall = evaluate(test_data) 164 | print( 165 | 'test: f1: %.5f, precision: %.5f, recall: %.5f\n' % 166 | (f1, precision, recall) 167 | ) 168 | 169 | def main(): 170 | 171 | 172 | optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=1e-5) 173 | train_dataset = NERDataset(train_data) 174 | 175 | trainer.train(model=bert_model, optimizer=optimizer, 176 | train_dataset=train_dataset, evaluator=Evaluator, 177 | collate_fn=bert_sequence_label_collate_fn) 178 | 179 | if __name__ == '__main__': 180 | main() 181 | -------------------------------------------------------------------------------- /examples/ner/train_roberta_ner_gp_people_daily.py: -------------------------------------------------------------------------------- 1 | # 人民日报数据 2 | import torch 3 | from tqdm import tqdm 4 | from torch.utils.data import Dataset, DataLoader 5 | from bert_seq2seq import Tokenizer 6 | from bert_seq2seq import load_model 7 | from bert_seq2seq.dataset import bert_sequence_label_gp_collate_fn 8 | from bert_seq2seq import Trainer 9 | import numpy as np 10 | import os 11 | from bert_seq2seq import Predictor 12 | 13 | train_path = "../data/china-people-daily-ner-corpus/example.train" 14 | valid_path = '../data/china-people-daily-ner-corpus/example.dev' 15 | test_path = '../data/china-people-daily-ner-corpus/example.test' 16 | 17 | model_name = "roberta" # 选择模型名字 18 | task_name = "sequence_labeling_gp" 19 | 20 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置 21 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置 22 | 23 | model_save_path = "./bert_sequence_labeling_gp.bin" 24 | 25 | batch_size = 16 26 | lr = 2e-5 27 | # 加载字典 28 | tokenizer = Tokenizer(vocab_path) 29 | maxlen = 256 30 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 31 | 32 | trainer = Trainer(epoches=10, 33 | env_type="pytorch", 34 | val_every_step=500, 35 | batch_size=batch_size, 36 | device=device, 37 | ) 38 | target = set() 39 | 40 | def load_data(filename): 41 | """加载数据 42 | 单条格式:[text, (start, end, label), (start, end, label), ...], 43 | 意味着text[start:end + 1]是类型为label的实体。 44 | """ 45 | D = [] 46 | with open(filename, encoding='utf-8') as f: 47 | f = f.read() 48 | for l in f.split('\n\n'): 49 | if not l: 50 | continue 51 | d = [''] 52 | for i, c in enumerate(l.split('\n')): 53 | char, flag = c.split(' ') 54 | d[0] += char 55 | if flag[0] == 'B': 56 | d.append([i, i, flag[2:]]) 57 | target.add(flag[2:]) 58 | elif flag[0] == 'I': 59 | d[-1][1] = i 60 | 61 | D.append(d) 62 | return D 63 | 64 | train_data = load_data(train_path) 65 | val_data = load_data(valid_path) 66 | test_data = load_data(test_path) 67 | target = list(sorted(target)) 68 | 69 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name, 70 | target_size=len(target), ner_inner_dim=64) 71 | # ## 加载预训练的模型参数~ 72 | bert_model.load_pretrain_params(model_path) 73 | predictor = Predictor(bert_model, tokenizer) 74 | 75 | ## 自定义dataset 76 | class NERDataset(Dataset): 77 | """ 78 | 针对特定数据集,定义一个相关的取数据的方式 79 | """ 80 | def __init__(self, data) : 81 | ## 一般init函数是加载所有数据 82 | super(NERDataset, self).__init__() 83 | # 读原始数据 84 | self.data = data 85 | def __getitem__(self, i): 86 | ## 得到单个数据 87 | # print(i) 88 | data = self.data[i] 89 | 90 | tokens = tokenizer.tokenize(data[0], maxlen=maxlen, add_spatial_tokens=True) 91 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 92 | mapping = tokenizer.rematch(data[0], tokens) 93 | start_mapping = {j[0]: i for i, j in enumerate(mapping) if j} 94 | end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j} 95 | length = len(tokens) 96 | labels = np.zeros((len(target), length, length)) 97 | 98 | for start, end, label in data[1:]: 99 | if start in start_mapping and end in end_mapping: 100 | # 说明找到这个token了。 101 | start = start_mapping[start] 102 | end = end_mapping[end] 103 | 104 | label_index = target.index(label) 105 | labels[label_index, start, end] = 1 106 | 107 | output = { 108 | "input_ids": input_ids, 109 | "labels": labels 110 | } 111 | return output 112 | 113 | def __len__(self): 114 | return len(self.data) 115 | 116 | 117 | def evaluate(data): 118 | """评测函数 119 | """ 120 | X, Y, Z = 1e-10, 1e-10, 1e-10 121 | for d in tqdm(data, ncols=100): 122 | R = set(predictor.predict_ner(d[0], target, maxlen=maxlen)) 123 | T = set([tuple(i) for i in d[1:]]) 124 | X += len(R & T) 125 | Y += len(R) 126 | Z += len(T) 127 | f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z 128 | return f1, precision, recall 129 | 130 | class Evaluator: 131 | 132 | def __init__(self): 133 | self.best_val_f1 = 0.0 134 | 135 | def on_validation(self, data): 136 | loss = data["loss"] 137 | step = data["iteration"] 138 | 139 | text = ["6月15日,河南省文物考古研究所曹操高陵文物队公开发表声明承认:“从来没有说过出土的珠子是墓主人的", 140 | "4月8日,北京冬奥会、冬残奥会总结表彰大会在人民大会堂隆重举行。习近平总书记出席大会并发表重要讲话。在讲话中,总书记充分肯定了北京冬奥会、冬残奥会取得的优异成绩,全面回顾了7年筹办备赛的不凡历程,深入总结了筹备举办北京冬奥会、冬残奥会的宝贵经验,深刻阐释了北京冬奥精神,对运用好冬奥遗产推动高质量发展提出明确要求。", 141 | "当地时间8日,欧盟委员会表示,欧盟各成员国政府现已冻结共计约300亿欧元与俄罗斯寡头及其他被制裁的俄方人员有关的资产。", 142 | "这一盘口状态下英国必发公司亚洲盘交易数据显示博洛尼亚热。而从欧赔投注看,也是主队热。巴勒莫两连败,", 143 | ] 144 | for t in text: 145 | entities = predictor.predict_ner(t, target, maxlen=maxlen) 146 | result = {} 147 | for e in entities: 148 | if e[2] not in result: 149 | result[e[2]] = [t[e[0]: e[1]+1]] 150 | else : 151 | result[e[2]].append(t[e[0]: e[1]+1]) 152 | print(f"result is {result}") 153 | 154 | f1, precision, recall = evaluate(val_data) 155 | # 保存最优 156 | if f1 >= self.best_val_f1: 157 | self.best_val_f1 = f1 158 | torch.save(bert_model.state_dict(), model_save_path) 159 | print(f"模型保存成功~") 160 | 161 | print( 162 | 'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' % 163 | (f1, precision, recall, self.best_val_f1) 164 | ) 165 | 166 | f1, precision, recall = evaluate(test_data) 167 | print( 168 | 'test: f1: %.5f, precision: %.5f, recall: %.5f\n' % 169 | (f1, precision, recall) 170 | ) 171 | 172 | 173 | def main(): 174 | 175 | optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=1e-5) 176 | train_dataset = NERDataset(train_data) 177 | trainer.train(model=bert_model, optimizer=optimizer, evaluator=Evaluator, 178 | train_dataset=train_dataset, collate_fn=bert_sequence_label_gp_collate_fn, 179 | ) 180 | 181 | if __name__ == '__main__': 182 | main() 183 | -------------------------------------------------------------------------------- /bert_seq2seq/launch.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The Microsoft DeepSpeed Team 2 | """ 3 | sailing runner is the main front-end to launching multi-worker 4 | training jobs with DeepSpeed. By default this uses pdsh to parallel 5 | ssh into multiple worker nodes and launch all the necessary processes 6 | per rank for training. 7 | """ 8 | 9 | import os 10 | import sys 11 | import json 12 | import subprocess 13 | import collections 14 | import socket 15 | import signal 16 | import logging 17 | 18 | import torch.distributed as dist 19 | 20 | 21 | def fetch_hostfile(hostfile_path): 22 | if not os.path.isfile(hostfile_path): 23 | print("Unable to find hostfile, will proceed with training " 24 | "with local resources only.") 25 | return None 26 | # e.g., worker-0 slots=16 27 | with open(hostfile_path, 'r') as fd: 28 | resource_pool = collections.OrderedDict() 29 | for line in fd.readlines(): 30 | line = line.strip() 31 | if line == '': 32 | # skip empty lines 33 | continue 34 | try: 35 | hostname, slots = line.split() 36 | _, slot_count = slots.split("=") 37 | slot_count = int(slot_count) 38 | except ValueError as err: 39 | raise err 40 | if hostname in resource_pool: 41 | raise ValueError(f"host {hostname} is already defined") 42 | resource_pool[hostname] = slot_count 43 | 44 | return resource_pool 45 | 46 | 47 | def cmd_load_hyperparam(config_path=None, format="json", encoding="utf-8"): 48 | """ 49 | shell load arguments form argparse and config file 50 | """ 51 | # config_path='config/config_block_large_chinese.json' 52 | format = config_path.rsplit('.')[-1] 53 | with open(config_path, 'r', encoding=encoding) as f: 54 | if format == "json": 55 | config_dict = json.load(f) 56 | else: 57 | raise NameError("current format%s for hyperparam file is invalid" % 58 | format) 59 | config_cmd = [] 60 | for key in config_dict: 61 | if len(str(config_dict[key])) == 0: 62 | config_cmd.append('--' + key) 63 | else: 64 | config_cmd.append('--' + key) 65 | config_cmd.append(str(config_dict[key])) 66 | return config_cmd 67 | 68 | 69 | def launch_dist( 70 | env_type="DDP", 71 | num_nodes=1, 72 | gpus_per_node=1, 73 | master_addr='localhost', 74 | master_port=17500, 75 | training_script='train.py', 76 | hostfile=None, 77 | ): 78 | 79 | if num_nodes != 1: 80 | print("多机多卡待测试。暂不支持。") 81 | os._exit(0) 82 | if env_type == "DDP": 83 | cmd_launch = [] 84 | cmd_launch.extend([ 85 | # 'export NUM_NODES=' + str(num_nodes) + ';', 86 | # 'export GPUS_PER_NODE=' + str(gpus_per_node) + ';', 87 | sys.executable, 88 | # "python", 89 | '-m', 'torch.distributed.launch' 90 | ]) 91 | torch_distributed_args = [ 92 | '--nproc_per_node', 93 | str(gpus_per_node), 94 | '--nnodes', 95 | str(num_nodes), 96 | '--node_rank', 97 | str(0), 98 | '--master_addr', 99 | master_addr, 100 | '--master_port', 101 | str(master_port), 102 | ] 103 | cmd_launch.extend(torch_distributed_args) 104 | cmd_launch.append(training_script) 105 | cmd_launch.append('--not_call_launch') 106 | run_cmd = ' '.join(cmd_launch) 107 | p = subprocess.Popen(run_cmd, shell=True, preexec_fn=os.setsid) 108 | def signal_handler(signal, frame): 109 | os.killpg(os.getpgid(p.pid), 9) 110 | signal.signal(signal.SIGINT, signal_handler) 111 | p.wait() 112 | print ('finish') 113 | 114 | elif env_type == "deepspeed": 115 | 116 | # if hostfile is None: 117 | # print( 118 | # 'Unable to find hostfile, will proceed with training with local resources only.' 119 | # ) 120 | # os.makedirs("./tmp", exist_ok=True) 121 | # 122 | # with open('./tmp/hostfile', 'w') as w: 123 | # w.write(socket.gethostname() + ' slots=2') 124 | # hostfile = './tmp/hostfile' 125 | 126 | cmd_launch = ['deepspeed'] 127 | 128 | cmd_launch.extend([ 129 | '--master_port', 130 | str(master_port), 131 | '--num_nodes', 132 | str(num_nodes), 133 | '--num_gpus', 134 | str(gpus_per_node), 135 | # '--hostfile', 136 | # hostfile, 137 | ]) 138 | 139 | cmd_launch.append(training_script) 140 | 141 | cmd_launch.append('--not_call_launch') 142 | run_cmd = ' '.join(cmd_launch) 143 | p = subprocess.Popen(run_cmd, shell=True, preexec_fn=os.setsid) 144 | def signal_handler(signal, frame): 145 | os.killpg(os.getpgid(p.pid), 9) 146 | signal.signal(signal.SIGINT, signal_handler) 147 | p.wait() 148 | print ('finish') 149 | 150 | elif env_type == "deepspeed+mpu": 151 | 152 | # if hostfile is None: 153 | # print( 154 | # 'Unable to find hostfile, will proceed with training with local resources only.' 155 | # ) 156 | # os.makedirs("./tmp", exist_ok=True) 157 | # 158 | # with open('./tmp/hostfile', 'w') as w: 159 | # w.write(socket.gethostname() + ' slots=2') 160 | # hostfile = './tmp/hostfile' 161 | 162 | cmd_launch = ["export ENV_TYPE=deepspeed+mpu;",'deepspeed'] 163 | 164 | cmd_launch.extend([ 165 | '--master_port', 166 | str(master_port), 167 | '--num_nodes', 168 | str(num_nodes), 169 | '--num_gpus', 170 | str(gpus_per_node), 171 | # '--hostfile', 172 | # hostfile, 173 | ]) 174 | 175 | cmd_launch.append(training_script) 176 | 177 | cmd_launch.append('--not_call_launch') 178 | run_cmd = ' '.join(cmd_launch) 179 | p = subprocess.Popen(run_cmd, shell=True, preexec_fn=os.setsid) 180 | def signal_handler(signal, frame): 181 | os.killpg(os.getpgid(p.pid), 9) 182 | signal.signal(signal.SIGINT, signal_handler) 183 | p.wait() 184 | print ('finish') 185 | 186 | else : 187 | print("不支持的env_type") 188 | os._exit(0) 189 | -------------------------------------------------------------------------------- /examples/ner/train_bert_ner_crf_people_daily.py: -------------------------------------------------------------------------------- 1 | ## 人民日报数据 2 | import torch 3 | from tqdm import tqdm 4 | from torch.utils.data import Dataset 5 | from bert_seq2seq import Tokenizer 6 | from bert_seq2seq import load_model 7 | from bert_seq2seq.dataset import bert_sequence_label_collate_fn 8 | from bert_seq2seq import Trainer 9 | from bert_seq2seq import Predictor 10 | 11 | train_path = "../data/china-people-daily-ner-corpus/example.train" 12 | valid_path = '../data/china-people-daily-ner-corpus/example.dev' 13 | test_path = '../data/china-people-daily-ner-corpus/example.test' 14 | 15 | model_name = "roberta" # 选择模型名字 16 | task_name = "sequence_labeling_crf" 17 | 18 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置 19 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置 20 | 21 | model_save_path = "./bert_sequence_labeling_crf.bin" 22 | 23 | batch_size = 16 24 | lr = 1e-5 25 | crf_lr = 0.01 26 | # 加载字典 27 | maxlen = 256 28 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 29 | tokenizer = Tokenizer(vocab_path, do_lower_case=True, max_len=maxlen) 30 | 31 | trainer = Trainer(epoches=10, 32 | env_type="pytorch", 33 | val_every_step=500, 34 | batch_size=batch_size, 35 | device=device, 36 | ) 37 | 38 | target = ["O", "B-LOC", "I-LOC", "B-ORG", "I-ORG", "B-PER", "I-PER"] 39 | 40 | def load_data(filename): 41 | """加载数据 42 | 单条格式:[text, (start, end, label), (start, end, label), ...], 43 | 意味着text[start:end + 1]是类型为label的实体。 44 | """ 45 | D = [] 46 | with open(filename, encoding='utf-8') as f: 47 | f = f.read() 48 | for l in f.split('\n\n'): 49 | if not l: 50 | continue 51 | d = [''] 52 | for i, c in enumerate(l.split('\n')): 53 | char, flag = c.split(' ') 54 | d[0] += char 55 | if flag[0] == 'B': 56 | d.append([i, i, flag[2:]]) 57 | elif flag[0] == 'I': 58 | d[-1][1] = i 59 | 60 | D.append(d) 61 | return D 62 | 63 | train_data = load_data(train_path) 64 | val_data = load_data(valid_path) 65 | test_data = load_data(test_path) 66 | 67 | print(f"all target is {target}") 68 | 69 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name, 70 | target_size=len(target)) 71 | bert_model.load_pretrain_params(model_path, strict=False) 72 | 73 | predictor = Predictor(bert_model, tokenizer) 74 | 75 | ## 自定义dataset 76 | class NERDataset(Dataset): 77 | """ 78 | 针对特定数据集,定义一个相关的取数据的方式 79 | """ 80 | def __init__(self, data) : 81 | ## 一般init函数是加载所有数据 82 | super(NERDataset, self).__init__() 83 | # 读原始数据 84 | self.data = data 85 | 86 | def __getitem__(self, i): 87 | ## 得到单个数据 88 | # print(i) 89 | data = self.data[i] 90 | 91 | tokens = tokenizer.tokenize(data[0], maxlen=maxlen, add_spatial_tokens=True) 92 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 93 | 94 | mapping = tokenizer.rematch(data[0], tokens) 95 | start_mapping = {j[0]: i for i, j in enumerate(mapping) if j} 96 | end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j} 97 | length = len(tokens) 98 | labels = [0] * length 99 | 100 | for start, end, label in data[1:]: 101 | if start in start_mapping and end in end_mapping: 102 | # 说明找到这个token了。 103 | start = start_mapping[start] 104 | end = end_mapping[end] 105 | 106 | labels[start] = target.index(f"B-{label}") 107 | for j in range(start + 1, end + 1): 108 | labels[j] = target.index(f"I-{label}") 109 | 110 | output = { 111 | "input_ids": input_ids, 112 | "labels": labels 113 | } 114 | return output 115 | 116 | def __len__(self): 117 | return len(self.data) 118 | 119 | def evaluate(data): 120 | """评测函数 121 | """ 122 | X, Y, Z = 1e-10, 1e-10, 1e-10 123 | for d in tqdm(data, ncols=100): 124 | R = set(predictor.predict_ner(d[0], target, maxlen=maxlen)) 125 | T = set([tuple(i) for i in d[1:]]) 126 | X += len(R & T) 127 | Y += len(R) 128 | Z += len(T) 129 | f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z 130 | return f1, precision, recall 131 | 132 | class Evaluator: 133 | 134 | def __init__(self): 135 | self.best_val_f1 = 0.0 136 | 137 | def on_epoch_end(self): 138 | 139 | text = ["6月15日,河南省文物考古研究所曹操高陵文物队公开发表声明承认:“从来没有说过出土的珠子是墓主人的", 140 | "4月8日,北京冬奥会、冬残奥会总结表彰大会在人民大会堂隆重举行。习近平总书记出席大会并发表重要讲话。在讲话中,总书记充分肯定了北京冬奥会、冬残奥会取得的优异成绩,全面回顾了7年筹办备赛的不凡历程,深入总结了筹备举办北京冬奥会、冬残奥会的宝贵经验,深刻阐释了北京冬奥精神,对运用好冬奥遗产推动高质量发展提出明确要求。", 141 | "当地时间8日,欧盟委员会表示,欧盟各成员国政府现已冻结共计约300亿欧元与俄罗斯寡头及其他被制裁的俄方人员有关的资产。", 142 | "这一盘口状态下英国必发公司亚洲盘交易数据显示博洛尼亚热。而从欧赔投注看,也是主队热。巴勒莫两连败,", 143 | ] 144 | for t in text: 145 | entities = predictor.predict_ner(t, target, maxlen=maxlen) 146 | result = {} 147 | for e in entities: 148 | if e[2] not in result: 149 | result[e[2]] = [t[e[0]: e[1]+1]] 150 | else : 151 | result[e[2]].append(t[e[0]: e[1]+1]) 152 | print(f"result is {result}") 153 | 154 | f1, precision, recall = evaluate(val_data) 155 | # 保存最优 156 | if f1 >= self.best_val_f1: 157 | self.best_val_f1 = f1 158 | torch.save(bert_model.state_dict(), model_save_path) 159 | print(f"模型保存成功~") 160 | print( 161 | 'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' % 162 | (f1, precision, recall, self.best_val_f1) 163 | ) 164 | 165 | f1, precision, recall = evaluate(test_data) 166 | print( 167 | 'test: f1: %.5f, precision: %.5f, recall: %.5f\n' % 168 | (f1, precision, recall) 169 | ) 170 | 171 | def main(): 172 | 173 | crf_params = list(map(id, bert_model.crf_layer.parameters())) ## 单独把crf层参数拿出来 174 | base_params = filter(lambda p: id(p) not in crf_params, bert_model.parameters()) 175 | 176 | optimizer = torch.optim.Adam([ 177 | {"params": base_params}, 178 | {"params": bert_model.crf_layer.parameters(), "lr": crf_lr}], lr=lr, weight_decay=1e-3) 179 | 180 | train_dataset = NERDataset(train_data) 181 | 182 | trainer.train(model=bert_model, optimizer=optimizer, 183 | train_dataset=train_dataset, evaluator=Evaluator, 184 | collate_fn=bert_sequence_label_collate_fn) 185 | 186 | if __name__ == '__main__': 187 | main() 188 | -------------------------------------------------------------------------------- /bert_seq2seq/model/layers/activations.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2022 BAAI. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License") 4 | # Copyright 2020 The HuggingFace Team. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import math 19 | import torch 20 | from packaging import version 21 | from torch import Tensor, nn 22 | 23 | 24 | class NewGELUActivation(nn.Module): 25 | """ 26 | Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see 27 | the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 28 | """ 29 | def forward(self, input: Tensor) -> Tensor: 30 | return 0.5 * input * (1.0 + torch.tanh( 31 | math.sqrt(2.0 / math.pi) * 32 | (input + 0.044715 * torch.pow(input, 3.0)))) 33 | 34 | 35 | class GELUActivation(nn.Module): 36 | """ 37 | Original Implementation of the GELU activation function in Google BERT repo when initially created. For 38 | information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 + 39 | torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional 40 | Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 41 | """ 42 | def __init__(self, use_gelu_python: bool = False): 43 | super().__init__() 44 | if version.parse( 45 | torch.__version__) < version.parse("1.4") or use_gelu_python: 46 | self.act = self._gelu_python 47 | else: 48 | self.act = nn.functional.gelu 49 | 50 | def _gelu_python(self, input: Tensor) -> Tensor: 51 | return input * 0.5 * (1.0 + torch.erf(input / math.sqrt(2.0))) 52 | 53 | def forward(self, input: Tensor) -> Tensor: 54 | return self.act(input) 55 | 56 | 57 | class FastGELUActivation(nn.Module): 58 | """ 59 | Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs 60 | """ 61 | def forward(self, input: Tensor) -> Tensor: 62 | return 0.5 * input * (1.0 + 63 | torch.tanh(input * 0.7978845608 * 64 | (1.0 + 0.044715 * input * input))) 65 | 66 | 67 | class QuickGELUActivation(nn.Module): 68 | """ 69 | Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs 70 | """ 71 | def forward(self, input: Tensor) -> Tensor: 72 | return input * torch.sigmoid(1.702 * input) 73 | 74 | 75 | class ClippedGELUActivation(nn.Module): 76 | """ 77 | Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as 78 | it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to 79 | https://arxiv.org/abs/2004.09602. 80 | Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when 81 | initially created. 82 | For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + 83 | torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415 84 | """ 85 | def __init__(self, min: float, max: float): 86 | if min > max: 87 | raise ValueError( 88 | f"min should be < max (got min: {min}, max: {max})") 89 | 90 | super().__init__() 91 | self.min = min 92 | self.max = max 93 | 94 | def forward(self, x: Tensor) -> Tensor: 95 | return torch.clip(gelu(x), self.min, self.max) 96 | 97 | 98 | class SiLUActivation(nn.Module): 99 | """ 100 | See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear 101 | Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function 102 | Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated 103 | Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with 104 | later. 105 | """ 106 | def __init__(self): 107 | super().__init__() 108 | if version.parse(torch.__version__) < version.parse("1.7"): 109 | self.act = self._silu_python 110 | else: 111 | self.act = nn.functional.silu 112 | 113 | def _silu_python(self, input: Tensor) -> Tensor: 114 | return input * torch.sigmoid(input) 115 | 116 | def forward(self, input: Tensor) -> Tensor: 117 | return self.act(input) 118 | 119 | 120 | class MishActivation(nn.Module): 121 | """ 122 | See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also 123 | visit the official repository for the paper: https://github.com/digantamisra98/Mish 124 | """ 125 | def __init__(self): 126 | super().__init__() 127 | if version.parse(torch.__version__) < version.parse("1.9"): 128 | self.act = self._mish_python 129 | else: 130 | self.act = nn.functional.mish 131 | 132 | def _mish_python(self, input: Tensor) -> Tensor: 133 | return input * torch.tanh(nn.functional.softplus(input)) 134 | 135 | def forward(self, input: Tensor) -> Tensor: 136 | return self.act(input) 137 | 138 | 139 | class LinearActivation(nn.Module): 140 | """ 141 | Applies the linear activation function, i.e. forwarding input directly to output. 142 | """ 143 | def forward(self, input: Tensor) -> Tensor: 144 | return input 145 | 146 | 147 | @torch.jit.script 148 | def gelu_impl(x): 149 | """OpenAI's gelu implementation.""" 150 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x * 151 | (1.0 + 0.044715 * x * x))) 152 | 153 | 154 | ACT2FN = { 155 | "relu": nn.ReLU(), 156 | "gelu_impl": gelu_impl, 157 | "silu": SiLUActivation(), 158 | "swish": SiLUActivation(), 159 | "gelu": GELUActivation(), 160 | "tanh": nn.Tanh(), 161 | "gelu_python": GELUActivation(use_gelu_python=True), 162 | "gelu_new": NewGELUActivation(), 163 | "gelu_fast": FastGELUActivation(), 164 | "quick_gelu": QuickGELUActivation(), 165 | "gelu_10": ClippedGELUActivation(-10, 10), 166 | "mish": MishActivation(), 167 | "linear": LinearActivation(), 168 | "sigmoid": nn.Sigmoid(), 169 | } 170 | 171 | 172 | def get_activation(activation_string): 173 | if activation_string in ACT2FN: 174 | return ACT2FN[activation_string] 175 | else: 176 | raise KeyError( 177 | f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}" 178 | ) 179 | 180 | 181 | # For backwards compatibility with: from activations import gelu_python 182 | gelu_python = get_activation("gelu_python") 183 | gelu_new = get_activation("gelu_new") 184 | gelu = get_activation("gelu") 185 | gelu_fast = get_activation("gelu_fast") 186 | quick_gelu = get_activation("quick_gelu") 187 | silu = get_activation("silu") 188 | mish = get_activation("mish") 189 | linear_act = get_activation("linear") 190 | gelu_impl = get_activation("gelu_impl") 191 | relu = get_activation("relu") 192 | -------------------------------------------------------------------------------- /bert_seq2seq/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import DataLoader, Dataset 3 | import numpy as np 4 | 5 | def padding(indice, max_length, pad_idx=0): 6 | """ 7 | pad 函数 8 | """ 9 | 10 | pad_indice = [item + [pad_idx] * max(0, max_length - len(item)) for item in indice] 11 | return torch.tensor(pad_indice) 12 | 13 | def sequence_padding(inputs, length=None, value=0, seq_dims=1, mode='post'): 14 | 15 | if length is None: 16 | length = np.max([np.shape(x)[:seq_dims] for x in inputs], axis=0) 17 | elif not hasattr(length, '__getitem__'): 18 | length = [length] 19 | 20 | slices = [np.s_[:length[i]] for i in range(seq_dims)] 21 | slices = tuple(slices) if len(slices) > 1 else slices[0] 22 | pad_width = [(0, 0) for _ in np.shape(inputs[0])] 23 | 24 | outputs = [] 25 | for x in inputs: 26 | x = x[slices] 27 | for i in range(seq_dims): 28 | if mode == 'post': 29 | pad_width[i] = (0, length[i] - np.shape(x)[i]) 30 | elif mode == 'pre': 31 | pad_width[i] = (length[i] - np.shape(x)[i], 0) 32 | else: 33 | raise ValueError('"mode" argument must be "post" or "pre".') 34 | x = np.pad(x, pad_width, 'constant', constant_values=value) 35 | outputs.append(x) 36 | 37 | return np.array(outputs) 38 | 39 | def gpt_collate_fn(batch): 40 | 41 | token_ids = [data["input_ids"] for data in batch] 42 | max_length = max([len(t) for t in token_ids]) 43 | 44 | token_ids_padded = padding(token_ids, max_length) 45 | target_ids_padded = token_ids_padded.clone() 46 | target_ids_padded[target_ids_padded == 0] = -100 47 | 48 | return { 49 | "input_ids": token_ids_padded, 50 | "labels": target_ids_padded 51 | } 52 | 53 | def t5_seq2seq_collate_fn(batch): 54 | 55 | token_ids_src = [data["input_ids"] for data in batch] 56 | max_length_src = max([len(t) for t in token_ids_src]) 57 | token_ids_tgt = [data["target_ids"] for data in batch] 58 | max_length_tgt = max([len(t) for t in token_ids_tgt]) 59 | 60 | token_ids_padded = padding(token_ids_src, max_length_src) 61 | target_ids_padded = padding(token_ids_tgt, max_length_tgt) 62 | labels_ids = target_ids_padded.clone() 63 | labels_ids[labels_ids == 0] = -100 64 | target_ids_padded = target_ids_padded[:, :-1].contiguous() 65 | labels_ids = labels_ids[:, 1:].contiguous() 66 | 67 | return { 68 | "input_ids": token_ids_padded, 69 | "decoder_input_ids": target_ids_padded, 70 | "labels": labels_ids 71 | } 72 | 73 | def bert_seq2seq_collate_fn(batch): 74 | 75 | token_ids = [data["input_ids"] for data in batch] 76 | max_length = max([len(t) for t in token_ids]) 77 | token_type_ids = [data["token_type_ids"] for data in batch] 78 | 79 | token_ids_padded = padding(token_ids, max_length) 80 | token_type_ids_padded = padding(token_type_ids, max_length) 81 | target_ids_padded = token_ids_padded[:, 1:].contiguous() 82 | 83 | return { 84 | "input_ids": token_ids_padded, 85 | "token_type_ids": token_type_ids_padded, 86 | "labels": target_ids_padded 87 | } 88 | 89 | def bert_cls_collate_fn(batch): 90 | 91 | token_ids = [data["input_ids"] for data in batch] 92 | max_length = max([len(t) for t in token_ids]) 93 | token_type_ids = [data["token_type_ids"] for data in batch] 94 | target_ids = [data["labels"] for data in batch] 95 | target_ids = torch.tensor(target_ids, dtype=torch.long) 96 | 97 | token_ids_padded = padding(token_ids, max_length) 98 | token_type_ids_padded = padding(token_type_ids, max_length) 99 | 100 | return { 101 | "input_ids": token_ids_padded, 102 | "token_type_ids": token_type_ids_padded, 103 | "labels": target_ids 104 | } 105 | 106 | def bert_sequence_label_collate_fn(batch): 107 | 108 | token_ids = [data["input_ids"] for data in batch] 109 | 110 | max_length = max([len(t) for t in token_ids]) 111 | target_ids = [data["labels"] for data in batch] 112 | 113 | token_ids_padded = padding(token_ids, max_length) 114 | target_ids_padded = padding(target_ids, max_length) 115 | 116 | return { 117 | "input_ids": token_ids_padded, 118 | "token_type_ids": None, 119 | "labels": target_ids_padded 120 | } 121 | 122 | def bert_sequence_label_gp_collate_fn(batch): 123 | 124 | token_ids = [data["input_ids"] for data in batch] 125 | labels = [data["labels"] for data in batch] 126 | token_ids_padded = sequence_padding(token_ids) 127 | labels_padded = sequence_padding(labels, seq_dims=3) 128 | token_ids_padded = torch.from_numpy(token_ids_padded) 129 | labels_padded = torch.from_numpy(labels_padded) 130 | 131 | return { 132 | "input_ids": token_ids_padded, 133 | "token_type_ids": None, 134 | "labels": labels_padded 135 | } 136 | 137 | def bert_gplinker_collate_fn(batch): 138 | input_ids = [data["input_ids"] for data in batch] 139 | token_type_ids = [data["token_type_ids"] for data in batch] 140 | entity_labels = [data["entity_labels"] for data in batch] 141 | head_labels = [data["head_labels"] for data in batch] 142 | tail_labels = [data["tail_labels"] for data in batch] 143 | 144 | input_ids = sequence_padding(input_ids) 145 | token_type_ids = sequence_padding(token_type_ids) 146 | entity_labels = sequence_padding(entity_labels, seq_dims=2) 147 | head_labels = sequence_padding(head_labels, seq_dims=2) 148 | tail_labels = sequence_padding(tail_labels, seq_dims=2) 149 | 150 | input_ids = torch.from_numpy(input_ids).long() 151 | token_type_ids = torch.from_numpy(token_type_ids).long() 152 | entity_labels = torch.from_numpy(entity_labels).long() 153 | head_labels = torch.from_numpy(head_labels).long() 154 | tail_labels = torch.from_numpy(tail_labels).long() 155 | 156 | return { 157 | "input_ids": input_ids, 158 | "token_type_ids": token_type_ids, 159 | "entity_labels": entity_labels, 160 | "head_labels": head_labels, 161 | "tail_labels": tail_labels 162 | } 163 | 164 | def pad_token(tokens, max_length): 165 | pad_len = max_length - len(tokens) 166 | # pad id is 50000 167 | tokens += [50000] * pad_len 168 | return tokens 169 | 170 | def pad_position_ids(position_ids, max_length): 171 | pad_len = max_length - len(position_ids[0]) 172 | position_ids[0] += [len(position_ids[0]) + x for x in range(pad_len)] 173 | position_ids[1] += [1] * pad_len 174 | return position_ids 175 | 176 | def pad_loss_mask(loss_mask, max_length): 177 | pad_len = max_length - len(loss_mask) 178 | loss_mask += [0] * pad_len 179 | return loss_mask 180 | 181 | def glm_generation_collate_fn(batch): #padding process in each batch 182 | 183 | input_ids = [data["input_ids"] for data in batch] 184 | position_ids = [data["position_ids"] for data in batch] 185 | attention_mask = [data['attention_mask'] for data in batch] 186 | loss_mask = [data['loss_mask'] for data in batch] 187 | labels = [data['labels'] for data in batch] 188 | 189 | max_length = max([len(t) for t in input_ids]) 190 | for i in range(len(input_ids)): 191 | input_ids[i] = pad_token(input_ids[i], max_length) 192 | labels[i] = pad_token(labels[i], max_length) 193 | position_ids[i] = pad_position_ids(position_ids[i], 194 | max_length) 195 | loss_mask[i] = pad_loss_mask(loss_mask[i], max_length) 196 | return { 197 | 'input_ids': torch.LongTensor(input_ids), 198 | 'position_ids': torch.LongTensor(position_ids), 199 | 'attention_mask': torch.LongTensor(attention_mask), 200 | 'loss_mask': torch.LongTensor(loss_mask), 201 | 'labels': torch.LongTensor(labels), 202 | } 203 | -------------------------------------------------------------------------------- /bert_seq2seq/mpu/mp_tools.py: -------------------------------------------------------------------------------- 1 | # Copyright © 2022 BAAI. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License") 4 | import sys 5 | import os 6 | import torch 7 | import copy 8 | 9 | from_1_to_n_models = { 10 | "gpt": { 11 | "wte.weight": 0, 12 | "attn.c_attn.weight": 30, 13 | "attn.c_attn.bias": 30, 14 | "attn.c_proj.weight": 1, 15 | "mlp.c_fc.weight": 0, 16 | "mlp.c_fc.bias": 0, 17 | "mlp.c_proj.weight": 1, 18 | }, 19 | "opt": { 20 | "decoder.embed_tokens.weight": 0, 21 | "self_attn.k_proj.weight": 0, 22 | "self_attn.k_proj.bias": 0, 23 | "self_attn.q_proj.weight": 0, 24 | "self_attn.q_proj.bias": 0, 25 | "self_attn.v_proj.weight": 0, 26 | "self_attn.v_proj.bias": 0, 27 | 28 | "self_attn.out_proj.weight": 1, 29 | "fc1.weight": 0, 30 | "fc1.bias": 0, 31 | "fc2.weight": 1, 32 | }, 33 | "glm": { 34 | "word_embeddings.weight": 0, 35 | "attention.query_key_value.weight": 30, 36 | "attention.query_key_value.bias": 30, 37 | "attention.dense.weight": 1, 38 | "mlp.dense_h_to_4h.weight": 0, 39 | "mlp.dense_h_to_4h.bias": 0, 40 | "mlp.dense_4h_to_h.weight": 1, 41 | }, 42 | "t5": { 43 | 44 | }, 45 | } 46 | 47 | def check_pytorch_model_mp_size(checkpoint: str, target_mp: int): 48 | """ 49 | check the checkpoints contains the weights for mp_size = target_mp 50 | """ 51 | assert os.path.isdir(checkpoint) 52 | filenames = os.listdir(checkpoint) 53 | filenames = [ 54 | filename for filename in filenames 55 | if filename.startswith("pytorch_model") 56 | ] 57 | if 'pytorch_model.bin' in filenames and target_mp == 1: 58 | return True 59 | else: 60 | filenames.remove('pytorch_model.bin') 61 | print( 62 | "check the weight files in {}, the number of mp_size({}) {} num_of_files({})" 63 | .format(checkpoint, target_mp, 64 | "=" if target_mp == len(filenames) else "!=", len(filenames))) 65 | return target_mp == len(filenames) 66 | 67 | def change_pytorch_model_mp_from_1_to_n(model_name_brief, checkpoint: str, target_mp: int): 68 | trans_keys = from_1_to_n_models.get(model_name_brief, None) 69 | if trans_keys is None: 70 | print(f"Not support the model_name: {model_name_brief}") 71 | os._exit(0) 72 | 73 | if check_pytorch_model_mp_size(checkpoint, target_mp): 74 | return 75 | assert os.path.isdir(checkpoint) 76 | filenames = os.listdir(checkpoint) 77 | filenames = [ 78 | filename for filename in filenames 79 | if filename.startswith("pytorch_model") 80 | ] 81 | if 'pytorch_model.bin' in filenames and target_mp > 1: 82 | filenames = ['pytorch_model.bin'] 83 | filenames = [os.path.join(checkpoint, x) for x in filenames] 84 | 85 | if target_mp == len(filenames): 86 | print("MP size keeps the same.") 87 | exit(0) 88 | 89 | if checkpoint[-1] == '/': 90 | new_checkpoint = checkpoint[:-1] 91 | else: 92 | new_checkpoint = checkpoint 93 | preserve_keys = [ 94 | "lr_scheduler", 95 | "skipped_steps", 96 | "global_steps", 97 | "global_samples", 98 | "dp_world_size", 99 | "iteration", 100 | "client_lr_scheduler", 101 | "np_rng_state", 102 | "random_rng_state", 103 | "torch_rng_state", 104 | "cuda_rng_state", 105 | "rng_tracker_states", 106 | ] 107 | 108 | if target_mp > len(filenames): 109 | print("Increase MP size.") 110 | assert target_mp % len(filenames) == 0 111 | ratio = target_mp // len(filenames) 112 | for i in range(len(filenames)): 113 | start = ratio * i 114 | end = ratio * (i + 1) 115 | d = torch.load(filenames[i], map_location='cpu') 116 | # if d.get("module", None) is None: 117 | # d["module"] = d 118 | 119 | for j in range(start, end): 120 | d_new = {} 121 | shift = j - start 122 | for k, v in d.items(): 123 | if k != 'module': 124 | if k in preserve_keys: 125 | d_new[k] = copy.deepcopy(d[k]) 126 | elif k == "mp_world_size": 127 | d_new[k] = target_mp 128 | else: 129 | d_new[k] = None 130 | d_new['module'] = {} 131 | with torch.no_grad(): 132 | if "module" in d: 133 | d = d["module"] 134 | 135 | for k, v in d.items(): 136 | assert len(v.shape) < 3 137 | flag = 0 138 | for keys in trans_keys: 139 | if keys in k: 140 | flag = 1 141 | # find a key to cut 142 | dim = trans_keys[keys] 143 | 144 | if len(v.shape) == 2: 145 | if dim == 30: 146 | part = v.shape[0] // ratio // 3 147 | d_new['module'][k] = torch.cat([ 148 | v[shift * part:(shift + 1) * 149 | part, :].clone(), 150 | v[(shift + ratio) * 151 | part:(shift + 1 + ratio) * 152 | part, :].clone(), 153 | v[(shift + 2 * ratio) * 154 | part:(shift + 1 + 2 * ratio) * 155 | part, :].clone() 156 | ], 0) 157 | break 158 | 159 | elif dim == 0: 160 | part = v.shape[dim] // ratio 161 | d_new['module'][k] = v[shift * 162 | part:(shift + 1) * 163 | part, :].clone() 164 | break 165 | 166 | elif dim == 1: 167 | part = v.shape[dim] // ratio 168 | d_new['module'][k] = v[:, shift * 169 | part:(shift + 1) * 170 | part].clone() 171 | break 172 | 173 | elif len(v.shape) == 1: 174 | if dim == 30: 175 | part = v.shape[0] // ratio // 3 176 | d_new['module'][k] = torch.cat([ 177 | v[shift * part:(shift + 1) * 178 | part].clone(), 179 | v[(shift + ratio) * 180 | part:(shift + 1 + ratio) * 181 | part].clone(), 182 | v[(shift + 2 * ratio) * 183 | part:(shift + 1 + 2 * ratio) * 184 | part].clone() 185 | ], 0) 186 | break 187 | 188 | else : 189 | d_new['module'][k] = v[shift * part:(shift + 1) * 190 | part].clone() 191 | break 192 | 193 | if flag == 0: 194 | d_new['module'][k] = v.clone() 195 | 196 | 197 | print("saving mp_size = {:02d} ".format(j)) 198 | filename = os.path.join(new_checkpoint, 199 | "pytorch_model_{:02d}.bin".format(j)) 200 | torch.save(d_new, filename) 201 | -------------------------------------------------------------------------------- /bert_seq2seq/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | class GlobalPointer(nn.Module): 7 | def __init__(self, hidden_size, ent_type_size, inner_dim, RoPE=True, trill_mask=True): 8 | super().__init__() 9 | self.ent_type_size = ent_type_size 10 | self.inner_dim = inner_dim 11 | self.hidden_size = hidden_size 12 | self.dense = nn.Linear(self.hidden_size, self.ent_type_size * self.inner_dim * 2) 13 | self.trill_mask = trill_mask 14 | self.RoPE = RoPE 15 | 16 | def sinusoidal_position_embedding(self, batch_size, seq_len, output_dim): 17 | position_ids = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(-1) 18 | 19 | indices = torch.arange(0, output_dim // 2, dtype=torch.float) 20 | indices = torch.pow(10000, -2 * indices / output_dim) 21 | embeddings = position_ids * indices 22 | embeddings = torch.stack([torch.sin(embeddings), torch.cos(embeddings)], dim=-1) 23 | embeddings = embeddings.repeat((batch_size, *([1]*len(embeddings.shape)))) 24 | embeddings = torch.reshape(embeddings, (batch_size, seq_len, output_dim)) 25 | embeddings = embeddings.to(self.device) 26 | return embeddings 27 | 28 | def rope(self, batch_size, seq_len, dim, qw, kw): 29 | # pos_emb:(batch_size, seq_len, inner_dim) 30 | pos_emb = self.sinusoidal_position_embedding(batch_size, seq_len, dim) 31 | # cos_pos,sin_pos: (batch_size, seq_len, 1, inner_dim) 32 | cos_pos = pos_emb[..., None, 1::2].repeat_interleave(2, dim=-1) 33 | sin_pos = pos_emb[..., None,::2].repeat_interleave(2, dim=-1) 34 | qw2 = torch.stack([-qw[..., 1::2], qw[...,::2]], -1) 35 | qw2 = qw2.reshape(qw.shape) 36 | qw = qw * cos_pos + qw2 * sin_pos 37 | kw2 = torch.stack([-kw[..., 1::2], kw[...,::2]], -1) 38 | kw2 = kw2.reshape(kw.shape) 39 | kw = kw * cos_pos + kw2 * sin_pos 40 | return qw, kw 41 | 42 | def forward(self, last_hidden_state, padding_mask): 43 | self.device = last_hidden_state.device 44 | batch_size = last_hidden_state.size()[0] 45 | seq_len = last_hidden_state.size()[1] 46 | 47 | # outputs:(batch_size, seq_len, ent_type_size*inner_dim*2) 48 | outputs = self.dense(last_hidden_state) 49 | outputs = torch.split(outputs, self.inner_dim * 2, dim=-1) 50 | # outputs:(batch_size, seq_len, ent_type_size, inner_dim*2) 51 | outputs = torch.stack(outputs, dim=-2) 52 | # qw,kw:(batch_size, seq_len, ent_type_size, inner_dim) 53 | qw, kw = outputs[...,:self.inner_dim], outputs[...,self.inner_dim:] # TODO:修改为Linear获取? 54 | 55 | if self.RoPE: 56 | qw, kw = self.rope(batch_size, seq_len, self.inner_dim, qw, kw) 57 | 58 | # logits:(batch_size, ent_type_size, seq_len, seq_len) 59 | logits = torch.einsum('bmhd,bnhd->bhmn', qw, kw) 60 | 61 | # padding mask 62 | pad_mask = padding_mask.unsqueeze(1).unsqueeze(1).expand(batch_size, self.ent_type_size, seq_len, seq_len) 63 | # pad_mask_h = attention_mask.unsqueeze(1).unsqueeze(-1).expand(batch_size, self.ent_type_size, seq_len, seq_len) 64 | # pad_mask = pad_mask_v&pad_mask_h 65 | logits = logits*pad_mask - (1-pad_mask)*1e12 66 | 67 | # 排除下三角 68 | if self.trill_mask: 69 | mask = torch.tril(torch.ones_like(logits), -1) 70 | logits = logits - mask * 1e12 71 | 72 | return logits/self.inner_dim**0.5 73 | 74 | def compute_loss(self, logits, labels): 75 | # logits: 76 | # labels: 77 | pass 78 | 79 | bh = logits.shape[0] * logits.shape[1] 80 | labels = torch.reshape(labels, shape=(bh, -1)) 81 | logits = torch.reshape(logits, shape=(bh, -1)) 82 | return multilabel_crossentropy(logits, labels) 83 | 84 | def compute_loss_sparse(self, logits, labels, mask_zero=False): 85 | return sparse_multilabel_categorical_crossentropy(y_pred=logits, y_true=labels, mask_zero=mask_zero) 86 | 87 | 88 | def multilabel_crossentropy(y_pred, y_true): 89 | """ 90 | https://kexue.fm/archives/7359 91 | """ 92 | y_pred = (1 - 2 * y_true) * y_pred # -1 -> pos classes, 1 -> neg classes 93 | y_pred_neg = y_pred - y_true * 1e12 # mask the pred outputs of pos classes 94 | y_pred_pos = (y_pred - (1 - y_true) * 1e12) # mask the pred outputs of neg classes 95 | zeros = torch.zeros_like(y_pred[..., :1]) 96 | y_pred_neg = torch.cat([y_pred_neg, zeros], dim=-1) 97 | y_pred_pos = torch.cat([y_pred_pos, zeros], dim=-1) 98 | neg_loss = torch.logsumexp(y_pred_neg, dim=-1) 99 | pos_loss = torch.logsumexp(y_pred_pos, dim=-1) 100 | 101 | return (neg_loss + pos_loss).mean() 102 | 103 | def sparse_multilabel_categorical_crossentropy(y_true=None, y_pred=None, mask_zero=False): 104 | ''' 105 | 稀疏多标签交叉熵损失的torch实现 106 | ''' 107 | shape = y_pred.shape 108 | y_true = y_true[..., 0] * shape[2] + y_true[..., 1] 109 | y_pred = y_pred.reshape(shape[0], -1, np.prod(shape[2:])) 110 | zeros = torch.zeros_like(y_pred[...,:1]) 111 | y_pred = torch.cat([y_pred, zeros], dim=-1) 112 | if mask_zero: 113 | infs = zeros + 1e12 114 | y_pred = torch.cat([infs, y_pred[..., 1:]], dim=-1) 115 | y_pos_2 = torch.gather(y_pred, index=y_true, dim=-1) 116 | y_pos_1 = torch.cat([y_pos_2, zeros], dim=-1) 117 | if mask_zero: 118 | y_pred = torch.cat([-infs, y_pred[..., 1:]], dim=-1) 119 | y_pos_2 = torch.gather(y_pred, index=y_true, dim=-1) 120 | pos_loss = torch.logsumexp(-y_pos_1, dim=-1) 121 | all_loss = torch.logsumexp(y_pred, dim=-1) 122 | aux_loss = torch.logsumexp(y_pos_2, dim=-1) - all_loss 123 | aux_loss = torch.clip(1 - torch.exp(aux_loss), 1e-10, 1) 124 | neg_loss = all_loss + torch.log(aux_loss) 125 | loss = torch.mean(torch.sum(pos_loss + neg_loss)) 126 | return loss 127 | 128 | class CRFLayer(nn.Module): 129 | """ 130 | """ 131 | def __init__(self, output_dim): 132 | super(CRFLayer, self).__init__() 133 | 134 | self.output_dim = output_dim 135 | self.trans = nn.Parameter(torch.Tensor(output_dim, output_dim)) 136 | self.trans.data.uniform_(-0.1, 0.1) 137 | 138 | def compute_loss(self, y_pred, y_true, mask): 139 | """ 140 | 计算CRF损失 141 | """ 142 | y_pred = y_pred * mask 143 | y_true = y_true * mask 144 | target_score = self.target_score(y_pred, y_true) 145 | log_norm = self.log_norm_step(y_pred, mask) 146 | log_norm = self.logsumexp(log_norm, dim=1)# 计算标量 147 | return log_norm - target_score 148 | 149 | def forward(self, y_pred, y_true, mask): 150 | """ 151 | y_true: [[1, 2, 3], [2, 3, 0] ] 152 | mask: [[1, 1, 1], [1, 1, 0]] 153 | """ 154 | if y_pred.shape[0] != mask.shape[0] or y_pred.shape[1] != mask.shape[1]: 155 | raise Exception("mask shape is not match to y_pred shape") 156 | mask = mask.reshape((mask.shape[0], mask.shape[1], 1)) 157 | mask = mask.float() 158 | y_true = y_true.reshape(y_pred.shape[:-1]) 159 | y_true = y_true.long() 160 | y_true_onehot = F.one_hot(y_true, self.output_dim) 161 | y_true_onehot = y_true_onehot.float() 162 | 163 | return self.compute_loss(y_pred, y_true_onehot, mask) 164 | 165 | def target_score(self, y_pred, y_true): 166 | """ 167 | 计算状态标签得分 + 转移标签得分 168 | y_true: (batch, seq_len, out_dim) 169 | y_pred: (batch, seq_len, out_dim) 170 | """ 171 | # print(y_pred.shape) 172 | # print(y_true.shape) 173 | point_score = torch.einsum("bni,bni->b", y_pred, y_true) 174 | trans_score = torch.einsum("bni,ij,bnj->b", y_true[:, :-1], self.trans, y_true[:, 1: ]) 175 | 176 | return point_score + trans_score 177 | 178 | def log_norm_step(self, y_pred, mask): 179 | """ 180 | 计算归一化因子Z(X) 181 | """ 182 | state = y_pred[:, 0] # 初始Z(X) 183 | y_pred = y_pred[:, 1: ].contiguous() 184 | mask = mask[:, 1:].contiguous() 185 | batch, seq_len, out_dim = y_pred.shape 186 | for t in range(seq_len): 187 | cur_mask = mask[:, t] 188 | state = torch.unsqueeze(state, 2) # (batch, out_dim, 1) 189 | g = torch.unsqueeze(self.trans, 0) # (1, out_dim, out_dim) 190 | outputs = self.logsumexp(state + g, dim=1) # batch, out_dim 191 | outputs = outputs + y_pred[:, t] 192 | outputs = cur_mask * outputs + (1 - cur_mask) * state.squeeze(-1) 193 | state = outputs 194 | 195 | return outputs 196 | 197 | def logsumexp(self, x, dim=None, keepdim=False): 198 | """ 199 | 避免溢出 200 | """ 201 | if dim is None: 202 | x, dim = x.view(-1), 0 203 | xm, _ = torch.max(x, dim, keepdim=True) 204 | out = xm + torch.log(torch.sum(torch.exp(x - xm), dim=dim, keepdim=True)) 205 | return out if keepdim else out.squeeze(dim) 206 | -------------------------------------------------------------------------------- /examples/relationship_extraction/train_bert_relationship_extraction.py: -------------------------------------------------------------------------------- 1 | 2 | import json 3 | import numpy as np 4 | import torch 5 | import os 6 | from tqdm import tqdm 7 | from bert_seq2seq import Trainer 8 | from bert_seq2seq import Tokenizer 9 | from torch.utils.data import Dataset 10 | from bert_seq2seq.dataset import bert_gplinker_collate_fn, sequence_padding 11 | from bert_seq2seq.utils import load_model 12 | 13 | vocab_path = "../state_dict/roberta/vocab.txt" 14 | model_path = "../state_dict/roberta/pytorch_model.bin" 15 | model_save_path = "./bert_relation_extraction.bin" 16 | task_name = "relationship_extraction" 17 | model_name = "roberta" 18 | epoches = 5 19 | data_dir = "../data/三元组抽取" 20 | train_path = os.path.join(data_dir, "train_data.json") 21 | val_path = os.path.join(data_dir, "dev_data.json") 22 | 23 | batch_size = 8 24 | maxlen = 128 25 | lr = 1e-5 26 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 27 | trainer = Trainer(epoches=epoches, env_type="pytorch", 28 | val_every_step=1000, batch_size=batch_size, 29 | device=device, 30 | # num_nodes=1, 31 | # num_gpus=4, 32 | # training_script=__file__, 33 | ) 34 | 35 | def load_data(filename): 36 | """加载数据 37 | 单条格式:{'text': text, 'spo_list': [(s, p, o)]} 38 | """ 39 | D = [] 40 | with open(filename, encoding='utf-8') as f: 41 | for l in f: 42 | l = json.loads(l) 43 | D.append({ 44 | 'text': l['text'], 45 | 'spo_list': [(spo['subject'], spo['predicate'], spo['object']) 46 | for spo in l['spo_list']] 47 | }) 48 | return D 49 | 50 | def load_target(): 51 | target = [] 52 | with open(os.path.join(data_dir, 'all_50_schemas')) as f: 53 | for l in f: 54 | l = json.loads(l) 55 | if l['predicate'] not in target: 56 | target.append(l['predicate']) 57 | return target 58 | 59 | def search(pattern, sequence): 60 | """从sequence中寻找子串pattern 61 | 如果找到,返回第一个下标;否则返回-1。 62 | """ 63 | n = len(pattern) 64 | for i in range(len(sequence)): 65 | if sequence[i:i + n] == pattern: 66 | return i 67 | return -1 68 | 69 | # 建立分词器 70 | tokenizer = Tokenizer(vocab_path) 71 | train_data = load_data(train_path) 72 | valid_data = load_data(val_path) 73 | target = load_target() 74 | model = load_model(tokenizer.vocab, model_name=model_name, 75 | task_name=task_name, target_size=len(target), 76 | ner_inner_dim=64 ) 77 | 78 | class RelationshipDataset(Dataset): 79 | def __init__(self, data): 80 | pass 81 | self.data = data 82 | 83 | def __getitem__(self, i): 84 | data = self.data[i] 85 | 86 | tokenizer_out = tokenizer.encode_plus(data["text"], max_length=maxlen, truncation=True) 87 | input_ids = tokenizer_out["input_ids"] 88 | token_type_ids = tokenizer_out["token_type_ids"] 89 | 90 | spoes = set() 91 | for s, p, o in data['spo_list']: 92 | s = tokenizer.encode_plus(s)["input_ids"][1:-1] 93 | p = target.index(p) 94 | 95 | o = tokenizer.encode_plus(o)["input_ids"][1:-1] 96 | sh = search(s, input_ids) 97 | oh = search(o, input_ids) 98 | if sh != -1 and oh != -1: 99 | spoes.add((sh, sh + len(s) - 1, p, oh, oh + len(o) - 1)) 100 | 101 | # 构建标签 102 | entity_labels = [set() for _ in range(2)] 103 | head_labels = [set() for _ in range(len(target))] 104 | tail_labels = [set() for _ in range(len(target))] 105 | for sh, st, p, oh, ot in spoes: 106 | entity_labels[0].add((sh, st)) 107 | entity_labels[1].add((oh, ot)) 108 | head_labels[p].add((sh, oh)) 109 | tail_labels[p].add((st, ot)) 110 | 111 | for label in entity_labels + head_labels + tail_labels: 112 | if not label: # 至少要有一个标签 113 | label.add((0, 0)) # 如果没有则用0填充 114 | 115 | entity_labels = sequence_padding([list(l) for l in entity_labels]) 116 | head_labels = sequence_padding([list(l) for l in head_labels]) 117 | tail_labels = sequence_padding([list(l) for l in tail_labels]) 118 | 119 | output = { 120 | "input_ids": input_ids, 121 | "token_type_ids": token_type_ids, 122 | "entity_labels": entity_labels, 123 | "head_labels": head_labels, 124 | "tail_labels": tail_labels, 125 | } 126 | return output 127 | 128 | def __len__(self): 129 | return len(self.data) 130 | 131 | class SPO(tuple): 132 | """用来存三元组的类 133 | 表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法, 134 | 使得在判断两个三元组是否等价时容错性更好。 135 | """ 136 | def __init__(self, spo): 137 | self.spox = ( 138 | tuple(tokenizer.tokenize(spo[0])), 139 | spo[1], 140 | tuple(tokenizer.tokenize(spo[2])), 141 | ) 142 | 143 | def __hash__(self): 144 | return self.spox.__hash__() 145 | 146 | def __eq__(self, spo): 147 | return self.spox == spo.spox 148 | 149 | def extract_spoes(text, threshold=0): 150 | """抽取输入text所包含的三元组 151 | """ 152 | tokens = tokenizer.tokenize(text, maxlen=maxlen, add_spatial_tokens=True) 153 | mapping = tokenizer.rematch(text, tokens) 154 | tokenizer_out = tokenizer.encode_plus(text, max_length=maxlen) 155 | input_ids = tokenizer_out["input_ids"] 156 | token_type_ids = tokenizer_out["token_type_ids"] 157 | 158 | input_ids = torch.tensor(input_ids, device=device) 159 | token_type_ids = torch.tensor(token_type_ids, device=device) 160 | if input_ids.ndim == 1: 161 | input_ids = input_ids.view(1, -1) 162 | token_type_ids = token_type_ids.view(1, -1) 163 | with torch.no_grad(): 164 | model_out = model(**{"input_ids": input_ids, "token_type_ids": token_type_ids}) 165 | 166 | outputs = [model_out["entity_output"].cpu().numpy(), 167 | model_out["head_output"].cpu().numpy(), 168 | model_out["tail_output"].cpu().numpy()] 169 | 170 | outputs = [o[0] for o in outputs] 171 | # 抽取subject和object 172 | subjects, objects = set(), set() 173 | outputs[0][:, [0, -1]] -= np.inf 174 | outputs[0][:, :, [0, -1]] -= np.inf 175 | for l, h, t in zip(*np.where(outputs[0] > threshold)): 176 | if l == 0: 177 | subjects.add((h, t)) 178 | else: 179 | objects.add((h, t)) 180 | # 识别对应的predicate 181 | spoes = set() 182 | for sh, st in subjects: 183 | for oh, ot in objects: 184 | p1s = np.where(outputs[1][:, sh, oh] > threshold)[0] 185 | p2s = np.where(outputs[2][:, st, ot] > threshold)[0] 186 | ps = set(p1s) & set(p2s) 187 | for p in ps: 188 | try: 189 | spoes.add(( 190 | text[mapping[sh][0]:mapping[st][-1] + 1], target[p], 191 | text[mapping[oh][0]:mapping[ot][-1] + 1] 192 | )) 193 | except: 194 | continue 195 | 196 | return list(spoes) 197 | 198 | def evaluate(data): 199 | """评估函数,计算f1、precision、recall 200 | """ 201 | X, Y, Z = 1e-10, 1e-10, 1e-10 202 | f = open('dev_pred.json', 'w', encoding='utf-8') 203 | 204 | for d in tqdm(data, total=len(data)): 205 | R = set([SPO(spo) for spo in extract_spoes(d['text'])]) 206 | T = set([SPO(spo) for spo in d['spo_list']]) 207 | X += len(R & T) 208 | Y += len(R) 209 | Z += len(T) 210 | f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z 211 | s = json.dumps({ 212 | 'text': d['text'], 213 | 'spo_list': list(T), 214 | 'spo_list_pred': list(R), 215 | 'new': list(R - T), 216 | 'lack': list(T - R), 217 | }, 218 | ensure_ascii=False, 219 | indent=4) 220 | f.write(s + '\n') 221 | f.close() 222 | return f1, precision, recall 223 | 224 | def validate(): 225 | text = "南京京九思新能源有限公司于2015年05月15日在南京市江宁区市场监督管理局登记成立" 226 | spo_list = extract_spoes(text) 227 | print(f"spo_list is {spo_list}") 228 | f1, precision, recall = evaluate(valid_data) 229 | print(f"f1 is {f1}, precision is {precision}, recall is {recall}") 230 | return f1 231 | 232 | class Evaluator: 233 | def __init__(self): 234 | self.best_f1 = 0.0 235 | 236 | def on_validation(self, data): 237 | loss = data["loss"] 238 | step = data["iteration"] 239 | pass 240 | 241 | def on_epoch_end(self): 242 | f1 = validate() 243 | if self.best_f1 > f1: 244 | torch.save(model.state_dict(), model_save_path) 245 | print(f"模型保存成功: {model_save_path}") 246 | 247 | 248 | if __name__ == "__main__": 249 | 250 | train_dataset = RelationshipDataset(train_data) 251 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-5) 252 | 253 | trainer.train(model=model, optimizer=optimizer, 254 | train_dataset=train_dataset, 255 | evaluator=Evaluator, 256 | collate_fn=bert_gplinker_collate_fn) -------------------------------------------------------------------------------- /bert_seq2seq/predictor/predictor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import List 3 | import torch 4 | import os 5 | import math 6 | from bert_seq2seq.predictor.utils import viterbi_decode, decode_labels, \ 7 | bert_beamsearch, t5_random_sample, gpt_random_sample, \ 8 | t5_beamsearch, gpt_beamsearch, bert_random_sample, \ 9 | gpt_random_sample_from_ids, glm_random_sample 10 | class Predictor: 11 | 12 | def __init__(self, model, tokenizer): 13 | self.tokenizer = tokenizer 14 | self.model = model 15 | self.model.eval() 16 | self.class_name = type(model).__name__ 17 | 18 | def predict_embedding(self, text, maxlen=256, pred_type="cls"): 19 | device = next(self.model.parameters()).device 20 | tokenizer_out = self.tokenizer.encode_plus(text, max_length=maxlen, truncation=True) 21 | 22 | input_ids = tokenizer_out["input_ids"] 23 | token_type_ids = tokenizer_out["token_type_ids"] 24 | input_ids = torch.tensor(input_ids, device=device) 25 | token_type_ids = torch.tensor(token_type_ids, device=device) 26 | if input_ids.ndim == 1: 27 | input_ids = input_ids.view(1, -1) 28 | token_type_ids = token_type_ids.view(1, -1) 29 | with torch.no_grad(): 30 | if pred_type == "cls": 31 | score = self.model(**{"input_ids": input_ids, "token_type_ids": token_type_ids})["logits"].cpu()[0, 0] 32 | elif pred_type == "mean": 33 | score = self.model(**{"input_ids": input_ids, "token_type_ids": token_type_ids})["logits"].cpu().mean(dim=1)[0] 34 | 35 | return score 36 | 37 | def predict_cls_classifier(self, text, max_len=512): 38 | ## text is text or text-pair 39 | device = next(self.model.parameters()).device 40 | if type(text) is str: 41 | tokenizer_out = self.tokenizer.encode_plus(text, max_length=max_len, truncation=True) 42 | else : 43 | assert len(text) == 2 44 | tokenizer_out = self.tokenizer.encode_plus(text[0], text[1], max_length=max_len, truncation=True) 45 | 46 | input_ids = tokenizer_out["input_ids"] 47 | token_type_ids = tokenizer_out["token_type_ids"] 48 | input_ids = torch.tensor(input_ids, device=device) 49 | token_type_ids = torch.tensor(token_type_ids, device=device) 50 | if input_ids.ndim == 1: 51 | input_ids = input_ids.view(1, -1) 52 | token_type_ids = token_type_ids.view(1, -1) 53 | with torch.no_grad(): 54 | score = self.model(**{"input_ids": input_ids, "token_type_ids": token_type_ids})["logits"].cpu()[0] 55 | return score 56 | 57 | def predict_masklm(self, text, max_len=512): 58 | device = next(self.model.parameters()).device 59 | tokenizer_out = self.tokenizer.encode_plus(text, max_length=max_len, truncation=True) 60 | 61 | input_ids = tokenizer_out["input_ids"] 62 | token_type_ids = tokenizer_out["token_type_ids"] 63 | input_ids = torch.tensor(input_ids, device=device) 64 | token_type_ids = torch.tensor(token_type_ids, device=device) 65 | if input_ids.ndim == 1: 66 | input_ids = input_ids.view(1, -1) 67 | token_type_ids = token_type_ids.view(1, -1) 68 | with torch.no_grad(): 69 | score = self.model(**{"input_ids": input_ids, "token_type_ids": token_type_ids})["logits"].cpu() 70 | score = score.argmax(dim=-1).numpy()[0] 71 | return self.tokenizer.decode(score) 72 | 73 | def predict_ner(self, text, target, maxlen=256): 74 | model = self.model 75 | model.eval() 76 | device = next(model.parameters()).device 77 | tokenizer = self.tokenizer 78 | tokens = tokenizer.tokenize(text, maxlen=maxlen, add_spatial_tokens=True) 79 | mapping = tokenizer.rematch(text, tokens) 80 | token_ids = tokenizer.convert_tokens_to_ids(tokens) 81 | token_ids = torch.tensor([token_ids], dtype=torch.long, device=device) 82 | 83 | trans = model.state_dict().get("crf_layer.trans", None) 84 | if trans is not None: 85 | ## crf 86 | trans = trans.cpu() 87 | with torch.no_grad(): 88 | out = model(**{"input_ids": token_ids})["logits"][0].cpu() 89 | labels = viterbi_decode(out, trans) 90 | entities = decode_labels(labels, target) 91 | return [(mapping[w[0]][0], mapping[w[-1]][-1], l) for w, l in entities if mapping[w[0]] and mapping[w[-1]]] 92 | 93 | elif getattr(model, "gp", None) is not None : 94 | entities = [] 95 | with torch.no_grad(): 96 | scores = model(**{"input_ids": token_ids})["logits"].cpu().numpy()[0] 97 | ## global pointer 98 | scores[:, [0, -1]] -= np.inf 99 | scores[:, :, [0, -1]] -= np.inf 100 | for l, start, end in zip(*np.where(scores > 0)): 101 | if mapping[start] and mapping[end]: 102 | entities.append( 103 | (mapping[start][0], mapping[end][-1], target[l]) 104 | ) 105 | return entities 106 | 107 | else : 108 | with torch.no_grad(): 109 | scores = model(**{"input_ids": token_ids})["logits"].cpu()[0] 110 | labels = scores.argmax(dim=-1) 111 | entities = decode_labels(labels, target) 112 | return [(mapping[w[0]][0], mapping[w[-1]][-1], l) for w, l in entities if mapping[w[0]] and mapping[w[-1]]] 113 | 114 | def predict_generate_beamsearch(self, text, input_max_length=256, out_max_length=100, beam_size=1, ): 115 | self.model.eval() 116 | if "bert" in self.class_name.lower(): 117 | assert "seq2seq" in self.class_name.lower(), "this function only support seq2seq task" 118 | return bert_beamsearch(self.model, self.tokenizer, text, input_max_length=input_max_length, 119 | out_max_length=out_max_length, beam_size=beam_size) 120 | elif "t5" in self.class_name.lower(): 121 | return t5_beamsearch(self.model, self.tokenizer, text, input_max_length=input_max_length, 122 | out_max_length=out_max_length, beam_size=beam_size) 123 | 124 | elif "gpt" in self.class_name.lower(): 125 | return gpt_beamsearch(self.model, self.tokenizer, text, input_max_length=input_max_length, 126 | out_max_length=out_max_length, beam_size=beam_size) 127 | 128 | else : 129 | print("暂不支持的解码方式") 130 | import os 131 | os._exit(0) 132 | 133 | def predict_generate_randomsample(self, text, input_max_length=256, 134 | out_max_length=200, top_k=30, top_p=1.0, 135 | repetition_penalty=1.0, temperature=1.0, add_sep=False, 136 | ): 137 | device = next(self.model.parameters()).device 138 | if "t5" in self.class_name.lower(): 139 | return t5_random_sample(self.model, self.tokenizer, text, input_max_length, 140 | out_max_length, top_k, top_p, repetition_penalty, temperature, device) 141 | 142 | elif "gpt" in self.class_name.lower(): 143 | return gpt_random_sample(self.model, self.tokenizer, text, input_max_length, 144 | out_max_length, top_k, top_p, repetition_penalty, temperature, device, add_sep=add_sep) 145 | 146 | elif "bert" in self.class_name.lower(): 147 | return bert_random_sample(self.model, self.tokenizer, text, input_max_length, 148 | out_max_length, top_k, top_p, repetition_penalty, temperature, device) 149 | 150 | elif "glm" in self.class_name.lower(): 151 | return glm_random_sample(self.model, self.tokenizer, text, input_max_length, 152 | out_max_length, top_k, top_p, repetition_penalty, 153 | temperature, device) 154 | 155 | else: 156 | print("暂不支持的解码方式") 157 | import os 158 | os._exit(0) 159 | 160 | def predict_multi_response(self, sentences: List[str], top_k, top_p, 161 | repetition_penalty, temperature, input_max_length=1024, 162 | out_max_length=100): 163 | pass 164 | 165 | length = sum([len(text) for text in sentences]) 166 | if length > input_max_length: 167 | print(f"对话过长: {length}") 168 | os._exit(0) 169 | device = next(self.model.parameters()).device 170 | input_ids = [self.tokenizer.token_start_id] 171 | for index, text in enumerate(sentences): 172 | if (index + 1) % 2 == 1: 173 | input_ids += self.tokenizer.encode_plus("A:" + text, max_length=input_max_length)["input_ids"][1:] 174 | else : 175 | input_ids += self.tokenizer.encode_plus("B:" + text, max_length=input_max_length)["input_ids"][1:] 176 | 177 | if "gpt" in self.class_name.lower(): 178 | return gpt_random_sample_from_ids(self.model, self.tokenizer, input_ids, 179 | out_max_length, top_k, top_p, repetition_penalty, 180 | temperature, device) 181 | 182 | else : 183 | print(f"暂不支持的解码方式: {self.class_name}") 184 | os._exit(0) 185 | 186 | 187 | 188 | 189 | --------------------------------------------------------------------------------