├── bert_seq2seq
├── model
│ ├── __init__.py
│ ├── blocks
│ │ └── __init__.py
│ ├── layers
│ │ ├── __init__.py
│ │ ├── layer_norm.py
│ │ └── activations.py
│ ├── utils.py
│ └── prompt.py
├── task
│ ├── seq2seq
│ │ ├── __init__.py
│ │ ├── t5_seq2seq_model.py
│ │ ├── gpt2_seq2seq_model.py
│ │ ├── bert_seq2seq_model.py
│ │ └── GLM_seq2seq_model.py
│ ├── embedding
│ │ ├── __init__.py
│ │ └── bert_embedding.py
│ ├── classification
│ │ ├── __init__.py
│ │ └── bert_cls_classifier.py
│ ├── sequence_labeling
│ │ ├── __init__.py
│ │ └── bert_sequence_labeling.py
│ ├── relationship_extraction
│ │ ├── __init__.py
│ │ └── bert_relationship_extraction.py
│ └── __init__.py
├── predictor
│ ├── __init__.py
│ └── predictor.py
├── __init__.py
├── mpu
│ ├── func_utils.py
│ ├── __init__.py
│ ├── utils.py
│ ├── grads.py
│ ├── data.py
│ ├── mappings.py
│ ├── cross_entropy.py
│ ├── initialize.py
│ └── mp_tools.py
├── config.py
├── utils.py
├── launch.py
├── dataset.py
└── layers.py
├── .idea
├── .gitignore
├── vcs.xml
├── inspectionProfiles
│ ├── profiles_settings.xml
│ └── Project_Default.xml
├── misc.xml
├── modules.xml
└── bert_seq2seq_DDP.iml
├── .DS_Store
├── data
├── auto_title
│ ├── train.tgt
│ └── train.src
├── semantic_matching
│ └── train.tsv
├── LCCC-base-split
│ ├── LCCC-base_test.json
│ ├── LCCC-base_train.json
│ └── LCCC-base_valid.json
├── ner
│ └── china-people-daily-ner-corpus
│ │ ├── example.test
│ │ ├── example.dev
│ │ └── example.train
└── relationship_extraction
│ ├── all_50_schemas
│ └── dev_data.json
├── setup.py
├── examples
├── seq2seq
│ ├── gpt2
│ │ ├── test_gpt2_text_writting.py
│ │ ├── test_gpt2_multi_chat.py
│ │ ├── test_multi_processing_generate.py
│ │ └── train_gpt2_multi_chat.py
│ ├── t5
│ │ └── test_t5_auto_title.py
│ ├── GLM
│ │ ├── glm_generate_samples.py
│ │ └── train_glm_auto_title.py
│ └── bert
│ │ ├── test_roberta_auto_title.py
│ │ ├── train_roberta_auto_title.py
│ │ └── train_roberta_auto_title_multi_gpu.py
├── text_classification
│ ├── test.py
│ ├── train_roberta_large_news_title_classification.py
│ ├── train_roberta_semantic_matching.py
│ ├── train_roberta_news_title_classification.py
│ └── train_roberta_news_title_classification_multi_gpu.py
├── bert_embedding
│ └── get_bert_embedding.py
├── FAQ
│ ├── 1_construct_data.py
│ └── 2_test_bert_faq.py
├── README.md
├── ner
│ ├── train_bert_ner_people_daily.py
│ ├── train_roberta_ner_gp_people_daily.py
│ └── train_bert_ner_crf_people_daily.py
└── relationship_extraction
│ └── train_bert_relationship_extraction.py
├── .gitignore
└── README.md
/bert_seq2seq/model/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/bert_seq2seq/model/blocks/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/bert_seq2seq/model/layers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/bert_seq2seq/task/seq2seq/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/bert_seq2seq/task/embedding/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/bert_seq2seq/task/classification/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/bert_seq2seq/task/sequence_labeling/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/bert_seq2seq/task/relationship_extraction/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/bert_seq2seq/predictor/__init__.py:
--------------------------------------------------------------------------------
1 | from .predictor import *
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/920232796/bert_seq2seq_DDP/HEAD/.DS_Store
--------------------------------------------------------------------------------
/data/auto_title/train.tgt:
--------------------------------------------------------------------------------
1 | 修改后的立法法全文公布
2 | 深圳机场9死24伤续:司机全责赔偿或超千万
3 | 孟建柱:主动适应形势新变化提高政法机关服务大局的能力
--------------------------------------------------------------------------------
/data/semantic_matching/train.tsv:
--------------------------------------------------------------------------------
1 | 好无聊啊 啊好无聊啊 1
2 | 我好想谈恋爱呀 我多想谈一场恋爱呀 1
3 | 今天我四点就起床了 今天下午一点五十叫我起床 0
4 | 现在不需要你了不要回来了你 不要回来了 1
5 | 语音助手用不了怎么办 怎么用语音召唤小助手? 0
--------------------------------------------------------------------------------
/bert_seq2seq/__init__.py:
--------------------------------------------------------------------------------
1 | from .tokenizer import Tokenizer
2 | from .utils import *
3 | from .predictor import *
4 | from .trainer import Trainer
5 | from .glm_tokenizer import GLMTokenizer
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/bert_seq2seq/task/__init__.py:
--------------------------------------------------------------------------------
1 | from .classification.bert_cls_classifier import *
2 | from .embedding.bert_embedding import *
3 | from .seq2seq.bert_seq2seq_model import *
4 | from .sequence_labeling.bert_sequence_labeling import *
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/data/LCCC-base-split/LCCC-base_test.json:
--------------------------------------------------------------------------------
1 | [
2 | [
3 | "我 饿 了 。",
4 | "去 相 机 家 里 吃 … …",
5 | "相 机 今 年 木 有 回 去 T . T"
6 | ],
7 | [
8 | "网 络 大 实 话 里 说 的 是 也 许 你 能 在 网 络 里 找 到 你 想 要 的 友 情 但 永 远 不 会 找 到 你 想 要 的 爱 情",
9 | "你 过 来 我 们 什 么 关 系"
10 | ]
11 | ]
--------------------------------------------------------------------------------
/data/auto_title/train.src:
--------------------------------------------------------------------------------
1 | 新华社受权于18日全文播发修改后的《中华人民共和国立法法》修改后的立法法分为“总则”“法律”“行政法规”“地方性法规自治条例和单行条例规章”“适用与备案审查”“附则”等6章共计105条
2 | 一辆小轿车一名女司机竟造成9死24伤日前深圳市交警局对事故进行通报:从目前证据看事故系司机超速行驶且操作不当导致目前24名伤员已有6名治愈出院其余正接受治疗预计事故赔偿费或超一千万元
3 | 1月18日习近平总书记对政法工作作出重要指示:2014年政法战线各项工作特别是改革工作取得新成效新形势下希望全国政法机关主动适应新形势为公正司法和提高执法司法公信力提供有力制度保障
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/data/LCCC-base-split/LCCC-base_train.json:
--------------------------------------------------------------------------------
1 | [
2 | [
3 | "你 去 那儿 竟然 不喊 我 生气 了 , 快点 给 我 道歉",
4 | "道歉 ! ! 再有 时间 找 你 去",
5 | "领个 搓衣板 去 吧"
6 | ],
7 | [
8 | "我用 SEED.24 小时 签到 一次 可以 用 4 小时 , 对于 我 这种 每天晚上 逛 一下 的 感觉 不错",
9 | "SEED 早上 刚 被 禁用 还有 一个月 的 VIP 路线 呢 禁 了 之后 才 买 的 另 一个 买 了 一年 结果 用 了 一 下午 就 挂 了 现在 用 了 个 极速网 速差 的 很",
10 | "心疼 你"
11 | ],
12 |
13 | ]
--------------------------------------------------------------------------------
/data/ner/china-people-daily-ner-corpus/example.test:
--------------------------------------------------------------------------------
1 | 我 O
2 | 们 O
3 | 变 O
4 | 而 O
5 | 以 O
6 | 书 O
7 | 会 O
8 | 友 O
9 | , O
10 | 以 O
11 | 书 O
12 | 结 O
13 | 缘 O
14 | , O
15 | 把 O
16 | 欧 B-LOC
17 | 美 B-LOC
18 | 、 O
19 | 港 B-LOC
20 | 台 B-LOC
21 | 流 O
22 | 行 O
23 | 的 O
24 | 食 O
25 | 品 O
26 | 类 O
27 | 图 O
28 | 谱 O
29 | 、 O
30 | 画 O
31 | 册 O
32 | 、 O
33 | 工 O
34 | 具 O
35 | 书 O
36 | 汇 O
37 | 集 O
38 | 一 O
39 | 堂 O
40 | 。 O
--------------------------------------------------------------------------------
/.idea/bert_seq2seq_DDP.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/data/LCCC-base-split/LCCC-base_valid.json:
--------------------------------------------------------------------------------
1 | [
2 | [
3 | "啊 我 好 爱 虾 仁 蛋 黄 酱 金 枪 鱼 蛋 黄 酱",
4 | "那 个 饭 凉 了 吧 唧 的 怎 么 吃 啊 摔"
5 | ],
6 | [
7 | "考 试 撞 墙 关 驾 校 屁 事 ? 你 怎 么 不 顺 便 把 考 场 施 工 单 位 也 告 了 ?",
8 | "看 了 下 全 文 , 那 女 的 考 试 当 天 就 表 明 身 体 不 舒 服 了 , 考 试 不 是 她 预 约 是 教 练 自 己 安 排 的 , 教 练 还 让 她 考 试 不 就 是 教 练 的 错 吗 ? 而 且 她 住 院 花 了 3 1 万 , 赔 3 0 万 不 过 分 吧",
9 | "更 改 要 提 前 3 天 , 当 天 不 上 场 视 为 放 弃 考 试 , 又 要 重 新 报 名 重 新 交 钱 , 估 计 她 也 不 肯 的 吧 ?"
10 | ]
11 | ]
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | setup(
4 | name='bert_seq2seq_DDP',
5 | version='0.3.0',
6 | description='use torch to do bert_seq2seq task',
7 | long_description='bert_seq2seq_DDP: https://github.com/920232796/bert_seq2seq_DDP',
8 | license='Apache License 2.0',
9 | url='https://github.com/920232796/bert_seq2seq_DDP',
10 | author='xingzhaohu',
11 | author_email='920232796@qq.com',
12 | packages=find_packages()
13 | )
14 |
--------------------------------------------------------------------------------
/examples/seq2seq/gpt2/test_gpt2_text_writting.py:
--------------------------------------------------------------------------------
1 | from bert_seq2seq.utils import load_model
2 | from bert_seq2seq.tokenizer import Tokenizer
3 | from bert_seq2seq import Predictor
4 |
5 | model_path = "../state_dict/gpt2/pytorch_model.bin"
6 | vocab_path = "../state_dict/gpt2/vocab.txt"
7 |
8 | tokenizer = Tokenizer(vocab_path)
9 |
10 | model = load_model(tokenizer.vocab,
11 | model_name="gpt2",
12 | task_name="seq2seq")
13 | model.load_pretrain_params(model_path)
14 | predictor = Predictor(model, tokenizer)
15 |
16 | if __name__ == '__main__':
17 | text = "今天天气好,"
18 | out = predictor.predict_generate_randomsample(text, out_max_length=100,
19 | repetition_penalty=1.5,
20 | top_p=1.0, top_k=20)
21 | print(out)
--------------------------------------------------------------------------------
/examples/seq2seq/t5/test_t5_auto_title.py:
--------------------------------------------------------------------------------
1 | from bert_seq2seq.utils import load_model
2 | from bert_seq2seq.tokenizer import T5PegasusTokenizer
3 | from bert_seq2seq import Predictor
4 |
5 | model_path = "../state_dict/t5-chinese/pytorch_model.bin"
6 | vocab_path = "../state_dict/t5-chinese/vocab.txt"
7 |
8 | tokenizer = T5PegasusTokenizer(vocab_path)
9 |
10 | model = load_model(tokenizer.vocab,
11 | model_name="t5",
12 | task_name="seq2seq")
13 | model.load_pretrain_params(model_path)
14 |
15 | predictor = Predictor(model, tokenizer)
16 |
17 | if __name__ == '__main__':
18 | text = "本文总结了十个可穿戴产品的设计原则,而这些原则同样也是笔者认为是这个行业最吸引人的地方:1.为人们解决重复性问题,2.从人开始而不是从机器开始,3.要引起注意但不要刻意,4.提升用户能力而不是取代人"
19 | out = predictor.predict_generate_randomsample(text, out_max_length=100,
20 | repetition_penalty=1.0,
21 | top_p=0.9, top_k=50)
22 | print(out)
--------------------------------------------------------------------------------
/bert_seq2seq/task/embedding/bert_embedding.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | from bert_seq2seq.basic_bert import BasicBert
3 |
4 | class BertEmbedding(BasicBert):
5 | """
6 | """
7 | def __init__(self, vocab,
8 | model_name="roberta",
9 | size="base",
10 | **kwargs):
11 | super(BertEmbedding, self).__init__(word2ix=vocab, model_name=model_name, size=size)
12 | self.layer_norm_cond = None
13 | self.cls.predictions.decoder = None
14 |
15 | def forward(self, **data):
16 |
17 | input_ids = data["input_ids"]
18 | token_type_ids = data.get("token_type_ids", None)
19 |
20 | all_layers, _ = self.bert(input_ids, token_type_ids=token_type_ids,
21 | output_all_encoded_layers=True)
22 | sequence_out = all_layers[-1]
23 | tokens_hidden_state = self.cls.predictions.transform(sequence_out)
24 |
25 | return_data = {"logits": tokens_hidden_state, }
26 |
27 | return return_data
28 |
--------------------------------------------------------------------------------
/examples/text_classification/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | from bert_seq2seq import Tokenizer
4 | from bert_seq2seq import load_model
5 | from bert_seq2seq import Predictor
6 |
7 |
8 | model_name = "roberta" # 选择模型名字
9 | task_name = "cls"
10 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置
11 | model_save_path = "./bert_emotion_analysis.bin"
12 | # 加载字典
13 | tokenizer = Tokenizer(vocab_path)
14 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15 |
16 | target = ["中性", "积极", "消极"]
17 |
18 | def main():
19 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name, target_size=3)
20 | bert_model.load_all_params(model_save_path)
21 | predictor = Predictor(bert_model, tokenizer)
22 |
23 | text = ["今天天气很好,挺喜欢。",
24 | "你今天是生谁的气了?怎么这么不开心??",
25 | "明天要下雨了。"]
26 |
27 | for t in text:
28 | ids = predictor.predict_cls_classifier(t).argmax(dim=0)
29 | print(target[ids])
30 |
31 | if __name__ == '__main__':
32 | main()
33 |
--------------------------------------------------------------------------------
/data/ner/china-people-daily-ner-corpus/example.dev:
--------------------------------------------------------------------------------
1 | 在 O
2 | 这 O
3 | 里 O
4 | 恕 O
5 | 弟 O
6 | 不 O
7 | 恭 O
8 | 之 O
9 | 罪 O
10 | , O
11 | 敢 O
12 | 在 O
13 | 尊 O
14 | 前 O
15 | 一 O
16 | 诤 O
17 | : O
18 | 前 O
19 | 人 O
20 | 论 O
21 | 书 O
22 | , O
23 | 每 O
24 | 曰 O
25 | “ O
26 | 字 O
27 | 字 O
28 | 有 O
29 | 来 O
30 | 历 O
31 | , O
32 | 笔 O
33 | 笔 O
34 | 有 O
35 | 出 O
36 | 处 O
37 | ” O
38 | , O
39 | 细 O
40 | 读 O
41 | 公 O
42 | 字 O
43 | , O
44 | 何 O
45 | 尝 O
46 | 跳 O
47 | 出 O
48 | 前 O
49 | 人 O
50 | 藩 O
51 | 篱 O
52 | , O
53 | 自 O
54 | 隶 O
55 | 变 O
56 | 而 O
57 | 后 O
58 | , O
59 | 直 O
60 | 至 O
61 | 明 O
62 | 季 O
63 | , O
64 | 兄 O
65 | 有 O
66 | 何 O
67 | 新 O
68 | 出 O
69 | ? O
70 |
71 | 相 O
72 | 比 O
73 | 之 O
74 | 下 O
75 | , O
76 | 青 B-ORG
77 | 岛 I-ORG
78 | 海 I-ORG
79 | 牛 I-ORG
80 | 队 I-ORG
81 | 和 O
82 | 广 B-ORG
83 | 州 I-ORG
84 | 松 I-ORG
85 | 日 I-ORG
86 | 队 I-ORG
87 | 的 O
88 | 雨 O
89 | 中 O
90 | 之 O
91 | 战 O
92 | 虽 O
93 | 然 O
94 | 也 O
95 | 是 O
96 | 0 O
97 | ∶ O
98 | 0 O
99 | , O
100 | 但 O
101 | 乏 O
102 | 善 O
103 | 可 O
104 | 陈 O
105 | 。 O
--------------------------------------------------------------------------------
/examples/seq2seq/gpt2/test_gpt2_multi_chat.py:
--------------------------------------------------------------------------------
1 | ## 多轮对话,测试
2 | from bert_seq2seq.utils import load_model
3 | from bert_seq2seq.tokenizer import Tokenizer
4 | from bert_seq2seq import Predictor
5 | import os
6 |
7 | vocab_path = "../state_dict/gpt2/vocab.txt"
8 | model_save_path = "./gpt2_multi_chat_model.bin" # 训练好的模型保存位置。
9 |
10 | tokenizer = Tokenizer(vocab_path)
11 |
12 | model = load_model(tokenizer.vocab,
13 | model_name="gpt2",
14 | task_name="seq2seq")
15 | model.load_all_params(model_save_path)
16 | predictor = Predictor(model, tokenizer)
17 |
18 | if __name__ == '__main__':
19 | sentences_list = [["今天我去吃了火锅,还可以,想不想尝尝?"],
20 | ["今天天气很好", "是啊,真的非常好,我也出去玩了一会"],
21 | ["今天天气很好", "是啊,真的非常好", "你也出去玩了吗?"]]
22 |
23 | for sentences in sentences_list:
24 | out = predictor.predict_multi_response(sentences,
25 | repetition_penalty=1.2,
26 | temperature=1.2,
27 | top_p=1.0, top_k=30)
28 | print(out)
--------------------------------------------------------------------------------
/examples/bert_embedding/get_bert_embedding.py:
--------------------------------------------------------------------------------
1 |
2 | import torch
3 | from bert_seq2seq import Tokenizer
4 | from bert_seq2seq import load_model
5 | from bert_seq2seq import Predictor
6 | import numpy as np
7 |
8 | def compute_similarity(in_1, in_2):
9 | res = np.dot(in_1, in_2) / (np.linalg.norm(in_1) * np.linalg.norm(in_2))
10 | return res
11 |
12 | maxlen = 256
13 | model_name = "bert" # 选择模型名字
14 | task_name = "embedding"
15 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16 |
17 | vocab_path = "../state_dict/bert-base-chinese/vocab.txt" # roberta模型字典的位置
18 | model_path = "../state_dict/bert-base-chinese/pytorch_model.bin" # roberta模型位置
19 |
20 | tokenizer = Tokenizer(vocab_path, do_lower_case=True, max_len=maxlen)
21 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name)
22 | bert_model.load_pretrain_params(model_path, strict=False)
23 |
24 | predictor = Predictor(bert_model, tokenizer)
25 | text = ["今天天气很好", "今天天气不错", "今天有事出去忙"]
26 |
27 | embedding_1 = predictor.predict_embedding(text[0], maxlen=maxlen)
28 | embedding_2 = predictor.predict_embedding(text[1], maxlen=maxlen)
29 | embedding_3 = predictor.predict_embedding(text[2], maxlen=maxlen)
30 |
31 | print(f"cos sim 1-2 is {compute_similarity(embedding_1, embedding_2)}")
32 | print(f"cos sim 1-3 is {compute_similarity(embedding_1, embedding_3)}")
33 |
34 |
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/bert_seq2seq/task/seq2seq/t5_seq2seq_model.py:
--------------------------------------------------------------------------------
1 |
2 | import torch
3 | from bert_seq2seq.model.t5_model import T5ForConditionalGeneration, T5Config, T5SmallConfig
4 | from bert_seq2seq.basic_bert import BasicT5
5 | import torch.nn.functional as F
6 |
7 | class T5Model(BasicT5):
8 |
9 | def __init__(self, vocab,
10 | model_name="t5",
11 | size="base",
12 | **kwargs):
13 | super().__init__()
14 | if size == "base":
15 | config = T5Config(vocab_size=len(vocab))
16 | elif size == "small":
17 | config = T5SmallConfig(vocab_size=len(vocab))
18 | else:
19 | raise Exception("not support this model type")
20 | self.model = T5ForConditionalGeneration(config)
21 | print(f"model is {model_name}")
22 |
23 | def forward(self, **data):
24 | input_ids = data.get("input_ids", None)
25 | decoder_input_ids = data["decoder_input_ids"]
26 | encoder_last_hidden_state = data.get("encoder_last_hidden_state", None)
27 | if encoder_last_hidden_state is not None:
28 | encoder_last_hidden_state = [encoder_last_hidden_state]
29 | labels = data.get("labels", None)
30 | t5_out = self.model(input_ids=input_ids, encoder_outputs=encoder_last_hidden_state, decoder_input_ids=decoder_input_ids, labels=labels)
31 | if labels is not None:
32 | return {"logits": t5_out[1], "loss": t5_out[0], "encoder_last_hidden_state": t5_out[2]}
33 |
34 | return {"logits": t5_out[0], "encoder_last_hidden_state": t5_out[1]}
--------------------------------------------------------------------------------
/bert_seq2seq/mpu/func_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright © 2022 BAAI. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License")
4 | # coding=utf-8
5 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | import torch
20 | import math
21 |
22 |
23 | @torch.jit.script
24 | def gelu_impl(x):
25 | """OpenAI's gelu implementation."""
26 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
27 | (1.0 + 0.044715 * x * x)))
28 |
29 |
30 | def gelu(x):
31 | return gelu_impl(x)
32 |
33 |
34 | def unscaled_init_method(sigma):
35 | """Init method based on N(0, sigma)."""
36 | def init_(tensor):
37 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
38 |
39 | return init_
40 |
41 |
42 | def scaled_init_method(sigma, num_layers):
43 | """Init method based on N(0, sigma/sqrt(2*num_layers)."""
44 | std = sigma / math.sqrt(2.0 * num_layers)
45 |
46 | def init_(tensor):
47 | return torch.nn.init.normal_(tensor, mean=0.0, std=std)
48 |
49 | return init_
50 |
51 |
52 | def sqrt(x):
53 | return int(math.sqrt(x) + 1e-4)
54 |
--------------------------------------------------------------------------------
/bert_seq2seq/task/seq2seq/gpt2_seq2seq_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from bert_seq2seq.model.gpt2_model import GPT2LMHeadModel, GPT2Config
3 | from bert_seq2seq.basic_bert import BasicGPT
4 |
5 | class GPT2(BasicGPT):
6 | def __init__(self, vocab,
7 | model_name="gpt2",
8 | **kwargs
9 | ):
10 | super().__init__()
11 | self.word2ix = vocab
12 | if model_name == "gpt2":
13 | self.config = GPT2Config(len(vocab))
14 | else :
15 | self.config = None
16 | self.model = GPT2LMHeadModel(self.config)
17 | print(f"model is {model_name}")
18 |
19 | def _make_causal_mask(self, input_ids):
20 | device = input_ids.device
21 | bsz, tgt_len = input_ids.shape
22 | mask = torch.full((tgt_len, tgt_len), 0.0).to(device)
23 | mask_cond = torch.arange(mask.size(-1)).to(device)
24 | mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 1.0)
25 |
26 | return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len)
27 |
28 | def forward(self, **data):
29 | input_ids = data["input_ids"]
30 | labels = data.get("labels", None)
31 | extend_mask = (input_ids > 0).float()
32 |
33 | return_data = {}
34 | attention_mask = self._make_causal_mask(input_ids)
35 | extend_mask = extend_mask.unsqueeze(1).unsqueeze(1) * attention_mask
36 | if labels is not None:
37 | loss, lm_logits = self.model(input_ids, labels=labels, attention_mask=extend_mask)
38 | return_data["loss"] = loss
39 |
40 | else :
41 | lm_logits = self.model(input_ids, attention_mask=attention_mask)
42 | return_data["logits"] = lm_logits
43 |
44 | return return_data
--------------------------------------------------------------------------------
/examples/seq2seq/GLM/glm_generate_samples.py:
--------------------------------------------------------------------------------
1 | # Copyright © 2022 BAAI. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License")
4 |
5 | import torch
6 | from bert_seq2seq import Predictor
7 | from bert_seq2seq import GLMTokenizer
8 | from bert_seq2seq.utils import load_model
9 | import torch
10 | import os
11 |
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 |
14 | if __name__ == '__main__':
15 | tokenizer = GLMTokenizer("../state_dict/GLM-large-ch/cog-pretrain.model")
16 | model = load_model(model_name="glm",
17 | task_name="seq2seq",
18 | size="large")
19 |
20 | model.load_pretrain_params("../state_dict/GLM-large-ch/pytorch_model.bin")
21 | model.to(device)
22 |
23 | predictor = Predictor(model, tokenizer)
24 | # generate samples
25 | text = [
26 | '问题:啤酒伤胃吗?回答:[gMASK]', "问题:隔夜菜能吃吗?回答:[gMASK]", "问题:如何评价许嵩?回答:[gMASK]"
27 | ]
28 | for t in text:
29 | output = predictor.predict_generate_randomsample(
30 | t, top_k=50, repetition_penalty=4.0, top_p=1.0)
31 | print(t, '\n', output)
32 |
33 | text = ['北京故宫是中国[MASK]非物质文化遗产。', "上海是中国[MASK]大都市。", "天津大学是[MASK]现代大学。"]
34 | for t in text:
35 | output = predictor.predict_generate_randomsample(
36 | t, top_k=50, repetition_penalty=4.0, top_p=1.0)
37 | print(t, '\n', output)
38 | #
39 | text = [
40 | "人工智能是一个以计算机科学为基础,由计算机、数学、哲学等多学科交叉融合的交叉学科,[sMASK],具有非常巨大的前景。",
41 | "最近十多年来,人工神经网络的研究工作不断深入,已经取得了很大的进展,[sMASK],表现出了良好的智能特性。"
42 | ]
43 | for t in text:
44 | output = predictor.predict_generate_randomsample(
45 | t, top_k=50, repetition_penalty=4.0, top_p=1.0)
46 | print(t, '\n', output)
47 |
--------------------------------------------------------------------------------
/examples/FAQ/1_construct_data.py:
--------------------------------------------------------------------------------
1 | ## 构建数据库
2 | ## 数据来源 https://github.com/murufeng/ChineseNlpCorpus
3 | import torch
4 | from bert_seq2seq import Tokenizer
5 | from bert_seq2seq import load_model
6 | from bert_seq2seq import Predictor
7 | import pandas as pd
8 | import numpy as np
9 | from tqdm import tqdm
10 | import collections
11 | import faiss
12 |
13 | faq_data_path = "../data/financezhidao_filter.csv"
14 | answer_save_path = "../data/finance_fqa.json"
15 | embeddings_save_path = "../data/finance_embeddings.json"
16 |
17 | maxlen = 256
18 | model_name = "bert" # 选择模型名字
19 | task_name = "embedding"
20 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21 |
22 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置
23 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置
24 |
25 | tokenizer = Tokenizer(vocab_path, do_lower_case=True, max_len=maxlen)
26 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name)
27 | bert_model.load_pretrain_params(model_path)
28 | bert_model.to(device)
29 | predictor = Predictor(bert_model, tokenizer)
30 |
31 | def resave_data():
32 | answer = collections.OrderedDict()
33 | embeddings = []
34 | df = pd.read_csv(faq_data_path)
35 | for index, row in tqdm(df.iterrows(), total=len(df)):
36 | if type(row[0]) == str:
37 | if row[0] not in answer:
38 | answer[row[0]] = row[2]
39 | embeddings.append(predictor.predict_embedding(row[0], maxlen=maxlen).numpy())
40 |
41 | embeddings = np.array(embeddings)
42 | torch.save(answer, answer_save_path)
43 | torch.save(embeddings, embeddings_save_path)
44 |
45 | print(f"数据保存成功: {answer_save_path}, {embeddings_save_path}")
46 |
47 | if __name__ == '__main__':
48 |
49 | resave_data()
--------------------------------------------------------------------------------
/bert_seq2seq/mpu/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Model parallel utility interface."""
16 | from .data import broadcast_data
17 |
18 | from .grads import clip_grad_norm
19 |
20 | from .initialize import destroy_model_parallel
21 | from .initialize import get_data_parallel_group
22 | from .initialize import get_data_parallel_rank
23 | from .initialize import get_data_parallel_world_size
24 | from .initialize import get_model_parallel_group
25 | from .initialize import get_model_parallel_rank
26 | from .initialize import get_model_parallel_src_rank
27 | from .initialize import get_model_parallel_world_size
28 | from .initialize import initialize_model_parallel
29 | from .initialize import model_parallel_is_initialized
30 |
31 | from .mappings import copy_to_model_parallel_region
32 | from .mappings import gather_from_model_parallel_region
33 | from .mappings import reduce_from_model_parallel_region
34 | from .mappings import scatter_to_model_parallel_region
35 |
36 | from .random import checkpoint
37 | from .random import partition_activations_in_checkpoint
38 | from .random import get_cuda_rng_tracker
39 | from .random import model_parallel_cuda_manual_seed
40 |
--------------------------------------------------------------------------------
/bert_seq2seq/task/classification/bert_cls_classifier.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | from bert_seq2seq.basic_bert import BasicBert
3 |
4 | class BertClsClassifier(BasicBert):
5 | """
6 | """
7 | def __init__(self, vocab,
8 | target_size,
9 | model_name="roberta",
10 | **kwargs):
11 | super(BertClsClassifier, self).__init__(word2ix=vocab, model_name=model_name)
12 | self.target_size = target_size
13 | self.final_dense = nn.Linear(self.config.hidden_size, self.target_size)
14 | self.cls = None
15 | self.layer_norm_cond = None
16 |
17 | def compute_loss(self, predictions, labels):
18 | """
19 | 计算loss
20 | predictions: (batch_size, 1)
21 | """
22 | predictions = predictions.view(-1, self.target_size)
23 | labels = labels.view(-1)
24 | loss = nn.CrossEntropyLoss(reduction="mean")
25 | return loss(predictions, labels)
26 |
27 | def compute_loss_sigmoid(self, predictions, labels):
28 | predictions = predictions.view(-1)
29 | labels = labels.view(-1).float()
30 |
31 | loss_sigmoid = nn.BCEWithLogitsLoss()
32 | return loss_sigmoid(predictions, labels)
33 |
34 | def forward(self, **data):
35 |
36 | input_ids = data["input_ids"]
37 | token_type_ids = data["token_type_ids"]
38 | labels = data.get("labels", None)
39 |
40 | all_layers, pooled_out = self.bert(input_ids, token_type_ids=token_type_ids,
41 | output_all_encoded_layers=True)
42 |
43 | predictions = self.final_dense(pooled_out)
44 | return_data = {"logits": predictions, }
45 | if labels is not None:
46 | ## 计算loss
47 | if self.target_size == 1:
48 | loss = self.compute_loss_sigmoid(predictions, labels)
49 | else :
50 | loss = self.compute_loss(predictions, labels)
51 | return_data["loss"] = loss
52 |
53 | return return_data
--------------------------------------------------------------------------------
/data/ner/china-people-daily-ner-corpus/example.train:
--------------------------------------------------------------------------------
1 | 海 O
2 | 钓 O
3 | 比 O
4 | 赛 O
5 | 地 O
6 | 点 O
7 | 在 O
8 | 厦 B-LOC
9 | 门 I-LOC
10 | 与 O
11 | 金 B-LOC
12 | 门 I-LOC
13 | 之 O
14 | 间 O
15 | 的 O
16 | 海 O
17 | 域 O
18 | 。 O
19 |
20 | 这 O
21 | 座 O
22 | 依 O
23 | 山 O
24 | 傍 O
25 | 水 O
26 | 的 O
27 | 博 O
28 | 物 O
29 | 馆 O
30 | 由 O
31 | 国 O
32 | 内 O
33 | 一 O
34 | 流 O
35 | 的 O
36 | 设 O
37 | 计 O
38 | 师 O
39 | 主 O
40 | 持 O
41 | 设 O
42 | 计 O
43 | , O
44 | 整 O
45 | 个 O
46 | 建 O
47 | 筑 O
48 | 群 O
49 | 精 O
50 | 美 O
51 | 而 O
52 | 恢 O
53 | 宏 O
54 | 。 O
55 |
56 | 但 O
57 | 作 O
58 | 为 O
59 | 一 O
60 | 个 O
61 | 共 O
62 | 产 O
63 | 党 O
64 | 员 O
65 | 、 O
66 | 人 O
67 | 民 O
68 | 公 O
69 | 仆 O
70 | , O
71 | 应 O
72 | 当 O
73 | 胸 O
74 | 怀 O
75 | 宽 O
76 | 阔 O
77 | , O
78 | 真 O
79 | 正 O
80 | 做 O
81 | 到 O
82 | “ O
83 | 先 O
84 | 天 O
85 | 下 O
86 | 之 O
87 | 忧 O
88 | 而 O
89 | 忧 O
90 | , O
91 | 后 O
92 | 天 O
93 | 下 O
94 | 之 O
95 | 乐 O
96 | 而 O
97 | 乐 O
98 | ” O
99 | , O
100 | 淡 O
101 | 化 O
102 | 个 O
103 | 人 O
104 | 的 O
105 | 名 O
106 | 利 O
107 | 得 O
108 | 失 O
109 | 和 O
110 | 宠 O
111 | 辱 O
112 | 悲 O
113 | 喜 O
114 | , O
115 | 把 O
116 | 改 O
117 | 革 O
118 | 大 O
119 | 业 O
120 | 摆 O
121 | 在 O
122 | 首 O
123 | 位 O
124 | , O
125 | 这 O
126 | 样 O
127 | 才 O
128 | 能 O
129 | 超 O
130 | 越 O
131 | 自 O
132 | 我 O
133 | , O
134 | 摆 O
135 | 脱 O
136 | 世 O
137 | 俗 O
138 | , O
139 | 有 O
140 | 所 O
141 | 作 O
142 | 为 O
143 | 。 O
144 |
145 | 在 O
146 | 发 O
147 | 达 O
148 | 国 O
149 | 家 O
150 | , O
151 | 急 O
152 | 救 O
153 | 保 O
154 | 险 O
155 | 十 O
156 | 分 O
157 | 普 O
158 | 及 O
159 | , O
160 | 已 O
161 | 成 O
162 | 为 O
163 | 社 O
164 | 会 O
165 | 保 O
166 | 障 O
167 | 体 O
168 | 系 O
169 | 的 O
170 | 重 O
171 | 要 O
172 | 组 O
173 | 成 O
174 | 部 O
175 | 分 O
176 | 。 O
177 |
178 | 日 B-LOC
179 | 俄 B-LOC
180 | 两 O
181 | 国 O
182 | 国 O
183 | 内 O
184 | 政 O
185 | 局 O
186 | 都 O
187 | 充 O
188 | 满 O
189 | 变 O
190 | 数 O
191 | , O
192 | 尽 O
193 | 管 O
194 | 日 B-LOC
195 | 俄 B-LOC
196 | 关 O
197 | 系 O
198 | 目 O
199 | 前 O
200 | 是 O
201 | 历 O
202 | 史 O
203 | 最 O
204 | 佳 O
205 | 时 O
206 | 期 O
207 | , O
208 | 但 O
209 | 其 O
210 | 脆 O
211 | 弱 O
212 | 性 O
213 | 不 O
214 | 言 O
215 | 自 O
216 | 明 O
217 | 。 O
--------------------------------------------------------------------------------
/examples/seq2seq/gpt2/test_multi_processing_generate.py:
--------------------------------------------------------------------------------
1 | from cgi import test
2 | from multiprocessing import Pool, Process
3 | import os
4 | from nbformat import write
5 | import pandas as pd
6 | import torch
7 | from torch.utils.data import Dataset
8 | from bert_seq2seq import Tokenizer
9 | from bert_seq2seq import load_model
10 | from bert_seq2seq import Predictor
11 |
12 | vocab_path = "../state_dict/gpt2/vocab.txt"
13 | model_save_path = "./gpt2_writing_model.bin" # 训练好的模型保存位置。
14 |
15 | model_name = "gpt2" # 选择模型名字
16 | task_name = "seq2seq" # 任务名字
17 |
18 | data_path = "../data/xzwaz2kx4cu.csv"
19 |
20 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21 |
22 | tokenizer = Tokenizer(vocab_path)
23 | model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name)
24 | model.load_all_params(model_save_path)
25 | model.to(device)
26 |
27 | predictor = Predictor(model, tokenizer)
28 |
29 | def read_file():
30 | ## 更换数据集只需要实现这个函数即可,返回框架需要的src、tgt
31 | data = []
32 |
33 | df = pd.read_csv(data_path)
34 | for index, row in df.iterrows():
35 | if type(row[0]) is str:
36 | data.append(row[0])
37 |
38 | return data
39 |
40 | test_data = read_file()
41 | print(f"data len is {len(test_data)}")
42 |
43 | def generate_multiprocess(data):
44 | print(f"data is {data}")
45 | out = predictor.predict_generate_randomsample(data,
46 | input_max_length=100,
47 | out_max_length=900,
48 | top_k=50,
49 | top_p=0.8,
50 | repetition_penalty=3.0,
51 | temperature=1.5)
52 |
53 | with open(os.path.join("./gene", f"{data}.txt"), "w+") as f :
54 | f.write(str(out))
55 | # return (out, data)
56 |
57 |
58 | if __name__ == "__main__":
59 | torch.multiprocessing.set_start_method("spawn")
60 | p = Pool(3)
61 | p.map_async(generate_multiprocess, test_data, chunksize=3)
62 | p.close()
63 | p.join()
64 | print('done.')
--------------------------------------------------------------------------------
/bert_seq2seq/model/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright © 2022 BAAI. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License")
4 | import math
5 |
6 | import torch
7 |
8 |
9 | def ensure_divisibility(numerator, denominator):
10 | """Ensure that numerator is divisible by the denominator."""
11 | assert numerator % denominator == 0, '{} is not divisible by {}'.format(
12 | numerator, denominator)
13 |
14 |
15 | def divide(numerator, denominator):
16 | """Ensure that numerator is divisible by the denominator and return
17 | the division value."""
18 | ensure_divisibility(numerator, denominator)
19 | return numerator // denominator
20 |
21 |
22 | def split_tensor_along_last_dim(tensor,
23 | num_partitions,
24 | contiguous_split_chunks=False):
25 | """Split a tensor along its last dimension.
26 | Arguments:
27 | tensor: input tensor.
28 | num_partitions: number of partitions to split the tensor
29 | contiguous_split_chunks: If True, make each chunk contiguous
30 | in memory.
31 | """
32 | # Get the size and dimension.
33 | last_dim = tensor.dim() - 1
34 | last_dim_size = divide(tensor.size()[last_dim], num_partitions)
35 | # Split.
36 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
37 | # Note: torch.split does not create contiguous tensors by default.
38 | if contiguous_split_chunks:
39 | return tuple(chunk.contiguous() for chunk in tensor_list)
40 |
41 | return tensor_list
42 |
43 |
44 | def unscaled_init_method(sigma):
45 | """Init method based on N(0, sigma)."""
46 | def init_(tensor):
47 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
48 |
49 | return init_
50 |
51 |
52 | def scaled_init_method(mean, sigma, num_layers):
53 | """Init method based on N(0, sigma/sqrt(2*num_layers)."""
54 | std = sigma / math.sqrt(2.0 * num_layers)
55 |
56 | def init_(tensor):
57 | return torch.nn.init.normal_(tensor, mean=mean, std=std)
58 |
59 | return init_
60 |
61 |
62 | def sqrt(x):
63 | return int(math.sqrt(x) + 1e-4)
64 |
65 |
66 | def normal_init_method(mean=0.0, std=0.02):
67 | def init_(tensor):
68 | return torch.nn.init.normal_(tensor, mean=mean, std=std)
69 |
70 | return init_
71 |
--------------------------------------------------------------------------------
/examples/FAQ/2_test_bert_faq.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from bert_seq2seq import Tokenizer
3 | from bert_seq2seq import load_model
4 | from bert_seq2seq import Predictor
5 | import faiss
6 |
7 | faq_data_path = "../data/financezhidao_filter.csv"
8 | answer_save_path = "../data/finance_fqa.json"
9 | embeddings_save_path = "../data/finance_embeddings.json"
10 |
11 | maxlen = 256
12 | d = 768
13 | nlist = 5
14 |
15 | model_name = "bert" # 选择模型名字
16 | task_name = "embedding"
17 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18 |
19 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置
20 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置
21 |
22 | tokenizer = Tokenizer(vocab_path, do_lower_case=True, max_len=maxlen)
23 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name)
24 | bert_model.load_pretrain_params(model_path)
25 |
26 | predictor = Predictor(bert_model, tokenizer)
27 |
28 | class Search:
29 | def __init__(self, training_vectors, d, nlist=10, nprobe=1):
30 | quantizer = faiss.IndexFlatIP(d) # the other index,需要以其他index作为基础
31 | self.index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
32 | assert not self.index.is_trained
33 | self.index.train(training_vectors)
34 | assert self.index.is_trained
35 | self.index.nprobe = nprobe # default nprobe is 1, try a few more
36 | self.index.add(training_vectors) # add may be a bit slower as well
37 | self.d = d
38 |
39 | def search(self, answer, query, k=10):
40 | query = query.numpy().reshape(-1, self.d)
41 | D, I = self.index.search(query, k) # actual search
42 | result = []
43 | all_question = list(answer.keys())
44 | for s, i in zip(D[0], I[0]):
45 | print(i)
46 | if i != -1:
47 | result.append({all_question[i]: s})
48 |
49 | print(result)
50 |
51 | if __name__ == '__main__':
52 | # load data
53 | answer = torch.load(answer_save_path)
54 | embeddings = torch.load(embeddings_save_path)
55 |
56 | method = Search(training_vectors=embeddings, d=d, nlist=nlist, nprobe=2)
57 |
58 | while True:
59 | question = input("请输入问题:")
60 | if question == "q":
61 | break
62 | question_embedding = predictor.predict_embedding(question, maxlen=maxlen)
63 | method.search(answer, question_embedding, k=10)
64 |
65 |
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/bert_seq2seq/config.py:
--------------------------------------------------------------------------------
1 |
2 | max_length = 256
3 |
4 | yayun_list = [
5 | "东同铜桐筒童僮瞳中衷忠虫终戎崇嵩弓躬宫融雄熊穹穷冯风枫丰充隆空公功工攻蒙笼聋珑洪红鸿虹丛翁聪通蓬烘潼胧砻峒螽梦讧冻忡酆恫总侗窿懵庞种盅芎倥艨绒葱匆骢",
6 | "冬农宗钟龙舂松冲容蓉庸封胸雍浓重从逢缝踪茸峰锋烽蛩慵恭供淙侬松凶墉镛佣溶邛共憧喁邕壅纵龚枞脓淞匈汹禺蚣榕彤",
7 | "江扛窗邦缸降双庞逄腔撞幢桩淙豇",
8 | "支枝移为垂吹陂碑奇宜仪皮儿离施知驰池规危夷师姿迟眉悲之芝时诗棋旗辞词期祠基疑姬丝司葵医帷思滋持随痴维卮麋螭麾墀弥慈遗肌脂雌披嬉尸狸炊篱兹差疲茨卑亏蕤陲骑曦歧岐谁斯私窥熙欺疵赀笞羁彝颐资糜饥衰锥姨楣夔涯伊蓍追",
9 | "缁箕椎罴篪萎匙脾坻嶷治骊尸綦怡尼漪累牺饴而鸱推縻璃祁绥逵羲羸肢骐訾狮奇嗤咨堕其睢漓蠡噫馗辎胝鳍蛇陴淇淄丽筛厮氏痍貔比僖贻祺嘻鹂瓷琦嵋怩熹孜台蚩罹魑丕琪耆衰惟剂提禧居栀戏畸椅磁痿离佳虽仔寅委崎隋逶倭黎犁郦",
10 | "微薇晖徽挥韦围帏违霏菲妃绯飞非扉肥腓威畿机几讥矶稀希衣依沂巍归诽痱欷葳颀圻",
11 | "鱼渔初书舒居裾车渠余予誉舆胥狙锄疏蔬梳虚嘘徐猪闾庐驴诸除储如墟与畲疽苴于茹蛆且沮祛蜍榈淤好雎纾躇趄滁屠据匹咀衙涂虑",
12 | "虞愚娱隅刍无芜巫于盂衢儒濡襦须株诛蛛殊瑜榆谀愉腴区驱躯朱珠趋扶符凫雏敷夫肤纡输枢厨俱驹模谟蒲胡湖瑚乎壶狐弧孤辜姑觚菰徒途涂荼图屠奴呼吾七虞梧吴租卢鲈苏酥乌枯都铺禺诬竽吁瞿劬需俞逾觎揄萸臾渝岖镂娄夫孚桴俘迂姝拘摹糊鸪沽呱蛄驽逋舻垆徂孥泸栌嚅蚨诹扶母毋芙喁颅轳句邾洙麸机膜瓠恶芋呕驺喻枸侏龉葫懦帑拊",
13 | "齐蛴脐黎犁梨黧妻萋凄堤低氐诋题提荑缔折篦鸡稽兮奚嵇蹊倪霓西栖犀嘶撕梯鼙批挤迷泥溪圭闺睽奎携畦骊鹂儿",
14 | "佳街鞋牌柴钗差涯阶偕谐骸排乖怀淮豺侪埋霾斋娲蜗娃哇皆喈揩蛙楷槐俳",
15 | "灰恢魁隈回徊枚梅媒煤瑰雷催摧堆陪杯醅嵬推开哀埃台苔该才材财裁来莱栽哉灾猜胎孩虺崔裴培坏垓陔徕皑傀崃诙煨桅唉颏能茴酶偎隗咳",
16 | "真因茵辛新薪晨辰臣人仁神亲申伸绅身宾滨邻鳞麟珍尘陈春津秦频苹颦银垠筠巾民珉缗贫淳醇纯唇伦纶轮沦匀旬巡驯钧均臻榛姻寅彬鹑皴遵循振甄岷谆椿询恂峋莘堙屯呻粼磷辚濒闽豳逡填狺泯洵溱夤荀竣娠纫鄞抡畛嶙斌氤",
17 | "文闻纹云氛分纷芬焚坟群裙君军勤斤筋勋薰曛熏荤耘芸汾氲员欣芹殷昕贲郧雯蕲",
18 | "元原源园猿辕坦烦繁蕃樊翻萱喧冤言轩藩魂浑温孙门尊存蹲敦墩暾屯豚村盆奔论坤昏婚阍痕根恩吞沅媛援爰幡番反埙鸳宛掀昆琨鲲扪荪髡跟垠抡蕴犍袁怨蜿溷昆炖饨臀喷纯",
19 | "寒韩翰丹殚单安难餐滩坛檀弹残干肝竿乾阑栏澜兰看刊丸桓纨端湍酸团抟攒官观冠鸾銮栾峦欢宽盘蟠漫汗郸叹摊奸剜棺钻瘢谩瞒潘胖弁拦完莞獾拌掸萑倌繁曼馒鳗谰洹滦",
20 | "删潸关弯湾还环鹌鬟寰班斑颁般蛮颜菅攀顽山鳏艰闲娴悭孱潺殷扳讪患",
21 | "先前千阡笺天坚肩贤弦烟燕莲怜田填钿年颠巅牵妍研眠渊涓蠲编玄县泉迁仙鲜钱煎然延筵禅蝉缠连联涟篇偏便全宣镌穿川缘鸢铅捐旋娟船涎鞭专圆员乾虔愆骞权拳椽传焉跹溅舷咽零骈阗鹃翩扁平沿诠痊悛荃遄卷挛戋佃滇婵颛犍搴嫣癣澶单竣鄢扇键蜷棉",
22 | "萧箫挑貂刁凋雕迢条跳苕调枭浇聊辽寥撩僚寮尧幺宵消霄绡销超朝潮嚣樵谯骄娇焦蕉椒饶烧遥姚摇谣瑶韶昭招飚标杓镳瓢苗描猫要腰邀乔桥侨妖夭漂飘翘祧佻徼侥哨娆陶橇劭潇骁獠料硝灶鹞钊蛲峤轿荞嘹逍燎憔剽",
23 | "肴巢交郊茅嘲钞包胶爻苞梢蛟庖匏坳敲胞抛鲛崤铙炮哮捎茭淆泡跑咬啁教咆鞘剿刨佼抓姣唠",
24 | "豪毫操髦刀萄猱桃糟漕旄袍挠蒿涛皋号陶翱敖遭篙羔高嘈搔毛艘滔骚韬缫膏牢醪逃槽劳洮叨绸饕骜熬臊涝淘尻挑嚣捞嗥薅咎谣",
25 | "歌多罗河戈阿和波科柯陀娥蛾鹅萝荷过磨螺禾哥娑驼佗沱峨那苛诃珂轲莎蓑梭婆摩魔讹坡颇俄哦呵皤么涡窝茄迦伽磋跎番蹉搓驮献蝌箩锅倭罗嵯锣",
26 | "麻花霞家茶华沙车牙蛇瓜斜邪芽嘉瑕纱鸦遮叉葩奢楂琶衙赊涯夸巴加耶嗟遐笳差蟆蛙虾拿葭茄挝呀枷哑娲爬杷蜗爷芭鲨珈骅娃哇洼畲丫夸裟瘕些桠杈痂哆爹椰咤笆桦划迦揶吾佘",
27 | "阳杨扬香乡光昌堂章张王房芳长塘妆常凉霜藏场央泱鸯秧嫱床方浆觞梁娘庄黄仓皇装殇襄骧相湘箱缃创忘芒望尝偿樯枪坊囊郎唐狂强肠康冈苍匡荒遑行妨棠翔良航倡伥羌庆姜僵缰疆粮穰将墙桑刚祥详洋徉佯粱量羊伤汤鲂樟彰漳璋猖商防",
28 | "筐煌隍凰蝗惶璜廊浪裆沧纲亢吭潢钢丧盲簧忙茫傍汪臧琅当庠裳昂障糖疡锵杭邙赃滂禳攘瓤抢螳踉眶炀阊彭蒋亡殃蔷镶孀搪彷胱磅膀螃八庚更羹盲横觥彭棚亨英瑛烹平评京惊荆明盟鸣荣莹兵卿生甥笙牲檠擎鲸迎行衡耕萌氓宏闳茎莺樱泓橙筝争清情晴精睛菁旌晶盈瀛嬴营婴缨贞成盛城诚呈程声征正轻名令并倾萦琼赓撑瞠枪伧峥猩珩蘅铿嵘丁嘤鹦铮砰绷轰訇瞪侦顷榜抨趟坪请",
29 | "青经泾形刑邢型陉亭庭廷霆蜓停丁宁钉仃馨星腥醒惺娉灵棂龄铃苓伶零玲翎瓴囹聆听厅汀冥溟螟铭瓶屏萍荧萤荥扃町瞑暝",
30 | "蒸承丞惩陵凌绫冰膺鹰应蝇绳渑乘升胜兴缯凭仍兢矜征凝称登灯僧增曾憎层能棱朋鹏弘肱腾滕藤恒冯瞢扔誊",
31 | "尤邮优忧流留榴骝刘由油游猷悠攸牛修羞秋周州洲舟酬仇柔俦畴筹稠邱抽湫遒收鸠不愁休囚求裘球浮谋牟眸矛侯猴喉讴沤鸥瓯楼娄陬偷头投钩沟幽彪疣绸浏瘤犹啾酋售蹂揉搜叟邹貅泅球逑俅蜉桴罘欧搂抠髅蝼兜句妯惆呕缪繇偻篓馗区",
32 | "侵寻浔林霖临针箴斟沈深淫心琴禽擒钦衾吟今襟金音阴岑簪琳琛椹谌忱壬任黔歆禁喑森参淋郴妊湛",
33 | "覃潭谭参骖南男谙庵含涵函岚蚕探贪耽龛堪戡谈甘三酣篮柑惭蓝郯婪庵颔褴澹",
34 | "盐檐廉帘嫌严占髯谦奁纤签瞻蟾炎添兼缣尖潜阎镰粘淹箝甜恬拈暹詹渐歼黔沾苫占崦阉砭",
35 | "咸缄谗衔岩帆衫杉监凡馋芟喃嵌掺搀严"]
--------------------------------------------------------------------------------
/bert_seq2seq/model/layers/layer_norm.py:
--------------------------------------------------------------------------------
1 | # Copyright © 2022 BAAI. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License")
4 | # layer norm
5 |
6 | import torch
7 | import torch.nn as nn
8 |
9 | class LayerNorm(nn.Module):
10 | def __init__(self, hidden_size, eps=1e-6):
11 | super(LayerNorm, self).__init__()
12 | self.eps = eps
13 | self.gamma = nn.Parameter(torch.ones(hidden_size))
14 | self.beta = nn.Parameter(torch.zeros(hidden_size))
15 |
16 | def forward(self, x):
17 | """Perform layer normalization to input x, with two learnable variables gamma and beta"""
18 | mean = x.mean(-1, keepdim=True)
19 | std = x.std(-1, keepdim=True)
20 | hidden_states = self.gamma * (x - mean) / (std + self.eps)
21 |
22 | return hidden_states + self.beta
23 |
24 |
25 | class T5LayerNorm(nn.Module):
26 | def __init__(self, hidden_size, eps=1e-6):
27 | """
28 | Construct a layernorm module in the T5 style No bias and no subtraction of mean.
29 | """
30 | super().__init__()
31 | self.weight = nn.Parameter(torch.ones(hidden_size))
32 | self.variance_epsilon = eps
33 |
34 | def forward(self, hidden_states):
35 | # layer norm should always be calculated in float32
36 | variance = hidden_states.to(torch.float32).pow(2).mean(-1,
37 | keepdim=True)
38 | hidden_states = hidden_states * torch.rsqrt(variance +
39 | self.variance_epsilon)
40 |
41 | # convert into float16 if necessary
42 | if self.weight.dtype == torch.float16:
43 | hidden_states = hidden_states.to(torch.float16)
44 | return self.weight * hidden_states
45 |
46 |
47 | class BertLayerNorm(nn.Module):
48 | def __init__(self, hidden_size, eps=1e-12):
49 | """Construct a layernorm module in the TF style (epsilon inside the square root).
50 | """
51 | super(BertLayerNorm, self).__init__()
52 | self.weight = nn.Parameter(torch.ones(hidden_size))
53 | self.bias = nn.Parameter(torch.zeros(hidden_size))
54 | self.variance_epsilon = eps
55 |
56 | def forward(self, x):
57 | u = x.mean(-1, keepdim=True)
58 | s = (x - u).pow(2).mean(-1, keepdim=True)
59 | x = (x - u) / torch.sqrt(s + self.variance_epsilon)
60 | return self.weight * x + self.bias
61 |
--------------------------------------------------------------------------------
/bert_seq2seq/task/seq2seq/bert_seq2seq_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from bert_seq2seq.basic_bert import BasicBert
4 |
5 | class BertSeq2SeqModel(BasicBert):
6 | """
7 | """
8 | def __init__(self, vocab,
9 | model_name="roberta",
10 | size="base",
11 | **kwargs):
12 | super(BertSeq2SeqModel, self).__init__(word2ix=vocab, model_name=model_name, size=size)
13 |
14 | self.hidden_dim = self.config.hidden_size
15 | self.vocab_size = len(vocab)
16 |
17 | def compute_loss(self, predictions, labels, target_mask):
18 | """
19 | target_mask : 句子a部分和pad部分全为0, 而句子b部分为1
20 | """
21 | predictions = predictions.view(-1, self.vocab_size)
22 | labels = labels.view(-1)
23 | target_mask = target_mask.view(-1).float()
24 | loss = nn.CrossEntropyLoss(ignore_index=0, reduction="none")
25 | return (loss(predictions, labels) * target_mask).sum() / target_mask.sum() ## 通过mask 取消 pad 和句子a部分预测的影响
26 |
27 | def forward(self, **data):
28 | input_ids = data["input_ids"]
29 | token_type_ids = data["token_type_ids"]
30 | labels = data.get("labels", None)
31 | device = input_ids.device
32 |
33 | input_shape = input_ids.shape
34 | seq_len = input_shape[1]
35 | ## 构建特殊的mask
36 | ones = torch.ones((1, 1, seq_len, seq_len), dtype=torch.float32, device=device)
37 | a_mask = ones.tril()
38 | s_ex12 = token_type_ids.unsqueeze(1).unsqueeze(2).float()
39 | s_ex13 = token_type_ids.unsqueeze(1).unsqueeze(3).float()
40 | a_mask = (1.0 - s_ex12) * (1.0 - s_ex13) + s_ex13 * a_mask
41 |
42 | enc_layers, _ = self.bert(input_ids, position_ids=None, token_type_ids=token_type_ids, attention_mask=a_mask,
43 | output_all_encoded_layers=True)
44 | squence_out = enc_layers[-1] ## 取出来最后一层输出 (batch, seq_len, 768)
45 |
46 | tokens_hidden_state, predictions = self.cls(squence_out)
47 | result_data = {"logits": predictions, "hidden_states": tokens_hidden_state}
48 |
49 | if labels is not None:
50 |
51 | predictions = predictions[:, :-1].contiguous()
52 | target_mask = token_type_ids[:, 1:].contiguous()
53 | loss = self.compute_loss(predictions, labels, target_mask)
54 | result_data["loss"] = loss
55 |
56 | return result_data
57 |
58 |
59 |
--------------------------------------------------------------------------------
/examples/seq2seq/bert/test_roberta_auto_title.py:
--------------------------------------------------------------------------------
1 | from bert_seq2seq.tokenizer import Tokenizer
2 | from bert_seq2seq import Predictor
3 | from bert_seq2seq import load_model
4 | import torch
5 |
6 | model_name = "roberta" # 选择模型名字
7 | task_name = "seq2seq" # 任务名字
8 | model_path = "./roberta_auto_title_model.bin"
9 | vocab_path = "../state_dict/roberta/vocab.txt"
10 |
11 | tokenizer = Tokenizer(vocab_path)
12 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name)
13 | bert_model.load_all_params(model_path)
14 | predictor = Predictor(bert_model, tokenizer)
15 |
16 | if __name__ == '__main__':
17 | textset = ["近期,美国国会众院通过法案,重申美国对台湾的承诺。对此,中国外交部发言人表示,有关法案严重违反一个中国原则和中美三个联合公报规定,粗暴干涉中国内政,中方对此坚决反对并已向美方提出严正交涉。\n" \
18 | "事实上,中国高度关注美国国内打“台湾牌”、挑战一中原则的危险动向。近年来,作为“亲台”势力大本营的美国国会动作不断,先后通过“与台湾交往法”“亚洲再保证倡议法”等一系列“挺台”法案,“2019" \
19 | "财年国防授权法案”也多处触及台湾问题。",
20 | "在推进“双一流”高校建设进程中,我们要紧紧围绕为党育人、为国育才,找准问题、破解难题,以一流意识和担当精神,大力推进高校的治理能力建设。",
21 | "增强政治引领力。坚持党对高校工作的全面领导,始终把政治建设摆在首位,增强校党委的政治领导力,全面推进党的建设各项工作。落实立德树人根本任务,把培养社会主义建设者和接班人放在中心位置。紧紧抓住思想政治工作这条生命线,全面加强师生思想政治工作,推进“三全育人”综合改革,将思想政治工作贯穿学校教育管理服务全过程,努力让学生成为德才兼备、全面发展的人才。",
22 | "提升人才聚集力。人才是创新的核心要素,创新驱动本质上是人才驱动。要坚持引育并举,建立绿色通道,探索知名专家举荐制,完善“一事一议”支持机制。在大力支持自然科学人才队伍建设的同时,实施哲学社会科学人才工程。立足实际,在条件成熟的学院探索“一院一策”改革。创新科研组织形式,为人才成长创设空间,建设更加崇尚学术、更加追求卓越、更加关爱学生、更加担当有为的学术共同体。",
23 | "培养学生竞争力。遵循学生成长成才的规律培育人才,着力培养具有国际竞争力的拔尖创新人才和各类专门人才,使优势学科、优秀教师、优质资源、优良环境围绕立德树人的根本任务配置。淘汰“水课”,打造“金课”,全力打造世界一流本科教育。深入推进研究生教育综合改革,加强事关国家重大战略的高精尖急缺人才培养,建设具有国际竞争力的研究生教育。",
24 | "激发科技创新力。在国家急需发展的领域挑大梁,就要更加聚焦科技前沿和国家需求,狠抓平台建设,包括加快牵头“武汉光源”建设步伐,积极参与国家实验室建设,建立校级大型科研仪器设备共享平台。关键核心技术领域“卡脖子”问题,归根结底是基础科学研究薄弱。要加大基础研究的支持力度,推进理论、技术和方法创新,鼓励支持重大原创和颠覆性技术创新,催生一批高水平、原创性研究成果。",
25 | "发展社会服务力。在贡献和服务中体现价值,推动合作共建、多元投入的格局,大力推进政产学研用结合,强化科技成果转移转化及产业化。探索校城融合发展、校地联动发展的新模式,深度融入地方创新发展网络,为地方经济社会发展提供人才支撑,不断拓展和优化社会服务网络。",
26 | "涵育文化软实力。加快体制机制改革,优化学校、学部、学院三级评审机制,充分发挥优秀学者特别是德才兼备的年轻学者在学术治理中的重要作用。牢固树立一流意识、紧紧围绕一流目标、认真执行一流标准,让成就一流事业成为普遍追求和行动自觉。培育具有强大凝聚力的大学文化,营造积极团结、向上向善、干事创业的氛围,让大学成为吸引和留住一大批优秀人才建功立业的沃土,让敢干事、肯干事、能干事的人有更多的荣誉感和获得感。",
27 | "建设中国特色、世界一流大学不是等得来、喊得来的,而是脚踏实地拼出来、干出来的。对标一流,深化改革,坚持按章程办学,构建以一流质量标准为核心的制度规范体系,扎实推进学校综合改革,探索更具活力、更富效率的管理体制和运行机制,我们就一定能构建起具有中国特色的现代大学治理体系,进一步提升管理服务水平和工作效能。"
28 | ]
29 | for text in textset:
30 | out = predictor.predict_generate_beamsearch(text, beam_size=3, input_max_length=200, out_max_length=40)
31 | print(out)
32 |
--------------------------------------------------------------------------------
/bert_seq2seq/task/relationship_extraction/bert_relationship_extraction.py:
--------------------------------------------------------------------------------
1 |
2 | import torch.nn as nn
3 | from bert_seq2seq.basic_bert import BasicBert
4 | from bert_seq2seq.layers import GlobalPointer
5 |
6 | class BertRelationshipExtraction(BasicBert):
7 | """
8 | """
9 | def __init__(self, vocab,
10 | target_size,
11 | inner_dim=64,
12 | size="base",
13 | model_name="roberta",
14 | **kwargs):
15 | super(BertRelationshipExtraction, self).__init__(word2ix=vocab, model_name=model_name, size=size)
16 | self.entity_output = GlobalPointer(self.config.hidden_size, 2,
17 | inner_dim, RoPE=True, trill_mask=True)
18 | self.head_output = GlobalPointer(self.config.hidden_size, target_size,
19 | inner_dim, RoPE=False, trill_mask=False)
20 | self.tail_output = GlobalPointer(self.config.hidden_size, target_size,
21 | inner_dim, RoPE=False, trill_mask=False)
22 | self.layer_norm_cond = None
23 | self.cls = None
24 |
25 | def forward(self, **data):
26 | input_ids = data["input_ids"]
27 | token_type_ids = data.get("token_type_ids", None)
28 | head_labels = data.get("head_labels", None)
29 | tail_labels = data.get("tail_labels", None)
30 | entity_labels = data.get("entity_labels", None)
31 |
32 | padding_mask = (input_ids > 0).float()
33 |
34 | all_layers, _ = self.bert(input_ids, token_type_ids=token_type_ids,
35 | output_all_encoded_layers=True)
36 | sequence_out = all_layers[-1]
37 |
38 | entity_output = self.entity_output(sequence_out, padding_mask)
39 | head_output = self.head_output(sequence_out, padding_mask)
40 | tail_output = self.tail_output(sequence_out, padding_mask)
41 |
42 | return_data = {"entity_output": entity_output, "head_output": head_output, "tail_output": tail_output}
43 | if entity_labels is not None:
44 | loss_entity = self.entity_output.compute_loss_sparse(entity_output, entity_labels, mask_zero=True)
45 | loss_head = self.head_output.compute_loss_sparse(head_output, head_labels, mask_zero=True)
46 | loss_tail = self.tail_output.compute_loss_sparse(tail_output, tail_labels, mask_zero=True)
47 |
48 | return_data["loss"] = (loss_entity + loss_head + loss_tail) / 3
49 | return return_data
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | no_push
12 |
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | pip-wheel-metadata/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 | help.txt
38 | examples/text_classification/train_roberta_emotion_analysis.py
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .nox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *.cover
53 | *.py,cover
54 | .hypothesis/
55 | .pytest_cache/
56 |
57 | # Translations
58 | *.mo
59 | *.pot
60 |
61 | # Django stuff:
62 | *.log
63 | local_settings.py
64 | db.sqlite3
65 | db.sqlite3-journal
66 |
67 | # Flask stuff:
68 | instance/
69 | .webassets-cache
70 |
71 | # Scrapy stuff:
72 | .scrapy
73 |
74 | # Sphinx documentation
75 | docs/_build/
76 |
77 | # PyBuilder
78 | target/
79 |
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 |
83 | # IPython
84 | profile_default/
85 | ipython_config.py
86 |
87 | # pyenv
88 | .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98 | __pypackages__/
99 |
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 |
104 | # SageMath parsed files
105 | *.sage.py
106 |
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 |
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 |
120 | # Rope project settings
121 | .ropeproject
122 |
123 | # mkdocs documentation
124 | /site
125 |
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 |
131 | # Pyre type checker
132 | .pyre/
133 |
--------------------------------------------------------------------------------
/bert_seq2seq/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from bert_seq2seq.task.seq2seq.bert_seq2seq_model import BertSeq2SeqModel
3 | import os
4 | from bert_seq2seq.task.embedding.bert_embedding import BertEmbedding
5 | from bert_seq2seq.task.classification.bert_cls_classifier import BertClsClassifier
6 | from bert_seq2seq.task.sequence_labeling.bert_sequence_labeling import BertNERGP, BertNERCRF, BertSequenceLabling
7 | from bert_seq2seq.task.seq2seq.gpt2_seq2seq_model import GPT2
8 | from bert_seq2seq.task.seq2seq.t5_seq2seq_model import T5Model
9 | from bert_seq2seq.task.relationship_extraction.bert_relationship_extraction import BertRelationshipExtraction
10 | # from bert_seq2seq.GLM.model.modeling_glm import GLMModel
11 | # from GLM.model.modeling_glm import GLMModel
12 | from bert_seq2seq.task.seq2seq.GLM_seq2seq_model import GLMSeq2SeqModel
13 |
14 | ALL_TASK = {
15 | "bert_seq2seq": BertSeq2SeqModel,
16 | "roberta_seq2seq": BertSeq2SeqModel,
17 | "roberta-large_seq2seq": BertSeq2SeqModel,
18 | "bert_classification": BertClsClassifier,
19 | "roberta_classification": BertClsClassifier,
20 | "roberta-large_classification": BertClsClassifier,
21 | "bert_sequence_labeling_gp": BertNERGP,
22 | "roberta_sequence_labeling_gp": BertNERGP,
23 | "roberta-large_sequence_labeling_gp": BertNERGP,
24 | "bert_sequence_labeling_crf": BertNERCRF,
25 | "roberta_sequence_labeling_crf": BertNERCRF,
26 | "roberta-large_sequence_labeling_crf": BertNERCRF,
27 | "bert_sequence_labeling": BertSequenceLabling,
28 | "roberta_sequence_labeling": BertSequenceLabling,
29 | "roberta-large_sequence_labeling": BertSequenceLabling,
30 | "bert_embedding": BertEmbedding,
31 | "roberta_embedding": BertEmbedding,
32 | "roberta-large_embedding": BertEmbedding,
33 | "gpt2_seq2seq": GPT2,
34 | "t5_seq2seq": T5Model,
35 | "bert_relationship_extraction":BertRelationshipExtraction,
36 | "roberta_relationship_extraction":BertRelationshipExtraction,
37 | "nezha_relationship_extraction":BertRelationshipExtraction,
38 | "glm": GLMSeq2SeqModel,
39 | "glm_seq2seq": GLMSeq2SeqModel,
40 | "glm_lm": GLMSeq2SeqModel,
41 |
42 | }
43 |
44 | def load_model(vocab=None,
45 | model_name="roberta",
46 | task_name="seq2seq",
47 | target_size=0,
48 | ner_inner_dim=-1,
49 | size="base"):
50 | if model_name != "glm":
51 | assert vocab is not None, "vocab 字典不能为空"
52 | task_model = ALL_TASK.get(f"{model_name}_{task_name}", None)
53 | if task_model is None :
54 | print("no this task")
55 | os._exit(0)
56 |
57 | return task_model(vocab=vocab,
58 | model_name=model_name,
59 | size=size,
60 | target_size=target_size,
61 | ent_type_size=target_size,
62 | inner_dim=ner_inner_dim)
63 |
64 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | ## 例子文件说明
2 |
3 | ### bert embedding
4 | bert、roberta、nezha模型,输入一个句子,得到这个句子的embedding
5 | 1. [get_bert_embedding.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/bert_embedding/get_bert_embedding.py)
6 |
7 | ### ner
8 | bert、roberta、nezha模型,命名实体识别任务,支持crf与global pointer方式
9 | 1. [train_bert_ner_crf_people_daily.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/ner/train_bert_ner_crf_people_daily.py) crf方式进行ner任务
10 | 2. [train_roberta_ner_gp_people_daily.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/ner/train_roberta_ner_gp_people_daily.py) global pointer 方式进行ner任务
11 |
12 | ### seq2seq
13 | 生成任务,支持bert、roberta、nezha、gpt2、t5、bart等模型
14 | 1. [test_gpt2_text_writting.py.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/gpt2/test_gpt2_text_writting.py.py) gpt2续写测试
15 | 2. [train_roberta_auto_title.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/bert/train_roberta_auto_title.py) roberta模型自动标题训练
16 | 3. [train_roberta_auto_title_multi_gpu.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/bert/train_roberta_auto_title_multi_gpu.py) roberta自动标题训练(多gpu版本)
17 | 4. [train_gpt2_multi_chat.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/gpt2/train_gpt2_multi_chat.py) gpt2多轮对话训练
18 | 5. [test_t5_auto_title.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/t5/test_t5_auto_title.py) T5模型自动标题测试代码
19 | 6. [test_gpt2_multi_chat.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/gpt2/test_gpt2_multi_chat.py) gpt2多轮对话测试
20 | 7. [test_multi_processing_generate.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/gpt2/test_multi_processing_generate.py) 多进程生成的例子
21 |
22 | ### text classification
23 | bert、roberta、nezha模型,支持文本分类、情感分析、语义匹配任务
24 | 1. [train_roberta_news_title_classification.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/text_classification/train_roberta_news_title_classification.py) 新闻摘要文本分类训练
25 | 2. [train_roberta_news_title_classification_multi_gpu.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/text_classification/train_roberta_news_title_classification_multi_gpu.py) 新闻摘要文本分类训练(多gpu版本)
26 | 3. [train_roberta_semantic_matching.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/text_classification/train_roberta_semantic_matching.py) 语义匹配训练
27 | 4. [test.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/text_classification/test.py) 加载训练好的模型进行测试
28 |
29 | ### FAQ 检索式问答
30 | 1. [1_construct_data.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/text_classification/1_construct_data.py) 构建数据集,提前提取embedding特征
31 | 2. [2_test_bert_faq.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/text_classification/2_test_bert_faq.py) 加载构建的embeddings,利用faiss进行相似问题的检索
32 |
--------------------------------------------------------------------------------
/bert_seq2seq/model/prompt.py:
--------------------------------------------------------------------------------
1 | # Copyright © 2022 BAAI. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License")
4 | import random
5 | import torch
6 |
7 |
8 | class PromptSpell(torch.nn.Module):
9 | def __init__(self, spell_length, hidden_size, spell_func):
10 | super(PromptSpell, self).__init__()
11 | self.spell_length = spell_length
12 | self.hidden_size = hidden_size
13 | self.spell_embeddings = torch.nn.Embedding(self.spell_length, self.hidden_size)
14 | self.spell_func = spell_func
15 | if self.spell_func == "lstm":
16 | self.lstm_head = torch.nn.LSTM(input_size=self.hidden_size,
17 | hidden_size=self.hidden_size,
18 | num_layers=2,
19 | # dropout=self.lstm_dropout,
20 | bidirectional=True,
21 | batch_first=True) # .to(torch.device("cuda"))
22 | self.mlp_head = torch.nn.Sequential(torch.nn.Linear(2 * self.hidden_size, self.hidden_size),
23 | torch.nn.ReLU(),
24 | torch.nn.Linear(self.hidden_size, self.hidden_size))
25 | elif self.spell_func == "mlp":
26 | self.mlp_head = torch.nn.Sequential(torch.nn.Linear(self.hidden_size, self.hidden_size),
27 | torch.nn.ReLU(),
28 | torch.nn.Linear(self.hidden_size, self.hidden_size))
29 | elif self.spell_func != "none":
30 | raise NotImplementedError("Prompt function " + self.spell_func)
31 |
32 | def init_embedding(self, word_embeddings=None, task_tokens=None):
33 | num_words = 5000
34 | with torch.no_grad():
35 | for i in range(self.spell_length):
36 | rand_token = random.randrange(num_words)
37 | if task_tokens is None:
38 | target_embedding = word_embeddings[rand_token]
39 | else:
40 | word_embedding = word_embeddings[rand_token]
41 | task_token = random.choice(task_tokens)
42 | task_embedding = word_embeddings[task_token]
43 | ratio = random.random()
44 | target_embedding = word_embedding * ratio + task_embedding * (1 - ratio)
45 | self.spell_embeddings.weight.data[i] = target_embedding
46 |
47 | def forward(self):
48 | prompt_embeds = self.spell_embeddings.weight.unsqueeze(0)
49 | if self.spell_func == "lstm":
50 | prompt_embeds = self.lstm_head(prompt_embeds)[0]
51 | if self.spell_func == "lstm" or self.spell_func == "mlp":
52 | prompt_embeds = self.mlp_head(prompt_embeds)
53 | return prompt_embeds
54 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/bert_seq2seq/mpu/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright © 2022 BAAI. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License")
4 | # coding=utf-8
5 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | import torch
20 |
21 |
22 | def ensure_divisibility(numerator, denominator):
23 | """Ensure that numerator is divisible by the denominator."""
24 | assert numerator % denominator == 0, '{} is not divisible by {}'.format(
25 | numerator, denominator)
26 |
27 |
28 | def divide(numerator, denominator):
29 | """Ensure that numerator is divisible by the denominator and return
30 | the division value."""
31 | ensure_divisibility(numerator, denominator)
32 | return numerator // denominator
33 |
34 |
35 | def split_tensor_along_last_dim(tensor,
36 | num_partitions,
37 | contiguous_split_chunks=False):
38 | """Split a tensor along its last dimension.
39 | Arguments:
40 | tensor: input tensor.
41 | num_partitions: number of partitions to split the tensor
42 | contiguous_split_chunks: If True, make each chunk contiguous
43 | in memory.
44 | """
45 | # Get the size and dimension.
46 | last_dim = tensor.dim() - 1
47 | last_dim_size = divide(tensor.size()[last_dim], num_partitions)
48 | # Split.
49 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
50 | # Note: torch.split does not create contiguous tensors by default.
51 | if contiguous_split_chunks:
52 | return tuple(chunk.contiguous() for chunk in tensor_list)
53 |
54 | return tensor_list
55 |
56 |
57 | class VocabUtility:
58 | """Split the vocabulary into `world_size` chunks amd return the
59 | first and last index of the vocabulary belonging to the `rank`
60 | partition: Note that indecies in [fist, last)"""
61 | @staticmethod
62 | def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
63 | rank, world_size):
64 | index_f = rank * per_partition_vocab_size
65 | index_l = index_f + per_partition_vocab_size
66 | return index_f, index_l
67 |
68 | @staticmethod
69 | def vocab_range_from_global_vocab_size(global_vocab_size, rank,
70 | world_size):
71 | per_partition_vocab_size = divide(global_vocab_size, world_size)
72 | return VocabUtility.vocab_range_from_per_partition_vocab_size(
73 | per_partition_vocab_size, rank, world_size)
74 |
--------------------------------------------------------------------------------
/bert_seq2seq/mpu/grads.py:
--------------------------------------------------------------------------------
1 | # Copyright © 2022 BAAI. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License")
4 | # coding=utf-8
5 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | # Parts of the code here are adapted from PyTorch
20 | # repo: https://github.com/pytorch/pytorch
21 |
22 | import torch
23 | from torch._six import inf
24 |
25 | from .initialize import get_model_parallel_group
26 | from .initialize import get_model_parallel_rank
27 |
28 |
29 | def clip_grad_norm(parameters, max_norm, norm_type=2):
30 | """Clips gradient norm of an iterable of parameters.
31 |
32 | This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
33 | added functionality to handle model parallel parameters. Note that
34 | the gradients are modified in place.
35 |
36 | Arguments:
37 | parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
38 | single Tensor that will have gradients normalized
39 | max_norm (float or int): max norm of the gradients
40 | norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
41 | infinity norm.
42 |
43 | Returns:
44 | Total norm of the parameters (viewed as a single vector).
45 | """
46 | if isinstance(parameters, torch.Tensor):
47 | parameters = [parameters]
48 | parameters = list(filter(lambda p: p.grad is not None, parameters))
49 | max_norm = float(max_norm)
50 | norm_type = float(norm_type)
51 | if norm_type == inf:
52 | total_norm = max(p.grad.data.abs().max() for p in parameters)
53 | total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
54 | # Take max across all GPUs.
55 | torch.distributed.all_reduce(total_norm_cuda,
56 | op=torch.distributed.ReduceOp.MAX,
57 | group=get_model_parallel_group())
58 | total_norm = total_norm_cuda[0].item()
59 | else:
60 | total_norm = 0
61 | for p in parameters:
62 | if p.model_parallel or (get_model_parallel_rank() == 0):
63 | param_norm = p.grad.data.norm(norm_type)
64 | total_norm += param_norm.item()**norm_type
65 | # Sum across all model parallel GPUs.
66 | total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
67 | torch.distributed.all_reduce(total_norm_cuda,
68 | op=torch.distributed.ReduceOp.SUM,
69 | group=get_model_parallel_group())
70 | total_norm = total_norm_cuda[0].item()**(1. / norm_type)
71 | clip_coef = max_norm / (total_norm + 1e-6)
72 | if clip_coef < 1:
73 | for p in parameters:
74 | p.grad.data.mul_(clip_coef)
75 | return total_norm
76 |
--------------------------------------------------------------------------------
/data/relationship_extraction/all_50_schemas:
--------------------------------------------------------------------------------
1 | {"object_type": "地点", "predicate": "祖籍", "subject_type": "人物"}
2 | {"object_type": "人物", "predicate": "父亲", "subject_type": "人物"}
3 | {"object_type": "地点", "predicate": "总部地点", "subject_type": "企业"}
4 | {"object_type": "地点", "predicate": "出生地", "subject_type": "人物"}
5 | {"object_type": "目", "predicate": "目", "subject_type": "生物"}
6 | {"object_type": "Number", "predicate": "面积", "subject_type": "行政区"}
7 | {"object_type": "Text", "predicate": "简称", "subject_type": "机构"}
8 | {"object_type": "Date", "predicate": "上映时间", "subject_type": "影视作品"}
9 | {"object_type": "人物", "predicate": "妻子", "subject_type": "人物"}
10 | {"object_type": "音乐专辑", "predicate": "所属专辑", "subject_type": "歌曲"}
11 | {"object_type": "Number", "predicate": "注册资本", "subject_type": "企业"}
12 | {"object_type": "城市", "predicate": "首都", "subject_type": "国家"}
13 | {"object_type": "人物", "predicate": "导演", "subject_type": "影视作品"}
14 | {"object_type": "Text", "predicate": "字", "subject_type": "历史人物"}
15 | {"object_type": "Number", "predicate": "身高", "subject_type": "人物"}
16 | {"object_type": "企业", "predicate": "出品公司", "subject_type": "影视作品"}
17 | {"object_type": "Number", "predicate": "修业年限", "subject_type": "学科专业"}
18 | {"object_type": "Date", "predicate": "出生日期", "subject_type": "人物"}
19 | {"object_type": "人物", "predicate": "制片人", "subject_type": "影视作品"}
20 | {"object_type": "人物", "predicate": "母亲", "subject_type": "人物"}
21 | {"object_type": "人物", "predicate": "编剧", "subject_type": "影视作品"}
22 | {"object_type": "国家", "predicate": "国籍", "subject_type": "人物"}
23 | {"object_type": "Number", "predicate": "海拔", "subject_type": "地点"}
24 | {"object_type": "网站", "predicate": "连载网站", "subject_type": "网络小说"}
25 | {"object_type": "人物", "predicate": "丈夫", "subject_type": "人物"}
26 | {"object_type": "Text", "predicate": "朝代", "subject_type": "历史人物"}
27 | {"object_type": "Text", "predicate": "民族", "subject_type": "人物"}
28 | {"object_type": "Text", "predicate": "号", "subject_type": "历史人物"}
29 | {"object_type": "出版社", "predicate": "出版社", "subject_type": "书籍"}
30 | {"object_type": "人物", "predicate": "主持人", "subject_type": "电视综艺"}
31 | {"object_type": "Text", "predicate": "专业代码", "subject_type": "学科专业"}
32 | {"object_type": "人物", "predicate": "歌手", "subject_type": "歌曲"}
33 | {"object_type": "人物", "predicate": "作词", "subject_type": "歌曲"}
34 | {"object_type": "人物", "predicate": "主角", "subject_type": "网络小说"}
35 | {"object_type": "人物", "predicate": "董事长", "subject_type": "企业"}
36 | {"object_type": "Date", "predicate": "成立日期", "subject_type": "机构"}
37 | {"object_type": "学校", "predicate": "毕业院校", "subject_type": "人物"}
38 | {"object_type": "Number", "predicate": "占地面积", "subject_type": "机构"}
39 | {"object_type": "语言", "predicate": "官方语言", "subject_type": "国家"}
40 | {"object_type": "Text", "predicate": "邮政编码", "subject_type": "行政区"}
41 | {"object_type": "Number", "predicate": "人口数量", "subject_type": "行政区"}
42 | {"object_type": "城市", "predicate": "所在城市", "subject_type": "景点"}
43 | {"object_type": "人物", "predicate": "作者", "subject_type": "图书作品"}
44 | {"object_type": "Date", "predicate": "成立日期", "subject_type": "企业"}
45 | {"object_type": "人物", "predicate": "作曲", "subject_type": "歌曲"}
46 | {"object_type": "气候", "predicate": "气候", "subject_type": "行政区"}
47 | {"object_type": "人物", "predicate": "嘉宾", "subject_type": "电视综艺"}
48 | {"object_type": "人物", "predicate": "主演", "subject_type": "影视作品"}
49 | {"object_type": "作品", "predicate": "改编自", "subject_type": "影视作品"}
50 | {"object_type": "人物", "predicate": "创始人", "subject_type": "企业"}
--------------------------------------------------------------------------------
/examples/seq2seq/gpt2/train_gpt2_multi_chat.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import torch
4 | from torch.utils.data import Dataset
5 | from bert_seq2seq import Tokenizer
6 | from bert_seq2seq import load_model
7 | from bert_seq2seq import Trainer
8 | from bert_seq2seq.dataset import gpt_collate_fn
9 | from bert_seq2seq import Predictor
10 | import json
11 |
12 | model_name = "gpt2" # 选择模型名字
13 | task_name = "seq2seq" # 任务名字
14 |
15 | model_path = "../state_dict/gpt2/pytorch_model.bin"
16 | vocab_path = "../state_dict/gpt2/vocab.txt"
17 | model_save_path = "./gpt2_multi_chat_model.bin" # 训练好的模型保存位置。
18 | lr = 2e-5
19 | maxlen = 1024
20 | data_path = '../data/LCCC-base-split/LCCC-base_train.json' # 数据位置
21 |
22 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23 | tokenizer = Tokenizer(vocab_path)
24 | model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name)
25 | model.load_pretrain_params(model_path)
26 | predictor = Predictor(model, tokenizer)
27 |
28 | trainer = Trainer(env_type="pytorch",
29 | epoches=5,
30 | val_every_step=500,
31 | device=device,
32 | batch_size=8,
33 | gradient_accmulation_step=8)
34 |
35 | def read_file():
36 | ## 更换数据集只需要实现这个函数即可,返回框架需要的src
37 |
38 | with open(data_path) as f:
39 | data = json.loads(f.read())
40 |
41 | return data
42 |
43 | data = read_file()
44 | print(data[:5])
45 |
46 | class ChatDataset(Dataset):
47 | """
48 | 针对特定数据集,定义一个相关的取数据的方式
49 | """
50 | def __init__(self, data) :
51 | ## 一般init函数是加载所有数据
52 | super().__init__()
53 | self.data = data
54 |
55 | def __getitem__(self, i):
56 | ## 得到单个数据
57 | # print(i)
58 | data = self.data[i]
59 | input_ids = [tokenizer.token_start_id]
60 |
61 | for index, text in enumerate(data):
62 | if (index + 1) % 2 == 1:
63 | text = "A:" + text
64 | else :
65 | text = "B:" + text
66 |
67 | text_ids = tokenizer.encode_plus(text, max_length=maxlen)["input_ids"][1:]
68 | input_ids.extend(text_ids)
69 |
70 | output = {
71 | "input_ids": input_ids,
72 | }
73 | return output
74 |
75 | def __len__(self):
76 |
77 | return len(self.data)
78 |
79 | class Evaluator:
80 |
81 | def on_validation(self, data):
82 | loss = data["loss"]
83 | step = data["iteration"]
84 | ## 自己定义validate函数实现,十分灵活。
85 | test_data = [["A:今天天气很好,你觉得呢?"],
86 | ["A:我去吃了火锅。"],
87 | ["A:我去吃了火锅。", "B:我也是,真不错,你吃的哪家?"]
88 | ]
89 | for text in test_data:
90 | print(predictor.predict_multi_response(text,
91 | input_max_length=200,
92 | out_max_length=40,
93 | top_k=30, top_p=0.9,
94 | repetition_penalty=1.2,
95 | temperature=1.2))
96 |
97 | torch.save(model.state_dict(), model_save_path)
98 | print(f"模型保存成功~")
99 |
100 | def main():
101 | ## 加载数据
102 | data = read_file()
103 |
104 | optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-3)
105 | train_dataset = ChatDataset(data)
106 |
107 | trainer.train(model, optimizer, train_dataset=train_dataset, evaluator=Evaluator,
108 | collate_fn=gpt_collate_fn)
109 |
110 | if __name__ == '__main__':
111 | main()
112 |
--------------------------------------------------------------------------------
/data/relationship_extraction/dev_data.json:
--------------------------------------------------------------------------------
1 | {"postag": [{"word": "查尔斯", "pos": "nr"}, {"word": "·", "pos": "w"}, {"word": "阿兰基斯", "pos": "nr"}, {"word": "(", "pos": "w"}, {"word": "Charles Aránguiz", "pos": "nz"}, {"word": ")", "pos": "w"}, {"word": ",", "pos": "w"}, {"word": "1989年4月17日", "pos": "t"}, {"word": "出生", "pos": "v"}, {"word": "于", "pos": "p"}, {"word": "智利圣地亚哥", "pos": "ns"}, {"word": ",", "pos": "w"}, {"word": "智利", "pos": "ns"}, {"word": "职业", "pos": "n"}, {"word": "足球", "pos": "n"}, {"word": "运动员", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "司职", "pos": "v"}, {"word": "中场", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "效力", "pos": "v"}, {"word": "于", "pos": "p"}, {"word": "德国", "pos": "ns"}, {"word": "足球", "pos": "n"}, {"word": "甲级", "pos": "a"}, {"word": "联赛", "pos": "n"}, {"word": "勒沃库森足球俱乐部", "pos": "nt"}], "text": "查尔斯·阿兰基斯(Charles Aránguiz),1989年4月17日出生于智利圣地亚哥,智利职业足球运动员,司职中场,效力于德国足球甲级联赛勒沃库森足球俱乐部", "spo_list": [{"predicate": "出生地", "object_type": "地点", "subject_type": "人物", "object": "圣地亚哥", "subject": "查尔斯·阿兰基斯"}, {"predicate": "出生日期", "object_type": "Date", "subject_type": "人物", "object": "1989年4月17日", "subject": "查尔斯·阿兰基斯"}]}
2 | {"postag": [{"word": "《", "pos": "w"}, {"word": "离开", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "是", "pos": "v"}, {"word": "由", "pos": "p"}, {"word": "张宇", "pos": "nr"}, {"word": "谱曲", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "演唱", "pos": "v"}], "text": "《离开》是由张宇谱曲,演唱", "spo_list": [{"predicate": "歌手", "object_type": "人物", "subject_type": "歌曲", "object": "张宇", "subject": "离开"}, {"predicate": "作曲", "object_type": "人物", "subject_type": "歌曲", "object": "张宇", "subject": "离开"}]}
3 | {"postag": [{"word": "《", "pos": "w"}, {"word": "愤怒的唐僧", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "由", "pos": "p"}, {"word": "北京吴意波影视文化工作室", "pos": "nt"}, {"word": "与", "pos": "p"}, {"word": "优酷", "pos": "nt"}, {"word": "电视剧", "pos": "n"}, {"word": "频道", "pos": "n"}, {"word": "联合", "pos": "vd"}, {"word": "制作", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "故事", "pos": "n"}, {"word": "以", "pos": "p"}, {"word": "喜剧", "pos": "n"}, {"word": "元素", "pos": "n"}, {"word": "为主", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "讲述", "pos": "v"}, {"word": "唐僧", "pos": "nr"}, {"word": "与", "pos": "c"}, {"word": "佛祖", "pos": "n"}, {"word": "打牌", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "得罪", "pos": "v"}, {"word": "了", "pos": "u"}, {"word": "佛祖", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "被", "pos": "p"}, {"word": "踢", "pos": "v"}, {"word": "下", "pos": "v"}, {"word": "人间", "pos": "n"}, {"word": "再", "pos": "d"}, {"word": "渡", "pos": "v"}, {"word": "九九八十一难", "pos": "nz"}, {"word": "的", "pos": "u"}, {"word": "故事", "pos": "n"}], "text": "《愤怒的唐僧》由北京吴意波影视文化工作室与优酷电视剧频道联合制作,故事以喜剧元素为主,讲述唐僧与佛祖打牌,得罪了佛祖,被踢下人间再渡九九八十一难的故事", "spo_list": [{"predicate": "出品公司", "object_type": "企业", "subject_type": "影视作品", "object": "北京吴意波影视文化工作室", "subject": "愤怒的唐僧"}, {"predicate": "导演", "object_type": "人物", "subject_type": "影视作品", "object": "吴意波", "subject": "愤怒的唐僧"}]}
4 | {"postag": [{"word": "李治", "pos": "nr"}, {"word": "即位", "pos": "v"}, {"word": "后", "pos": "f"}, {"word": ",", "pos": "w"}, {"word": "萧淑妃", "pos": "nr"}, {"word": "受宠", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "王皇后", "pos": "nr"}, {"word": "为了", "pos": "p"}, {"word": "排挤", "pos": "v"}, {"word": "萧淑妃", "pos": "nr"}, {"word": ",", "pos": "w"}, {"word": "答应", "pos": "v"}, {"word": "李治", "pos": "nr"}, {"word": "让", "pos": "v"}, {"word": "身", "pos": "n"}, {"word": "在", "pos": "v"}, {"word": "感业寺", "pos": "ns"}, {"word": "的", "pos": "u"}, {"word": "武则天", "pos": "nr"}, {"word": "续", "pos": "v"}, {"word": "起", "pos": "v"}, {"word": "头发", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "重新", "pos": "d"}, {"word": "纳入", "pos": "v"}, {"word": "后宫", "pos": "n"}], "text": "李治即位后,萧淑妃受宠,王皇后为了排挤萧淑妃,答应李治让身在感业寺的武则天续起头发,重新纳入后宫", "spo_list": [{"predicate": "妻子", "object_type": "人物", "subject_type": "人物", "object": "萧淑妃", "subject": "李治"}, {"predicate": "丈夫", "object_type": "人物", "subject_type": "人物", "object": "李治", "subject": "萧淑妃"}]}
5 |
--------------------------------------------------------------------------------
/examples/seq2seq/bert/train_roberta_auto_title.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import torch
4 | from torch.utils.data import Dataset, DataLoader
5 | from bert_seq2seq import Tokenizer
6 | from bert_seq2seq import load_model
7 | from bert_seq2seq import Trainer
8 | from bert_seq2seq.dataset import bert_seq2seq_collate_fn
9 | from bert_seq2seq import Predictor
10 |
11 | model_name = "roberta" # 选择模型名字
12 | task_name = "seq2seq" # 任务名字
13 |
14 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置
15 | model_path = "../state_dict/roberta/pytorch_model.bin" # 预训练模型位置
16 | model_save_path = "./roberta_auto_title_model.bin" # 训练好的模型保存位置。
17 |
18 | lr = 1e-5
19 | maxlen=256
20 | src_dir = '../data/auto_title/train.src' # 数据位置
21 | tgt_dir = '../data/auto_title/train.tgt'
22 |
23 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24 | tokenizer = Tokenizer(vocab_path)
25 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name)
26 | bert_model.load_pretrain_params(model_path)
27 | predictor = Predictor(bert_model, tokenizer)
28 |
29 | trainer = Trainer(env_type="pytorch",
30 | epoches=5,
31 | val_every_step=500,
32 | device=device,
33 | batch_size=16)
34 |
35 | def read_file():
36 | ## 更换数据集只需要实现这个函数即可,返回框架需要的src、tgt
37 | src = []
38 | tgt = []
39 |
40 | with open(src_dir,'r',encoding='utf-8') as f:
41 | lines = f.readlines()
42 | for line in lines:
43 | src.append(line.strip('\n').lower())
44 |
45 | with open(tgt_dir,'r',encoding='utf-8') as f:
46 | lines = f.readlines()
47 | for line in lines:
48 | tgt.append(line.strip('\n').lower())
49 |
50 | return src,tgt
51 |
52 | class AutoTitleDataset(Dataset):
53 | """
54 | 针对特定数据集,定义一个相关的取数据的方式
55 | """
56 | def __init__(self, sents_src, sents_tgt) :
57 | ## 一般init函数是加载所有数据
58 | super().__init__()
59 | self.sents_src = sents_src
60 | self.sents_tgt = sents_tgt
61 |
62 | def __getitem__(self, i):
63 | ## 得到单个数据
64 | # print(i)
65 | src = self.sents_src[i]
66 | tgt = self.sents_tgt[i]
67 | tokenizer_out = tokenizer.encode_plus(src, tgt, max_length=maxlen)
68 |
69 | output = {
70 | "input_ids": tokenizer_out["input_ids"],
71 | "token_type_ids": tokenizer_out["token_type_ids"],
72 | }
73 | return output
74 |
75 | def __len__(self):
76 |
77 | return len(self.sents_src)
78 |
79 | class Evaluator:
80 |
81 | def on_validation(self, data):
82 | loss = data["loss"]
83 | step = data["iteration"]
84 | ## 自己定义validate函数实现,十分灵活。
85 | test_data = ["本文总结了十个可穿戴产品的设计原则而这些原则同样也是笔者认为是这个行业最吸引人的地方1为人们解决重复性问题2从人开始而不是从机器开始3要引起注意但不要刻意4提升用户能力而不是取代人",
86 | "2007年乔布斯向人们展示iPhone并宣称它将会改变世界还有人认为他在夸大其词然而在8年后以iPhone为代表的触屏智能手机已经席卷全球各个角落未来智能手机将会成为真正的个人电脑为人类发展做出更大的贡献",
87 | "雅虎发布2014年第四季度财报并推出了免税方式剥离其持有的阿里巴巴集团15%股权的计划打算将这一价值约400亿美元的宝贵投资分配给股东截止发稿前雅虎股价上涨了大约7%至5145美元"]
88 | for text in test_data:
89 | print(predictor.predict_generate_beamsearch(text, beam_size=3, input_max_length=200, out_max_length=40))
90 |
91 | torch.save(bert_model.state_dict(), model_save_path)
92 | print(f"模型保存成功~")
93 |
94 | def main():
95 | ## 加载数据
96 | all_src, all_tgt = read_file()
97 | train_size = int(len(all_src) * 0.9)
98 | train_src, train_tgt = all_src[:train_size], all_tgt[:train_size]
99 | # 声明需要优化的参数
100 | optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=1e-3)
101 | train_dataset = AutoTitleDataset(train_src, train_tgt)
102 |
103 | trainer.train(bert_model, optimizer, train_dataset=train_dataset, evaluator=Evaluator,
104 | collate_fn=bert_seq2seq_collate_fn)
105 |
106 | if __name__ == '__main__':
107 | main()
108 |
--------------------------------------------------------------------------------
/examples/seq2seq/bert/train_roberta_auto_title_multi_gpu.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.data import Dataset, DataLoader
3 | from bert_seq2seq import Tokenizer
4 | from bert_seq2seq import load_model
5 | from bert_seq2seq import Trainer
6 | from bert_seq2seq.dataset import bert_seq2seq_collate_fn
7 | from bert_seq2seq import Predictor
8 | import os
9 |
10 | model_name = "roberta" # 选择模型名字
11 | task_name = "seq2seq" # 任务名字
12 |
13 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置
14 | model_path = "../state_dict/roberta/pytorch_model.bin" # 预训练模型位置
15 | model_save_path = "./roberta_auto_title_model.bin" # 训练好的模型保存位置。
16 | lr = 1e-5
17 | maxlen=256
18 | src_dir = '../data/auto_title/train.src' # 数据位置
19 | tgt_dir = '../data/auto_title/train.tgt'
20 |
21 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22 |
23 | # os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
24 |
25 | num_gpus = 4 # gpu个数
26 | num_nodes = 1 ## 机器个数 目前只支持1 ,多机待测试。
27 | trainer = Trainer(env_type="DDP",
28 | epoches=5,
29 | val_every_step=500,
30 | device=device,
31 | batch_size=16,
32 | num_gpus=num_gpus,
33 | num_nodes=num_nodes,
34 | training_script=__file__,
35 | )
36 |
37 | tokenizer = Tokenizer(vocab_path)
38 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name)
39 | bert_model.load_pretrain_params(model_path)
40 | predictor = Predictor(bert_model, tokenizer)
41 |
42 | def read_file():
43 | ## 更换数据集只需要实现这个函数即可,返回框架需要的src、tgt
44 | src = []
45 | tgt = []
46 |
47 | with open(src_dir,'r',encoding='utf-8') as f:
48 | lines = f.readlines()
49 | for line in lines:
50 | src.append(line.strip('\n').lower())
51 |
52 | with open(tgt_dir,'r',encoding='utf-8') as f:
53 | lines = f.readlines()
54 | for line in lines:
55 | tgt.append(line.strip('\n').lower())
56 |
57 | return src,tgt
58 |
59 | class AutoTitleDataset(Dataset):
60 | """
61 | 针对特定数据集,定义一个相关的取数据的方式
62 | """
63 | def __init__(self, sents_src, sents_tgt) :
64 | ## 一般init函数是加载所有数据
65 | super().__init__()
66 | self.sents_src = sents_src
67 | self.sents_tgt = sents_tgt
68 |
69 | def __getitem__(self, i):
70 | ## 得到单个数据
71 | # print(i)
72 | src = self.sents_src[i]
73 | tgt = self.sents_tgt[i]
74 | tokenizer_out = tokenizer.encode_plus(src, tgt, max_length=maxlen)
75 |
76 | output = {
77 | "input_ids": tokenizer_out["input_ids"],
78 | "token_type_ids": tokenizer_out["token_type_ids"],
79 | }
80 | return output
81 |
82 | def __len__(self):
83 |
84 | return len(self.sents_src)
85 |
86 | class Evaluator:
87 |
88 | def on_validation(self, data):
89 | loss = data["loss"]
90 | step = data["iteration"]
91 | ## 自己定义validate函数实现,十分灵活。
92 | test_data = ["本文总结了十个可穿戴产品的设计原则而这些原则同样也是笔者认为是这个行业最吸引人的地方1为人们解决重复性问题2从人开始而不是从机器开始3要引起注意但不要刻意4提升用户能力而不是取代人",
93 | "2007年乔布斯向人们展示iPhone并宣称它将会改变世界还有人认为他在夸大其词然而在8年后以iPhone为代表的触屏智能手机已经席卷全球各个角落未来智能手机将会成为真正的个人电脑为人类发展做出更大的贡献",
94 | "雅虎发布2014年第四季度财报并推出了免税方式剥离其持有的阿里巴巴集团15%股权的计划打算将这一价值约400亿美元的宝贵投资分配给股东截止发稿前雅虎股价上涨了大约7%至5145美元"]
95 | for text in test_data:
96 | print(predictor.predict_generate_beamsearch(text, beam_size=3, input_max_length=200, out_max_length=40))
97 |
98 | torch.save(bert_model.state_dict(), model_save_path)
99 | print(f"模型保存成功~")
100 |
101 |
102 | def main():
103 | ## 加载数据
104 | all_src, all_tgt = read_file()
105 | train_size = int(len(all_src) * 0.9)
106 | train_src, train_tgt = all_src[:train_size], all_tgt[:train_size]
107 | # 声明需要优化的参数
108 | optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=1e-3)
109 | train_dataset = AutoTitleDataset(train_src, train_tgt)
110 |
111 | trainer.train(bert_model, optimizer, train_dataset=train_dataset, evaluator=Evaluator,
112 | collate_fn=bert_seq2seq_collate_fn)
113 |
114 | if __name__ == '__main__':
115 | main()
116 |
--------------------------------------------------------------------------------
/bert_seq2seq/task/sequence_labeling/bert_sequence_labeling.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | from bert_seq2seq.basic_bert import BasicBert
3 | from bert_seq2seq.layers import GlobalPointer, CRFLayer
4 |
5 | class BertSequenceLabling(BasicBert):
6 | """
7 | """
8 | def __init__(self, vocab,
9 | target_size,
10 | model_name="roberta",
11 | size="base",
12 | **kwargs):
13 | super(BertSequenceLabling, self).__init__(word2ix=vocab, model_name=model_name, size=size)
14 | self.cls = None
15 | self.layer_norm_cond = None
16 | self.target_size = target_size
17 | self.final_dense = nn.Linear(self.config.hidden_size, target_size)
18 |
19 | def compute_loss(self, predictions, labels):
20 | """
21 | 计算loss
22 | predictions: (batch_size, 1)
23 | """
24 | predictions = predictions.view(-1, self.target_size)
25 | labels = labels.view(-1)
26 | loss = nn.CrossEntropyLoss(reduction="mean")
27 | return loss(predictions, labels)
28 |
29 | def forward(self, **data):
30 |
31 | input_ids = data["input_ids"]
32 | token_type_ids = data.get("token_type_ids", None)
33 | labels = data.get("labels", None)
34 |
35 | all_layers, pooled_out = self.bert(input_ids, token_type_ids=token_type_ids,
36 | output_all_encoded_layers=True)
37 |
38 | sequence_out = all_layers[-1]
39 | predictions = self.final_dense(sequence_out)
40 |
41 | return_data = {"logits": predictions, }
42 |
43 | if labels is not None:
44 | ## 计算loss
45 | loss = self.compute_loss(predictions, labels)
46 | return_data["loss"] = loss
47 |
48 | return return_data
49 |
50 | class BertNERGP(BasicBert):
51 | """
52 | """
53 | def __init__(self, vocab, ent_type_size, inner_dim=64, size="base", model_name="roberta", **kwargs):
54 | super(BertNERGP, self).__init__(word2ix=vocab, model_name=model_name, size=size)
55 | self.gp = GlobalPointer(self.config.hidden_size, ent_type_size, inner_dim, RoPE=True)
56 | self.layer_norm_cond = None
57 | self.cls = None
58 | def compute_loss(self, logits, labels):
59 | pass
60 |
61 | def forward(self, **data):
62 | input_ids = data["input_ids"]
63 | token_type_ids = data.get("token_type_ids", None)
64 | padding_mask = (input_ids > 0).float()
65 | labels = data.get("labels", None)
66 |
67 | all_layers, _ = self.bert(input_ids, token_type_ids=token_type_ids,
68 | output_all_encoded_layers=True)
69 | sequence_out = all_layers[-1]
70 |
71 | gp_out = self.gp(sequence_out, padding_mask)
72 | return_data = {"logits": gp_out, }
73 |
74 | if labels is not None:
75 | return_data["loss"] = self.gp.compute_loss(gp_out, labels)
76 | return return_data
77 |
78 | class BertNERCRF(BasicBert):
79 | """
80 | """
81 | def __init__(self, vocab, target_size=-1, size="base", model_name="roberta", **kwargs):
82 | super(BertNERCRF, self).__init__(word2ix=vocab, model_name=model_name, size=size,)
83 | self.layer_norm_cond = None
84 | self.cls = None
85 | self.final_dense = nn.Linear(self.config.hidden_size, target_size)
86 | self.crf_layer = CRFLayer(target_size)
87 |
88 | def compute_loss(self, logits, labels, target_mask):
89 | loss = self.crf_layer(logits, labels, target_mask)
90 |
91 | return loss.mean()
92 |
93 | def forward(self, **data):
94 | input_ids = data["input_ids"]
95 | token_type_ids = data.get("token_type_ids", None)
96 | padding_mask = (input_ids > 0).float()
97 | labels = data.get("labels", None)
98 |
99 | all_layers, _ = self.bert(input_ids, token_type_ids=token_type_ids,
100 | output_all_encoded_layers=True)
101 | sequence_out = all_layers[-1]
102 |
103 | predictions = self.final_dense(sequence_out)
104 |
105 | return_data = {"logits": predictions, }
106 |
107 | if labels is not None:
108 | ## 计算loss
109 | return_data["loss"] = self.compute_loss(predictions, labels, padding_mask)
110 |
111 | return return_data
--------------------------------------------------------------------------------
/examples/text_classification/train_roberta_large_news_title_classification.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.data import Dataset
3 | from bert_seq2seq import Tokenizer
4 | from bert_seq2seq import load_model
5 | from bert_seq2seq.dataset import bert_cls_collate_fn
6 | from bert_seq2seq.trainer import Trainer
7 | from sklearn.model_selection import train_test_split
8 | from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
9 | from bert_seq2seq import Predictor
10 | import os
11 |
12 | target = ["财经", "彩票", "房产", "股票", "家居", "教育", "科技", "社会", "时尚", "时政", "体育", "星座", "游戏", "娱乐"]
13 | train_path = "../data/新闻标题文本分类/Train.txt"
14 | model_name = "roberta" # 选择模型名字
15 | task_name = "classification"
16 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置
17 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置
18 | model_save_path = "./bert_news_title_classification.bin"
19 | batch_size = 16
20 | lr = 1e-5
21 | # 加载字典
22 | tokenizer = Tokenizer(vocab_path)
23 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24 |
25 | trainer = Trainer(epoches=3, val_every_step=500, batch_size=16, env_type="pytorch",
26 | device=device,)
27 |
28 | bert_model = load_model(tokenizer.vocab,
29 | model_name=model_name,
30 | task_name=task_name,
31 | size="large",
32 | target_size=len(target))
33 | ## 加载预训练的模型参数~
34 | bert_model.load_pretrain_params(model_path)
35 | predictor = Predictor(bert_model, tokenizer)
36 |
37 | def read_corpus():
38 | """
39 | 读原始数据
40 | """
41 | sents_src = []
42 | sents_tgt = []
43 |
44 | with open(train_path) as f:
45 | lines = f.readlines()
46 | for line in lines:
47 | line = line.split("\t")
48 | sents_tgt.append(int(line[0]))
49 | sents_src.append(line[2])
50 |
51 | return sents_src, sents_tgt
52 |
53 | ## 加载数据
54 | all_input, all_label = read_corpus()
55 | train_input, val_input, train_label, val_label = train_test_split(all_input, all_label, train_size=0.8, random_state=123)
56 |
57 |
58 | ## 自定义dataset
59 | class ClassificationDataset(Dataset):
60 | """
61 | 针对特定数据集,定义一个相关的取数据的方式
62 | """
63 | def __init__(self, sents_src, sents_tgt) :
64 | super(ClassificationDataset, self).__init__()
65 | # 读原始数据
66 | self.sents_src = sents_src
67 | self.sents_tgt = sents_tgt
68 |
69 | def __getitem__(self, i):
70 | ## 得到单个数据
71 | src = self.sents_src[i]
72 | tgt = self.sents_tgt[i]
73 | tokenizer_out = tokenizer.encode_plus(src)
74 |
75 | output = {
76 | "input_ids": tokenizer_out["input_ids"],
77 | "token_type_ids": tokenizer_out["token_type_ids"],
78 | "labels": tgt
79 | }
80 | return output
81 |
82 | def __len__(self):
83 | return len(self.sents_src)
84 |
85 | def validate():
86 | res = []
87 | for data in val_input:
88 | pred = predictor.predict_cls_classifier(data)
89 | pred = pred.argmax(dim=1).numpy()
90 | res.append(pred)
91 |
92 | f1 = f1_score(val_label, res)
93 | accuracy = accuracy_score(val_label, res)
94 | recall = recall_score(val_label, res)
95 | precision = precision_score(val_label, res)
96 |
97 | print(f" f1 is {f1}, acc is {accuracy}, recall is {recall} precision is {precision}")
98 | return accuracy
99 |
100 | class Evaluator:
101 | def __init__(self):
102 | self.best_acc = 0.0
103 |
104 | def on_validation(self, data):
105 | loss = data["loss"]
106 | step = data["iteration"]
107 | acc = validate()
108 | if acc > self.best_acc:
109 | self.best_acc = acc
110 | torch.save(bert_model.state_dict(), model_save_path)
111 | print(f"模型保存成功~")
112 |
113 |
114 | def main():
115 |
116 | # 声明需要优化的参数
117 | optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=1e-3)
118 | train_dataset = ClassificationDataset(train_input, train_label)
119 |
120 | trainer.train(bert_model, optimizer=optimizer,
121 | train_dataset=train_dataset,
122 | evaluator=Evaluator,
123 | collate_fn=bert_cls_collate_fn)
124 |
125 | if __name__ == '__main__':
126 | main()
127 |
--------------------------------------------------------------------------------
/examples/text_classification/train_roberta_semantic_matching.py:
--------------------------------------------------------------------------------
1 | # https://tianchi.aliyun.com/competition/entrance/531851/information
2 | import torch
3 | from torch.utils.data import Dataset, DataLoader
4 | from bert_seq2seq import Tokenizer
5 | from bert_seq2seq import load_model
6 | from bert_seq2seq.dataset import bert_cls_collate_fn
7 | from bert_seq2seq.trainer import Trainer
8 | from sklearn.model_selection import train_test_split
9 | from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
10 | from bert_seq2seq import Predictor
11 | import os
12 |
13 | target = [0, 1]
14 | train_path = "../data/语义匹配/train.tsv"
15 | model_name = "roberta" # 选择模型名字
16 | task_name = "classification"
17 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置
18 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置
19 | model_save_path = "./bert_semantic_matching.bin"
20 | batch_size = 16
21 | lr = 1e-5
22 | # 加载字典
23 | tokenizer = Tokenizer(vocab_path)
24 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25 |
26 | trainer = Trainer(epoches=3,
27 | val_every_step=100,
28 | batch_size=16,
29 | env_type="pytorch",
30 | device=device)
31 |
32 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name, target_size=len(target))
33 | ## 加载预训练的模型参数~
34 | bert_model.load_pretrain_params(model_path)
35 | # 声明需要优化的参数
36 | predictor = Predictor(bert_model, tokenizer)
37 |
38 | def read_corpus(data_path):
39 | """
40 | 读原始数据
41 | """
42 | sents_src = []
43 | sents_tgt = []
44 |
45 | with open(data_path) as f:
46 | lines = f.readlines()
47 | for line in lines:
48 | line = line.split("\t")
49 | if len(line) == 3:
50 | sents_tgt.append(int(line[2]))
51 | sents_src.append(line[0] + "#" +line[1])
52 |
53 | return sents_src, sents_tgt
54 |
55 | ## 加载数据
56 | all_input, all_label = read_corpus(train_path)
57 | train_input, val_input, train_label, val_label = train_test_split(all_input, all_label, train_size=0.8, random_state=123)
58 |
59 |
60 | ## 自定义dataset
61 | class SemanticMatchingDataset(Dataset):
62 | """
63 | 针对特定数据集,定义一个相关的取数据的方式
64 | """
65 | def __init__(self, sents_src, sents_tgt) :
66 | ## 一般init函数是加载所有数据
67 | super(SemanticMatchingDataset, self).__init__()
68 | # 读原始数据
69 | self.sents_src = sents_src
70 | self.sents_tgt = sents_tgt
71 |
72 | def __getitem__(self, i):
73 | ## 得到单个数据
74 | # print(i)
75 | src = self.sents_src[i]
76 | tgt = self.sents_tgt[i]
77 | tokenizer_out = tokenizer.encode_plus(src)
78 |
79 | output = {
80 | "input_ids": tokenizer_out["input_ids"],
81 | "token_type_ids": tokenizer_out["token_type_ids"],
82 | "labels": tgt
83 | }
84 | return output
85 |
86 | def __len__(self):
87 | return len(self.sents_src)
88 |
89 | class Evaluator:
90 | def __init__(self):
91 | self.best_acc = 0.0
92 |
93 | def on_validation(self, data):
94 | loss = data["loss"]
95 | step = data["iteration"]
96 | res = []
97 | for data in val_input:
98 | pred = predictor.predict_cls_classifier(data)
99 | pred = pred.argmax(dim=0).numpy()
100 | res.append(pred)
101 |
102 | f1 = f1_score(val_label, res)
103 | accuracy = accuracy_score(val_label, res)
104 | recall = recall_score(val_label, res)
105 | precision = precision_score(val_label, res)
106 |
107 | print(f" f1 is {f1}, acc is {accuracy}, recall is {recall} precision is {precision}")
108 |
109 | if accuracy > self.best_acc:
110 | self.best_acc = accuracy
111 | torch.save(bert_model.state_dict(), model_save_path)
112 | print(f"模型保存成功~")
113 |
114 |
115 | def main():
116 |
117 | optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=1e-3)
118 | train_dataset = SemanticMatchingDataset(train_input, train_label)
119 |
120 | trainer.train(bert_model, optimizer=optimizer,
121 | train_dataset=train_dataset,
122 | evaluator=Evaluator,
123 | collate_fn=bert_cls_collate_fn)
124 |
125 | if __name__ == '__main__':
126 | main()
127 |
--------------------------------------------------------------------------------
/examples/text_classification/train_roberta_news_title_classification.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.data import Dataset
3 | from bert_seq2seq import Tokenizer
4 | from bert_seq2seq import load_model
5 | from bert_seq2seq.dataset import bert_cls_collate_fn
6 | from bert_seq2seq.trainer import Trainer
7 | from sklearn.model_selection import train_test_split
8 | from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
9 | from bert_seq2seq import Predictor
10 | import os
11 | from tqdm import tqdm
12 |
13 | target = ["财经", "彩票", "房产", "股票", "家居", "教育", "科技", "社会", "时尚", "时政", "体育", "星座", "游戏", "娱乐"]
14 | train_path = "../data/新闻标题文本分类/Train.txt"
15 | model_name = "roberta" # 选择模型名字
16 | task_name = "classification"
17 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置
18 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置
19 | model_save_path = "./bert_news_title_classification.bin"
20 | batch_size = 16
21 | lr = 1e-5
22 | # 加载字典
23 | tokenizer = Tokenizer(vocab_path)
24 | device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
25 |
26 | trainer = Trainer(epoches=3,
27 | val_every_step=10000,
28 | batch_size=16,
29 | env_type="pytorch",
30 | device=device)
31 | bert_model = load_model(tokenizer.vocab,
32 | model_name=model_name,
33 | task_name=task_name,
34 | target_size=len(target))
35 | ## 加载预训练的模型参数~
36 | bert_model.load_pretrain_params(model_path)
37 | predictor = Predictor(bert_model, tokenizer)
38 |
39 | def read_corpus():
40 | """
41 | 读原始数据
42 | """
43 | sents_src = []
44 | sents_tgt = []
45 |
46 | with open(train_path) as f:
47 | lines = f.readlines()
48 | for line in lines:
49 | line = line.split("\t")
50 | sents_tgt.append(int(line[0]))
51 | sents_src.append(line[2])
52 |
53 | return sents_src, sents_tgt
54 |
55 | ## 加载数据
56 | all_input, all_label = read_corpus()
57 | train_input, val_input, train_label, val_label = train_test_split(all_input, all_label, train_size=0.8, random_state=123)
58 |
59 |
60 | ## 自定义dataset
61 | class ClassificationDataset(Dataset):
62 | """
63 | 针对特定数据集,定义一个相关的取数据的方式
64 | """
65 | def __init__(self, sents_src, sents_tgt) :
66 | super(ClassificationDataset, self).__init__()
67 | # 读原始数据
68 | self.sents_src = sents_src
69 | self.sents_tgt = sents_tgt
70 |
71 | def __getitem__(self, i):
72 | ## 得到单个数据
73 | src = self.sents_src[i]
74 | tgt = self.sents_tgt[i]
75 | tokenizer_out = tokenizer.encode_plus(src)
76 |
77 | output = {
78 | "input_ids": tokenizer_out["input_ids"],
79 | "token_type_ids": tokenizer_out["token_type_ids"],
80 | "labels": tgt
81 | }
82 | return output
83 |
84 | def __len__(self):
85 | return len(self.sents_src)
86 |
87 | def validate():
88 | res = []
89 | for data in tqdm(val_input, total=len(val_input)):
90 | pred = predictor.predict_cls_classifier(data)
91 | pred = pred.argmax(dim=0).numpy()
92 | res.append(pred)
93 |
94 | f1 = f1_score(val_label, res, average="macro")
95 | accuracy = accuracy_score(val_label, res)
96 | recall = recall_score(val_label, res, average="macro")
97 | precision = precision_score(val_label, res, average="macro")
98 |
99 | print(f" f1 is {f1}, acc is {accuracy}, recall is {recall} precision is {precision}")
100 | return accuracy
101 |
102 | class Evaluator:
103 | def __init__(self):
104 | self.best_acc = 0.0
105 |
106 | def on_epoch_end(self):
107 | acc = validate()
108 | if acc > self.best_acc:
109 | self.best_acc = acc
110 | torch.save(bert_model.state_dict(), model_save_path)
111 | print(f"模型保存成功~")
112 |
113 | def on_validation(self, data):
114 | loss = data["loss"]
115 | step = data["iteration"]
116 | pass
117 |
118 | def main():
119 |
120 | # 声明需要优化的参数
121 | optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=1e-3)
122 | train_dataset = ClassificationDataset(train_input, train_label)
123 |
124 | trainer.train(bert_model, optimizer=optimizer,
125 | train_dataset=train_dataset,
126 | evaluator=Evaluator,
127 | collate_fn=bert_cls_collate_fn)
128 |
129 | if __name__ == '__main__':
130 | main()
131 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # bert_seq2seq_DDP
2 | bert_seq2seq的DDP(分布式训练)版本。
3 | 此项目是对bert_seq2seq项目的重构并且很好的支持pytorch的DDP多卡训练。examples里面是各种训练例子,data中是样例数据。
4 |
5 | 本项目可以轻松调用不同种类transformer结构的模型(Bert、Roberta、T5、Nezha、Bart等)针对不同的任务(生成、序列标注、文本分类、关系抽取、命名实体识别等)进行快速的训练、预测,并且无缝进行分布式(DDP)训练。
6 |
7 | **一个不同的数据集,只需要花5-10分钟修改好构建输入输出的函数,即可快速开始训练!**
8 | #### 欢迎加入交流群~ 可以提问题,提建议,互相交流,还会提供部分数据与模型的下载 QQ群: 975907202 微信群: w11267191 加好友拉入群~
9 |
10 |
11 | 更多关于bert_seq2seq相关的内容请看:https://github.com/920232796/bert_seq2seq
12 |
13 | ### 项目特点一:
14 | 单卡训练与多卡训练方式相同,无需添加额外代码和使用额外命令运行。
15 |
16 | 单卡与多卡的运行方式均为:
17 | ```shell
18 | python "./train.py" ## train.py为example中以train开头的训练脚本文件
19 | ```
20 | 切换多卡训练只需要修改 ```train.py``` 文件中的环境设置即可:
21 |
22 | ```python
23 | num_gpus = 4 # gpu个数
24 | num_nodes = 1 ## 机器个数 目前只支持1 ,多机待测试。
25 | trainer = Trainer(env_type="DDP",## DDP为pytorch的分布式数据并行训练
26 | epoches=5, model_save_dir=model_save_dir,
27 | val_every_step=500, device=device,
28 | batch_size=16, num_gpus=num_gpus, num_nodes=num_nodes,
29 | training_script=__file__,
30 | )
31 | ```
32 | 具体例子代码可以参考:
33 |
34 | [train_roberta_auto_title_multi_gpu.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/bert/train_roberta_auto_title_multi_gpu.py) 自动标题任务,多gpu训练。
35 |
36 | ### 项目特点二:
37 | 虽然使用Trainer类进行了封装,也能做到比较灵活的evaluation.
38 |
39 | #### 自定义Evaluator类,可以自由进行验证
40 |
41 | [train_roberta_auto_title.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/bert/train_roberta_auto_title.py) 自动标题任务,在训练过程中很方便打印下生成内容。
42 |
43 | [train_roberta_semantic_matching.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/text_classification/train_roberta_semantic_matching.py) 语义匹配任务,在训练过程中轻松插入验证过程。
44 |
45 | ### 项目特点三:
46 | 提供样例数据在data目录中,帮助理解代码运行过程,供参考(qq群文件里提供部分任务的全部数据)。
47 | ### 环境配置
48 | #### 安装pytorch,不是太旧的版本即可。
49 | https://pytorch.org/
50 | #### 安装额外的包
51 | ```commandline
52 | pip install bert_seq2seq_DDP
53 | pip install tqdm
54 | pip install scikit-learn //可选
55 | ```
56 | 网络不好请切换国内源进行安装
57 |
58 | #### 模型预训练参数、字典下载
59 | 1. roberta模型(支持base、large),模型和字典文件下载地址:https://drive.google.com/file/d/1iNeYFhCBJWeUsIlnW_2K6SMwXkM4gLb_/view 这里下载。 参考github仓库:https://github.com/ymcui/Chinese-BERT-wwm ,roberta-large模型也是在里面进行下载即可。
60 | 2. bert模型(支持base、large),下载bert中文预训练权重 "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin", 下载bert中文字典 "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt".
61 | 3. nezha模型,字典权重位置(支持base、large):nezha-base模型下载:链接: https://pan.baidu.com/s/1Z0SJbISsKzAgs0lT9hFyZQ 提取码: 4awe
62 | 4. gpt2中文模型:gpt2中文通用模型和字典下载地址:https://pan.baidu.com/s/1vTYc8fJUmlQrre5p0JRelw 密码: f5un,下载好即可在 [examples/seq2seq/gpt2_text_writting.py](https://github.com/920232796/bert_seq2seq_DDP/blob/master/examples/seq2seq/gpt2_text_writting.py) 中进行续写测试。
63 | 6. t5中文模型(支持base、small),预训练参数下载:https://github.com/renmada/t5-pegasus-pytorch
64 | 7. SimBert模型,相似句的生成,预训练模型使用bert、roberta、nezha均可。
65 | 8. bart中文模型下载地址:https://huggingface.co/fnlp/bart-base-chinese
66 |
67 | #### 参数说明,以文本分类任务为例
68 | ```python
69 | import torch
70 | from bert_seq2seq import Tokenizer, Predictor, load_model, Trainer
71 | target = ["财经", "彩票", "房产", "股票", "家居", "教育", "科技", "社会", "时尚", "时政", "体育", "星座", "游戏", "娱乐"] # 所有labels
72 | train_path = "../data/新闻标题文本分类/Train.txt" # 数据存放位置
73 | model_name = "roberta" # 选择模型名字
74 | task_name = "cls" # 任务名字
75 | vocab_path = "../state_dict/roberta-large/vocab.txt" # roberta模型字典的位置
76 | model_path = "../state_dict/roberta-large/pytorch_model.bin" # roberta模型位置
77 | model_save_path = "./bert_news_title_classification.bin" ## 训练好的模型保存位置
78 | batch_size = 16
79 | lr = 1e-5
80 | # 加载字典
81 | tokenizer = Tokenizer(vocab_path)
82 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
83 | # 加载roberta large 模型,做利用cls向量分类的任务。
84 | bert_model = load_model(tokenizer.vocab,
85 | model_name=model_name,
86 | size="large", ## load large model
87 | task_name=task_name,
88 | target_size=len(target))
89 | ## 加载预训练的模型参数~
90 | bert_model.load_pretrain_params(model_path)
91 | # trainer设置
92 | trainer = Trainer(epoches=3, val_every_step=500, # 每500步进行验证
93 | batch_size=batch_size,
94 | env_type="pytorch", # 单卡训练方式
95 | device=device,
96 | )
97 | ```
98 | #### 运行
99 | 确定要做哪个任务,找到examples中对应的train_*.py文件,下载好模型与字典后,理解数据构建过程,运行即可(样例数据在data目录中,帮助理解代码过程,供参考)。
100 |
101 |
--------------------------------------------------------------------------------
/bert_seq2seq/mpu/data.py:
--------------------------------------------------------------------------------
1 | # Copyright © 2022 BAAI. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License")
4 | # coding=utf-8
5 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | import torch
20 |
21 | from .initialize import get_model_parallel_group
22 | from .initialize import get_model_parallel_rank
23 | from .initialize import get_model_parallel_src_rank
24 |
25 | _MAX_DATA_DIM = 5
26 |
27 |
28 | def _check_data_types(keys, data, target_dtype):
29 | """Check that all the keys have the same target data type."""
30 | for key in keys:
31 | assert data[key].dtype == target_dtype, '{} has data type {} which '\
32 | 'is different than {}'.format(key, data[key].dtype, target_dtype)
33 |
34 |
35 | def _build_key_size_numel_dictionaries(keys, data):
36 | """Build the size on rank 0 and broadcast."""
37 | max_dim = _MAX_DATA_DIM
38 | sizes = [0 for _ in range(max_dim) for _ in keys]
39 |
40 | # Pack the sizes on rank zero.
41 | if get_model_parallel_rank() == 0:
42 | offset = 0
43 | for key in keys:
44 | assert data[key].dim(
45 | ) < max_dim, 'you should increase MAX_DATA_DIM'
46 | size = data[key].size()
47 | for i, s in enumerate(size):
48 | sizes[i + offset] = s
49 | offset += max_dim
50 |
51 | # Move to GPU and broadcast.
52 | sizes_cuda = torch.cuda.LongTensor(sizes)
53 | torch.distributed.broadcast(sizes_cuda,
54 | get_model_parallel_src_rank(),
55 | group=get_model_parallel_group())
56 |
57 | # Move back to cpu and unpack.
58 | sizes_cpu = sizes_cuda.cpu()
59 | key_size = {}
60 | key_numel = {}
61 | total_numel = 0
62 | offset = 0
63 | for key in keys:
64 | i = 0
65 | size = []
66 | numel = 1
67 | while sizes_cpu[offset + i] > 0:
68 | this_size = sizes_cpu[offset + i]
69 | size.append(this_size)
70 | numel *= this_size
71 | i += 1
72 | key_size[key] = size
73 | key_numel[key] = numel
74 | total_numel += numel
75 | offset += max_dim
76 |
77 | return key_size, key_numel, total_numel
78 |
79 |
80 | def broadcast_data(keys, data, datatype):
81 | """Broadcast data from rank zero of each model parallel group to the
82 | members of the same model parallel group.
83 |
84 | Arguments:
85 | keys: list of keys in the data disctionary to be broadcasted
86 | data: data dictionary of string keys and cpu tensor values.
87 | datatype: torch data type of all tensors in data associated
88 | with keys.
89 | """
90 | # Build (key, size) and (key, number of elements) dictionaries along
91 | # with the total number of elements on all ranks.
92 | key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(
93 | keys, data)
94 |
95 | # Pack on rank zero.
96 | if get_model_parallel_rank() == 0:
97 | # Check that all keys have the same data type.
98 | _check_data_types(keys, data, datatype)
99 | # Flatten the data associated with the keys
100 | flatten_data = torch.cat(
101 | [data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
102 | else:
103 | flatten_data = torch.empty(total_numel,
104 | device=torch.cuda.current_device(),
105 | dtype=datatype)
106 |
107 | # Boradcast
108 | torch.distributed.broadcast(flatten_data,
109 | get_model_parallel_src_rank(),
110 | group=get_model_parallel_group())
111 |
112 | # Unpack
113 | output = {}
114 | offset = 0
115 | for key in keys:
116 | size = key_size[key]
117 | numel = key_numel[key]
118 | output[key] = flatten_data.narrow(0, offset, numel).view(size)
119 | offset += numel
120 |
121 | return output
122 |
--------------------------------------------------------------------------------
/bert_seq2seq/task/seq2seq/GLM_seq2seq_model.py:
--------------------------------------------------------------------------------
1 |
2 | from bert_seq2seq.basic_bert import BasicGLM
3 | from bert_seq2seq.model.glm_model import GLMModel
4 | import os
5 |
6 | large_ch_config = {
7 | "num_layers": 24,
8 | "vocab_size": 50048,
9 | "hidden_size": 1024,
10 | "num_attention_heads":16,
11 | "embedding_dropout_prob":0.1,
12 | "attention_dropout_prob":0.1,
13 | "output_dropout_prob":0.1,
14 | "max_sequence_length":1024,
15 | "max_memory_length":511,
16 | "checkpoint_activations": False ,
17 | "checkpoint_num_layers":1 ,
18 | "parallel_output": True,
19 | "relative_encoding": False,
20 | "block_position_encoding": True,
21 | "output_predict": True,
22 | "spell_length": None,
23 | "spell_func": "lstm",
24 | "attention_scale":1.0
25 | }
26 | class GLMLargeChConfig:
27 | def __init__(self):
28 | config = large_ch_config
29 | self.num_layers = config["num_layers"]
30 | self.vocab_size = config["vocab_size"]
31 | self.hidden_size = config["hidden_size"]
32 | self.num_attention_heads = config["num_attention_heads"]
33 | self.embedding_dropout_prob = config["embedding_dropout_prob"]
34 | self.attention_dropout_prob = config["attention_dropout_prob"]
35 | self.output_dropout_prob = config["output_dropout_prob"]
36 | self.max_sequence_length = config["max_sequence_length"]
37 | self.max_memory_length = config["max_memory_length"]
38 | self.checkpoint_activations = config["checkpoint_activations"]
39 | self.checkpoint_num_layers = config["checkpoint_num_layers"]
40 | self.parallel_output = config["parallel_output"]
41 | self.relative_encoding = config["relative_encoding"]
42 | self.block_position_encoding = config["block_position_encoding"]
43 | self.output_predict = config["output_predict"]
44 | self.spell_length = config["spell_length"]
45 | self.spell_func = config["spell_func"]
46 | self.attention_scale = config["attention_scale"]
47 |
48 | class GLMSeq2SeqModel(BasicGLM):
49 | """
50 | """
51 | def __init__(self,
52 | size="base", **kwargs):
53 | super(GLMSeq2SeqModel, self).__init__()
54 | if size == "base":
55 | pass
56 | print("不支持GLM base模型")
57 | os._exit(0)
58 | elif size == "large":
59 | config = GLMLargeChConfig()
60 |
61 | else :
62 | print("不支持的size")
63 | os._exit(0)
64 |
65 | self.config = config
66 | self.model = GLMModel(num_layers=config.num_layers,
67 | vocab_size=config.vocab_size,
68 | hidden_size=config.hidden_size,
69 | num_attention_heads=config.num_attention_heads,
70 | embedding_dropout_prob=config.embedding_dropout_prob,
71 | attention_dropout_prob=config.attention_dropout_prob,
72 | output_dropout_prob=config.output_dropout_prob,
73 | max_sequence_length=config.max_sequence_length,
74 | max_memory_length=config.max_memory_length,
75 | checkpoint_activations=config.checkpoint_activations,
76 | checkpoint_num_layers=config.checkpoint_num_layers,
77 | output_predict=config.output_predict,
78 | parallel_output=config.parallel_output,
79 | relative_encoding=config.relative_encoding,
80 | block_position_encoding=config.block_position_encoding,
81 | spell_length=config.spell_length,
82 | spell_func=config.spell_func,
83 | attention_scale=config.attention_scale)
84 |
85 | self.hidden_dim = self.config.hidden_size
86 | self.vocab_size = self.config.vocab_size
87 |
88 | def forward(self, **data):
89 | input_ids = data["input_ids"]
90 | labels = data.get("labels", None)
91 | position_ids = data["position_ids"]
92 | attention_mask = data["attention_mask"]
93 | return_memory = data.get("return_memory", False)
94 | mems = data.get("mems", None)
95 |
96 | return self.model(input_ids=input_ids, position_ids=position_ids,
97 | attention_mask=attention_mask, labels=labels,
98 | return_memory=return_memory, mems=mems)
99 |
100 | def load_weights(self, checkpoints_path):
101 | self.model.load_weights_glm(checkpoints_path)
102 |
103 |
104 |
--------------------------------------------------------------------------------
/examples/text_classification/train_roberta_news_title_classification_multi_gpu.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.data import Dataset
3 | from bert_seq2seq import Tokenizer
4 | from bert_seq2seq import load_model
5 | from bert_seq2seq.dataset import bert_cls_collate_fn
6 | from bert_seq2seq.trainer import Trainer
7 | from sklearn.model_selection import train_test_split
8 | from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
9 | from bert_seq2seq import Predictor
10 | import os
11 | from tqdm import tqdm
12 |
13 | target = ["财经", "彩票", "房产", "股票", "家居", "教育", "科技", "社会", "时尚", "时政", "体育", "星座", "游戏", "娱乐"]
14 | train_path = "../data/新闻标题文本分类/Train.txt"
15 | model_name = "roberta" # 选择模型名字
16 | task_name = "classification"
17 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置
18 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置
19 | model_save_path = "./bert_news_title_classification.bin"
20 | batch_size = 16
21 | lr = 1e-5
22 | os.environ['CUDA_VISIBLE_DEVICES'] = "1,2"
23 | num_gpus = 2
24 |
25 | # 加载字典
26 | tokenizer = Tokenizer(vocab_path)
27 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28 |
29 | trainer = Trainer(epoches=3,
30 | val_every_step=500,
31 | batch_size=16,
32 | env_type="DDP",
33 | device=device,
34 | num_nodes=1,
35 | num_gpus=num_gpus,
36 | training_script=__file__)
37 |
38 | bert_model = load_model(tokenizer.vocab,
39 | model_name=model_name,
40 | task_name=task_name,
41 | target_size=len(target),)
42 | ## 加载预训练的模型参数~
43 | bert_model.load_pretrain_params(model_path)
44 | predictor = Predictor(bert_model, tokenizer)
45 |
46 | def read_corpus():
47 | """
48 | 读原始数据
49 | """
50 | sents_src = []
51 | sents_tgt = []
52 |
53 | with open(train_path) as f:
54 | lines = f.readlines()
55 | for line in lines:
56 | line = line.split("\t")
57 | sents_tgt.append(int(line[0]))
58 | sents_src.append(line[2])
59 |
60 | return sents_src, sents_tgt
61 |
62 | ## 加载数据
63 | all_input, all_label = read_corpus()
64 | train_input, val_input, train_label, val_label = train_test_split(all_input, all_label, train_size=0.8, random_state=123)
65 |
66 |
67 | ## 自定义dataset
68 | class ClassificationDataset(Dataset):
69 | """
70 | 针对特定数据集,定义一个相关的取数据的方式
71 | """
72 | def __init__(self, sents_src, sents_tgt) :
73 | super(ClassificationDataset, self).__init__()
74 | # 读原始数据
75 | self.sents_src = sents_src
76 | self.sents_tgt = sents_tgt
77 |
78 | def __getitem__(self, i):
79 | ## 得到单个数据
80 | src = self.sents_src[i]
81 | tgt = self.sents_tgt[i]
82 | tokenizer_out = tokenizer.encode_plus(src)
83 |
84 | output = {
85 | "input_ids": tokenizer_out["input_ids"],
86 | "token_type_ids": tokenizer_out["token_type_ids"],
87 | "labels": tgt
88 | }
89 | return output
90 |
91 | def __len__(self):
92 | return len(self.sents_src)
93 |
94 | def validate():
95 | res = []
96 | for data in tqdm(val_input, total=len(val_input)):
97 | pred = predictor.predict_cls_classifier(data)
98 | pred = pred.argmax(dim=0).numpy()
99 | res.append(pred)
100 |
101 | f1 = f1_score(val_label, res, average="macro")
102 | accuracy = accuracy_score(val_label, res)
103 | recall = recall_score(val_label, res, average="macro")
104 | precision = precision_score(val_label, res, average="macro")
105 |
106 | print(f" f1 is {f1}, acc is {accuracy}, recall is {recall} precision is {precision}")
107 | return accuracy
108 |
109 | class Evaluator:
110 | def __init__(self):
111 | self.best_acc = 0.0
112 |
113 | def on_epoch_end(self):
114 | acc = validate()
115 | if acc > self.best_acc:
116 | self.best_acc = acc
117 | torch.save(bert_model.state_dict(), model_save_path)
118 | print(f"模型保存成功~")
119 |
120 | def on_validation(self, data):
121 | loss = data["loss"]
122 | step = data["iteration"]
123 | pass
124 |
125 | def main():
126 |
127 | # 声明需要优化的参数
128 | optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=1e-3)
129 | train_dataset = ClassificationDataset(train_input, train_label)
130 |
131 | trainer.train(bert_model, optimizer=optimizer,
132 | train_dataset=train_dataset,
133 | evaluator=Evaluator,
134 | collate_fn=bert_cls_collate_fn)
135 |
136 | if __name__ == '__main__':
137 | main()
138 |
--------------------------------------------------------------------------------
/bert_seq2seq/mpu/mappings.py:
--------------------------------------------------------------------------------
1 | # Copyright © 2022 BAAI. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License")
4 | # coding=utf-8
5 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | import torch
20 |
21 | from .initialize import get_model_parallel_group
22 | from .utils import split_tensor_along_last_dim
23 |
24 |
25 | def _reduce(input_):
26 | """All-reduce the the input tensor across model parallel group."""
27 | group = get_model_parallel_group()
28 |
29 | # Bypass the function if we are using only 1 GPU.
30 | if torch.distributed.get_world_size(group=group) == 1:
31 | return input_
32 |
33 | # All-reduce.
34 | torch.distributed.all_reduce(input_, group=group)
35 |
36 | return input_
37 |
38 |
39 | def _split(input_):
40 | """Split the tensor along its last dimension and keep the
41 | corresponding slice."""
42 | group = get_model_parallel_group()
43 |
44 | # Bypass the function if we are using only 1 GPU.
45 | if torch.distributed.get_world_size(group=group) == 1:
46 | return input_
47 |
48 | # Split along last dimension.
49 | world_size = torch.distributed.get_world_size(group=group)
50 | input_list = split_tensor_along_last_dim(input_, world_size)
51 |
52 | # Note: torch.split does not create contiguous tensors by default.
53 | rank = torch.distributed.get_rank(group=group)
54 | output = input_list[rank].contiguous()
55 |
56 | return output
57 |
58 |
59 | def _gather(input_):
60 | """Gather tensors and concatinate along the last dimension."""
61 | group = get_model_parallel_group()
62 |
63 | # Bypass the function if we are using only 1 GPU.
64 | if torch.distributed.get_world_size(group=group) == 1:
65 | return input_
66 |
67 | # Size and dimension.
68 | last_dim = input_.dim() - 1
69 | rank = torch.distributed.get_rank(group=group)
70 | world_size = torch.distributed.get_world_size(group=group)
71 |
72 | tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
73 | tensor_list[rank] = input_
74 | torch.distributed.all_gather(tensor_list, input_, group=group)
75 |
76 | # Note: torch.cat already creates a contiguous tensor.
77 | output = torch.cat(tensor_list, dim=last_dim).contiguous()
78 |
79 | return output
80 |
81 |
82 | class _CopyToModelParallelRegion(torch.autograd.Function):
83 | """Pass the input to the model parallel region."""
84 | @staticmethod
85 | def forward(ctx, input_):
86 | return input_
87 |
88 | @staticmethod
89 | def backward(ctx, grad_output):
90 | return _reduce(grad_output)
91 |
92 |
93 | class _ReduceFromModelParallelRegion(torch.autograd.Function):
94 | """All-redcue the input from the model parallel region."""
95 | @staticmethod
96 | def forward(ctx, input_):
97 | return _reduce(input_)
98 |
99 | @staticmethod
100 | def backward(ctx, grad_output):
101 | return grad_output
102 |
103 |
104 | class _ScatterToModelParallelRegion(torch.autograd.Function):
105 | """Split the input and keep only the corresponding chuck to the rank."""
106 | @staticmethod
107 | def forward(ctx, input_):
108 | return _split(input_)
109 |
110 | @staticmethod
111 | def backward(ctx, grad_output):
112 | return _gather(grad_output)
113 |
114 |
115 | class _GatherFromModelParallelRegion(torch.autograd.Function):
116 | """Gather the input from model parallel region and concatinate."""
117 | @staticmethod
118 | def forward(ctx, input_):
119 | return _gather(input_)
120 |
121 | @staticmethod
122 | def backward(ctx, grad_output):
123 | return _split(grad_output)
124 |
125 |
126 | # -----------------
127 | # Helper functions.
128 | # -----------------
129 |
130 |
131 | def copy_to_model_parallel_region(input_):
132 | return _CopyToModelParallelRegion.apply(input_)
133 |
134 |
135 | def reduce_from_model_parallel_region(input_):
136 | return _ReduceFromModelParallelRegion.apply(input_)
137 |
138 |
139 | def scatter_to_model_parallel_region(input_):
140 | return _ScatterToModelParallelRegion.apply(input_)
141 |
142 |
143 | def gather_from_model_parallel_region(input_):
144 | return _GatherFromModelParallelRegion.apply(input_)
145 |
--------------------------------------------------------------------------------
/examples/seq2seq/GLM/train_glm_auto_title.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import torch
4 | from torch.utils.data import Dataset, DataLoader
5 | from bert_seq2seq import GLMTokenizer
6 | from bert_seq2seq import load_model
7 | from bert_seq2seq import Trainer
8 | from bert_seq2seq.dataset import glm_generation_collate_fn
9 | from bert_seq2seq import Predictor
10 |
11 | model_name = "glm" # 选择模型名字
12 | task_name = "seq2seq" # 任务名字
13 |
14 | vocab_path = "../state_dict/GLM-large-ch/cog-pretrain.model" # roberta模型字典的位置
15 | model_path = "../state_dict/GLM-large-ch/pytorch_model.bin" # 预训练模型位置
16 | model_save_path = "./GLM_auto_title_model.bin" # 训练好的模型保存位置。
17 |
18 | lr = 1e-5
19 | maxlen=1024
20 |
21 | src_dir = '../data/auto_title/train.src' # 数据位置
22 | tgt_dir = '../data/auto_title/train.tgt'
23 |
24 | device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
25 | tokenizer = GLMTokenizer(vocab_path)
26 |
27 | model = load_model(model_name=model_name,
28 | task_name=task_name,
29 | size="large")
30 |
31 | model.load_pretrain_params(model_path)
32 | predictor = Predictor(model, tokenizer)
33 |
34 | trainer = Trainer(env_type="pytorch",
35 | epoches=5,
36 | val_every_step=500,
37 | device=device,
38 | batch_size=16)
39 |
40 | def read_file():
41 | ## 更换数据集只需要实现这个函数即可,返回框架需要的src、tgt
42 | src = []
43 | tgt = []
44 |
45 | with open(src_dir,'r',encoding='utf-8') as f:
46 | lines = f.readlines()
47 | for line in lines:
48 | src.append(line.strip('\n').lower())
49 |
50 | with open(tgt_dir,'r',encoding='utf-8') as f:
51 | lines = f.readlines()
52 | for line in lines:
53 | tgt.append(line.strip('\n').lower())
54 |
55 | return src,tgt
56 |
57 | class AutoTitleDataset(Dataset):
58 | """
59 | 针对特定数据集,定义一个相关的取数据的方式
60 | """
61 | def __init__(self, sents_src, sents_tgt) :
62 | ## 一般init函数是加载所有数据
63 | super().__init__()
64 | self.sents_src = sents_src
65 | self.sents_tgt = sents_tgt
66 |
67 | def __getitem__(self, i):
68 | ## 得到单个数据
69 | # print(i)
70 | src = self.sents_src[i]
71 | tgt = self.sents_tgt[i]
72 |
73 | tokenizer_out = tokenizer.encode_plus(src,
74 | tgt,
75 | max_length=maxlen,
76 | # mask_token="sMASK",
77 | prefix_flag="标题生成:",
78 | post_flag=" 回答:")
79 |
80 | output = {
81 | "input_ids": tokenizer_out["input_ids"],
82 | "attention_mask": tokenizer_out["attention_mask"],
83 | "position_ids": tokenizer_out["position_ids"],
84 | "loss_mask": tokenizer_out["loss_mask"],
85 | }
86 | return output
87 |
88 | def __len__(self):
89 |
90 | return len(self.sents_src)
91 |
92 | class Evaluator:
93 |
94 | def prompt(self, text):
95 | text = "标题生成:" + text + "回答:[gMASK]"
96 | return text
97 |
98 | def on_validation(self, data):
99 | loss = data["loss"]
100 | step = data["iteration"]
101 | ## 自己定义validate函数实现,十分灵活。
102 | test_data = ["本文总结了十个可穿戴产品的设计原则而这些原则同样也是笔者认为是这个行业最吸引人的地方1为人们解决重复性问题2从人开始而不是从机器开始3要引起注意但不要刻意4提升用户能力而不是取代人",
103 | "2007年乔布斯向人们展示iPhone并宣称它将会改变世界还有人认为他在夸大其词然而在8年后以iPhone为代表的触屏智能手机已经席卷全球各个角落未来智能手机将会成为真正的个人电脑为人类发展做出更大的贡献",
104 | "雅虎发布2014年第四季度财报并推出了免税方式剥离其持有的阿里巴巴集团15%股权的计划打算将这一价值约400亿美元的宝贵投资分配给股东截止发稿前雅虎股价上涨了大约7%至5145美元"]
105 | for text in test_data:
106 | text = self.prompt(text)
107 | print(predictor.predict_generate_randomsample(text,
108 | top_k=50,
109 | top_p=0.9,
110 | repetition_penalty=1.0,
111 | input_max_length=600,
112 | out_max_length=100,
113 | ))
114 |
115 | torch.save(model.state_dict(), model_save_path)
116 | print(f"模型保存成功~")
117 |
118 | def main():
119 | ## 加载数据
120 | all_src, all_tgt = read_file()
121 | train_size = int(len(all_src) * 0.9)
122 | train_src, train_tgt = all_src[:train_size], all_tgt[:train_size]
123 | # 声明需要优化的参数
124 | optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-3)
125 | train_dataset = AutoTitleDataset(train_src, train_tgt)
126 |
127 | trainer.train(model, optimizer, train_dataset=train_dataset, evaluator=Evaluator,
128 | collate_fn=glm_generation_collate_fn)
129 |
130 | if __name__ == '__main__':
131 | main()
132 |
--------------------------------------------------------------------------------
/bert_seq2seq/mpu/cross_entropy.py:
--------------------------------------------------------------------------------
1 | # Copyright © 2022 BAAI. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License")
4 | # coding=utf-8
5 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | import torch
20 |
21 | from .initialize import get_model_parallel_group
22 | from .initialize import get_model_parallel_rank
23 | from .initialize import get_model_parallel_world_size
24 | from .utils import VocabUtility
25 |
26 |
27 | class _VocabParallelCrossEntropy(torch.autograd.Function):
28 | @staticmethod
29 | def forward(ctx, vocab_parallel_logits, target):
30 |
31 | # Copy so the input remains unchanged.
32 | logits = vocab_parallel_logits.clone()
33 | # Maximum value along vocab dimension across all GPUs.
34 | logits_max = torch.max(logits, dim=-1)[0]
35 | torch.distributed.all_reduce(logits_max,
36 | op=torch.distributed.ReduceOp.MAX,
37 | group=get_model_parallel_group())
38 | # Subtract the maximum value.
39 | logits.sub_(logits_max.unsqueeze(dim=-1))
40 | # Sum of exponential of logits along vocab dimension across all GPUs.
41 | exp_logits = logits.exp()
42 | sum_exp_logits = exp_logits.sum(dim=-1)
43 | torch.distributed.all_reduce(sum_exp_logits,
44 | op=torch.distributed.ReduceOp.SUM,
45 | group=get_model_parallel_group())
46 |
47 | # Get the partition's vocab indecies
48 | get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
49 | partition_vocab_size = vocab_parallel_logits.size()[-1]
50 | rank = get_model_parallel_rank()
51 | world_size = get_model_parallel_world_size()
52 | vocab_start_index, vocab_end_index = get_vocab_range(
53 | partition_vocab_size, rank, world_size)
54 |
55 | # Create a mask of valid vocab ids (1 means it needs to be masked).
56 | target_mask = (target < vocab_start_index) | (target >=
57 | vocab_end_index)
58 | masked_target = target.clone() - vocab_start_index
59 | masked_target[target_mask] = 0
60 |
61 | # Get predicted-logits = logits[target].
62 | # For Simplicity, we convert logits to a 2-D tensor with size
63 | # [*, partition-vocab-size] and target to a 1-D tensor of size [*].
64 | logits_2d = logits.view(-1, partition_vocab_size)
65 | masked_target_1d = masked_target.view(-1)
66 | arange_1d = torch.arange(start=0,
67 | end=logits_2d.size()[0],
68 | device=logits_2d.device)
69 | predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
70 | predicted_logits = predicted_logits_1d.view_as(target)
71 | predicted_logits[target_mask] = 0.0
72 | # All reduce is needed to get the chunks from other GPUs.
73 | torch.distributed.all_reduce(predicted_logits,
74 | op=torch.distributed.ReduceOp.SUM,
75 | group=get_model_parallel_group())
76 |
77 | # Loss = log(sum(exp(logits))) - predicted-logit.
78 | loss = torch.log(sum_exp_logits) - predicted_logits
79 |
80 | # Store softmax, target-mask and masked-target for backward pass.
81 | exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
82 | ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
83 |
84 | return loss
85 |
86 | @staticmethod
87 | def backward(ctx, grad_output):
88 |
89 | # Retreive tensors from the forward path.
90 | softmax, target_mask, masked_target_1d = ctx.saved_tensors
91 |
92 | # All the inputs have softmax as thier gradient.
93 | grad_input = softmax
94 | # For simplicity, work with the 2D gradient.
95 | partition_vocab_size = softmax.size()[-1]
96 | grad_2d = grad_input.view(-1, partition_vocab_size)
97 |
98 | # Add the gradient from matching classes.
99 | arange_1d = torch.arange(start=0,
100 | end=grad_2d.size()[0],
101 | device=grad_2d.device)
102 | grad_2d[arange_1d,
103 | masked_target_1d] -= (1.0 - target_mask.view(-1).float())
104 |
105 | # Finally elementwise multiplication with the output gradients.
106 | grad_input.mul_(grad_output.unsqueeze(dim=-1))
107 |
108 | return grad_input, None
109 |
110 |
111 | def vocab_parallel_cross_entropy(vocab_parallel_logits, target):
112 | """Helper function for the cross entropy."""
113 | return _VocabParallelCrossEntropy.apply(vocab_parallel_logits, target)
114 |
--------------------------------------------------------------------------------
/bert_seq2seq/mpu/initialize.py:
--------------------------------------------------------------------------------
1 | # Copyright © 2022 BAAI. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License")
4 | # coding=utf-8
5 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | """Model and data parallel groups."""
19 |
20 | import torch
21 |
22 | from .utils import ensure_divisibility
23 |
24 | # Model parallel group that the current rank belongs to.
25 | _MODEL_PARALLEL_GROUP = None
26 | # Data parallel group that the current rank belongs to.
27 | _DATA_PARALLEL_GROUP = None
28 |
29 |
30 | def initialize_model_parallel(model_parallel_size_):
31 | """
32 | Initialize model data parallel groups.
33 |
34 | Arguments:
35 | model_parallel_size: number of GPUs used to parallelize model.
36 |
37 | Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
38 | use 2 GPUs to parallelize the model. The present function will
39 | create 4 model parallel groups and 2 data parallel groups as:
40 | 4 model parallel groups:
41 | [g0, g1], [g2, g3], [g4, g5], [g6, g7]
42 | 2 data parallel groups:
43 | [g0, g2, g4, g6], [g1, g3, g5, g7]
44 | Note that for efficiency, the caller should make sure adjacent ranks
45 | are on the same DGX box. For example if we are using 2 DGX-1 boxes
46 | with a total of 16 GPUs, rank 0 to 7 belong to the first box and
47 | ranks 8 to 15 belong to the second box.
48 | """
49 | if torch.distributed.get_rank() == 0:
50 | print('> initializing model parallel with size {}'.format(
51 | model_parallel_size_))
52 | # Get world size and rank. Ensure some consistencies.
53 | assert torch.distributed.is_initialized()
54 | world_size = torch.distributed.get_world_size()
55 | model_parallel_size = min(model_parallel_size_, world_size)
56 | ensure_divisibility(world_size, model_parallel_size)
57 | rank = torch.distributed.get_rank()
58 |
59 | # Build the data parallel groups.
60 | global _DATA_PARALLEL_GROUP
61 | assert _DATA_PARALLEL_GROUP is None, \
62 | 'data parallel group is already initialized'
63 | for i in range(model_parallel_size):
64 | ranks = range(i, world_size, model_parallel_size)
65 | group = torch.distributed.new_group(ranks)
66 | if i == (rank % model_parallel_size):
67 | _DATA_PARALLEL_GROUP = group
68 |
69 | # Build the model parallel groups.
70 | global _MODEL_PARALLEL_GROUP
71 | assert _MODEL_PARALLEL_GROUP is None, \
72 | 'model parallel group is already initialized'
73 | for i in range(world_size // model_parallel_size):
74 | ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size)
75 | group = torch.distributed.new_group(ranks)
76 | if i == (rank // model_parallel_size):
77 | _MODEL_PARALLEL_GROUP = group
78 |
79 |
80 | def model_parallel_is_initialized():
81 | """Check if model and data parallel groups are initialized."""
82 | if _MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None:
83 | return False
84 | return True
85 |
86 |
87 | def get_model_parallel_group():
88 | """Get the model parallel group the caller rank belongs to."""
89 | assert _MODEL_PARALLEL_GROUP is not None, \
90 | 'model parallel group is not initialized'
91 | return _MODEL_PARALLEL_GROUP
92 |
93 |
94 | def get_data_parallel_group():
95 | """Get the data parallel group the caller rank belongs to."""
96 | assert _DATA_PARALLEL_GROUP is not None, \
97 | 'data parallel group is not initialized'
98 | return _DATA_PARALLEL_GROUP
99 |
100 |
101 | def get_model_parallel_world_size():
102 | """Return world size for the model parallel group."""
103 | return torch.distributed.get_world_size(group=get_model_parallel_group())
104 |
105 |
106 | def get_model_parallel_rank():
107 | """Return my rank for the model parallel group."""
108 | return torch.distributed.get_rank(group=get_model_parallel_group())
109 |
110 |
111 | def get_model_parallel_src_rank():
112 | """Calculate the global rank corresponding to a local rank zeor
113 | in the model parallel group."""
114 | global_rank = torch.distributed.get_rank()
115 | local_world_size = get_model_parallel_world_size()
116 | return (global_rank // local_world_size) * local_world_size
117 |
118 |
119 | def get_data_parallel_world_size():
120 | """Return world size for the data parallel group."""
121 | return torch.distributed.get_world_size(group=get_data_parallel_group())
122 |
123 |
124 | def get_data_parallel_rank():
125 | """Return my rank for the data parallel group."""
126 | return torch.distributed.get_rank(group=get_data_parallel_group())
127 |
128 |
129 | def destroy_model_parallel():
130 | """Set the groups to none."""
131 | global _MODEL_PARALLEL_GROUP
132 | _MODEL_PARALLEL_GROUP = None
133 | global _DATA_PARALLEL_GROUP
134 | _DATA_PARALLEL_GROUP = None
135 |
--------------------------------------------------------------------------------
/examples/ner/train_bert_ner_people_daily.py:
--------------------------------------------------------------------------------
1 | ## 人民日报数据
2 | import torch
3 | from tqdm import tqdm
4 | from torch.utils.data import Dataset
5 | from bert_seq2seq import Tokenizer
6 | from bert_seq2seq import load_model
7 | from bert_seq2seq.dataset import bert_sequence_label_collate_fn
8 | from bert_seq2seq import Trainer
9 | from bert_seq2seq import Predictor
10 |
11 | train_path = "../data/china-people-daily-ner-corpus/example.train"
12 | valid_path = '../data/china-people-daily-ner-corpus/example.dev'
13 | test_path = '../data/china-people-daily-ner-corpus/example.test'
14 |
15 | model_name = "roberta" # 选择模型名字
16 | task_name = "sequence_labeling"
17 |
18 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置
19 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置
20 |
21 | model_save_path = "./bert_sequence_labeling.bin"
22 |
23 | batch_size = 16
24 | lr = 1e-5
25 | # 加载字典
26 | maxlen = 256
27 | device = torch.device("cuda:5" if torch.cuda.is_available() else "cpu")
28 | tokenizer = Tokenizer(vocab_path, do_lower_case=True, max_len=maxlen)
29 |
30 | trainer = Trainer(epoches=10,
31 | env_type="pytorch",
32 | val_every_step=500,
33 | batch_size=batch_size,
34 | device=device,
35 | )
36 |
37 | target = ["O", "B-LOC", "I-LOC", "B-ORG", "I-ORG", "B-PER", "I-PER"]
38 |
39 | def load_data(filename):
40 | """加载数据
41 | 单条格式:[text, (start, end, label), (start, end, label), ...],
42 | 意味着text[start:end + 1]是类型为label的实体。
43 | """
44 | D = []
45 | with open(filename, encoding='utf-8') as f:
46 | f = f.read()
47 | for l in f.split('\n\n'):
48 | if not l:
49 | continue
50 | d = ['']
51 | for i, c in enumerate(l.split('\n')):
52 | char, flag = c.split(' ')
53 | d[0] += char
54 | if flag[0] == 'B':
55 | d.append([i, i, flag[2:]])
56 | elif flag[0] == 'I':
57 | d[-1][1] = i
58 |
59 | D.append(d)
60 | return D
61 |
62 | train_data = load_data(train_path)
63 | val_data = load_data(valid_path)
64 | test_data = load_data(test_path)
65 |
66 | print(f"all target is {target}")
67 |
68 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name,
69 | target_size=len(target))
70 | bert_model.load_pretrain_params(model_path)
71 |
72 | predictor = Predictor(bert_model, tokenizer)
73 |
74 | ## 自定义dataset
75 | class NERDataset(Dataset):
76 | """
77 | 针对特定数据集,定义一个相关的取数据的方式
78 | """
79 | def __init__(self, data) :
80 | ## 一般init函数是加载所有数据
81 | super(NERDataset, self).__init__()
82 | # 读原始数据
83 | self.data = data
84 | def __getitem__(self, i):
85 | ## 得到单个数据
86 | # print(i)
87 | data = self.data[i]
88 |
89 | tokens = tokenizer.tokenize(data[0], maxlen=maxlen, add_spatial_tokens=True)
90 | input_ids = tokenizer.convert_tokens_to_ids(tokens)
91 |
92 | mapping = tokenizer.rematch(data[0], tokens)
93 | start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
94 | end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
95 | length = len(tokens)
96 | labels = [0] * length
97 |
98 | for start, end, label in data[1:]:
99 | if start in start_mapping and end in end_mapping:
100 | # 说明找到这个token了。
101 | start = start_mapping[start]
102 | end = end_mapping[end]
103 |
104 | labels[start] = target.index(f"B-{label}")
105 | for j in range(start + 1, end + 1):
106 | labels[j] = target.index(f"I-{label}")
107 |
108 | output = {
109 | "input_ids": input_ids,
110 | "labels": labels
111 | }
112 | return output
113 |
114 | def __len__(self):
115 | return len(self.data)
116 |
117 | def evaluate(data):
118 | """评测函数
119 | """
120 | X, Y, Z = 1e-10, 1e-10, 1e-10
121 | for d in tqdm(data, ncols=100):
122 | R = set(predictor.predict_ner(d[0], target, maxlen=maxlen))
123 | T = set([tuple(i) for i in d[1:]])
124 | X += len(R & T)
125 | Y += len(R)
126 | Z += len(T)
127 | f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
128 | return f1, precision, recall
129 |
130 | class Evaluator:
131 |
132 | def __init__(self):
133 | self.best_val_f1 = 0.0
134 |
135 | def on_epoch_end(self):
136 |
137 | text = ["6月15日,河南省文物考古研究所曹操高陵文物队公开发表声明承认:“从来没有说过出土的珠子是墓主人的",
138 | "4月8日,北京冬奥会、冬残奥会总结表彰大会在人民大会堂隆重举行。习近平总书记出席大会并发表重要讲话。在讲话中,总书记充分肯定了北京冬奥会、冬残奥会取得的优异成绩,全面回顾了7年筹办备赛的不凡历程,深入总结了筹备举办北京冬奥会、冬残奥会的宝贵经验,深刻阐释了北京冬奥精神,对运用好冬奥遗产推动高质量发展提出明确要求。",
139 | "当地时间8日,欧盟委员会表示,欧盟各成员国政府现已冻结共计约300亿欧元与俄罗斯寡头及其他被制裁的俄方人员有关的资产。",
140 | "这一盘口状态下英国必发公司亚洲盘交易数据显示博洛尼亚热。而从欧赔投注看,也是主队热。巴勒莫两连败,",
141 | ]
142 | for t in text:
143 | entities = predictor.predict_ner(t, target, maxlen=maxlen)
144 | result = {}
145 | for e in entities:
146 | if e[2] not in result:
147 | result[e[2]] = [t[e[0]: e[1]+1]]
148 | else :
149 | result[e[2]].append(t[e[0]: e[1]+1])
150 | print(f"result is {result}")
151 |
152 | f1, precision, recall = evaluate(val_data)
153 | # 保存最优
154 | if f1 >= self.best_val_f1:
155 | self.best_val_f1 = f1
156 | torch.save(bert_model.state_dict(), model_save_path)
157 | print(f"模型保存成功~")
158 | print(
159 | 'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
160 | (f1, precision, recall, self.best_val_f1)
161 | )
162 |
163 | f1, precision, recall = evaluate(test_data)
164 | print(
165 | 'test: f1: %.5f, precision: %.5f, recall: %.5f\n' %
166 | (f1, precision, recall)
167 | )
168 |
169 | def main():
170 |
171 |
172 | optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=1e-5)
173 | train_dataset = NERDataset(train_data)
174 |
175 | trainer.train(model=bert_model, optimizer=optimizer,
176 | train_dataset=train_dataset, evaluator=Evaluator,
177 | collate_fn=bert_sequence_label_collate_fn)
178 |
179 | if __name__ == '__main__':
180 | main()
181 |
--------------------------------------------------------------------------------
/examples/ner/train_roberta_ner_gp_people_daily.py:
--------------------------------------------------------------------------------
1 | # 人民日报数据
2 | import torch
3 | from tqdm import tqdm
4 | from torch.utils.data import Dataset, DataLoader
5 | from bert_seq2seq import Tokenizer
6 | from bert_seq2seq import load_model
7 | from bert_seq2seq.dataset import bert_sequence_label_gp_collate_fn
8 | from bert_seq2seq import Trainer
9 | import numpy as np
10 | import os
11 | from bert_seq2seq import Predictor
12 |
13 | train_path = "../data/china-people-daily-ner-corpus/example.train"
14 | valid_path = '../data/china-people-daily-ner-corpus/example.dev'
15 | test_path = '../data/china-people-daily-ner-corpus/example.test'
16 |
17 | model_name = "roberta" # 选择模型名字
18 | task_name = "sequence_labeling_gp"
19 |
20 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置
21 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置
22 |
23 | model_save_path = "./bert_sequence_labeling_gp.bin"
24 |
25 | batch_size = 16
26 | lr = 2e-5
27 | # 加载字典
28 | tokenizer = Tokenizer(vocab_path)
29 | maxlen = 256
30 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31 |
32 | trainer = Trainer(epoches=10,
33 | env_type="pytorch",
34 | val_every_step=500,
35 | batch_size=batch_size,
36 | device=device,
37 | )
38 | target = set()
39 |
40 | def load_data(filename):
41 | """加载数据
42 | 单条格式:[text, (start, end, label), (start, end, label), ...],
43 | 意味着text[start:end + 1]是类型为label的实体。
44 | """
45 | D = []
46 | with open(filename, encoding='utf-8') as f:
47 | f = f.read()
48 | for l in f.split('\n\n'):
49 | if not l:
50 | continue
51 | d = ['']
52 | for i, c in enumerate(l.split('\n')):
53 | char, flag = c.split(' ')
54 | d[0] += char
55 | if flag[0] == 'B':
56 | d.append([i, i, flag[2:]])
57 | target.add(flag[2:])
58 | elif flag[0] == 'I':
59 | d[-1][1] = i
60 |
61 | D.append(d)
62 | return D
63 |
64 | train_data = load_data(train_path)
65 | val_data = load_data(valid_path)
66 | test_data = load_data(test_path)
67 | target = list(sorted(target))
68 |
69 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name,
70 | target_size=len(target), ner_inner_dim=64)
71 | # ## 加载预训练的模型参数~
72 | bert_model.load_pretrain_params(model_path)
73 | predictor = Predictor(bert_model, tokenizer)
74 |
75 | ## 自定义dataset
76 | class NERDataset(Dataset):
77 | """
78 | 针对特定数据集,定义一个相关的取数据的方式
79 | """
80 | def __init__(self, data) :
81 | ## 一般init函数是加载所有数据
82 | super(NERDataset, self).__init__()
83 | # 读原始数据
84 | self.data = data
85 | def __getitem__(self, i):
86 | ## 得到单个数据
87 | # print(i)
88 | data = self.data[i]
89 |
90 | tokens = tokenizer.tokenize(data[0], maxlen=maxlen, add_spatial_tokens=True)
91 | input_ids = tokenizer.convert_tokens_to_ids(tokens)
92 | mapping = tokenizer.rematch(data[0], tokens)
93 | start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
94 | end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
95 | length = len(tokens)
96 | labels = np.zeros((len(target), length, length))
97 |
98 | for start, end, label in data[1:]:
99 | if start in start_mapping and end in end_mapping:
100 | # 说明找到这个token了。
101 | start = start_mapping[start]
102 | end = end_mapping[end]
103 |
104 | label_index = target.index(label)
105 | labels[label_index, start, end] = 1
106 |
107 | output = {
108 | "input_ids": input_ids,
109 | "labels": labels
110 | }
111 | return output
112 |
113 | def __len__(self):
114 | return len(self.data)
115 |
116 |
117 | def evaluate(data):
118 | """评测函数
119 | """
120 | X, Y, Z = 1e-10, 1e-10, 1e-10
121 | for d in tqdm(data, ncols=100):
122 | R = set(predictor.predict_ner(d[0], target, maxlen=maxlen))
123 | T = set([tuple(i) for i in d[1:]])
124 | X += len(R & T)
125 | Y += len(R)
126 | Z += len(T)
127 | f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
128 | return f1, precision, recall
129 |
130 | class Evaluator:
131 |
132 | def __init__(self):
133 | self.best_val_f1 = 0.0
134 |
135 | def on_validation(self, data):
136 | loss = data["loss"]
137 | step = data["iteration"]
138 |
139 | text = ["6月15日,河南省文物考古研究所曹操高陵文物队公开发表声明承认:“从来没有说过出土的珠子是墓主人的",
140 | "4月8日,北京冬奥会、冬残奥会总结表彰大会在人民大会堂隆重举行。习近平总书记出席大会并发表重要讲话。在讲话中,总书记充分肯定了北京冬奥会、冬残奥会取得的优异成绩,全面回顾了7年筹办备赛的不凡历程,深入总结了筹备举办北京冬奥会、冬残奥会的宝贵经验,深刻阐释了北京冬奥精神,对运用好冬奥遗产推动高质量发展提出明确要求。",
141 | "当地时间8日,欧盟委员会表示,欧盟各成员国政府现已冻结共计约300亿欧元与俄罗斯寡头及其他被制裁的俄方人员有关的资产。",
142 | "这一盘口状态下英国必发公司亚洲盘交易数据显示博洛尼亚热。而从欧赔投注看,也是主队热。巴勒莫两连败,",
143 | ]
144 | for t in text:
145 | entities = predictor.predict_ner(t, target, maxlen=maxlen)
146 | result = {}
147 | for e in entities:
148 | if e[2] not in result:
149 | result[e[2]] = [t[e[0]: e[1]+1]]
150 | else :
151 | result[e[2]].append(t[e[0]: e[1]+1])
152 | print(f"result is {result}")
153 |
154 | f1, precision, recall = evaluate(val_data)
155 | # 保存最优
156 | if f1 >= self.best_val_f1:
157 | self.best_val_f1 = f1
158 | torch.save(bert_model.state_dict(), model_save_path)
159 | print(f"模型保存成功~")
160 |
161 | print(
162 | 'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
163 | (f1, precision, recall, self.best_val_f1)
164 | )
165 |
166 | f1, precision, recall = evaluate(test_data)
167 | print(
168 | 'test: f1: %.5f, precision: %.5f, recall: %.5f\n' %
169 | (f1, precision, recall)
170 | )
171 |
172 |
173 | def main():
174 |
175 | optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=1e-5)
176 | train_dataset = NERDataset(train_data)
177 | trainer.train(model=bert_model, optimizer=optimizer, evaluator=Evaluator,
178 | train_dataset=train_dataset, collate_fn=bert_sequence_label_gp_collate_fn,
179 | )
180 |
181 | if __name__ == '__main__':
182 | main()
183 |
--------------------------------------------------------------------------------
/bert_seq2seq/launch.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The Microsoft DeepSpeed Team
2 | """
3 | sailing runner is the main front-end to launching multi-worker
4 | training jobs with DeepSpeed. By default this uses pdsh to parallel
5 | ssh into multiple worker nodes and launch all the necessary processes
6 | per rank for training.
7 | """
8 |
9 | import os
10 | import sys
11 | import json
12 | import subprocess
13 | import collections
14 | import socket
15 | import signal
16 | import logging
17 |
18 | import torch.distributed as dist
19 |
20 |
21 | def fetch_hostfile(hostfile_path):
22 | if not os.path.isfile(hostfile_path):
23 | print("Unable to find hostfile, will proceed with training "
24 | "with local resources only.")
25 | return None
26 | # e.g., worker-0 slots=16
27 | with open(hostfile_path, 'r') as fd:
28 | resource_pool = collections.OrderedDict()
29 | for line in fd.readlines():
30 | line = line.strip()
31 | if line == '':
32 | # skip empty lines
33 | continue
34 | try:
35 | hostname, slots = line.split()
36 | _, slot_count = slots.split("=")
37 | slot_count = int(slot_count)
38 | except ValueError as err:
39 | raise err
40 | if hostname in resource_pool:
41 | raise ValueError(f"host {hostname} is already defined")
42 | resource_pool[hostname] = slot_count
43 |
44 | return resource_pool
45 |
46 |
47 | def cmd_load_hyperparam(config_path=None, format="json", encoding="utf-8"):
48 | """
49 | shell load arguments form argparse and config file
50 | """
51 | # config_path='config/config_block_large_chinese.json'
52 | format = config_path.rsplit('.')[-1]
53 | with open(config_path, 'r', encoding=encoding) as f:
54 | if format == "json":
55 | config_dict = json.load(f)
56 | else:
57 | raise NameError("current format%s for hyperparam file is invalid" %
58 | format)
59 | config_cmd = []
60 | for key in config_dict:
61 | if len(str(config_dict[key])) == 0:
62 | config_cmd.append('--' + key)
63 | else:
64 | config_cmd.append('--' + key)
65 | config_cmd.append(str(config_dict[key]))
66 | return config_cmd
67 |
68 |
69 | def launch_dist(
70 | env_type="DDP",
71 | num_nodes=1,
72 | gpus_per_node=1,
73 | master_addr='localhost',
74 | master_port=17500,
75 | training_script='train.py',
76 | hostfile=None,
77 | ):
78 |
79 | if num_nodes != 1:
80 | print("多机多卡待测试。暂不支持。")
81 | os._exit(0)
82 | if env_type == "DDP":
83 | cmd_launch = []
84 | cmd_launch.extend([
85 | # 'export NUM_NODES=' + str(num_nodes) + ';',
86 | # 'export GPUS_PER_NODE=' + str(gpus_per_node) + ';',
87 | sys.executable,
88 | # "python",
89 | '-m', 'torch.distributed.launch'
90 | ])
91 | torch_distributed_args = [
92 | '--nproc_per_node',
93 | str(gpus_per_node),
94 | '--nnodes',
95 | str(num_nodes),
96 | '--node_rank',
97 | str(0),
98 | '--master_addr',
99 | master_addr,
100 | '--master_port',
101 | str(master_port),
102 | ]
103 | cmd_launch.extend(torch_distributed_args)
104 | cmd_launch.append(training_script)
105 | cmd_launch.append('--not_call_launch')
106 | run_cmd = ' '.join(cmd_launch)
107 | p = subprocess.Popen(run_cmd, shell=True, preexec_fn=os.setsid)
108 | def signal_handler(signal, frame):
109 | os.killpg(os.getpgid(p.pid), 9)
110 | signal.signal(signal.SIGINT, signal_handler)
111 | p.wait()
112 | print ('finish')
113 |
114 | elif env_type == "deepspeed":
115 |
116 | # if hostfile is None:
117 | # print(
118 | # 'Unable to find hostfile, will proceed with training with local resources only.'
119 | # )
120 | # os.makedirs("./tmp", exist_ok=True)
121 | #
122 | # with open('./tmp/hostfile', 'w') as w:
123 | # w.write(socket.gethostname() + ' slots=2')
124 | # hostfile = './tmp/hostfile'
125 |
126 | cmd_launch = ['deepspeed']
127 |
128 | cmd_launch.extend([
129 | '--master_port',
130 | str(master_port),
131 | '--num_nodes',
132 | str(num_nodes),
133 | '--num_gpus',
134 | str(gpus_per_node),
135 | # '--hostfile',
136 | # hostfile,
137 | ])
138 |
139 | cmd_launch.append(training_script)
140 |
141 | cmd_launch.append('--not_call_launch')
142 | run_cmd = ' '.join(cmd_launch)
143 | p = subprocess.Popen(run_cmd, shell=True, preexec_fn=os.setsid)
144 | def signal_handler(signal, frame):
145 | os.killpg(os.getpgid(p.pid), 9)
146 | signal.signal(signal.SIGINT, signal_handler)
147 | p.wait()
148 | print ('finish')
149 |
150 | elif env_type == "deepspeed+mpu":
151 |
152 | # if hostfile is None:
153 | # print(
154 | # 'Unable to find hostfile, will proceed with training with local resources only.'
155 | # )
156 | # os.makedirs("./tmp", exist_ok=True)
157 | #
158 | # with open('./tmp/hostfile', 'w') as w:
159 | # w.write(socket.gethostname() + ' slots=2')
160 | # hostfile = './tmp/hostfile'
161 |
162 | cmd_launch = ["export ENV_TYPE=deepspeed+mpu;",'deepspeed']
163 |
164 | cmd_launch.extend([
165 | '--master_port',
166 | str(master_port),
167 | '--num_nodes',
168 | str(num_nodes),
169 | '--num_gpus',
170 | str(gpus_per_node),
171 | # '--hostfile',
172 | # hostfile,
173 | ])
174 |
175 | cmd_launch.append(training_script)
176 |
177 | cmd_launch.append('--not_call_launch')
178 | run_cmd = ' '.join(cmd_launch)
179 | p = subprocess.Popen(run_cmd, shell=True, preexec_fn=os.setsid)
180 | def signal_handler(signal, frame):
181 | os.killpg(os.getpgid(p.pid), 9)
182 | signal.signal(signal.SIGINT, signal_handler)
183 | p.wait()
184 | print ('finish')
185 |
186 | else :
187 | print("不支持的env_type")
188 | os._exit(0)
189 |
--------------------------------------------------------------------------------
/examples/ner/train_bert_ner_crf_people_daily.py:
--------------------------------------------------------------------------------
1 | ## 人民日报数据
2 | import torch
3 | from tqdm import tqdm
4 | from torch.utils.data import Dataset
5 | from bert_seq2seq import Tokenizer
6 | from bert_seq2seq import load_model
7 | from bert_seq2seq.dataset import bert_sequence_label_collate_fn
8 | from bert_seq2seq import Trainer
9 | from bert_seq2seq import Predictor
10 |
11 | train_path = "../data/china-people-daily-ner-corpus/example.train"
12 | valid_path = '../data/china-people-daily-ner-corpus/example.dev'
13 | test_path = '../data/china-people-daily-ner-corpus/example.test'
14 |
15 | model_name = "roberta" # 选择模型名字
16 | task_name = "sequence_labeling_crf"
17 |
18 | vocab_path = "../state_dict/roberta/vocab.txt" # roberta模型字典的位置
19 | model_path = "../state_dict/roberta/pytorch_model.bin" # roberta模型位置
20 |
21 | model_save_path = "./bert_sequence_labeling_crf.bin"
22 |
23 | batch_size = 16
24 | lr = 1e-5
25 | crf_lr = 0.01
26 | # 加载字典
27 | maxlen = 256
28 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
29 | tokenizer = Tokenizer(vocab_path, do_lower_case=True, max_len=maxlen)
30 |
31 | trainer = Trainer(epoches=10,
32 | env_type="pytorch",
33 | val_every_step=500,
34 | batch_size=batch_size,
35 | device=device,
36 | )
37 |
38 | target = ["O", "B-LOC", "I-LOC", "B-ORG", "I-ORG", "B-PER", "I-PER"]
39 |
40 | def load_data(filename):
41 | """加载数据
42 | 单条格式:[text, (start, end, label), (start, end, label), ...],
43 | 意味着text[start:end + 1]是类型为label的实体。
44 | """
45 | D = []
46 | with open(filename, encoding='utf-8') as f:
47 | f = f.read()
48 | for l in f.split('\n\n'):
49 | if not l:
50 | continue
51 | d = ['']
52 | for i, c in enumerate(l.split('\n')):
53 | char, flag = c.split(' ')
54 | d[0] += char
55 | if flag[0] == 'B':
56 | d.append([i, i, flag[2:]])
57 | elif flag[0] == 'I':
58 | d[-1][1] = i
59 |
60 | D.append(d)
61 | return D
62 |
63 | train_data = load_data(train_path)
64 | val_data = load_data(valid_path)
65 | test_data = load_data(test_path)
66 |
67 | print(f"all target is {target}")
68 |
69 | bert_model = load_model(tokenizer.vocab, model_name=model_name, task_name=task_name,
70 | target_size=len(target))
71 | bert_model.load_pretrain_params(model_path, strict=False)
72 |
73 | predictor = Predictor(bert_model, tokenizer)
74 |
75 | ## 自定义dataset
76 | class NERDataset(Dataset):
77 | """
78 | 针对特定数据集,定义一个相关的取数据的方式
79 | """
80 | def __init__(self, data) :
81 | ## 一般init函数是加载所有数据
82 | super(NERDataset, self).__init__()
83 | # 读原始数据
84 | self.data = data
85 |
86 | def __getitem__(self, i):
87 | ## 得到单个数据
88 | # print(i)
89 | data = self.data[i]
90 |
91 | tokens = tokenizer.tokenize(data[0], maxlen=maxlen, add_spatial_tokens=True)
92 | input_ids = tokenizer.convert_tokens_to_ids(tokens)
93 |
94 | mapping = tokenizer.rematch(data[0], tokens)
95 | start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
96 | end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
97 | length = len(tokens)
98 | labels = [0] * length
99 |
100 | for start, end, label in data[1:]:
101 | if start in start_mapping and end in end_mapping:
102 | # 说明找到这个token了。
103 | start = start_mapping[start]
104 | end = end_mapping[end]
105 |
106 | labels[start] = target.index(f"B-{label}")
107 | for j in range(start + 1, end + 1):
108 | labels[j] = target.index(f"I-{label}")
109 |
110 | output = {
111 | "input_ids": input_ids,
112 | "labels": labels
113 | }
114 | return output
115 |
116 | def __len__(self):
117 | return len(self.data)
118 |
119 | def evaluate(data):
120 | """评测函数
121 | """
122 | X, Y, Z = 1e-10, 1e-10, 1e-10
123 | for d in tqdm(data, ncols=100):
124 | R = set(predictor.predict_ner(d[0], target, maxlen=maxlen))
125 | T = set([tuple(i) for i in d[1:]])
126 | X += len(R & T)
127 | Y += len(R)
128 | Z += len(T)
129 | f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
130 | return f1, precision, recall
131 |
132 | class Evaluator:
133 |
134 | def __init__(self):
135 | self.best_val_f1 = 0.0
136 |
137 | def on_epoch_end(self):
138 |
139 | text = ["6月15日,河南省文物考古研究所曹操高陵文物队公开发表声明承认:“从来没有说过出土的珠子是墓主人的",
140 | "4月8日,北京冬奥会、冬残奥会总结表彰大会在人民大会堂隆重举行。习近平总书记出席大会并发表重要讲话。在讲话中,总书记充分肯定了北京冬奥会、冬残奥会取得的优异成绩,全面回顾了7年筹办备赛的不凡历程,深入总结了筹备举办北京冬奥会、冬残奥会的宝贵经验,深刻阐释了北京冬奥精神,对运用好冬奥遗产推动高质量发展提出明确要求。",
141 | "当地时间8日,欧盟委员会表示,欧盟各成员国政府现已冻结共计约300亿欧元与俄罗斯寡头及其他被制裁的俄方人员有关的资产。",
142 | "这一盘口状态下英国必发公司亚洲盘交易数据显示博洛尼亚热。而从欧赔投注看,也是主队热。巴勒莫两连败,",
143 | ]
144 | for t in text:
145 | entities = predictor.predict_ner(t, target, maxlen=maxlen)
146 | result = {}
147 | for e in entities:
148 | if e[2] not in result:
149 | result[e[2]] = [t[e[0]: e[1]+1]]
150 | else :
151 | result[e[2]].append(t[e[0]: e[1]+1])
152 | print(f"result is {result}")
153 |
154 | f1, precision, recall = evaluate(val_data)
155 | # 保存最优
156 | if f1 >= self.best_val_f1:
157 | self.best_val_f1 = f1
158 | torch.save(bert_model.state_dict(), model_save_path)
159 | print(f"模型保存成功~")
160 | print(
161 | 'valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
162 | (f1, precision, recall, self.best_val_f1)
163 | )
164 |
165 | f1, precision, recall = evaluate(test_data)
166 | print(
167 | 'test: f1: %.5f, precision: %.5f, recall: %.5f\n' %
168 | (f1, precision, recall)
169 | )
170 |
171 | def main():
172 |
173 | crf_params = list(map(id, bert_model.crf_layer.parameters())) ## 单独把crf层参数拿出来
174 | base_params = filter(lambda p: id(p) not in crf_params, bert_model.parameters())
175 |
176 | optimizer = torch.optim.Adam([
177 | {"params": base_params},
178 | {"params": bert_model.crf_layer.parameters(), "lr": crf_lr}], lr=lr, weight_decay=1e-3)
179 |
180 | train_dataset = NERDataset(train_data)
181 |
182 | trainer.train(model=bert_model, optimizer=optimizer,
183 | train_dataset=train_dataset, evaluator=Evaluator,
184 | collate_fn=bert_sequence_label_collate_fn)
185 |
186 | if __name__ == '__main__':
187 | main()
188 |
--------------------------------------------------------------------------------
/bert_seq2seq/model/layers/activations.py:
--------------------------------------------------------------------------------
1 | # Copyright © 2022 BAAI. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License")
4 | # Copyright 2020 The HuggingFace Team. All rights reserved.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | import math
19 | import torch
20 | from packaging import version
21 | from torch import Tensor, nn
22 |
23 |
24 | class NewGELUActivation(nn.Module):
25 | """
26 | Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
27 | the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
28 | """
29 | def forward(self, input: Tensor) -> Tensor:
30 | return 0.5 * input * (1.0 + torch.tanh(
31 | math.sqrt(2.0 / math.pi) *
32 | (input + 0.044715 * torch.pow(input, 3.0))))
33 |
34 |
35 | class GELUActivation(nn.Module):
36 | """
37 | Original Implementation of the GELU activation function in Google BERT repo when initially created. For
38 | information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
39 | torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
40 | Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
41 | """
42 | def __init__(self, use_gelu_python: bool = False):
43 | super().__init__()
44 | if version.parse(
45 | torch.__version__) < version.parse("1.4") or use_gelu_python:
46 | self.act = self._gelu_python
47 | else:
48 | self.act = nn.functional.gelu
49 |
50 | def _gelu_python(self, input: Tensor) -> Tensor:
51 | return input * 0.5 * (1.0 + torch.erf(input / math.sqrt(2.0)))
52 |
53 | def forward(self, input: Tensor) -> Tensor:
54 | return self.act(input)
55 |
56 |
57 | class FastGELUActivation(nn.Module):
58 | """
59 | Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
60 | """
61 | def forward(self, input: Tensor) -> Tensor:
62 | return 0.5 * input * (1.0 +
63 | torch.tanh(input * 0.7978845608 *
64 | (1.0 + 0.044715 * input * input)))
65 |
66 |
67 | class QuickGELUActivation(nn.Module):
68 | """
69 | Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
70 | """
71 | def forward(self, input: Tensor) -> Tensor:
72 | return input * torch.sigmoid(1.702 * input)
73 |
74 |
75 | class ClippedGELUActivation(nn.Module):
76 | """
77 | Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
78 | it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
79 | https://arxiv.org/abs/2004.09602.
80 | Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
81 | initially created.
82 | For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
83 | torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
84 | """
85 | def __init__(self, min: float, max: float):
86 | if min > max:
87 | raise ValueError(
88 | f"min should be < max (got min: {min}, max: {max})")
89 |
90 | super().__init__()
91 | self.min = min
92 | self.max = max
93 |
94 | def forward(self, x: Tensor) -> Tensor:
95 | return torch.clip(gelu(x), self.min, self.max)
96 |
97 |
98 | class SiLUActivation(nn.Module):
99 | """
100 | See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
101 | Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
102 | Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
103 | Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
104 | later.
105 | """
106 | def __init__(self):
107 | super().__init__()
108 | if version.parse(torch.__version__) < version.parse("1.7"):
109 | self.act = self._silu_python
110 | else:
111 | self.act = nn.functional.silu
112 |
113 | def _silu_python(self, input: Tensor) -> Tensor:
114 | return input * torch.sigmoid(input)
115 |
116 | def forward(self, input: Tensor) -> Tensor:
117 | return self.act(input)
118 |
119 |
120 | class MishActivation(nn.Module):
121 | """
122 | See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
123 | visit the official repository for the paper: https://github.com/digantamisra98/Mish
124 | """
125 | def __init__(self):
126 | super().__init__()
127 | if version.parse(torch.__version__) < version.parse("1.9"):
128 | self.act = self._mish_python
129 | else:
130 | self.act = nn.functional.mish
131 |
132 | def _mish_python(self, input: Tensor) -> Tensor:
133 | return input * torch.tanh(nn.functional.softplus(input))
134 |
135 | def forward(self, input: Tensor) -> Tensor:
136 | return self.act(input)
137 |
138 |
139 | class LinearActivation(nn.Module):
140 | """
141 | Applies the linear activation function, i.e. forwarding input directly to output.
142 | """
143 | def forward(self, input: Tensor) -> Tensor:
144 | return input
145 |
146 |
147 | @torch.jit.script
148 | def gelu_impl(x):
149 | """OpenAI's gelu implementation."""
150 | return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * x *
151 | (1.0 + 0.044715 * x * x)))
152 |
153 |
154 | ACT2FN = {
155 | "relu": nn.ReLU(),
156 | "gelu_impl": gelu_impl,
157 | "silu": SiLUActivation(),
158 | "swish": SiLUActivation(),
159 | "gelu": GELUActivation(),
160 | "tanh": nn.Tanh(),
161 | "gelu_python": GELUActivation(use_gelu_python=True),
162 | "gelu_new": NewGELUActivation(),
163 | "gelu_fast": FastGELUActivation(),
164 | "quick_gelu": QuickGELUActivation(),
165 | "gelu_10": ClippedGELUActivation(-10, 10),
166 | "mish": MishActivation(),
167 | "linear": LinearActivation(),
168 | "sigmoid": nn.Sigmoid(),
169 | }
170 |
171 |
172 | def get_activation(activation_string):
173 | if activation_string in ACT2FN:
174 | return ACT2FN[activation_string]
175 | else:
176 | raise KeyError(
177 | f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}"
178 | )
179 |
180 |
181 | # For backwards compatibility with: from activations import gelu_python
182 | gelu_python = get_activation("gelu_python")
183 | gelu_new = get_activation("gelu_new")
184 | gelu = get_activation("gelu")
185 | gelu_fast = get_activation("gelu_fast")
186 | quick_gelu = get_activation("quick_gelu")
187 | silu = get_activation("silu")
188 | mish = get_activation("mish")
189 | linear_act = get_activation("linear")
190 | gelu_impl = get_activation("gelu_impl")
191 | relu = get_activation("relu")
192 |
--------------------------------------------------------------------------------
/bert_seq2seq/dataset.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.data import DataLoader, Dataset
3 | import numpy as np
4 |
5 | def padding(indice, max_length, pad_idx=0):
6 | """
7 | pad 函数
8 | """
9 |
10 | pad_indice = [item + [pad_idx] * max(0, max_length - len(item)) for item in indice]
11 | return torch.tensor(pad_indice)
12 |
13 | def sequence_padding(inputs, length=None, value=0, seq_dims=1, mode='post'):
14 |
15 | if length is None:
16 | length = np.max([np.shape(x)[:seq_dims] for x in inputs], axis=0)
17 | elif not hasattr(length, '__getitem__'):
18 | length = [length]
19 |
20 | slices = [np.s_[:length[i]] for i in range(seq_dims)]
21 | slices = tuple(slices) if len(slices) > 1 else slices[0]
22 | pad_width = [(0, 0) for _ in np.shape(inputs[0])]
23 |
24 | outputs = []
25 | for x in inputs:
26 | x = x[slices]
27 | for i in range(seq_dims):
28 | if mode == 'post':
29 | pad_width[i] = (0, length[i] - np.shape(x)[i])
30 | elif mode == 'pre':
31 | pad_width[i] = (length[i] - np.shape(x)[i], 0)
32 | else:
33 | raise ValueError('"mode" argument must be "post" or "pre".')
34 | x = np.pad(x, pad_width, 'constant', constant_values=value)
35 | outputs.append(x)
36 |
37 | return np.array(outputs)
38 |
39 | def gpt_collate_fn(batch):
40 |
41 | token_ids = [data["input_ids"] for data in batch]
42 | max_length = max([len(t) for t in token_ids])
43 |
44 | token_ids_padded = padding(token_ids, max_length)
45 | target_ids_padded = token_ids_padded.clone()
46 | target_ids_padded[target_ids_padded == 0] = -100
47 |
48 | return {
49 | "input_ids": token_ids_padded,
50 | "labels": target_ids_padded
51 | }
52 |
53 | def t5_seq2seq_collate_fn(batch):
54 |
55 | token_ids_src = [data["input_ids"] for data in batch]
56 | max_length_src = max([len(t) for t in token_ids_src])
57 | token_ids_tgt = [data["target_ids"] for data in batch]
58 | max_length_tgt = max([len(t) for t in token_ids_tgt])
59 |
60 | token_ids_padded = padding(token_ids_src, max_length_src)
61 | target_ids_padded = padding(token_ids_tgt, max_length_tgt)
62 | labels_ids = target_ids_padded.clone()
63 | labels_ids[labels_ids == 0] = -100
64 | target_ids_padded = target_ids_padded[:, :-1].contiguous()
65 | labels_ids = labels_ids[:, 1:].contiguous()
66 |
67 | return {
68 | "input_ids": token_ids_padded,
69 | "decoder_input_ids": target_ids_padded,
70 | "labels": labels_ids
71 | }
72 |
73 | def bert_seq2seq_collate_fn(batch):
74 |
75 | token_ids = [data["input_ids"] for data in batch]
76 | max_length = max([len(t) for t in token_ids])
77 | token_type_ids = [data["token_type_ids"] for data in batch]
78 |
79 | token_ids_padded = padding(token_ids, max_length)
80 | token_type_ids_padded = padding(token_type_ids, max_length)
81 | target_ids_padded = token_ids_padded[:, 1:].contiguous()
82 |
83 | return {
84 | "input_ids": token_ids_padded,
85 | "token_type_ids": token_type_ids_padded,
86 | "labels": target_ids_padded
87 | }
88 |
89 | def bert_cls_collate_fn(batch):
90 |
91 | token_ids = [data["input_ids"] for data in batch]
92 | max_length = max([len(t) for t in token_ids])
93 | token_type_ids = [data["token_type_ids"] for data in batch]
94 | target_ids = [data["labels"] for data in batch]
95 | target_ids = torch.tensor(target_ids, dtype=torch.long)
96 |
97 | token_ids_padded = padding(token_ids, max_length)
98 | token_type_ids_padded = padding(token_type_ids, max_length)
99 |
100 | return {
101 | "input_ids": token_ids_padded,
102 | "token_type_ids": token_type_ids_padded,
103 | "labels": target_ids
104 | }
105 |
106 | def bert_sequence_label_collate_fn(batch):
107 |
108 | token_ids = [data["input_ids"] for data in batch]
109 |
110 | max_length = max([len(t) for t in token_ids])
111 | target_ids = [data["labels"] for data in batch]
112 |
113 | token_ids_padded = padding(token_ids, max_length)
114 | target_ids_padded = padding(target_ids, max_length)
115 |
116 | return {
117 | "input_ids": token_ids_padded,
118 | "token_type_ids": None,
119 | "labels": target_ids_padded
120 | }
121 |
122 | def bert_sequence_label_gp_collate_fn(batch):
123 |
124 | token_ids = [data["input_ids"] for data in batch]
125 | labels = [data["labels"] for data in batch]
126 | token_ids_padded = sequence_padding(token_ids)
127 | labels_padded = sequence_padding(labels, seq_dims=3)
128 | token_ids_padded = torch.from_numpy(token_ids_padded)
129 | labels_padded = torch.from_numpy(labels_padded)
130 |
131 | return {
132 | "input_ids": token_ids_padded,
133 | "token_type_ids": None,
134 | "labels": labels_padded
135 | }
136 |
137 | def bert_gplinker_collate_fn(batch):
138 | input_ids = [data["input_ids"] for data in batch]
139 | token_type_ids = [data["token_type_ids"] for data in batch]
140 | entity_labels = [data["entity_labels"] for data in batch]
141 | head_labels = [data["head_labels"] for data in batch]
142 | tail_labels = [data["tail_labels"] for data in batch]
143 |
144 | input_ids = sequence_padding(input_ids)
145 | token_type_ids = sequence_padding(token_type_ids)
146 | entity_labels = sequence_padding(entity_labels, seq_dims=2)
147 | head_labels = sequence_padding(head_labels, seq_dims=2)
148 | tail_labels = sequence_padding(tail_labels, seq_dims=2)
149 |
150 | input_ids = torch.from_numpy(input_ids).long()
151 | token_type_ids = torch.from_numpy(token_type_ids).long()
152 | entity_labels = torch.from_numpy(entity_labels).long()
153 | head_labels = torch.from_numpy(head_labels).long()
154 | tail_labels = torch.from_numpy(tail_labels).long()
155 |
156 | return {
157 | "input_ids": input_ids,
158 | "token_type_ids": token_type_ids,
159 | "entity_labels": entity_labels,
160 | "head_labels": head_labels,
161 | "tail_labels": tail_labels
162 | }
163 |
164 | def pad_token(tokens, max_length):
165 | pad_len = max_length - len(tokens)
166 | # pad id is 50000
167 | tokens += [50000] * pad_len
168 | return tokens
169 |
170 | def pad_position_ids(position_ids, max_length):
171 | pad_len = max_length - len(position_ids[0])
172 | position_ids[0] += [len(position_ids[0]) + x for x in range(pad_len)]
173 | position_ids[1] += [1] * pad_len
174 | return position_ids
175 |
176 | def pad_loss_mask(loss_mask, max_length):
177 | pad_len = max_length - len(loss_mask)
178 | loss_mask += [0] * pad_len
179 | return loss_mask
180 |
181 | def glm_generation_collate_fn(batch): #padding process in each batch
182 |
183 | input_ids = [data["input_ids"] for data in batch]
184 | position_ids = [data["position_ids"] for data in batch]
185 | attention_mask = [data['attention_mask'] for data in batch]
186 | loss_mask = [data['loss_mask'] for data in batch]
187 | labels = [data['labels'] for data in batch]
188 |
189 | max_length = max([len(t) for t in input_ids])
190 | for i in range(len(input_ids)):
191 | input_ids[i] = pad_token(input_ids[i], max_length)
192 | labels[i] = pad_token(labels[i], max_length)
193 | position_ids[i] = pad_position_ids(position_ids[i],
194 | max_length)
195 | loss_mask[i] = pad_loss_mask(loss_mask[i], max_length)
196 | return {
197 | 'input_ids': torch.LongTensor(input_ids),
198 | 'position_ids': torch.LongTensor(position_ids),
199 | 'attention_mask': torch.LongTensor(attention_mask),
200 | 'loss_mask': torch.LongTensor(loss_mask),
201 | 'labels': torch.LongTensor(labels),
202 | }
203 |
--------------------------------------------------------------------------------
/bert_seq2seq/mpu/mp_tools.py:
--------------------------------------------------------------------------------
1 | # Copyright © 2022 BAAI. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License")
4 | import sys
5 | import os
6 | import torch
7 | import copy
8 |
9 | from_1_to_n_models = {
10 | "gpt": {
11 | "wte.weight": 0,
12 | "attn.c_attn.weight": 30,
13 | "attn.c_attn.bias": 30,
14 | "attn.c_proj.weight": 1,
15 | "mlp.c_fc.weight": 0,
16 | "mlp.c_fc.bias": 0,
17 | "mlp.c_proj.weight": 1,
18 | },
19 | "opt": {
20 | "decoder.embed_tokens.weight": 0,
21 | "self_attn.k_proj.weight": 0,
22 | "self_attn.k_proj.bias": 0,
23 | "self_attn.q_proj.weight": 0,
24 | "self_attn.q_proj.bias": 0,
25 | "self_attn.v_proj.weight": 0,
26 | "self_attn.v_proj.bias": 0,
27 |
28 | "self_attn.out_proj.weight": 1,
29 | "fc1.weight": 0,
30 | "fc1.bias": 0,
31 | "fc2.weight": 1,
32 | },
33 | "glm": {
34 | "word_embeddings.weight": 0,
35 | "attention.query_key_value.weight": 30,
36 | "attention.query_key_value.bias": 30,
37 | "attention.dense.weight": 1,
38 | "mlp.dense_h_to_4h.weight": 0,
39 | "mlp.dense_h_to_4h.bias": 0,
40 | "mlp.dense_4h_to_h.weight": 1,
41 | },
42 | "t5": {
43 |
44 | },
45 | }
46 |
47 | def check_pytorch_model_mp_size(checkpoint: str, target_mp: int):
48 | """
49 | check the checkpoints contains the weights for mp_size = target_mp
50 | """
51 | assert os.path.isdir(checkpoint)
52 | filenames = os.listdir(checkpoint)
53 | filenames = [
54 | filename for filename in filenames
55 | if filename.startswith("pytorch_model")
56 | ]
57 | if 'pytorch_model.bin' in filenames and target_mp == 1:
58 | return True
59 | else:
60 | filenames.remove('pytorch_model.bin')
61 | print(
62 | "check the weight files in {}, the number of mp_size({}) {} num_of_files({})"
63 | .format(checkpoint, target_mp,
64 | "=" if target_mp == len(filenames) else "!=", len(filenames)))
65 | return target_mp == len(filenames)
66 |
67 | def change_pytorch_model_mp_from_1_to_n(model_name_brief, checkpoint: str, target_mp: int):
68 | trans_keys = from_1_to_n_models.get(model_name_brief, None)
69 | if trans_keys is None:
70 | print(f"Not support the model_name: {model_name_brief}")
71 | os._exit(0)
72 |
73 | if check_pytorch_model_mp_size(checkpoint, target_mp):
74 | return
75 | assert os.path.isdir(checkpoint)
76 | filenames = os.listdir(checkpoint)
77 | filenames = [
78 | filename for filename in filenames
79 | if filename.startswith("pytorch_model")
80 | ]
81 | if 'pytorch_model.bin' in filenames and target_mp > 1:
82 | filenames = ['pytorch_model.bin']
83 | filenames = [os.path.join(checkpoint, x) for x in filenames]
84 |
85 | if target_mp == len(filenames):
86 | print("MP size keeps the same.")
87 | exit(0)
88 |
89 | if checkpoint[-1] == '/':
90 | new_checkpoint = checkpoint[:-1]
91 | else:
92 | new_checkpoint = checkpoint
93 | preserve_keys = [
94 | "lr_scheduler",
95 | "skipped_steps",
96 | "global_steps",
97 | "global_samples",
98 | "dp_world_size",
99 | "iteration",
100 | "client_lr_scheduler",
101 | "np_rng_state",
102 | "random_rng_state",
103 | "torch_rng_state",
104 | "cuda_rng_state",
105 | "rng_tracker_states",
106 | ]
107 |
108 | if target_mp > len(filenames):
109 | print("Increase MP size.")
110 | assert target_mp % len(filenames) == 0
111 | ratio = target_mp // len(filenames)
112 | for i in range(len(filenames)):
113 | start = ratio * i
114 | end = ratio * (i + 1)
115 | d = torch.load(filenames[i], map_location='cpu')
116 | # if d.get("module", None) is None:
117 | # d["module"] = d
118 |
119 | for j in range(start, end):
120 | d_new = {}
121 | shift = j - start
122 | for k, v in d.items():
123 | if k != 'module':
124 | if k in preserve_keys:
125 | d_new[k] = copy.deepcopy(d[k])
126 | elif k == "mp_world_size":
127 | d_new[k] = target_mp
128 | else:
129 | d_new[k] = None
130 | d_new['module'] = {}
131 | with torch.no_grad():
132 | if "module" in d:
133 | d = d["module"]
134 |
135 | for k, v in d.items():
136 | assert len(v.shape) < 3
137 | flag = 0
138 | for keys in trans_keys:
139 | if keys in k:
140 | flag = 1
141 | # find a key to cut
142 | dim = trans_keys[keys]
143 |
144 | if len(v.shape) == 2:
145 | if dim == 30:
146 | part = v.shape[0] // ratio // 3
147 | d_new['module'][k] = torch.cat([
148 | v[shift * part:(shift + 1) *
149 | part, :].clone(),
150 | v[(shift + ratio) *
151 | part:(shift + 1 + ratio) *
152 | part, :].clone(),
153 | v[(shift + 2 * ratio) *
154 | part:(shift + 1 + 2 * ratio) *
155 | part, :].clone()
156 | ], 0)
157 | break
158 |
159 | elif dim == 0:
160 | part = v.shape[dim] // ratio
161 | d_new['module'][k] = v[shift *
162 | part:(shift + 1) *
163 | part, :].clone()
164 | break
165 |
166 | elif dim == 1:
167 | part = v.shape[dim] // ratio
168 | d_new['module'][k] = v[:, shift *
169 | part:(shift + 1) *
170 | part].clone()
171 | break
172 |
173 | elif len(v.shape) == 1:
174 | if dim == 30:
175 | part = v.shape[0] // ratio // 3
176 | d_new['module'][k] = torch.cat([
177 | v[shift * part:(shift + 1) *
178 | part].clone(),
179 | v[(shift + ratio) *
180 | part:(shift + 1 + ratio) *
181 | part].clone(),
182 | v[(shift + 2 * ratio) *
183 | part:(shift + 1 + 2 * ratio) *
184 | part].clone()
185 | ], 0)
186 | break
187 |
188 | else :
189 | d_new['module'][k] = v[shift * part:(shift + 1) *
190 | part].clone()
191 | break
192 |
193 | if flag == 0:
194 | d_new['module'][k] = v.clone()
195 |
196 |
197 | print("saving mp_size = {:02d} ".format(j))
198 | filename = os.path.join(new_checkpoint,
199 | "pytorch_model_{:02d}.bin".format(j))
200 | torch.save(d_new, filename)
201 |
--------------------------------------------------------------------------------
/bert_seq2seq/layers.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import numpy as np
5 |
6 | class GlobalPointer(nn.Module):
7 | def __init__(self, hidden_size, ent_type_size, inner_dim, RoPE=True, trill_mask=True):
8 | super().__init__()
9 | self.ent_type_size = ent_type_size
10 | self.inner_dim = inner_dim
11 | self.hidden_size = hidden_size
12 | self.dense = nn.Linear(self.hidden_size, self.ent_type_size * self.inner_dim * 2)
13 | self.trill_mask = trill_mask
14 | self.RoPE = RoPE
15 |
16 | def sinusoidal_position_embedding(self, batch_size, seq_len, output_dim):
17 | position_ids = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(-1)
18 |
19 | indices = torch.arange(0, output_dim // 2, dtype=torch.float)
20 | indices = torch.pow(10000, -2 * indices / output_dim)
21 | embeddings = position_ids * indices
22 | embeddings = torch.stack([torch.sin(embeddings), torch.cos(embeddings)], dim=-1)
23 | embeddings = embeddings.repeat((batch_size, *([1]*len(embeddings.shape))))
24 | embeddings = torch.reshape(embeddings, (batch_size, seq_len, output_dim))
25 | embeddings = embeddings.to(self.device)
26 | return embeddings
27 |
28 | def rope(self, batch_size, seq_len, dim, qw, kw):
29 | # pos_emb:(batch_size, seq_len, inner_dim)
30 | pos_emb = self.sinusoidal_position_embedding(batch_size, seq_len, dim)
31 | # cos_pos,sin_pos: (batch_size, seq_len, 1, inner_dim)
32 | cos_pos = pos_emb[..., None, 1::2].repeat_interleave(2, dim=-1)
33 | sin_pos = pos_emb[..., None,::2].repeat_interleave(2, dim=-1)
34 | qw2 = torch.stack([-qw[..., 1::2], qw[...,::2]], -1)
35 | qw2 = qw2.reshape(qw.shape)
36 | qw = qw * cos_pos + qw2 * sin_pos
37 | kw2 = torch.stack([-kw[..., 1::2], kw[...,::2]], -1)
38 | kw2 = kw2.reshape(kw.shape)
39 | kw = kw * cos_pos + kw2 * sin_pos
40 | return qw, kw
41 |
42 | def forward(self, last_hidden_state, padding_mask):
43 | self.device = last_hidden_state.device
44 | batch_size = last_hidden_state.size()[0]
45 | seq_len = last_hidden_state.size()[1]
46 |
47 | # outputs:(batch_size, seq_len, ent_type_size*inner_dim*2)
48 | outputs = self.dense(last_hidden_state)
49 | outputs = torch.split(outputs, self.inner_dim * 2, dim=-1)
50 | # outputs:(batch_size, seq_len, ent_type_size, inner_dim*2)
51 | outputs = torch.stack(outputs, dim=-2)
52 | # qw,kw:(batch_size, seq_len, ent_type_size, inner_dim)
53 | qw, kw = outputs[...,:self.inner_dim], outputs[...,self.inner_dim:] # TODO:修改为Linear获取?
54 |
55 | if self.RoPE:
56 | qw, kw = self.rope(batch_size, seq_len, self.inner_dim, qw, kw)
57 |
58 | # logits:(batch_size, ent_type_size, seq_len, seq_len)
59 | logits = torch.einsum('bmhd,bnhd->bhmn', qw, kw)
60 |
61 | # padding mask
62 | pad_mask = padding_mask.unsqueeze(1).unsqueeze(1).expand(batch_size, self.ent_type_size, seq_len, seq_len)
63 | # pad_mask_h = attention_mask.unsqueeze(1).unsqueeze(-1).expand(batch_size, self.ent_type_size, seq_len, seq_len)
64 | # pad_mask = pad_mask_v&pad_mask_h
65 | logits = logits*pad_mask - (1-pad_mask)*1e12
66 |
67 | # 排除下三角
68 | if self.trill_mask:
69 | mask = torch.tril(torch.ones_like(logits), -1)
70 | logits = logits - mask * 1e12
71 |
72 | return logits/self.inner_dim**0.5
73 |
74 | def compute_loss(self, logits, labels):
75 | # logits:
76 | # labels:
77 | pass
78 |
79 | bh = logits.shape[0] * logits.shape[1]
80 | labels = torch.reshape(labels, shape=(bh, -1))
81 | logits = torch.reshape(logits, shape=(bh, -1))
82 | return multilabel_crossentropy(logits, labels)
83 |
84 | def compute_loss_sparse(self, logits, labels, mask_zero=False):
85 | return sparse_multilabel_categorical_crossentropy(y_pred=logits, y_true=labels, mask_zero=mask_zero)
86 |
87 |
88 | def multilabel_crossentropy(y_pred, y_true):
89 | """
90 | https://kexue.fm/archives/7359
91 | """
92 | y_pred = (1 - 2 * y_true) * y_pred # -1 -> pos classes, 1 -> neg classes
93 | y_pred_neg = y_pred - y_true * 1e12 # mask the pred outputs of pos classes
94 | y_pred_pos = (y_pred - (1 - y_true) * 1e12) # mask the pred outputs of neg classes
95 | zeros = torch.zeros_like(y_pred[..., :1])
96 | y_pred_neg = torch.cat([y_pred_neg, zeros], dim=-1)
97 | y_pred_pos = torch.cat([y_pred_pos, zeros], dim=-1)
98 | neg_loss = torch.logsumexp(y_pred_neg, dim=-1)
99 | pos_loss = torch.logsumexp(y_pred_pos, dim=-1)
100 |
101 | return (neg_loss + pos_loss).mean()
102 |
103 | def sparse_multilabel_categorical_crossentropy(y_true=None, y_pred=None, mask_zero=False):
104 | '''
105 | 稀疏多标签交叉熵损失的torch实现
106 | '''
107 | shape = y_pred.shape
108 | y_true = y_true[..., 0] * shape[2] + y_true[..., 1]
109 | y_pred = y_pred.reshape(shape[0], -1, np.prod(shape[2:]))
110 | zeros = torch.zeros_like(y_pred[...,:1])
111 | y_pred = torch.cat([y_pred, zeros], dim=-1)
112 | if mask_zero:
113 | infs = zeros + 1e12
114 | y_pred = torch.cat([infs, y_pred[..., 1:]], dim=-1)
115 | y_pos_2 = torch.gather(y_pred, index=y_true, dim=-1)
116 | y_pos_1 = torch.cat([y_pos_2, zeros], dim=-1)
117 | if mask_zero:
118 | y_pred = torch.cat([-infs, y_pred[..., 1:]], dim=-1)
119 | y_pos_2 = torch.gather(y_pred, index=y_true, dim=-1)
120 | pos_loss = torch.logsumexp(-y_pos_1, dim=-1)
121 | all_loss = torch.logsumexp(y_pred, dim=-1)
122 | aux_loss = torch.logsumexp(y_pos_2, dim=-1) - all_loss
123 | aux_loss = torch.clip(1 - torch.exp(aux_loss), 1e-10, 1)
124 | neg_loss = all_loss + torch.log(aux_loss)
125 | loss = torch.mean(torch.sum(pos_loss + neg_loss))
126 | return loss
127 |
128 | class CRFLayer(nn.Module):
129 | """
130 | """
131 | def __init__(self, output_dim):
132 | super(CRFLayer, self).__init__()
133 |
134 | self.output_dim = output_dim
135 | self.trans = nn.Parameter(torch.Tensor(output_dim, output_dim))
136 | self.trans.data.uniform_(-0.1, 0.1)
137 |
138 | def compute_loss(self, y_pred, y_true, mask):
139 | """
140 | 计算CRF损失
141 | """
142 | y_pred = y_pred * mask
143 | y_true = y_true * mask
144 | target_score = self.target_score(y_pred, y_true)
145 | log_norm = self.log_norm_step(y_pred, mask)
146 | log_norm = self.logsumexp(log_norm, dim=1)# 计算标量
147 | return log_norm - target_score
148 |
149 | def forward(self, y_pred, y_true, mask):
150 | """
151 | y_true: [[1, 2, 3], [2, 3, 0] ]
152 | mask: [[1, 1, 1], [1, 1, 0]]
153 | """
154 | if y_pred.shape[0] != mask.shape[0] or y_pred.shape[1] != mask.shape[1]:
155 | raise Exception("mask shape is not match to y_pred shape")
156 | mask = mask.reshape((mask.shape[0], mask.shape[1], 1))
157 | mask = mask.float()
158 | y_true = y_true.reshape(y_pred.shape[:-1])
159 | y_true = y_true.long()
160 | y_true_onehot = F.one_hot(y_true, self.output_dim)
161 | y_true_onehot = y_true_onehot.float()
162 |
163 | return self.compute_loss(y_pred, y_true_onehot, mask)
164 |
165 | def target_score(self, y_pred, y_true):
166 | """
167 | 计算状态标签得分 + 转移标签得分
168 | y_true: (batch, seq_len, out_dim)
169 | y_pred: (batch, seq_len, out_dim)
170 | """
171 | # print(y_pred.shape)
172 | # print(y_true.shape)
173 | point_score = torch.einsum("bni,bni->b", y_pred, y_true)
174 | trans_score = torch.einsum("bni,ij,bnj->b", y_true[:, :-1], self.trans, y_true[:, 1: ])
175 |
176 | return point_score + trans_score
177 |
178 | def log_norm_step(self, y_pred, mask):
179 | """
180 | 计算归一化因子Z(X)
181 | """
182 | state = y_pred[:, 0] # 初始Z(X)
183 | y_pred = y_pred[:, 1: ].contiguous()
184 | mask = mask[:, 1:].contiguous()
185 | batch, seq_len, out_dim = y_pred.shape
186 | for t in range(seq_len):
187 | cur_mask = mask[:, t]
188 | state = torch.unsqueeze(state, 2) # (batch, out_dim, 1)
189 | g = torch.unsqueeze(self.trans, 0) # (1, out_dim, out_dim)
190 | outputs = self.logsumexp(state + g, dim=1) # batch, out_dim
191 | outputs = outputs + y_pred[:, t]
192 | outputs = cur_mask * outputs + (1 - cur_mask) * state.squeeze(-1)
193 | state = outputs
194 |
195 | return outputs
196 |
197 | def logsumexp(self, x, dim=None, keepdim=False):
198 | """
199 | 避免溢出
200 | """
201 | if dim is None:
202 | x, dim = x.view(-1), 0
203 | xm, _ = torch.max(x, dim, keepdim=True)
204 | out = xm + torch.log(torch.sum(torch.exp(x - xm), dim=dim, keepdim=True))
205 | return out if keepdim else out.squeeze(dim)
206 |
--------------------------------------------------------------------------------
/examples/relationship_extraction/train_bert_relationship_extraction.py:
--------------------------------------------------------------------------------
1 |
2 | import json
3 | import numpy as np
4 | import torch
5 | import os
6 | from tqdm import tqdm
7 | from bert_seq2seq import Trainer
8 | from bert_seq2seq import Tokenizer
9 | from torch.utils.data import Dataset
10 | from bert_seq2seq.dataset import bert_gplinker_collate_fn, sequence_padding
11 | from bert_seq2seq.utils import load_model
12 |
13 | vocab_path = "../state_dict/roberta/vocab.txt"
14 | model_path = "../state_dict/roberta/pytorch_model.bin"
15 | model_save_path = "./bert_relation_extraction.bin"
16 | task_name = "relationship_extraction"
17 | model_name = "roberta"
18 | epoches = 5
19 | data_dir = "../data/三元组抽取"
20 | train_path = os.path.join(data_dir, "train_data.json")
21 | val_path = os.path.join(data_dir, "dev_data.json")
22 |
23 | batch_size = 8
24 | maxlen = 128
25 | lr = 1e-5
26 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27 | trainer = Trainer(epoches=epoches, env_type="pytorch",
28 | val_every_step=1000, batch_size=batch_size,
29 | device=device,
30 | # num_nodes=1,
31 | # num_gpus=4,
32 | # training_script=__file__,
33 | )
34 |
35 | def load_data(filename):
36 | """加载数据
37 | 单条格式:{'text': text, 'spo_list': [(s, p, o)]}
38 | """
39 | D = []
40 | with open(filename, encoding='utf-8') as f:
41 | for l in f:
42 | l = json.loads(l)
43 | D.append({
44 | 'text': l['text'],
45 | 'spo_list': [(spo['subject'], spo['predicate'], spo['object'])
46 | for spo in l['spo_list']]
47 | })
48 | return D
49 |
50 | def load_target():
51 | target = []
52 | with open(os.path.join(data_dir, 'all_50_schemas')) as f:
53 | for l in f:
54 | l = json.loads(l)
55 | if l['predicate'] not in target:
56 | target.append(l['predicate'])
57 | return target
58 |
59 | def search(pattern, sequence):
60 | """从sequence中寻找子串pattern
61 | 如果找到,返回第一个下标;否则返回-1。
62 | """
63 | n = len(pattern)
64 | for i in range(len(sequence)):
65 | if sequence[i:i + n] == pattern:
66 | return i
67 | return -1
68 |
69 | # 建立分词器
70 | tokenizer = Tokenizer(vocab_path)
71 | train_data = load_data(train_path)
72 | valid_data = load_data(val_path)
73 | target = load_target()
74 | model = load_model(tokenizer.vocab, model_name=model_name,
75 | task_name=task_name, target_size=len(target),
76 | ner_inner_dim=64 )
77 |
78 | class RelationshipDataset(Dataset):
79 | def __init__(self, data):
80 | pass
81 | self.data = data
82 |
83 | def __getitem__(self, i):
84 | data = self.data[i]
85 |
86 | tokenizer_out = tokenizer.encode_plus(data["text"], max_length=maxlen, truncation=True)
87 | input_ids = tokenizer_out["input_ids"]
88 | token_type_ids = tokenizer_out["token_type_ids"]
89 |
90 | spoes = set()
91 | for s, p, o in data['spo_list']:
92 | s = tokenizer.encode_plus(s)["input_ids"][1:-1]
93 | p = target.index(p)
94 |
95 | o = tokenizer.encode_plus(o)["input_ids"][1:-1]
96 | sh = search(s, input_ids)
97 | oh = search(o, input_ids)
98 | if sh != -1 and oh != -1:
99 | spoes.add((sh, sh + len(s) - 1, p, oh, oh + len(o) - 1))
100 |
101 | # 构建标签
102 | entity_labels = [set() for _ in range(2)]
103 | head_labels = [set() for _ in range(len(target))]
104 | tail_labels = [set() for _ in range(len(target))]
105 | for sh, st, p, oh, ot in spoes:
106 | entity_labels[0].add((sh, st))
107 | entity_labels[1].add((oh, ot))
108 | head_labels[p].add((sh, oh))
109 | tail_labels[p].add((st, ot))
110 |
111 | for label in entity_labels + head_labels + tail_labels:
112 | if not label: # 至少要有一个标签
113 | label.add((0, 0)) # 如果没有则用0填充
114 |
115 | entity_labels = sequence_padding([list(l) for l in entity_labels])
116 | head_labels = sequence_padding([list(l) for l in head_labels])
117 | tail_labels = sequence_padding([list(l) for l in tail_labels])
118 |
119 | output = {
120 | "input_ids": input_ids,
121 | "token_type_ids": token_type_ids,
122 | "entity_labels": entity_labels,
123 | "head_labels": head_labels,
124 | "tail_labels": tail_labels,
125 | }
126 | return output
127 |
128 | def __len__(self):
129 | return len(self.data)
130 |
131 | class SPO(tuple):
132 | """用来存三元组的类
133 | 表现跟tuple基本一致,只是重写了 __hash__ 和 __eq__ 方法,
134 | 使得在判断两个三元组是否等价时容错性更好。
135 | """
136 | def __init__(self, spo):
137 | self.spox = (
138 | tuple(tokenizer.tokenize(spo[0])),
139 | spo[1],
140 | tuple(tokenizer.tokenize(spo[2])),
141 | )
142 |
143 | def __hash__(self):
144 | return self.spox.__hash__()
145 |
146 | def __eq__(self, spo):
147 | return self.spox == spo.spox
148 |
149 | def extract_spoes(text, threshold=0):
150 | """抽取输入text所包含的三元组
151 | """
152 | tokens = tokenizer.tokenize(text, maxlen=maxlen, add_spatial_tokens=True)
153 | mapping = tokenizer.rematch(text, tokens)
154 | tokenizer_out = tokenizer.encode_plus(text, max_length=maxlen)
155 | input_ids = tokenizer_out["input_ids"]
156 | token_type_ids = tokenizer_out["token_type_ids"]
157 |
158 | input_ids = torch.tensor(input_ids, device=device)
159 | token_type_ids = torch.tensor(token_type_ids, device=device)
160 | if input_ids.ndim == 1:
161 | input_ids = input_ids.view(1, -1)
162 | token_type_ids = token_type_ids.view(1, -1)
163 | with torch.no_grad():
164 | model_out = model(**{"input_ids": input_ids, "token_type_ids": token_type_ids})
165 |
166 | outputs = [model_out["entity_output"].cpu().numpy(),
167 | model_out["head_output"].cpu().numpy(),
168 | model_out["tail_output"].cpu().numpy()]
169 |
170 | outputs = [o[0] for o in outputs]
171 | # 抽取subject和object
172 | subjects, objects = set(), set()
173 | outputs[0][:, [0, -1]] -= np.inf
174 | outputs[0][:, :, [0, -1]] -= np.inf
175 | for l, h, t in zip(*np.where(outputs[0] > threshold)):
176 | if l == 0:
177 | subjects.add((h, t))
178 | else:
179 | objects.add((h, t))
180 | # 识别对应的predicate
181 | spoes = set()
182 | for sh, st in subjects:
183 | for oh, ot in objects:
184 | p1s = np.where(outputs[1][:, sh, oh] > threshold)[0]
185 | p2s = np.where(outputs[2][:, st, ot] > threshold)[0]
186 | ps = set(p1s) & set(p2s)
187 | for p in ps:
188 | try:
189 | spoes.add((
190 | text[mapping[sh][0]:mapping[st][-1] + 1], target[p],
191 | text[mapping[oh][0]:mapping[ot][-1] + 1]
192 | ))
193 | except:
194 | continue
195 |
196 | return list(spoes)
197 |
198 | def evaluate(data):
199 | """评估函数,计算f1、precision、recall
200 | """
201 | X, Y, Z = 1e-10, 1e-10, 1e-10
202 | f = open('dev_pred.json', 'w', encoding='utf-8')
203 |
204 | for d in tqdm(data, total=len(data)):
205 | R = set([SPO(spo) for spo in extract_spoes(d['text'])])
206 | T = set([SPO(spo) for spo in d['spo_list']])
207 | X += len(R & T)
208 | Y += len(R)
209 | Z += len(T)
210 | f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
211 | s = json.dumps({
212 | 'text': d['text'],
213 | 'spo_list': list(T),
214 | 'spo_list_pred': list(R),
215 | 'new': list(R - T),
216 | 'lack': list(T - R),
217 | },
218 | ensure_ascii=False,
219 | indent=4)
220 | f.write(s + '\n')
221 | f.close()
222 | return f1, precision, recall
223 |
224 | def validate():
225 | text = "南京京九思新能源有限公司于2015年05月15日在南京市江宁区市场监督管理局登记成立"
226 | spo_list = extract_spoes(text)
227 | print(f"spo_list is {spo_list}")
228 | f1, precision, recall = evaluate(valid_data)
229 | print(f"f1 is {f1}, precision is {precision}, recall is {recall}")
230 | return f1
231 |
232 | class Evaluator:
233 | def __init__(self):
234 | self.best_f1 = 0.0
235 |
236 | def on_validation(self, data):
237 | loss = data["loss"]
238 | step = data["iteration"]
239 | pass
240 |
241 | def on_epoch_end(self):
242 | f1 = validate()
243 | if self.best_f1 > f1:
244 | torch.save(model.state_dict(), model_save_path)
245 | print(f"模型保存成功: {model_save_path}")
246 |
247 |
248 | if __name__ == "__main__":
249 |
250 | train_dataset = RelationshipDataset(train_data)
251 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-5)
252 |
253 | trainer.train(model=model, optimizer=optimizer,
254 | train_dataset=train_dataset,
255 | evaluator=Evaluator,
256 | collate_fn=bert_gplinker_collate_fn)
--------------------------------------------------------------------------------
/bert_seq2seq/predictor/predictor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from typing import List
3 | import torch
4 | import os
5 | import math
6 | from bert_seq2seq.predictor.utils import viterbi_decode, decode_labels, \
7 | bert_beamsearch, t5_random_sample, gpt_random_sample, \
8 | t5_beamsearch, gpt_beamsearch, bert_random_sample, \
9 | gpt_random_sample_from_ids, glm_random_sample
10 | class Predictor:
11 |
12 | def __init__(self, model, tokenizer):
13 | self.tokenizer = tokenizer
14 | self.model = model
15 | self.model.eval()
16 | self.class_name = type(model).__name__
17 |
18 | def predict_embedding(self, text, maxlen=256, pred_type="cls"):
19 | device = next(self.model.parameters()).device
20 | tokenizer_out = self.tokenizer.encode_plus(text, max_length=maxlen, truncation=True)
21 |
22 | input_ids = tokenizer_out["input_ids"]
23 | token_type_ids = tokenizer_out["token_type_ids"]
24 | input_ids = torch.tensor(input_ids, device=device)
25 | token_type_ids = torch.tensor(token_type_ids, device=device)
26 | if input_ids.ndim == 1:
27 | input_ids = input_ids.view(1, -1)
28 | token_type_ids = token_type_ids.view(1, -1)
29 | with torch.no_grad():
30 | if pred_type == "cls":
31 | score = self.model(**{"input_ids": input_ids, "token_type_ids": token_type_ids})["logits"].cpu()[0, 0]
32 | elif pred_type == "mean":
33 | score = self.model(**{"input_ids": input_ids, "token_type_ids": token_type_ids})["logits"].cpu().mean(dim=1)[0]
34 |
35 | return score
36 |
37 | def predict_cls_classifier(self, text, max_len=512):
38 | ## text is text or text-pair
39 | device = next(self.model.parameters()).device
40 | if type(text) is str:
41 | tokenizer_out = self.tokenizer.encode_plus(text, max_length=max_len, truncation=True)
42 | else :
43 | assert len(text) == 2
44 | tokenizer_out = self.tokenizer.encode_plus(text[0], text[1], max_length=max_len, truncation=True)
45 |
46 | input_ids = tokenizer_out["input_ids"]
47 | token_type_ids = tokenizer_out["token_type_ids"]
48 | input_ids = torch.tensor(input_ids, device=device)
49 | token_type_ids = torch.tensor(token_type_ids, device=device)
50 | if input_ids.ndim == 1:
51 | input_ids = input_ids.view(1, -1)
52 | token_type_ids = token_type_ids.view(1, -1)
53 | with torch.no_grad():
54 | score = self.model(**{"input_ids": input_ids, "token_type_ids": token_type_ids})["logits"].cpu()[0]
55 | return score
56 |
57 | def predict_masklm(self, text, max_len=512):
58 | device = next(self.model.parameters()).device
59 | tokenizer_out = self.tokenizer.encode_plus(text, max_length=max_len, truncation=True)
60 |
61 | input_ids = tokenizer_out["input_ids"]
62 | token_type_ids = tokenizer_out["token_type_ids"]
63 | input_ids = torch.tensor(input_ids, device=device)
64 | token_type_ids = torch.tensor(token_type_ids, device=device)
65 | if input_ids.ndim == 1:
66 | input_ids = input_ids.view(1, -1)
67 | token_type_ids = token_type_ids.view(1, -1)
68 | with torch.no_grad():
69 | score = self.model(**{"input_ids": input_ids, "token_type_ids": token_type_ids})["logits"].cpu()
70 | score = score.argmax(dim=-1).numpy()[0]
71 | return self.tokenizer.decode(score)
72 |
73 | def predict_ner(self, text, target, maxlen=256):
74 | model = self.model
75 | model.eval()
76 | device = next(model.parameters()).device
77 | tokenizer = self.tokenizer
78 | tokens = tokenizer.tokenize(text, maxlen=maxlen, add_spatial_tokens=True)
79 | mapping = tokenizer.rematch(text, tokens)
80 | token_ids = tokenizer.convert_tokens_to_ids(tokens)
81 | token_ids = torch.tensor([token_ids], dtype=torch.long, device=device)
82 |
83 | trans = model.state_dict().get("crf_layer.trans", None)
84 | if trans is not None:
85 | ## crf
86 | trans = trans.cpu()
87 | with torch.no_grad():
88 | out = model(**{"input_ids": token_ids})["logits"][0].cpu()
89 | labels = viterbi_decode(out, trans)
90 | entities = decode_labels(labels, target)
91 | return [(mapping[w[0]][0], mapping[w[-1]][-1], l) for w, l in entities if mapping[w[0]] and mapping[w[-1]]]
92 |
93 | elif getattr(model, "gp", None) is not None :
94 | entities = []
95 | with torch.no_grad():
96 | scores = model(**{"input_ids": token_ids})["logits"].cpu().numpy()[0]
97 | ## global pointer
98 | scores[:, [0, -1]] -= np.inf
99 | scores[:, :, [0, -1]] -= np.inf
100 | for l, start, end in zip(*np.where(scores > 0)):
101 | if mapping[start] and mapping[end]:
102 | entities.append(
103 | (mapping[start][0], mapping[end][-1], target[l])
104 | )
105 | return entities
106 |
107 | else :
108 | with torch.no_grad():
109 | scores = model(**{"input_ids": token_ids})["logits"].cpu()[0]
110 | labels = scores.argmax(dim=-1)
111 | entities = decode_labels(labels, target)
112 | return [(mapping[w[0]][0], mapping[w[-1]][-1], l) for w, l in entities if mapping[w[0]] and mapping[w[-1]]]
113 |
114 | def predict_generate_beamsearch(self, text, input_max_length=256, out_max_length=100, beam_size=1, ):
115 | self.model.eval()
116 | if "bert" in self.class_name.lower():
117 | assert "seq2seq" in self.class_name.lower(), "this function only support seq2seq task"
118 | return bert_beamsearch(self.model, self.tokenizer, text, input_max_length=input_max_length,
119 | out_max_length=out_max_length, beam_size=beam_size)
120 | elif "t5" in self.class_name.lower():
121 | return t5_beamsearch(self.model, self.tokenizer, text, input_max_length=input_max_length,
122 | out_max_length=out_max_length, beam_size=beam_size)
123 |
124 | elif "gpt" in self.class_name.lower():
125 | return gpt_beamsearch(self.model, self.tokenizer, text, input_max_length=input_max_length,
126 | out_max_length=out_max_length, beam_size=beam_size)
127 |
128 | else :
129 | print("暂不支持的解码方式")
130 | import os
131 | os._exit(0)
132 |
133 | def predict_generate_randomsample(self, text, input_max_length=256,
134 | out_max_length=200, top_k=30, top_p=1.0,
135 | repetition_penalty=1.0, temperature=1.0, add_sep=False,
136 | ):
137 | device = next(self.model.parameters()).device
138 | if "t5" in self.class_name.lower():
139 | return t5_random_sample(self.model, self.tokenizer, text, input_max_length,
140 | out_max_length, top_k, top_p, repetition_penalty, temperature, device)
141 |
142 | elif "gpt" in self.class_name.lower():
143 | return gpt_random_sample(self.model, self.tokenizer, text, input_max_length,
144 | out_max_length, top_k, top_p, repetition_penalty, temperature, device, add_sep=add_sep)
145 |
146 | elif "bert" in self.class_name.lower():
147 | return bert_random_sample(self.model, self.tokenizer, text, input_max_length,
148 | out_max_length, top_k, top_p, repetition_penalty, temperature, device)
149 |
150 | elif "glm" in self.class_name.lower():
151 | return glm_random_sample(self.model, self.tokenizer, text, input_max_length,
152 | out_max_length, top_k, top_p, repetition_penalty,
153 | temperature, device)
154 |
155 | else:
156 | print("暂不支持的解码方式")
157 | import os
158 | os._exit(0)
159 |
160 | def predict_multi_response(self, sentences: List[str], top_k, top_p,
161 | repetition_penalty, temperature, input_max_length=1024,
162 | out_max_length=100):
163 | pass
164 |
165 | length = sum([len(text) for text in sentences])
166 | if length > input_max_length:
167 | print(f"对话过长: {length}")
168 | os._exit(0)
169 | device = next(self.model.parameters()).device
170 | input_ids = [self.tokenizer.token_start_id]
171 | for index, text in enumerate(sentences):
172 | if (index + 1) % 2 == 1:
173 | input_ids += self.tokenizer.encode_plus("A:" + text, max_length=input_max_length)["input_ids"][1:]
174 | else :
175 | input_ids += self.tokenizer.encode_plus("B:" + text, max_length=input_max_length)["input_ids"][1:]
176 |
177 | if "gpt" in self.class_name.lower():
178 | return gpt_random_sample_from_ids(self.model, self.tokenizer, input_ids,
179 | out_max_length, top_k, top_p, repetition_penalty,
180 | temperature, device)
181 |
182 | else :
183 | print(f"暂不支持的解码方式: {self.class_name}")
184 | os._exit(0)
185 |
186 |
187 |
188 |
189 |
--------------------------------------------------------------------------------