├── pytorch_nlu ├── corpus │ ├── text_regression │ │ ├── __init__.py │ │ └── negative_sentence │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── dev.json │ │ │ └── train.json │ ├── __init__.py │ ├── text_classification │ │ ├── tnews │ │ │ ├── README.md │ │ │ ├── dev.json │ │ │ └── train.json │ │ └── school │ │ │ ├── README.md │ │ │ ├── test.json │ │ │ ├── dev.json │ │ │ └── train.json │ ├── sequence_labeling │ │ ├── ner_china_people_daily_1998_conll │ │ │ └── README.md │ │ └── ner_china_people_daily_1998_span │ │ │ ├── README.md │ │ │ ├── test.span │ │ │ ├── train.span │ │ │ └── dev.span │ └── text_summary │ │ └── maths_toy │ │ └── README.md ├── __init__.py ├── output │ └── __init__.py ├── pytorch_textclassification │ ├── __init__.py │ ├── tcGraph.py │ ├── tcPredict.py │ └── tcConfig.py ├── pytorch_textregression │ ├── __init__.py │ ├── README.md │ ├── trPredict.py │ ├── trConfig.py │ ├── trGraph.py │ └── trRun.py ├── pytorch_textsummary │ ├── __init__.py │ ├── tsGraph.py │ ├── tsPredict.py │ ├── tsConfig.py │ ├── README.md │ └── tsRun.py ├── pytorch_sequencelabeling │ ├── __init__.py │ ├── slPredict.py │ └── slConfig.py └── version.py ├── requirements.txt ├── test ├── __init__.py ├── sl │ ├── __init__.py │ ├── tet_sl_base_predict.py │ ├── tet_sl_base_softmax.py │ ├── tet_sl_base_data_span.py │ ├── tet_sl_base_data_conll.py │ ├── tet_sl_base_span.py │ ├── tet_sl_base_crf_ernie.py │ ├── tet_sl_base_crf.py │ └── tet_sl_base_grid.py ├── tc │ ├── __init__.py │ ├── tet_tc_base_predict_multiclass.py │ ├── tet_tc_base_predict_multilabel.py │ ├── tet_tc_base_multi_class.py │ ├── tet_tc_base_multi_label_dbloss.py │ ├── tet_tc_base_multi_label_focalloss.py │ ├── tet_tc_base_multi_label.py │ ├── tet_tc_base_multi_label_isadv.py │ └── tet_tc_base_multi_label_choice.py ├── corpus │ ├── __init__.py │ ├── pos_to_conll.py │ └── conll_to_pos.py ├── output │ ├── __init__.py │ ├── text_summary │ │ └── __init__.py │ ├── sequence_labeling │ │ └── __init__.py │ ├── text_regression │ │ └── __init__.py │ └── text_classification │ │ └── __init__.py ├── tr │ ├── __init__.py │ ├── tet_tr_base_predict.py │ └── tet_tr_base_train.py └── ts │ ├── __init__.py │ ├── tet_ts_base_pred.py │ └── tet_ts_base_train.py ├── .gitignore ├── setup.py └── pytorch-loss.md /pytorch_nlu/corpus/text_regression/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.8.2 2 | tensorboardX==2.4 3 | numpy==1.18.1 4 | torch==1.9.0 5 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/8/12 23:50 4 | # @author : Mo 5 | # @function: 6 | -------------------------------------------------------------------------------- /test/sl/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/8/27 19:23 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /test/tc/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/8/27 19:23 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /pytorch_nlu/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/9/27 23:18 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /test/corpus/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/8/24 23:09 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /test/output/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/7/27 19:28 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /test/tr/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2024/1/29 18:06 4 | # @author : Mo 5 | # @function: 6 | -------------------------------------------------------------------------------- /test/ts/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2023/2/2 21:42 4 | # @author : Mo 5 | # @function: 6 | -------------------------------------------------------------------------------- /pytorch_nlu/corpus/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/8/26 19:37 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /pytorch_nlu/output/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/9/27 23:32 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /test/output/text_summary/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/7/27 19:28 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /test/output/sequence_labeling/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/7/27 19:28 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /test/output/text_regression/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/7/27 19:28 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /test/output/text_classification/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/7/27 19:28 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_textclassification/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/11/17 21:52 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_textregression/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/11/17 21:52 4 | # @author : Mo 5 | # @function: 6 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_textsummary/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/11/17 21:52 4 | # @author : Mo 5 | # @function: 6 | 7 | -------------------------------------------------------------------------------- /pytorch_nlu/corpus/text_regression/negative_sentence/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/Pytorch-NLU/HEAD/pytorch_nlu/corpus/text_regression/negative_sentence/README.md -------------------------------------------------------------------------------- /pytorch_nlu/corpus/text_regression/negative_sentence/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2023/12/12 10:27 4 | # @author : Mo 5 | # @function: 6 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_sequencelabeling/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/7/7 20:53 4 | # @author : Mo 5 | # @function: sequence-labeling of pytorch 6 | 7 | -------------------------------------------------------------------------------- /pytorch_nlu/version.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/12/21 22:24 4 | # @author : Mo 5 | # @function: version of Pytorch-NLU 6 | 7 | 8 | __version__ = "0.0.2" 9 | -------------------------------------------------------------------------------- /pytorch_nlu/corpus/text_classification/tnews/README.md: -------------------------------------------------------------------------------- 1 | # 数据集 2 | ##数据来源: 3 | - url: https://github.com/aceimnorstuvwxz/toutiao-multilevel-text-classfication-dataset 4 | - 详情: 总共2914000样例,分布于1000+个多层的类别中。通常使用的数据集有: 5 | - [TNEWS-Tianchi](hhtps://tianchi.aliyun.com/competition/entrance/531841/information): 阿里云天池(Tianchi)版TNEWS, 今日头条提供的中文新闻标题分类语料, 数据集来自今日头条的新闻版块, 链接为Tianchi, 共63360样例, 15个类别; 6 | - [TNEWS-CLUE](https://storage.googleapis.com/cluebenchmark/tasks/tnews_public.zip): CLUE版TNEWS, 今日头条提供的中文新闻标题分类语料, 数据集来自今日头条的新闻版块, 链接为CLUE, 共73360样例, 15个类别; 7 | - 来源:今日头条客户端。 8 | 9 | ##备注: 10 | 这里训练、验证、测试集各只取了132个样例 11 | 12 | -------------------------------------------------------------------------------- /test/sl/tet_sl_base_predict.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/7/25 19:30 4 | # @author : Mo 5 | # @function: 测试预测模块 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import json 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_sequencelabeling") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | print(path_sys) 18 | 19 | from slPredict import SequenceLabelingPredict 20 | 21 | 22 | if __name__ == "__main__": 23 | 24 | path_config = "../output/sequence_labeling/model_BERT/sl.config" 25 | tcp = SequenceLabelingPredict(path_config) 26 | texts = [{"text": "平乐县,古称昭州,隶属于广西壮族自治区桂林市,位于广西东北部,桂林市东南部,东临钟山县,南接昭平,西北毗邻阳朔,北连恭城,总面积1919.34平方公里。"}, 27 | {"text": "平乐县主要旅游景点有榕津千年古榕、冷水石景苑、仙家温泉、桂江风景区、漓江风景区等,平乐县为漓江分界点,平乐以北称漓江,以南称桂江,是著名的大桂林旅游区之一。"}, 28 | {"text": "印岭玲珑,昭水晶莹,环绕我平中。青年的乐园,多士受陶熔。生活自觉自治,学习自发自动。五育并重,手脑并用。迎接新潮流,建设新平中"}, 29 | {"text": "桂林山水甲天下, 阳朔山水甲桂林"}, 30 | ] 31 | res = tcp.predict(texts) 32 | print(res) 33 | while True: 34 | print("请输入:") 35 | question = input() 36 | res = tcp.predict([{"text": question}]) 37 | print(res) 38 | 39 | -------------------------------------------------------------------------------- /test/tc/tet_tc_base_predict_multiclass.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/7/25 19:30 4 | # @author : Mo 5 | # @function: predict model, 预测模块-多类分类 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import json 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_textclassification") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | # os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 18 | from tcPredict import TextClassificationPredict 19 | 20 | 21 | if __name__ == "__main__": 22 | 23 | path_config = "../output/text_classification/model_BERT/tc.config" 24 | tcp = TextClassificationPredict(path_config) 25 | texts = [{"text": "平乐县,古称昭州,隶属于广西壮族自治区桂林市,位于广西东北部,桂林市东南部,东临钟山县,南接昭平,西北毗邻阳朔,北连恭城,总面积1919.34平方公里。"}, 26 | {"text": "平乐县主要旅游景点有榕津千年古榕、冷水石景苑、仙家温泉、桂江风景区、漓江风景区等,平乐县为漓江分界点,平乐以北称漓江,以南称桂江,是著名的大桂林旅游区之一。"}, 27 | {"text": "印岭玲珑,昭水晶莹,环绕我平中。青年的乐园,多士受陶熔。生活自觉自治,学习自发自动。五育并重,手脑并用。迎接新潮流,建设新平中"}, 28 | {"text": "桂林山水甲天下, 阳朔山水甲桂林"}, 29 | ] 30 | res = tcp.predict(texts, logits_type="softmax") 31 | print(res) 32 | while True: 33 | print("请输入:") 34 | question = input() 35 | res = tcp.predict([{"text": question}], logits_type="softmax") 36 | print(res) 37 | 38 | -------------------------------------------------------------------------------- /pytorch_nlu/corpus/sequence_labeling/ner_china_people_daily_1998_conll/README.md: -------------------------------------------------------------------------------- 1 | # 数据集(现代汉语多级加工语料库) 2 | ##数据来源: 3 | - url: https://opendata.pku.edu.cn/dataset.xhtml?persistentId=doi:10.18170/DVN/SEYRX5 4 | - 详情: 北京大学计算语言学研究所从1992年起开始研究现代汉语语料库的多级加工,1999年4月至2002年4月历时3年完成的1998年全年《人民日报》的标注语料库。现代汉语多级加工语料库包括5200万字的基本加工语料库(词语切分、词性标注、命名实体标注、注音)、2800万字的同形标注语料库,此外,还有56万字语料标注了并列结构。 5 | 6 | ##备注: 7 | 这里训练、验证、测试集各只取了132个样例 8 | 9 | ##作者信息: 10 | - 发布日期 2018-03-08 11 | - 标题 现代汉语多级加工语料库 12 | - 作者 俞士汶 (北京大学) 13 | - 段慧明 (北京大学) 14 | - 吴云芳 (北京大学) 15 | - 联系人 俞士汶 (北京大学) 16 | - 段慧明 (北京大学) 17 | - 吴云芳 (北京大学) 18 | - 提交日期 2018-03-05 19 | - 描述 北京大学计算语言学研究所从1992年起开始研究现代汉语语料库的多级加工,1999年4月至2002年4月历时3年完成的1998年全年《人民日报》的标注语料库。现代汉语多级加工语料库包括5200万字的基本加工语料库(词语切分、词性标注、命名实体标注、注音)、2800万字的同形标注语料库,此外,还有56万字语料标注了并列结构。 20 | - 学科 计算机与信息科学 21 | 22 | ## 使用条款 23 | - 北京大学开放研究数据平台>北京大学计算语言学研究所>综合型语言知识库(CLKB)>现代汉语多级加工语料库 24 | - 弃权声明 社区准则以及良好的科学实践都期望通过引用给予适当的功劳,请使用上面由Dataverse生成的数据引用。CC0 - "Public Domain Dedication" CC0 25 | - 版本 1.0 这是第一个已发布版本。 Dataverse Admin 2018-3-8 26 | ``` 27 | @data{DVN/SEYRX5_2018, 28 | author = {Yu, Shiwen and Duan, Huiming and Wu, Yunfang}, 29 | publisher = {Peking University Open Research Data Platform}, 30 | title = "{Corpus of Multi-level Processing for Modern Chinese}", 31 | year = {2018}, 32 | version = {V1}, 33 | doi = {10.18170/DVN/SEYRX5}, 34 | url = {https://doi.org/10.18170/DVN/SEYRX5} 35 | } 36 | ``` 37 | 38 | -------------------------------------------------------------------------------- /pytorch_nlu/corpus/sequence_labeling/ner_china_people_daily_1998_span/README.md: -------------------------------------------------------------------------------- 1 | # 数据集(现代汉语多级加工语料库) 2 | ##数据来源: 3 | - url: https://opendata.pku.edu.cn/dataset.xhtml?persistentId=doi:10.18170/DVN/SEYRX5 4 | - 详情: 北京大学计算语言学研究所从1992年起开始研究现代汉语语料库的多级加工,1999年4月至2002年4月历时3年完成的1998年全年《人民日报》的标注语料库。现代汉语多级加工语料库包括5200万字的基本加工语料库(词语切分、词性标注、命名实体标注、注音)、2800万字的同形标注语料库,此外,还有56万字语料标注了并列结构。 5 | 6 | ##备注: 7 | 这里训练、验证、测试集各只取了132个样例 8 | 9 | ##作者信息: 10 | - 发布日期 2018-03-08 11 | - 标题 现代汉语多级加工语料库 12 | - 作者 俞士汶 (北京大学) 13 | - 段慧明 (北京大学) 14 | - 吴云芳 (北京大学) 15 | - 联系人 俞士汶 (北京大学) 16 | - 段慧明 (北京大学) 17 | - 吴云芳 (北京大学) 18 | - 提交日期 2018-03-05 19 | - 描述 北京大学计算语言学研究所从1992年起开始研究现代汉语语料库的多级加工,1999年4月至2002年4月历时3年完成的1998年全年《人民日报》的标注语料库。现代汉语多级加工语料库包括5200万字的基本加工语料库(词语切分、词性标注、命名实体标注、注音)、2800万字的同形标注语料库,此外,还有56万字语料标注了并列结构。 20 | - 学科 计算机与信息科学 21 | 22 | ## 使用条款 23 | - 北京大学开放研究数据平台>北京大学计算语言学研究所>综合型语言知识库(CLKB)>现代汉语多级加工语料库 24 | - 弃权声明 社区准则以及良好的科学实践都期望通过引用给予适当的功劳,请使用上面由Dataverse生成的数据引用。CC0 - "Public Domain Dedication" CC0 25 | - 版本 1.0 这是第一个已发布版本。 Dataverse Admin 2018-3-8 26 | ``` 27 | @data{DVN/SEYRX5_2018, 28 | author = {Yu, Shiwen and Duan, Huiming and Wu, Yunfang}, 29 | publisher = {Peking University Open Research Data Platform}, 30 | title = "{Corpus of Multi-level Processing for Modern Chinese}", 31 | year = {2018}, 32 | version = {V1}, 33 | doi = {10.18170/DVN/SEYRX5}, 34 | url = {https://doi.org/10.18170/DVN/SEYRX5} 35 | } 36 | ``` 37 | 38 | -------------------------------------------------------------------------------- /test/tc/tet_tc_base_predict_multilabel.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/7/25 19:30 4 | # @author : Mo 5 | # @function: predict model, 预测模块-多标签分类 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import json 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_textclassification") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | # os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 18 | from tcPredict import TextClassificationPredict 19 | 20 | 21 | if __name__ == "__main__": 22 | 23 | path_config = "../output/text_classification/model_BERT/tc.config" 24 | tcp = TextClassificationPredict(path_config) 25 | texts = [{"text": "平乐县,古称昭州,隶属于广西壮族自治区桂林市,位于广西东北部,桂林市东南部,东临钟山县,南接昭平,西北毗邻阳朔,北连恭城,总面积1919.34平方公里。"}, 26 | {"text": "平乐县主要旅游景点有榕津千年古榕、冷水石景苑、仙家温泉、桂江风景区、漓江风景区等,平乐县为漓江分界点,平乐以北称漓江,以南称桂江,是著名的大桂林旅游区之一。"}, 27 | {"text": "印岭玲珑,昭水晶莹,环绕我平中。青年的乐园,多士受陶熔。生活自觉自治,学习自发自动。五育并重,手脑并用。迎接新潮流,建设新平中"}, 28 | {"text": "桂林山水甲天下, 阳朔山水甲桂林"}, 29 | ] 30 | res = tcp.predict(texts, logits_type="sigmoid") 31 | print(res) 32 | while True: 33 | print("请输入:") 34 | question = input() 35 | res = tcp.predict([{"text": question}], logits_type="sigmoid") 36 | print(res) 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /test/ts/tet_ts_base_pred.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/7/25 9:30 4 | # @author : Mo 5 | # @function: predict model, 预测模块 6 | 7 | 8 | # 适配linux 9 | from argparse import Namespace 10 | import logging as logger 11 | import traceback 12 | import sys 13 | import os 14 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 15 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_textsummary") 16 | sys.path.append(path_sys) 17 | print(path_root) 18 | print(path_sys) 19 | 20 | from tsPredict import TextSummaryPredict 21 | from tsTools import load_json 22 | from tsOffice import Office 23 | from tsData import DataSet 24 | 25 | 26 | if __name__ == "__main__": 27 | 28 | path_config = "../output/text_summary/model_BERT/tc.config" 29 | 30 | tcp = TextSummaryPredict(path_config) 31 | texts = [{'text': ['平乐县', '古称昭州', '隶属于广西壮族自治区桂林市', '位于广西东北部', '桂林市东南部', '东临钟山县', 32 | '南接昭平', '西北毗邻阳朔', '北连恭城', '总面积1919.34平方公里。']}, 33 | {'text': ['平乐县主要旅游景点有榕津千年古榕、冷水石景苑、仙家温泉、桂江风景区、漓江风景区等', '平乐县为漓江分界点', 34 | '平乐以北称漓江', '以南称桂江', '是著名的大桂林旅游区之一。']}, 35 | {'text': ['印岭玲珑', '昭水晶莹', '环绕我平中。青年的乐园', '多士受陶熔。生活自觉自治', '学习自发自动。五育并重', 36 | '手脑并用。迎接新潮流', '建设新平中']}, 37 | {'text': ['桂林山水甲天下', '阳朔山水甲桂林']}] 38 | res = tcp.predict(texts, logits_type="sigmoid") 39 | print(str(res).encode("utf-8", "ignore").decode("utf-8", "ignore")) 40 | 41 | -------------------------------------------------------------------------------- /pytorch_nlu/corpus/text_regression/negative_sentence/dev.json: -------------------------------------------------------------------------------- 1 | {"text": "不用于否定句。", "label": [0,0]} 2 | {"text": "我听说过李晓为了帮助一位盲人昨天很晚才回家。", "label": [1,0]} 3 | {"text": "小玲取得的优异成绩正是她勤奋学习的结果。", "label": [1,0]} 4 | {"text": "一棍打一船", "label": [1,0]} 5 | {"text": "不喜欢说唱音乐。", "label": [0,0]} 6 | {"text": "这是你的。", "label": [1,0]} 7 | {"text": "海上生明月,天涯共此时。", "label": [1,0]} 8 | {"text": "桂花雨很香。", "label": [1,0]} 9 | {"text": "这项活动是有意义的。", "label": [1,0]} 10 | {"text": "李大钊同志对革命事业充满信心,怎么会惧怕反动军阀", "label": [0,0]} 11 | {"text": "无论如何,后几个星期六你不许再玩了。", "label": [0,0]} 12 | {"text": "月亮挂在天空;", "label": [1,0]} 13 | {"text": "实现四个现代化,难道是全国人民的伟大历史任务", "label": [1,0]} 14 | {"text": "我决非浪漫之人。", "label": [0,0]} 15 | {"text": "不正确", "label": [0,0]} 16 | {"text": "字斟句酌的态度是值得肯定的。", "label": [1,0]} 17 | {"text": "不准备到它家去度过晚上的时光。", "label": [0,0]} 18 | {"text": "你只需按按键。", "label": [1,0]} 19 | {"text": "人有悲欢离合,月有阴晴圆缺,此事古难全。", "label": [1,0]} 20 | {"text": "他不会来参加派对。", "label": [0,0]} 21 | {"text": "万里长城不能不说是历史上的一个伟大奇迹", "label": [1,0]} 22 | {"text": "我不会干这种事的,决不会。", "label": [0,0]} 23 | {"text": "箭可以射得到呢", "label": [1,0]} 24 | {"text": "这是我曾经听过的最好的故事。", "label": [1,0]} 25 | {"text": "只要拥有他们所有,任何人都会干得比他们出色", "label": [0,0]} 26 | {"text": "微风柔和地吹,柔和地爱抚我的面孔。", "label": [1,0]} 27 | {"text": "以乱易整,不武。", "label": [0,0]} 28 | {"text": "你安静!", "label": [1,0]} 29 | {"text": "斗牛场是多么欢乐阿!", "label": [1,0]} 30 | {"text": "今天你不必做作业。", "label": [0,0]} 31 | {"text": "他醒来时,几乎无法说话。", "label": [0,0]} 32 | {"text": "在那天边隐约闪亮的不就是黄河?", "label": [1,0]} 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/11/01 10:17 4 | # @author :Mo 5 | # @function :setup of Pytorch-NLU 6 | 7 | 8 | from pytorch_nlu.version import __version__ 9 | from setuptools import find_packages, setup 10 | import codecs 11 | 12 | 13 | # Package meta-data. 14 | NAME = "Pytorch-NLU" 15 | DESCRIPTION = "Pytorch-NLU" 16 | URL = "https://github.com/yongzhuo/Pytorch-NLU" 17 | EMAIL = "1903865025@qq.com" 18 | AUTHOR = "yongzhuo" 19 | LICENSE = "Apache" 20 | 21 | with codecs.open("README.md", "r", "utf-8") as reader: 22 | long_description = reader.read() 23 | with codecs.open("requirements.txt", "r", "utf-8") as reader: 24 | install_requires = list(map(lambda x: x.strip(), reader.readlines())) 25 | 26 | setup(name=NAME, 27 | version=__version__, 28 | description=DESCRIPTION, 29 | long_description=long_description, 30 | long_description_content_type="text/markdown", 31 | author=AUTHOR, 32 | author_email=EMAIL, 33 | url=URL, 34 | packages=find_packages(), 35 | install_requires=install_requires, 36 | package_data={"pytorch_nlu": ["*.*", "corpus/*", 37 | "pytorch_textclassification/*" 38 | "pytorch_sequencelabeling/*", 39 | "corpus/text_classification/*", 40 | "corpus/sequence_labeling/*", 41 | "corpus/text_classification/school/*", 42 | "corpus/text_classification/tnews/*", 43 | "corpus/sequence_labeling/ner_china_people_daily_1998_conll/*", 44 | "corpus/sequence_labeling/ner_china_people_daily_1998_span/*",]}, 45 | license=LICENSE, 46 | classifiers=["License :: OSI Approved :: Apache License", 47 | "Programming Language :: Python :: 3.4", 48 | "Programming Language :: Python :: 3.5", 49 | "Programming Language :: Python :: 3.6", 50 | "Programming Language :: Python :: 3.7", 51 | "Programming Language :: Python :: 3.8", 52 | "Programming Language :: Python :: 3.9", 53 | "Programming Language :: Python :: Implementation :: CPython", 54 | "Programming Language :: Python :: Implementation :: PyPy"],) 55 | 56 | 57 | if __name__ == "__main__": 58 | print("setup ok!") 59 | 60 | 61 | # 打包与安装 62 | # step: 63 | # 打开cmd 64 | # 到达安装目录 65 | # python setup.py build 66 | # python setup.py install 67 | 68 | # or 69 | 70 | # python setup.py bdist_wheel --universal 71 | # twine upload dist/* 72 | 73 | -------------------------------------------------------------------------------- /pytorch_nlu/corpus/text_summary/maths_toy/README.md: -------------------------------------------------------------------------------- 1 | # 数据集(哈工大LCSTS文本摘要数据集) 2 | ##数据来源: 3 | - url: http://icrc.hitsz.edu.cn/Article/show/139.html 4 | - 详情: LCSTS: A Large-Scale Chinese Short Text Summarization Dataset 5 | - 6 | ##备注: 7 | 这里训练、验证、测试集各只取了128/32个样例 8 | 9 | ##作者信息: 10 | Baotian Hu 11 | Qingcai Chen 12 | Fangze Zhu 13 | baotianchina@gmail.com 14 | qingcai.chen@gmail.com 15 | zhufangze123@gmail.com 16 | ``` 17 | 介绍 18 | Introduction: 19 | 20 | Automatic text summarization is widely regarded as the highly difficult problem, partially because of the lack of large text summarization data set. Due to the great challenge of constructing the large scale summaries for full text, we introduce a Large-scale Chinese Short Text Summarization dataset constructed from the Chinese microblogging website SinaWeibo. This corpus consists of over 2 million real Chinese short texts with short summaries given by the writer of each text. We also manually tagged the relevance of 10,666 short summaries with their corresponding short texts. Based on the corpus, we introduce recurrent neural network for the summary generation and achieve promising results, which not only shows the usefulness of the proposed corpus for short text summarization research, but also provides a baseline for further research on this topic. 21 | 22 | 23 | ``` 24 | 25 | ## 使用条款 26 | ``` 27 | Copyright Notice: 28 | 29 | 1.Respect the privacy of personal information of the original source. 30 | 31 | 2.The original copyright of all the data of the Large Scale Chinese Short Text Summarization Dataset belongs to writers of the Weiboes, Intelligent Computing Research Center, Harbin Institute of Technology Shenzhen Graduate School collects, organizes, filters and purifies them. LCSTS is free to the public. 32 | 33 | 3.If you want to use the dataset for depth study, data providers (Intelligent Computing Research Center, Harbin Institute of Technology Shenzhen Graduate School) should be identified in your results. 34 | 35 | 4.The dataset is only for the specified applicant or study groups for research purposes. Without permission, it may not be used for any commercial purposes. 36 | 37 | 5.If the terms changed, the latest online version shall prevail. 38 | ``` 39 | ``` 40 | @article{DBLP:journals/corr/HuCZ15, 41 | author = {Baotian Hu and 42 | Qingcai Chen and 43 | Fangze Zhu}, 44 | title = {{LCSTS:} {A} Large Scale Chinese Short Text Summarization Dataset}, 45 | journal = {CoRR}, 46 | volume = {abs/1506.05865}, 47 | year = {2015}, 48 | url = {http://arxiv.org/abs/1506.05865}, 49 | eprinttype = {arXiv}, 50 | eprint = {1506.05865}, 51 | timestamp = {Mon, 13 Aug 2018 16:49:02 +0200}, 52 | biburl = {https://dblp.org/rec/journals/corr/HuCZ15.bib}, 53 | bibsource = {dblp computer science bibliography, https://dblp.org} 54 | } 55 | ``` 56 | 57 | -------------------------------------------------------------------------------- /test/ts/tet_ts_base_train.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/2/23 21:34 4 | # @author : Mo 5 | # @function: 文本摘要, text-summary 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import json 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_textsummary") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | print(path_sys) 18 | 19 | from tsTools import get_current_time 20 | from tsConfig import model_config 21 | from tsRun import TextSummary 22 | 23 | 24 | # 预训练模型地址, 本地win10默认只跑2步就评估保存模型 25 | if platform.system().lower() == 'windows': 26 | # pretrained_model_dir = "D:/pretrain_models/pytorch" 27 | pretrained_model_dir = "E:/DATA/bert-model/00_pytorch" 28 | evaluate_steps = 32 # 评估步数 29 | save_steps = 32 # 存储步数 30 | else: 31 | pretrained_model_dir = "/pretrain_models/pytorch" 32 | evaluate_steps = 320 # 评估步数 33 | save_steps = 320 # 存储步数 34 | ee = 0 35 | 36 | 37 | if __name__ == "__main__": 38 | 39 | # 训练-验证语料地址, 可以只输入训练地址 40 | path_corpus = os.path.join(path_root, "pytorch_nlu", "corpus", "text_summary", "maths_toy") 41 | path_train = os.path.join(path_corpus, "train.json") 42 | path_dev = os.path.join(path_corpus, "dev.json") 43 | 44 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 45 | model_config["save_steps"] = save_steps # 存储步数 46 | model_config["path_train"] = path_train 47 | model_config["path_dev"] = path_dev 48 | model_config["lr"] = 1e-5 # 测试语料, 可为None 49 | model_config["max_len"] = 256 # 测试语料, 可为None 50 | model_config["batch_size"] = 32 # 测试语料, 可为None 51 | model_config["loss_type"] = "SOFT_MARGIN_LOSS" # 测试语料, 可为None 52 | model_config["is_dropout"] = True # 53 | model_config["is_adv"] = False # 测试语料, 可为None 54 | 55 | 56 | # 预训练模型适配的class 57 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 58 | pretrained_model_name_or_path = { 59 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 60 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 61 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 62 | "XLNET": "hfl/chinese-xlnet-mid", 63 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 64 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 65 | "BERT": "bert-base-chinese", 66 | # "BERT": "hfl/chinese-macbert-base", 67 | 68 | } 69 | idx = 0 # 选择的预训练模型类型---model_type 70 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 71 | model_config["model_save_path"] = "../output/text_summary/model_{}".format(model_type[idx]) 72 | model_config["model_type"] = model_type[idx] 73 | # main 74 | lc = TextSummary(model_config) 75 | lc.process() 76 | lc.train() 77 | 78 | 79 | # shell 80 | # nohup python tcRun.py > tc.log 2>&1 & 81 | # tail -n 1000 -f tc.log 82 | # |myz| 83 | 84 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_sequencelabeling/slPredict.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/7/25 19:30 4 | # @author : Mo 5 | # @function: predict model, 预测模块 6 | 7 | 8 | # 适配linux 9 | import sys 10 | import os 11 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".")) 12 | sys.path.append(path_root) 13 | from tcConfig import model_config 14 | os.environ["CUDA_VISIBLE_DEVICES"] = model_config.get("CUDA_VISIBLE_DEVICES", "0") 15 | from slConfig import _SL_MODEL_SOFTMAX, _SL_MODEL_GRID, _SL_MODEL_SPAN, _SL_MODEL_CRF 16 | from slConfig import _SL_DATA_CONLL, _SL_DATA_SPAN 17 | from slTools import get_logger, load_json 18 | from slOffice import Office 19 | from slData import Corpus 20 | 21 | from argparse import Namespace 22 | 23 | 24 | class SequenceLabelingPredict: 25 | def __init__(self, path_config): 26 | """ 初始化 """ 27 | self.load_config(path_config) 28 | self.load_model() 29 | 30 | def load_config(self, path_config): 31 | """ 加载超参数 """ 32 | config = load_json(path_config) 33 | self.config = Namespace(**config) 34 | self.logger = get_logger(self.config.model_save_path) 35 | self.l2i, self.i2l = self.config.l2i, self.config.i2l 36 | # 数据预处理 类 37 | self.corpus = Corpus(self.config, self.logger) 38 | 39 | def load_model(self): 40 | """ 加载模型 """ 41 | self.office = Office(config=self.config, logger=self.logger) 42 | self.office.load_model() 43 | 44 | def process(self, texts): 45 | """ 数据预处理, process """ 46 | # token 转 idx, 训练集/验证集 47 | datas_xy, _ = self.corpus.read_texts_from_json(texts, keys=self.config.xy_keys_predict) 48 | if self.config.task_type.upper() in [_SL_MODEL_SPAN]: 49 | sl_preprocess = self.corpus.preprocess_span 50 | elif self.config.task_type.upper() in [_SL_MODEL_GRID]: 51 | sl_preprocess = self.corpus.preprocess_grid 52 | else: 53 | sl_preprocess = self.corpus.preprocess_common 54 | dataset = sl_preprocess(datas_xy, self.config.l2i, l2i_conll=self.config.l2i_conll, sl_ctype=self.config.sl_ctype, max_len=self.config.max_len) 55 | return dataset 56 | 57 | def predict(self, texts): 58 | """ 预测 """ 59 | dataset = self.process(texts) 60 | res = self.office.predict(dataset) 61 | return res 62 | 63 | 64 | if __name__ == "__main__": 65 | 66 | path_config = "../output/sequence_labeling/model_ERNIE/sl.config" 67 | tcp = SequenceLabelingPredict(path_config) 68 | texts = [{"text": "平乐县,古称昭州,隶属于广西壮族自治区桂林市,位于广西东北部,桂林市东南部,东临钟山县,南接昭平,西北毗邻阳朔,北连恭城,总面积1919.34平方公里。"}, 69 | {"text": "平乐县主要旅游景点有榕津千年古榕、冷水石景苑、仙家温泉、桂江风景区、漓江风景区等,平乐县为漓江分界点,平乐以北称漓江,以南称桂江,是著名的大桂林旅游区之一。"}, 70 | {"text": "印岭玲珑,昭水晶莹,环绕我平中。青年的乐园,多士受陶熔。生活自觉自治,学习自发自动。五育并重,手脑并用。迎接新潮流,建设新平中"}, 71 | {"text": "桂林山水甲天下, 阳朔山水甲桂林"}, 72 | ] 73 | res = tcp.predict(texts) 74 | print(res) 75 | while True: 76 | print("请输入:") 77 | question = input() 78 | res = tcp.predict([{"text": question}]) 79 | print(res) 80 | 81 | -------------------------------------------------------------------------------- /test/tc/tet_tc_base_multi_class.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/2/23 21:34 4 | # @author : Mo 5 | # @function: 多类分类, 根据label是否有|myz|分隔符判断是多类分类, 还是多标签分类 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import json 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_textclassification") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | # 分类下的引入, pytorch_textclassification 18 | from tcTools import get_current_time 19 | from tcRun import TextClassification 20 | from tcConfig import model_config 21 | 22 | 23 | # 预训练模型地址, 本地win10默认只跑2步就评估保存模型 24 | if platform.system().lower() == 'windows': 25 | # pretrained_model_dir = "D:/pretrain_models/pytorch" 26 | pretrained_model_dir = "E:/DATA/bert-model/00_pytorch" 27 | evaluate_steps = 32 # 评估步数 28 | save_steps = 32 # 存储步数 29 | else: 30 | pretrained_model_dir = "/pretrain_models/pytorch" 31 | evaluate_steps = 320 # 评估步数 32 | save_steps = 320 # 存储步数 33 | ee = 0 34 | 35 | 36 | if __name__ == "__main__": 37 | # 训练-验证语料地址, 可以只输入训练地址 38 | path_corpus = os.path.join(path_root, "pytorch_nlu", "corpus", "text_classification", "tnews") 39 | path_train = os.path.join(path_corpus, "train.json") 40 | path_dev = os.path.join(path_corpus, "dev.json") 41 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 42 | model_config["save_steps"] = save_steps # 存储步数 43 | model_config["path_train"] = path_train # 训练模语料, 必须 44 | model_config["path_dev"] = path_dev # 验证语料, 可为None 45 | model_config["path_tet"] = None # 测试语料, 可为None 46 | # multi-class: 可选 None(BCE), BCE, BCE_LOGITS, MSE, FOCAL_LOSS, DICE_LOSS, LABEL_SMOOTH; 47 | # multi-label: SOFT_MARGIN_LOSS, PRIOR_MARGIN_LOSS, FOCAL_LOSS, CIRCLE_LOSS, DICE_LOSS, MIX_focal_prior, DB_LOSS, CB_LOSS等 48 | os.environ["CUDA_VISIBLE_DEVICES"] = str(model_config["CUDA_VISIBLE_DEVICES"]) 49 | 50 | # 预训练模型适配的class 51 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 52 | pretrained_model_name_or_path = { 53 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 54 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 55 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 56 | "XLNET": "hfl/chinese-xlnet-mid", 57 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 58 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 59 | "BERT": "bert-base-chinese", 60 | # "BERT": "hfl/chinese-macbert-base", 61 | 62 | } 63 | idx = 0 # 选择的预训练模型类型---model_type, 0为BERT, 1为ernie 64 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 65 | # model_config["model_save_path"] = "../output/text_classification/model_{}".format(model_type[idx] + "_" + str(get_current_time())) 66 | model_config["model_save_path"] = "../output/text_classification/model_{}".format(model_type[idx]) 67 | model_config["model_type"] = model_type[idx] 68 | # main 69 | lc = TextClassification(model_config) 70 | lc.process() 71 | lc.train() 72 | 73 | 74 | # shell 75 | # nohup python tcRun.py > tc.log 2>&1 & 76 | # tail -n 1000 -f tc.log 77 | # |myz| 78 | 79 | -------------------------------------------------------------------------------- /test/tc/tet_tc_base_multi_label_dbloss.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/2/23 21:34 4 | # @author : Mo 5 | # @function: 使用分布平衡损失[DB-NTR], db-loss(distribution balanced loss) 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import json 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_textclassification") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | print(path_sys) 18 | # 分类下的引入, pytorch_textclassification 19 | from tcTools import get_current_time 20 | from tcRun import TextClassification 21 | from tcConfig import model_config 22 | 23 | 24 | # 预训练模型地址, 本地win10默认只跑2步就评估保存模型 25 | if platform.system().lower() == 'windows': 26 | # pretrained_model_dir = "D:/pretrain_models/pytorch" 27 | pretrained_model_dir = "E:/DATA/bert-model/00_pytorch" 28 | evaluate_steps = 32 # 评估步数 29 | save_steps = 32 # 存储步数 30 | else: 31 | pretrained_model_dir = "/pretrain_models/pytorch" 32 | evaluate_steps = 320 # 评估步数 33 | save_steps = 320 # 存储步数 34 | ee = 0 35 | 36 | 37 | if __name__ == "__main__": 38 | # 训练-验证语料地址, 可以只输入训练地址 39 | path_corpus = os.path.join(path_root, "pytorch_nlu", "corpus", "text_classification", "school") 40 | path_train = os.path.join(path_corpus, "train.json") 41 | path_dev = os.path.join(path_corpus, "dev.json") 42 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 43 | model_config["save_steps"] = save_steps # 存储步数 44 | model_config["path_train"] = path_train # 训练模语料, 必须 45 | model_config["path_dev"] = path_dev # 验证语料, 可为None 46 | model_config["path_tet"] = None # 测试语料, 可为None 47 | # 损失函数类型, 48 | # multi-class: 可选 None(BCE), BCE, BCE_LOGITS, MSE, FOCAL_LOSS, DICE_LOSS, LABEL_SMOOTH 49 | # multi-label: SOFT_MARGIN_LOSS, PRIOR_MARGIN_LOSS, FOCAL_LOSS, CIRCLE_LOSS, DICE_LOSS, MIX_focal_prior, DB_LOSS, CB_LOSS等 50 | model_config["loss_type"] = "CB_LOSS" 51 | 52 | # 预训练模型适配的class 53 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 54 | pretrained_model_name_or_path = { 55 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 56 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 57 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 58 | "XLNET": "hfl/chinese-xlnet-mid", 59 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 60 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 61 | "BERT": "bert-base-chinese", 62 | # "BERT": "hfl/chinese-macbert-base", 63 | 64 | } 65 | idx = 0 # 选择的预训练模型类型---model_type, 0为BERT, 66 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 67 | # model_config["model_save_path"] = "../output/text_classification/model_{}".format(model_type[idx] + "_" + str(get_current_time())) 68 | model_config["model_save_path"] = "../output/text_classification/model_{}".format(model_type[idx]) 69 | model_config["model_type"] = model_type[idx] 70 | # main 71 | lc = TextClassification(model_config) 72 | lc.process() 73 | lc.train() 74 | 75 | 76 | # shell 77 | # nohup python tcRun.py > tc.log 2>&1 & 78 | # tail -n 1000 -f tc.log 79 | # |myz| 80 | 81 | -------------------------------------------------------------------------------- /test/sl/tet_sl_base_softmax.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/2/23 21:34 4 | # @author : Mo 5 | # @function: 序列标注, 命名实体识别, CRF, 条件随机场 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import json 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_sequencelabeling") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | print(path_sys) 18 | # 分类下的引入, pytorch_textclassification 19 | from slTools import get_current_time 20 | from slRun import SequenceLabeling 21 | from slConfig import model_config 22 | 23 | 24 | # 预训练模型地址, 本地win10默认只跑2步就评估保存模型 25 | if platform.system().lower() == 'windows': 26 | pretrained_model_dir = "D:/pretrain_models/pytorch" 27 | evaluate_steps = 2 # 评估步数 28 | save_steps = 2 # 存储步数 29 | else: 30 | pretrained_model_dir = "/pretrain_models/pytorch" 31 | evaluate_steps = 320 # 评估步数 32 | save_steps = 320 # 存储步数 33 | ee = 0 34 | 35 | # 预训练模型适配的class 36 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 37 | pretrained_model_name_or_path = { 38 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 39 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 40 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 41 | "XLNET": "hfl/chinese-xlnet-mid", 42 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 43 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 44 | "BERT": "bert-base-chinese", 45 | # "BERT": "hfl/chinese-macbert-base", 46 | 47 | } 48 | 49 | 50 | if __name__ == "__main__": 51 | # 训练-验证语料地址, 可以只输入训练地址 52 | path_corpus = os.path.join(path_root, "corpus", "sequence_labeling", "ner_china_people_daily_1998_conll") 53 | path_train = os.path.join(path_corpus, "train.conll") 54 | path_dev = os.path.join(path_corpus, "dev.conll") 55 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 56 | model_config["save_steps"] = save_steps # 存储步数 57 | model_config["path_train"] = path_train # 训练模语料, 必须 58 | model_config["path_dev"] = path_dev # 验证语料, 可为None 59 | model_config["path_tet"] = None # 测试语料, 可为None 60 | # 一种格式 文件以.conll结尾, 或者corpus_type=="DATA-CONLL" 61 | # 另一种格式 文件以.span结尾, 或者corpus_type=="DATA-SPAN" 62 | model_config["corpus_type"] = "DATA-CONLL"# 语料数据格式, "DATA-CONLL", "DATA-SPAN" 63 | model_config["task_type"] = "SL-SOFTMAX" # 任务类型, "SL-SOFTMAX", "SL-CRF", "SL-SPAN" 64 | 65 | model_config["lr"] = 1e-5 # 学习率, 依据选择的预训练模型自己选择, 1e-5, 2e-5, 5e-5, 8e-5, 1e-4, 4e-4 66 | model_config["dense_lr"] = 1e-5 # CRF层学习率/全连接层学习率, 1e-5, 1e-4, 1e-3 67 | idx = 0 # 选择的预训练模型类型---model_type, 0为BERT, 68 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 69 | 70 | # model_config["model_save_path"] = "../output/sequence_labeling/model_{}".format(model_type[idx] + "_" + str(get_current_time())) 71 | model_config["model_save_path"] = "../output/sequence_labeling/model_{}".format(model_type[idx]) 72 | model_config["model_type"] = model_type[idx] 73 | # main 74 | lc = SequenceLabeling(model_config) 75 | lc.process() 76 | lc.train() 77 | 78 | 79 | # shell 80 | # nohup python slRun.py > sl.log 2>&1 & 81 | # tail -n 1000 -f sl.log 82 | 83 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_textsummary/tsGraph.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/11/17 21:35 4 | # @author : Mo 5 | # @function: graph of pre-train model 6 | 7 | 8 | # torch 9 | from transformers import BertPreTrainedModel 10 | import torch 11 | 12 | from tsConfig import PRETRAINED_MODEL_CLASSES 13 | 14 | 15 | class TSGraph(BertPreTrainedModel): 16 | def __init__(self, graph_config, tokenizer): 17 | """ 18 | Pytorch Graph of TextSummary, Pre-Trained Model based 19 | config: 20 | config: json, params of graph, eg. {"num_labels":17, "model_type":"BERT"} 21 | Returns: 22 | output: Tuple, Tensor of logits and loss 23 | Url: https://github.com/yongzhuo 24 | """ 25 | # 预训练语言模型读取 26 | self.graph_config = graph_config 27 | pretrained_config, pretrained_tokenizer, pretrained_model = PRETRAINED_MODEL_CLASSES[graph_config.model_type] 28 | self.pretrained_config = pretrained_config.from_pretrained(graph_config.pretrained_model_name_or_path, output_hidden_states=graph_config.output_hidden_states) 29 | self.pretrained_config.update({"gradient_checkpointing": True}) 30 | # self.tokenizer = pretrained_tokenizer.from_pretrained(graph_config.pretrained_model_name_or_path) 31 | # self.tokenizer = tokenizer 32 | super(TSGraph, self).__init__(self.pretrained_config) 33 | if self.graph_config.is_train: 34 | self.pretrain_model = pretrained_model.from_pretrained(graph_config.pretrained_model_name_or_path, config=self.pretrained_config) 35 | self.pretrain_model.resize_token_embeddings(len(tokenizer)) 36 | else: 37 | self.pretrain_model = pretrained_model(self.pretrained_config) 38 | self.pretrain_model.resize_token_embeddings(len(tokenizer)) 39 | # tokenizer.model_max_length = self.model.config.max_position_embeddings 40 | # 如果用隐藏层输出 41 | self.dense = torch.nn.Linear(self.pretrained_config.hidden_size, 1) 42 | 43 | # 池化层 44 | self.global_maxpooling = torch.nn.AdaptiveMaxPool1d(1) 45 | self.global_avgpooling = torch.nn.AdaptiveAvgPool1d(1) 46 | 47 | # 激活层/随即失活层 48 | self.softmax = torch.nn.Softmax(dim=-1) 49 | self.sigmoid = torch.nn.Sigmoid() 50 | self.dropout = torch.nn.Dropout() 51 | 52 | def forward(self, input_ids, attention_mask, token_type_ids, mask_cls, cls_ids, labels=None): 53 | output = self.pretrain_model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) 54 | top_vec = output.last_hidden_state 55 | if self.graph_config.is_dropout: 56 | top_vec = self.dropout(top_vec) 57 | sents_vec = top_vec[torch.arange(top_vec.size(0)).unsqueeze(1), cls_ids] 58 | sents_vec = sents_vec * mask_cls[:, :, None].float() 59 | h = self.dense(sents_vec).squeeze(-1) 60 | # sent_scores = torch.nn.Sigmoid()(h) * mask_cls.float() 61 | sent_scores = h * mask_cls.float() 62 | logits = sent_scores.squeeze(-1) 63 | # inference 64 | if self.graph_config.is_fc_sigmoid: 65 | return self.sigmoid(logits) 66 | elif self.graph_config.is_fc_softmax: 67 | return self.softmax(logits) 68 | return logits 69 | 70 | -------------------------------------------------------------------------------- /test/tc/tet_tc_base_multi_label_focalloss.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/2/23 21:34 4 | # @author : Mo 5 | # @function: 使用聚焦损失, focal-loss 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import json 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_textclassification") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | print(path_sys) 18 | # 分类下的引入, pytorch_textclassification 19 | from tcTools import get_current_time 20 | from tcRun import TextClassification 21 | from tcConfig import model_config 22 | 23 | 24 | # 预训练模型地址, 本地win10默认只跑2步就评估保存模型 25 | if platform.system().lower() == 'windows': 26 | # pretrained_model_dir = "D:/pretrain_models/pytorch" 27 | pretrained_model_dir = "E:/DATA/bert-model/00_pytorch" 28 | evaluate_steps = 32 # 评估步数 29 | save_steps = 32 # 存储步数 30 | else: 31 | pretrained_model_dir = "/pretrain_models/pytorch" 32 | evaluate_steps = 320 # 评估步数 33 | save_steps = 320 # 存储步数 34 | ee = 0 35 | 36 | 37 | if __name__ == "__main__": 38 | # 训练-验证语料地址, 可以只输入训练地址 39 | path_corpus = os.path.join(path_root, "pytorch_nlu", "corpus", "text_classification", "school") 40 | path_train = os.path.join(path_corpus, "train.json") 41 | path_dev = os.path.join(path_corpus, "dev.json") 42 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 43 | model_config["save_steps"] = save_steps # 存储步数 44 | model_config["path_train"] = path_train # 训练模语料, 必须 45 | model_config["path_dev"] = path_dev # 验证语料, 可为None 46 | model_config["path_tet"] = None # 测试语料, 可为None 47 | # 损失函数类型, 48 | # multi-class: 可选 None(BCE), BCE, BCE_LOGITS, MSE, FOCAL_LOSS, DICE_LOSS, LABEL_SMOOTH 49 | # multi-label: SOFT_MARGIN_LOSS, PRIOR_MARGIN_LOSS, FOCAL_LOSS, CIRCLE_LOSS, DICE_LOSS, MIX_focal_prior, DB_LOSS, CB_LOSS等 50 | model_config["loss_type"] = "FOCAL_LOSS" 51 | os.environ["CUDA_VISIBLE_DEVICES"] = str(model_config["CUDA_VISIBLE_DEVICES"]) 52 | 53 | # 预训练模型适配的class 54 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 55 | pretrained_model_name_or_path = { 56 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 57 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 58 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 59 | "XLNET": "hfl/chinese-xlnet-mid", 60 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 61 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 62 | "BERT": "bert-base-chinese", 63 | # "BERT": "hfl/chinese-macbert-base", 64 | 65 | } 66 | idx = 0 # 选择的预训练模型类型---model_type, 0为BERT, 67 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 68 | # model_config["model_save_path"] = "../output/text_classification/model_{}".format(model_type[idx] + "_" + str(get_current_time())) 69 | model_config["model_save_path"] = "../output/text_classification/model_{}".format(model_type[idx]) 70 | model_config["model_type"] = model_type[idx] 71 | # main 72 | lc = TextClassification(model_config) 73 | lc.process() 74 | lc.train() 75 | 76 | 77 | # shell 78 | # nohup python tcRun.py > tc.log 2>&1 & 79 | # tail -n 1000 -f tc.log 80 | # |myz| 81 | 82 | -------------------------------------------------------------------------------- /test/sl/tet_sl_base_data_span.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/2/23 21:34 4 | # @author : Mo 5 | # @function: 序列标注, 命名实体识别, CRF, 条件随机场 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import json 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_sequencelabeling") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | print(path_sys) 18 | # 分类下的引入, pytorch_textclassification 19 | from slTools import get_current_time 20 | from slRun import SequenceLabeling 21 | from slConfig import model_config 22 | 23 | 24 | # 预训练模型地址, 本地win10默认只跑2步就评估保存模型 25 | if platform.system().lower() == 'windows': 26 | pretrained_model_dir = "D:/pretrain_models/pytorch" 27 | evaluate_steps = 2 # 评估步数 28 | save_steps = 2 # 存储步数 29 | else: 30 | pretrained_model_dir = "/pretrain_models/pytorch" 31 | evaluate_steps = 320 # 评估步数 32 | save_steps = 320 # 存储步数 33 | ee = 0 34 | 35 | 36 | # 预训练模型适配的class 37 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 38 | pretrained_model_name_or_path = { 39 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 40 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 41 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 42 | "XLNET": "hfl/chinese-xlnet-mid", 43 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 44 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 45 | "BERT": "bert-base-chinese", 46 | # "BERT": "hfl/chinese-macbert-base", 47 | 48 | } 49 | 50 | 51 | if __name__ == "__main__": 52 | # 训练-验证语料地址, 可以只输入训练地址 53 | path_corpus = os.path.join(path_root, "pytorch_nlu", "corpus", "sequence_labeling", "ner_china_people_daily_1998_span") 54 | path_train = os.path.join(path_corpus, "train.span") 55 | path_dev = os.path.join(path_corpus, "dev.span") 56 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 57 | model_config["save_steps"] = save_steps # 存储步数 58 | model_config["path_train"] = path_train # 训练模语料, 必须 59 | model_config["path_dev"] = path_dev # 验证语料, 可为None 60 | model_config["path_tet"] = None # 测试语料, 可为None 61 | 62 | # 一种格式 文件以.conll结尾, 或者corpus_type=="DATA-CONLL" 63 | # 另一种格式 文件以.span结尾, 或者corpus_type=="DATA-SPAN" 64 | model_config["corpus_type"] = "DATA-SPAN" # 语料数据格式, "DATA-CONLL", "DATA-SPAN" 65 | model_config["xy_keys"] = ["text", "label"] # SPAN格式的数据, text, label在file中对应的keys 66 | model_config["lr"] = 1e-5 # 学习率, 1e-5, 2e-5, 5e-5, 8e-5, 1e-4, 4e-4 67 | model_config["dense_lr"] = 1e-5 # CRF层学习率/全连接层学习率, 1e-5, 1e-4, 1e-3 68 | 69 | 70 | idx = 0 # 选择的预训练模型类型---model_type, 0为BERT, 71 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 72 | # model_config["model_save_path"] = "../output/sequence_labeling/model_{}".format(model_type[idx] + "_" + str(get_current_time())) 73 | model_config["model_save_path"] = "../output/sequence_labeling/model_{}".format(model_type[idx]) 74 | model_config["model_type"] = model_type[idx] 75 | # main 76 | lc = SequenceLabeling(model_config) 77 | lc.process() 78 | lc.train() 79 | 80 | 81 | # shell 82 | # nohup python slRun.py > sl.log 2>&1 & 83 | # tail -n 1000 -f sl.log 84 | 85 | -------------------------------------------------------------------------------- /pytorch_nlu/corpus/sequence_labeling/ner_china_people_daily_1998_span/test.span: -------------------------------------------------------------------------------- 1 | {"label": [{"type": "LOC", "ent": "欧", "pos": [15, 15]}, {"type": "LOC", "ent": "美", "pos": [16, 16]}, {"type": "LOC", "ent": "港", "pos": [18, 18]}, {"type": "LOC", "ent": "台", "pos": [19, 19]}], "text": "我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。"} 2 | {"label": [], "text": "为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。"} 3 | {"label": [], "text": "其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。"} 4 | {"label": [], "text": "有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。"} 5 | {"label": [], "text": "不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。"} 6 | {"label": [], "text": "其实非汉非唐,又是什么与什么呢?"} 7 | {"label": [{"type": "PER", "ent": "国正", "pos": [0, 1]}], "text": "国正学长的文章与诗词,早就读过一些,很是喜欢。"} 8 | {"label": [], "text": "“文化大革命”中,茶馆作为“四旧”、“传播封、资、修的场所”被关闭了。"} 9 | {"label": [], "text": "几株数人才能合抱的大榕树挡住了烈日,树下凉风拂面。"} 10 | {"label": [], "text": "他正准备掏钱,妈妈轻声说:“老师,这茶不卖,我们要换粮食的……”"} 11 | {"label": [], "text": "沏茶时,一杯清水被新茶的细芽嫩叶染绿,春色满杯,清香幽远。"} 12 | {"label": [{"type": "LOC", "ent": "玉峰", "pos": [41, 42]}, {"type": "LOC", "ent": "重庆", "pos": [53, 54]}, {"type": "LOC", "ent": "玉峰山", "pos": [58, 60]}], "text": "和往年一样,清明节刚过,我的中学老师就千里迢迢寄来新采制的“雨前茶”,这是一种名叫玉峰云雾茶的绿茶,生长在重庆市郊的玉峰山麓。"} 13 | {"label": [], "text": "过了一会儿,他左手握住方向盘,腾出右手从身后摸出件东西扔给我———啊,我的手机!"} 14 | {"label": [], "text": "院门口一位修鞋的老头儿见我出来,翻着眼白从老花镜的镜片上方瞅了我一眼,“哼”一声:“哟,您还出来啦?”"} 15 | {"label": [], "text": "仅仅迎来送往,那礼节就多得不得了。"} 16 | {"label": [], "text": "喊过了也就过了,我没有把它当回事。"} 17 | {"label": [{"type": "LOC", "ent": "华", "pos": [0, 0]}, {"type": "PER", "ent": "韩素音", "pos": [4, 6]}, {"type": "LOC", "ent": "大足", "pos": [13, 14]}, {"type": "LOC", "ent": "大足石窟", "pos": [18, 21]}], "text": "华裔作家韩素音女士曾三次到大足,称“大足石窟是一座未被开发的金矿”。"} 18 | {"label": [{"type": "LOC", "ent": "日本", "pos": [0, 1]}, {"type": "PER", "ent": "石川一成", "pos": [6, 9]}, {"type": "LOC", "ent": "宝顶大佛湾", "pos": [19, 23]}, {"type": "LOC", "ent": "中华", "pos": [28, 29]}], "text": "日本知名学者石川一成先生曾撰文说:面对宝顶大佛湾造像,看中华民族囊括外来文化的能力和创造能力,不禁使我目瞪口呆。"} 19 | {"label": [{"type": "LOC", "ent": "佛湾", "pos": [2, 3]}], "text": "漫游佛湾还看到市俗化很浓的、反映贫民生活的场景《六道轮回图》,有融儒家思想于佛教教义的《父母恩重经变图》,还有说明宗教、哲理的《锁六耗图》等。"} 20 | {"label": [{"type": "LOC", "ent": "佛湾", "pos": [1, 2]}, {"type": "PER", "ent": "释迦", "pos": [58, 59]}], "text": "这佛湾最具传奇色彩的当属《九龙浴太子图》,它是以石灰岩为本色调雕出的九龙头,正中的巨龙头张着大嘴,汩汩泉水喷泻而出,释迦太子浴在泉下……远远望去,崖畔藤萝重绕,飞泉雾气腾腾,九龙恍如从云中飞出,气势夺人。"} 21 | {"label": [], "text": "这千手千姿无一重复,是我见到过的千手佛中最美,内蕴最丰富的一龛了。"} 22 | {"label": [{"type": "PER", "ent": "宋神宗", "pos": [8, 10]}, {"type": "ORG", "ent": "礼部", "pos": [15, 16]}, {"type": "PER", "ent": "杨次公", "pos": [18, 20]}], "text": "可见牧牛图是根据宋神宗时,官拜礼部郎杨次公的“牧牛颂”而创作的。"} 23 | {"label": [{"type": "LOC", "ent": "宝顶山", "pos": [25, 27]}, {"type": "PER", "ent": "密宗", "pos": [31, 32]}], "text": "最大的龛图竟有八十多米,最小的也有十来米,还有记载宝顶山沿革和密宗史实的七通碑刻及各种题记十七则和两座保存很好的舍利塔。"} 24 | {"label": [{"type": "LOC", "ent": "华夏", "pos": [4, 5]}], "text": "它折射出华夏文明的绚丽光环,使这片古老的土地更加异彩纷呈,充满魅力。"} 25 | {"label": [], "text": "同时,画册前后还附加了盲文页码,既增强了“盲童画集”的特点,也构成了一种很特别的装饰效果。"} 26 | {"label": [], "text": "盲童则不然,他们对这个世界的感知只能靠想象和有限的触觉,通过触摸感觉形体,通过知识认识颜色。"} 27 | {"label": [], "text": "就这些盲童的作品本身而言,也是十分富有特色的。"} 28 | {"label": [], "text": "但愿这种“新孝道”仅仅是一个玩笑。"} 29 | {"label": [], "text": "然而,你在公共汽车上看到的却往往“相反”。"} 30 | {"label": [], "text": "把儿子惯成这副模样,这位被砍的母亲有没有责任呢?"} 31 | {"label": [], "text": "———一口恶痰直向这位母亲脸上飞去。"} 32 | {"label": [], "text": "有一种解释:封建社会的父母们望子成龙、望女成凤太过心切,往往失望,“龙”未成而得“犬”,“凤”未成而得“鸡”,理想到现实一落千丈,父母们的一片爱心岂不遭到了嘲弄?"} 33 | -------------------------------------------------------------------------------- /test/sl/tet_sl_base_data_conll.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/2/23 21:34 4 | # @author : Mo 5 | # @function: 序列标注, 命名实体识别, CRF, 条件随机场 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import json 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_sequencelabeling") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | print(path_sys) 18 | # 分类下的引入, pytorch_textclassification 19 | from slTools import get_current_time 20 | from slRun import SequenceLabeling 21 | from slConfig import model_config 22 | 23 | 24 | # 预训练模型地址, 本地win10默认只跑2步就评估保存模型 25 | if platform.system().lower() == 'windows': 26 | pretrained_model_dir = "D:/pretrain_models/pytorch" 27 | evaluate_steps = 2 # 评估步数 28 | save_steps = 2 # 存储步数 29 | else: 30 | pretrained_model_dir = "/pretrain_models/pytorch" 31 | evaluate_steps = 320 # 评估步数 32 | save_steps = 320 # 存储步数 33 | ee = 0 34 | 35 | 36 | # 预训练模型适配的class 37 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 38 | pretrained_model_name_or_path = { 39 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 40 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 41 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 42 | "XLNET": "hfl/chinese-xlnet-mid", 43 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 44 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 45 | "BERT": "bert-base-chinese", 46 | # "BERT": "hfl/chinese-macbert-base", 47 | 48 | } 49 | 50 | 51 | if __name__ == "__main__": 52 | # 训练-验证语料地址, 可以只输入训练地址 53 | path_corpus = os.path.join(path_root, "pytorch_nlu", "corpus", "sequence_labeling", "ner_china_people_daily_1998_conll") 54 | path_train = os.path.join(path_corpus, "train.conll") 55 | path_dev = os.path.join(path_corpus, "dev.conll") 56 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 57 | model_config["save_steps"] = save_steps # 存储步数 58 | model_config["path_train"] = path_train # 训练模语料, 必须 59 | model_config["path_dev"] = path_dev # 验证语料, 可为None 60 | model_config["path_tet"] = None # 测试语料, 可为None 61 | # 一种格式 文件以.conll结尾, 或者corpus_type=="DATA-CONLL" 62 | # 另一种格式 文件以.span结尾, 或者corpus_type=="DATA-SPAN" 63 | model_config["corpus_type"] = "DATA-CONLL" # 语料数据格式, "DATA-CONLL", "DATA-SPAN" 64 | model_config["xy_keys"] = [0, 1] # CONLL格式的数据, text, label在file中对应的keys, colln时候选择[0,1]等integer 65 | model_config["lr"] = 1e-5 # 学习率, 1e-5, 2e-5, 5e-5, 8e-5, 1e-4, 4e-4 66 | model_config["dense_lr"] = 1e-5 # CRF层学习率/全连接层学习率, 1e-5, 1e-4, 1e-3 67 | 68 | 69 | idx = 0 # 选择的预训练模型类型---model_type, 0为BERT, 70 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 71 | # model_config["model_save_path"] = "../output/sequence_labeling/model_{}".format(model_type[idx] + "_" + str(get_current_time())) 72 | model_config["model_save_path"] = "../output/sequence_labeling/model_{}".format(model_type[idx]) 73 | model_config["model_type"] = model_type[idx] 74 | # main 75 | lc = SequenceLabeling(model_config) 76 | lc.process() 77 | lc.train() 78 | 79 | 80 | # shell 81 | # nohup python slRun.py > sl.log 2>&1 & 82 | # tail -n 1000 -f sl.log 83 | 84 | -------------------------------------------------------------------------------- /test/tc/tet_tc_base_multi_label.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/2/23 21:34 4 | # @author : Mo 5 | # @function: 多标签分类, 根据label是否有|myz|分隔符判断是多类分类, 还是多标签分类 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import json 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_textclassification") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | print(path_sys) 18 | # 分类下的引入, pytorch_textclassification 19 | from tcTools import get_current_time 20 | from tcRun import TextClassification 21 | from tcConfig import model_config 22 | 23 | 24 | # 预训练模型地址, 本地win10默认只跑2步就评估保存模型 25 | if platform.system().lower() == 'windows': 26 | # pretrained_model_dir = "D:/pretrain_models/pytorch" 27 | pretrained_model_dir = "E:/DATA/bert-model/00_pytorch" 28 | evaluate_steps = 32 # 评估步数 29 | save_steps = 32 # 存储步数 30 | else: 31 | pretrained_model_dir = "/pretrain_models/pytorch" 32 | evaluate_steps = 320 # 评估步数 33 | save_steps = 320 # 存储步数 34 | ee = 0 35 | 36 | 37 | if __name__ == "__main__": 38 | # 训练-验证语料地址, 可以只输入训练地址 39 | path_corpus = os.path.join(path_root, "pytorch_nlu", "corpus", "text_classification", "school") 40 | path_train = os.path.join(path_corpus, "train.json") 41 | path_dev = os.path.join(path_corpus, "dev.json") 42 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 43 | model_config["save_steps"] = save_steps # 存储步数 44 | model_config["path_train"] = path_train # 训练模语料, 必须 45 | model_config["path_dev"] = path_dev # 验证语料, 可为None 46 | model_config["path_tet"] = None # 测试语料, 可为None 47 | # 损失函数类型, 48 | # multi-class: 可选 None(BCE), BCE, BCE_LOGITS, MSE, FOCAL_LOSS, DICE_LOSS, LABEL_SMOOTH 49 | # multi-label: SOFT_MARGIN_LOSS, PRIOR_MARGIN_LOSS, FOCAL_LOSS, CIRCLE_LOSS, DICE_LOSS, MIX_focal_prior, DB_LOSS, CB_LOSS等 50 | model_config["loss_type"] = "SOFT_MARGIN_LOSS" 51 | os.environ["CUDA_VISIBLE_DEVICES"] = str(model_config["CUDA_VISIBLE_DEVICES"]) 52 | 53 | # 预训练模型适配的class 54 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 55 | pretrained_model_name_or_path = { 56 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 57 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 58 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 59 | "XLNET": "hfl/chinese-xlnet-mid", 60 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 61 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 62 | "BERT": "bert-base-chinese", 63 | # "BERT": "hfl/chinese-macbert-base", 64 | 65 | } 66 | idx = 0 # 选择的预训练模型类型---model_type, 0为BERT, 67 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 68 | # model_config["model_save_path"] = "../output/text_classification/model_{}".format(model_type[idx] + "_" + str(get_current_time())) 69 | model_config["model_save_path"] = "../output/text_classification/model_{}".format(model_type[idx]) 70 | model_config["model_type"] = model_type[idx] 71 | # main 72 | lc = TextClassification(model_config) 73 | lc.process() 74 | lc.train() 75 | 76 | 77 | # shell 78 | # nohup python tcRun.py > tc.log 2>&1 & 79 | # tail -n 1000 -f tc.log 80 | # |myz| 81 | 82 | -------------------------------------------------------------------------------- /test/sl/tet_sl_base_span.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/2/23 21:34 4 | # @author : Mo 5 | # @function: 序列标注, 使用SPAN架构的网络, 即start_id = [0,2,0,0.....] 6 | # end_id = [0,0,0,2,0...] 7 | 8 | 9 | # 适配linux 10 | import platform 11 | import json 12 | import sys 13 | import os 14 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 15 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_sequencelabeling") 16 | sys.path.append(path_sys) 17 | print(path_root) 18 | print(path_sys) 19 | # 分类下的引入, pytorch_textclassification 20 | from slTools import get_current_time 21 | from slRun import SequenceLabeling 22 | from slConfig import model_config 23 | 24 | 25 | # 预训练模型地址, 本地win10默认只跑2步就评估保存模型 26 | if platform.system().lower() == 'windows': 27 | pretrained_model_dir = "D:/pretrain_models/pytorch" 28 | evaluate_steps = 2 # 评估步数 29 | save_steps = 2 # 存储步数 30 | else: 31 | pretrained_model_dir = "/pretrain_models/pytorch" 32 | evaluate_steps = 320 # 评估步数 33 | save_steps = 320 # 存储步数 34 | ee = 0 35 | 36 | 37 | # 预训练模型适配的class 38 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 39 | pretrained_model_name_or_path = { 40 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 41 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 42 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 43 | "XLNET": "hfl/chinese-xlnet-mid", 44 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 45 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 46 | "BERT": "bert-base-chinese", 47 | # "BERT": "hfl/chinese-macbert-base", 48 | 49 | } 50 | 51 | 52 | if __name__ == "__main__": 53 | # 训练-验证语料地址, 可以只输入训练地址 54 | path_corpus = os.path.join(path_root, "corpus", "sequence_labeling", "ner_china_people_daily_1998_conll") 55 | path_train = os.path.join(path_corpus, "train.conll") 56 | path_dev = os.path.join(path_corpus, "dev.conll") 57 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 58 | model_config["save_steps"] = save_steps # 存储步数 59 | model_config["path_train"] = path_train # 训练模语料, 必须 60 | model_config["path_dev"] = path_dev # 验证语料, 可为None 61 | model_config["path_tet"] = None # 测试语料, 可为None 62 | # 一种格式 文件以.conll结尾, 或者corpus_type=="DATA-CONLL" 63 | # 另一种格式 文件以.span结尾, 或者corpus_type=="DATA-SPAN" 64 | model_config["corpus_type"] = "DATA-CONLL"# 语料数据格式, "DATA-CONLL", "DATA-SPAN" 65 | model_config["task_type"] = "SL-SPAN" # 任务类型, "SL-SOFTMAX", "SL-CRF", "SL-SPAN" 66 | 67 | model_config["lr"] = 1e-5 # 学习率, 依据选择的预训练模型自己选择, 1e-5, 2e-5, 5e-5, 8e-5, 1e-4, 4e-4 68 | model_config["dense_lr"] = 1e-5 # CRF层学习率/全连接层学习率, 1e-5, 1e-4, 1e-3 69 | idx = 0 # 选择的预训练模型类型---model_type, 0为BERT, 70 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 71 | 72 | # model_config["model_save_path"] = "../output/sequence_labeling/model_{}".format(model_type[idx] + "_" + str(get_current_time())) 73 | model_config["model_save_path"] = "../output/sequence_labeling/model_{}".format(model_type[idx]) 74 | model_config["model_type"] = model_type[idx] 75 | # main 76 | lc = SequenceLabeling(model_config) 77 | lc.process() 78 | lc.train() 79 | 80 | 81 | # shell 82 | # nohup python slRun.py > sl.log 2>&1 & 83 | # tail -n 1000 -f sl.log 84 | 85 | -------------------------------------------------------------------------------- /test/sl/tet_sl_base_crf_ernie.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/2/23 21:34 4 | # @author : Mo 5 | # @function: 序列标注, 命名实体识别, CRF, 条件随机场 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import json 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_sequencelabeling") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | print(path_sys) 18 | # 分类下的引入, pytorch_textclassification 19 | from slTools import get_current_time 20 | from slRun import SequenceLabeling 21 | from slConfig import model_config 22 | 23 | 24 | if platform.system().lower() == 'windows': 25 | pretrained_model_dir = "D:/pretrain_models/pytorch" 26 | evaluate_steps = 2 # 评估步数 27 | save_steps = 2 # 存储步数 28 | else: 29 | pretrained_model_dir = "/pretrain_models/pytorch" 30 | evaluate_steps = 320 # 评估步数 31 | save_steps = 320 # 存储步数 32 | ee = 0 33 | 34 | # 预训练模型适配的class 35 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 36 | pretrained_model_name_or_path = { 37 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 38 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 39 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 40 | "XLNET": "hfl/chinese-xlnet-mid", 41 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 42 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 43 | "BERT": "bert-base-chinese", 44 | # "BERT": "hfl/chinese-macbert-base", 45 | 46 | } 47 | 48 | 49 | if __name__ == "__main__": 50 | # 训练-验证语料地址, 可以只输入训练地址 51 | path_corpus = os.path.join(path_root, "pytorch_nlu", "corpus", "sequence_labeling", "ner_china_people_daily_1998_conll") 52 | path_train = os.path.join(path_corpus, "train.conll") 53 | path_dev = os.path.join(path_corpus, "dev.conll") 54 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 55 | model_config["save_steps"] = save_steps # 存储步数 56 | model_config["path_train"] = path_train # 训练模语料, 必须 57 | model_config["path_dev"] = path_dev # 验证语料, 可为None 58 | model_config["path_tet"] = None # 测试语料, 可为None 59 | # 一种格式 文件以.conll结尾, 或者corpus_type=="DATA-CONLL" 60 | # 另一种格式 文件以.span结尾, 或者corpus_type=="DATA-SPAN" 61 | model_config["corpus_type"] = "DATA-CONLL"# 语料数据格式, "DATA-CONLL", "DATA-SPAN" 62 | model_config["task_type"] = "SL-CRF" # 任务类型, "SL-SOFTMAX", "SL-CRF", "SL-SPAN" 63 | 64 | model_config["dense_lr"] = 1e-3 # 最后一层的学习率, CRF层学习率/全连接层学习率, 1e-5, 1e-4, 1e-3 65 | model_config["lr"] = 5e-5 # 学习率, 1e-5, 2e-5, 5e-5, 8e-5, 1e-4, 4e-4 66 | model_config["max_len"] = 156 # 最大文本长度, None和-1则为自动获取覆盖0.95数据的文本长度, 0则取训练语料的最大长度, 具体的数值就是强制padding到max_len 67 | 68 | idx = 1 # 选择的预训练模型类型---model_type, 0为BERT, 1为ERNIE 69 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 70 | 71 | # model_config["model_save_path"] = "../output/sequence_labeling/model_{}".format(model_type[idx] + "_" + str(get_current_time())) 72 | model_config["model_save_path"] = "../output/sequence_labeling/model_{}".format(model_type[idx]) 73 | model_config["model_type"] = model_type[idx] 74 | # main 75 | lc = SequenceLabeling(model_config) 76 | lc.process() 77 | lc.train() 78 | 79 | 80 | # shell 81 | # nohup python slRun.py > sl.log 2>&1 & 82 | # tail -n 1000 -f sl.log 83 | 84 | -------------------------------------------------------------------------------- /test/tc/tet_tc_base_multi_label_isadv.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/2/23 21:34 4 | # @author : Mo 5 | # @function: 使用对抗训练, is-adv 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import json 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_textclassification") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | print(path_sys) 18 | # 分类下的引入, pytorch_textclassification 19 | from tcTools import get_current_time 20 | from tcRun import TextClassification 21 | from tcConfig import model_config 22 | 23 | 24 | # 预训练模型地址, 本地win10默认只跑2步就评估保存模型 25 | if platform.system().lower() == 'windows': 26 | # pretrained_model_dir = "D:/pretrain_models/pytorch" 27 | pretrained_model_dir = "E:/DATA/bert-model/00_pytorch" 28 | evaluate_steps = 32 # 评估步数 29 | save_steps = 32 # 存储步数 30 | else: 31 | pretrained_model_dir = "/pretrain_models/pytorch" 32 | evaluate_steps = 320 # 评估步数 33 | save_steps = 320 # 存储步数 34 | ee = 0 35 | 36 | 37 | if __name__ == "__main__": 38 | # 训练-验证语料地址, 可以只输入训练地址 39 | path_corpus = os.path.join(path_root, "pytorch_nlu", "corpus", "text_classification", "school") 40 | path_train = os.path.join(path_corpus, "train.json") 41 | path_dev = os.path.join(path_corpus, "dev.json") 42 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 43 | model_config["save_steps"] = save_steps # 存储步数 44 | model_config["path_train"] = path_train # 训练模语料, 必须 45 | model_config["path_dev"] = path_dev # 验证语料, 可为None 46 | model_config["path_tet"] = None # 测试语料, 可为None 47 | model_config["is_adv"] = True # 使用对抗训练, 即扰动embedding 48 | # 损失函数类型, 49 | # multi-class: 可选 None(BCE), BCE, BCE_LOGITS, MSE, FOCAL_LOSS, DICE_LOSS, LABEL_SMOOTH 50 | # multi-label: SOFT_MARGIN_LOSS, PRIOR_MARGIN_LOSS, FOCAL_LOSS, CIRCLE_LOSS, DICE_LOSS, MIX_focal_prior, DB_LOSS, CB_LOSS等 51 | model_config["loss_type"] = "SOFT_MARGIN_LOSS" 52 | # os.environ["CUDA_VISIBLE_DEVICES"] = str(model_config["CUDA_VISIBLE_DEVICES"]) 53 | 54 | # 预训练模型适配的class 55 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 56 | pretrained_model_name_or_path = { 57 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 58 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 59 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 60 | "XLNET": "hfl/chinese-xlnet-mid", 61 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 62 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 63 | "BERT": "bert-base-chinese", 64 | # "BERT": "hfl/chinese-macbert-base", 65 | 66 | } 67 | idx = 0 # 选择的预训练模型类型---model_type, 0为BERT, 68 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 69 | # model_config["model_save_path"] = "../output/text_classification/model_{}".format(model_type[idx] + "_" + str(get_current_time())) 70 | model_config["model_save_path"] = "../output/text_classification/model_{}".format(model_type[idx]) 71 | model_config["model_type"] = model_type[idx] 72 | # main 73 | lc = TextClassification(model_config) 74 | lc.process() 75 | lc.train() 76 | 77 | 78 | # shell 79 | # nohup python tcRun.py > tc.log 2>&1 & 80 | # tail -n 1000 -f tc.log 81 | # |myz| 82 | 83 | -------------------------------------------------------------------------------- /test/sl/tet_sl_base_crf.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/2/23 21:34 4 | # @author : Mo 5 | # @function: 序列标注, 命名实体识别, CRF, 条件随机场 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import json 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_sequencelabeling") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | print(path_sys) 18 | # 分类下的引入, pytorch_textclassification 19 | from slTools import get_current_time 20 | from slRun import SequenceLabeling 21 | from slConfig import model_config 22 | 23 | 24 | # 预训练模型目录, 本地win10默认只跑2步就评估保存模型 25 | if platform.system().lower() == 'windows': 26 | pretrained_model_dir = "D:/pretrain_models/pytorch" 27 | evaluate_steps = 2 # 评估步数 28 | save_steps = 2 # 存储步数 29 | else: 30 | pretrained_model_dir = "/pretrain_models/pytorch" 31 | evaluate_steps = 320 # 评估步数 32 | save_steps = 320 # 存储步数 33 | ee = 0 34 | 35 | # 预训练模型适配的class 36 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 37 | pretrained_model_name_or_path = { 38 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 39 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 40 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 41 | "XLNET": "hfl/chinese-xlnet-mid", 42 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 43 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 44 | "BERT": "bert-base-chinese", 45 | # "BERT": "hfl/chinese-macbert-base", 46 | 47 | } 48 | 49 | 50 | if __name__ == "__main__": 51 | # 训练-验证语料地址, 可以只输入训练地址 52 | path_corpus = os.path.join(path_root, "pytorch_nlu", "corpus", "sequence_labeling", "ner_china_people_daily_1998_conll") 53 | path_train = os.path.join(path_corpus, "train.conll") 54 | path_dev = os.path.join(path_corpus, "dev.conll") 55 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 56 | model_config["save_steps"] = save_steps # 存储步数 57 | model_config["path_train"] = path_train # 训练模语料, 必须 58 | model_config["path_dev"] = path_dev # 验证语料, 可为None 59 | model_config["path_tet"] = None # 测试语料, 可为None 60 | # 一种格式 文件以.conll结尾, 或者corpus_type=="DATA-CONLL" 61 | # 另一种格式 文件以.span结尾, 或者corpus_type=="DATA-SPAN" 62 | model_config["corpus_type"] = "DATA-CONLL"# 语料数据格式, "DATA-CONLL", "DATA-SPAN" 63 | model_config["task_type"] = "SL-CRF" # 任务类型, "SL-SOFTMAX", "SL-CRF", "SL-SPAN" 64 | 65 | model_config["dense_lr"] = 1e-5 # 最后一层的学习率, CRF层学习率/全连接层学习率, 1e-5, 1e-4, 1e-3 66 | model_config["lr"] = 1e-5 # 学习率, 1e-5, 2e-5, 5e-5, 8e-5, 1e-4, 4e-4 67 | model_config["max_len"] = 156 # 最大文本长度, None和-1则为自动获取覆盖0.95数据的文本长度, 0则取训练语料的最大长度, 具体的数值就是强制padding到max_len 68 | 69 | idx = 0 # 选择的预训练模型类型---model_type, 0为BERT, 70 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 71 | 72 | # model_config["model_save_path"] = "../output/sequence_labeling/model_{}".format(model_type[idx] + "_" + str(get_current_time())) 73 | model_config["model_save_path"] = "../output/sequence_labeling/model_{}".format(model_type[idx]) 74 | model_config["model_type"] = model_type[idx] 75 | # main 76 | lc = SequenceLabeling(model_config) 77 | lc.process() 78 | lc.train() 79 | 80 | 81 | # shell 82 | # nohup python slRun.py > sl.log 2>&1 & 83 | # tail -n 1000 -f sl.log 84 | 85 | -------------------------------------------------------------------------------- /test/sl/tet_sl_base_grid.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/9/8 22:44 4 | # @author : Mo 5 | # @function: 序列标注, 使用全局(网格)指针网络(GROBAL或GRID)架构的网络, eg.grid_id = [[0, 0, 0], [0, 1, 0], [0, 0, 0]] 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import json 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_sequencelabeling") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | print(path_sys) 18 | # 分类下的引入, pytorch_textclassification 19 | from slTools import get_current_time 20 | from slRun import SequenceLabeling 21 | from slConfig import model_config 22 | 23 | 24 | # 预训练模型地址, 本地win10默认只跑2步就评估保存模型 25 | if platform.system().lower() == 'windows': 26 | # pretrained_model_dir = "D:/pretrain_models/pytorch" 27 | pretrained_model_dir = "E:/DATA/bert-model/00_pytorch" 28 | evaluate_steps = 2 # 评估步数 29 | save_steps = 2 # 存储步数 30 | else: 31 | pretrained_model_dir = "/pretrain_models/pytorch" 32 | evaluate_steps = 320 # 评估步数 33 | save_steps = 320 # 存储步数 34 | ee = 0 35 | 36 | 37 | # 预训练模型适配的class 38 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 39 | pretrained_model_name_or_path = { 40 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 41 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 42 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 43 | "XLNET": "hfl/chinese-xlnet-mid", 44 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 45 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 46 | "BERT": "bert-base-chinese", 47 | # "BERT": "hfl/chinese-macbert-base", 48 | 49 | } 50 | 51 | 52 | if __name__ == "__main__": 53 | # 训练-验证语料地址, 可以只输入训练地址 54 | path_corpus = os.path.join(path_root, "pytorch_nlu", "corpus", "sequence_labeling", "ner_china_people_daily_1998_conll") 55 | path_train = os.path.join(path_corpus, "train.conll") 56 | path_dev = os.path.join(path_corpus, "dev.conll") 57 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 58 | model_config["save_steps"] = save_steps # 存储步数 59 | model_config["path_train"] = path_train # 训练模语料, 必须 60 | model_config["path_dev"] = path_dev # 验证语料, 可为None 61 | model_config["path_tet"] = None # 测试语料, 可为None 62 | # 一种格式 文件以.conll结尾, 或者corpus_type=="DATA-CONLL" 63 | # 另一种格式 文件以.span结尾, 或者corpus_type=="DATA-SPAN" 64 | model_config["corpus_type"] = "DATA-CONLL"# 语料数据格式, "DATA-CONLL", "DATA-SPAN" 65 | model_config["task_type"] = "SL-GRID" # 任务类型, "SL-SOFTMAX", "SL-CRF", "SL-SPAN" 66 | model_config["task_type"] = "SL-GRID" # 任务类型, "SL-SOFTMAX", "SL-CRF", "SL-SPAN" 67 | model_config["loss_type"] = "CIRCLE_LOSS" # 因为0比较多所以必须使用circle_loss, 否则不收敛 68 | 69 | model_config["lr"] = 1e-5 # 学习率, 依据选择的预训练模型自己选择, 1e-5, 2e-5, 5e-5, 8e-5, 1e-4, 4e-4 70 | model_config["dense_lr"] = 1e-5 # CRF层学习率/全连接层学习率, 1e-5, 1e-4, 1e-3 71 | model_config["max_len"] = 156 # 最大文本长度, None和-1则为自动获取覆盖0.95数据的文本长度, 0则取训练语料的最大长度, 具体的数值就是强制padding到max_len 72 | 73 | idx = 0 # 选择的预训练模型类型---model_type, 0为BERT, 74 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 75 | 76 | # model_config["model_save_path"] = "../output/sequence_labeling/model_{}".format(model_type[idx] + "_" + str(get_current_time())) 77 | model_config["model_save_path"] = "../output/sequence_labeling/model_{}".format(model_type[idx]) 78 | model_config["model_type"] = model_type[idx] 79 | # main 80 | lc = SequenceLabeling(model_config) 81 | lc.process() 82 | lc.train() 83 | 84 | 85 | # shell 86 | # nohup python slRun.py > sl.log 2>&1 & 87 | # tail -n 1000 -f sl.log 88 | 89 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_textregression/README.md: -------------------------------------------------------------------------------- 1 | 2 | # [***pytorch-textregression***](https://github.com/yongzhuo/Pytorch-NLU/pytorch_textregression) 3 | >>> pytorch-textregression是一个以pytorch和transformers为基础,专注于中文文本分类的轻量级自然语言处理工具,支持多值回归等。 4 | 5 | 6 | ## 目录 7 | * [数据](#数据) 8 | * [使用方式](#使用方式) 9 | * [paper](#paper) 10 | * [参考](#参考) 11 | 12 | 13 | ## 项目地址 14 | - pytorch-textregression: [https://github.com/yongzhuo/Pytorch-NLU/pytorch_textregression](https://github.com/yongzhuo/Pytorch-NLU/pytorch_textregression) 15 | 16 | 17 | ## 数据格式 18 | ``` 19 | 1. 文本回归 (txt格式, 每行为一个json): 20 | 21 | 1.1 单个得分格式: 22 | {"text": "你安静!", "label": [1]} 23 | {"text": "斗牛场是多么欢乐阿!", "label": [1]} 24 | {"text": "今天你不必做作业。", "label": [0]} 25 | {"text": "他醒来时,几乎无法说话。", "label": [0]} 26 | {"text": "在那天边隐约闪亮的不就是黄河?", "label": [1]} 27 | 28 | 1.2 多个得分格式: 29 | {"text": "你安静!", "label": [1,0]} 30 | {"text": "斗牛场是多么欢乐阿!", "label": [1,0]} 31 | {"text": "今天你不必做作业。", "label": [0,0]} 32 | {"text": "他醒来时,几乎无法说话。", "label": [0,0]} 33 | {"text": "在那天边隐约闪亮的不就是黄河?", "label": [1,0]} 34 | 35 | ``` 36 | 37 | 38 | # 使用方式 39 | 更多样例sample详情见test/tr目录 40 | ```bash运行 41 | 训练 python tet_tr_base_train.py 42 | 预测 python tet_tr_base_predict.py 43 | ``` 44 | - 1. 需要配置好预训练模型目录, 即变量 pretrained_model_dir、pretrained_model_name_or_path、idx等; 45 | - 2. 需要配置好自己的语料地址, 即字典 model_config["path_train"]、model_config["path_dev"] 46 | - 3. cd到该脚本目录下运行普通的命令行即可, 例如: python trRun.py , python trPredict.py 47 | ## 文本回归(TR), Text-Regression 48 | ```bash 49 | # 适配linux 50 | import platform 51 | import json 52 | import sys 53 | import os 54 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 55 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_textregression") 56 | sys.path.append(path_sys) 57 | print(path_root) 58 | # 分类下的引入, pytorch_textclassification 59 | from trConfig import model_config 60 | from trTools import get_current_time 61 | 62 | # 训练-验证语料地址, 可以只输入训练地址 63 | path_corpus = path_root + "/corpus/text_regression/negative_sentence" 64 | path_train = os.path.join(path_corpus, "train.json") 65 | path_dev = os.path.join(path_corpus, "dev.json") 66 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 67 | model_config["save_steps"] = save_steps # 存储步数 68 | model_config["path_train"] = path_train 69 | model_config["path_dev"] = path_dev 70 | 71 | # 预训练模型适配的class 72 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 73 | pretrained_model_name_or_path = { 74 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 75 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 76 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 77 | "XLNET": "hfl/chinese-xlnet-mid", 78 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 79 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 80 | "BERT": "bert-base-chinese", 81 | # "BERT": "hfl/chinese-macbert-base", 82 | 83 | } 84 | idx = 1 # 选择的预训练模型类型---model_type 85 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 86 | model_config["model_save_path"] = "../output/text_regression/model_{}".format(model_type[idx]) 87 | model_config["model_type"] = model_type[idx] 88 | 89 | # os.environ["CUDA_VISIBLE_DEVICES"] = str(model_config["CUDA_VISIBLE_DEVICES"]) 90 | 91 | # main 92 | lc = TextRegression(model_config) 93 | lc.process() 94 | lc.train() 95 | 96 | ``` 97 | 98 | 99 | # Reference 100 | For citing this work, you can refer to the present GitHub project. For example, with BibTeX: 101 | ``` 102 | @software{Pytorch-NLU, 103 | url = {https://github.com/yongzhuo/Pytorch-NLU}, 104 | author = {Yongzhuo Mo}, 105 | title = {Pytorch-NLU}, 106 | year = {2021} 107 | 108 | ``` 109 | *希望对你有所帮助! 110 | 111 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_textsummary/tsPredict.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/7/25 9:30 4 | # @author : Mo 5 | # @function: predict model, 预测模块 6 | 7 | 8 | # 适配linux 9 | from argparse import Namespace 10 | import logging as logger 11 | import traceback 12 | import sys 13 | import os 14 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "")) 15 | sys.path.append(path_root) 16 | # print(path_root) 17 | 18 | from tsTools import load_json 19 | from tsOffice import Office 20 | from tsData import DataSet 21 | 22 | 23 | class TextSummaryPredict: 24 | def __init__(self, path_config, pretrained_model_name_or_path=None, logger=logger): 25 | """ 初始化 """ 26 | self.pretrained_model_name_or_path = pretrained_model_name_or_path 27 | self.logger = logger 28 | self.load_config(path_config) 29 | self.load_model() 30 | 31 | def load_config(self, path_config): 32 | """ 加载超参数 """ 33 | config = load_json(path_config) 34 | self.config = Namespace(**config) 35 | if self.pretrained_model_name_or_path: 36 | self.config.pretrained_model_name_or_path = self.pretrained_model_name_or_path 37 | # self.config.CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", "-1") 38 | self.real_model_save_path = os.path.split(path_config)[0] 39 | self.config.model_save_path = self.real_model_save_path 40 | self.l2i, self.i2l = self.config.l2i, self.config.i2l 41 | # 数据预处理 类 42 | self.corpus = DataSet(config=self.config, path_json=None, logger=self.logger) 43 | 44 | def load_model(self): 45 | """ 加载模型 """ 46 | self.office = Office(config=self.config, tokenizer=self.corpus.tokenizer, logger=self.logger) 47 | try: 48 | self.office.load_model_state() 49 | except Exception as e: 50 | self.logger.info(traceback.print_exc()) 51 | self.logger.info("self.office.load_model_state() is wrong, start self.office.load_model()") 52 | self.office.load_model() 53 | 54 | def process(self, texts): 55 | """ 数据预处理, process """ 56 | # token 转 idx, 训练集/验证集 57 | self.corpus.data_iter = self.corpus.read_texts_from_json(texts, keys=self.config.xy_keys) 58 | 59 | def predict(self, texts, logits_type="sigmoid", rounded=4): 60 | """ 分类模型预测 61 | config: 62 | texts : List, inputs of text, eg. {"num_labels":17, "model_type":"BERT"} 63 | logits_type: string, output-logits type, eg. "logits", "sigmoid", "softmax" 64 | rounded : int, rounded of float, eg. 3, 4, 6 65 | Returns: 66 | res : List, output of label-score, eg. 67 | """ 68 | self.process(texts) 69 | res = self.office.predict(self.corpus, rounded=rounded, logits_type=logits_type) 70 | return res 71 | 72 | def predict_loop(self): 73 | while 1: 74 | print("请输入:") 75 | text = input() 76 | res = self.predict(text) 77 | print(res) 78 | 79 | 80 | if __name__ == "__main__": 81 | # BERT-base = 8109M 82 | path_config = "../output/text_summary/model_ERNIE/tc.config" 83 | 84 | tcp = TextSummaryPredict(path_config) 85 | texts = [{'text': ['平乐县', '古称昭州', '隶属于广西壮族自治区桂林市', '位于广西东北部', '桂林市东南部', '东临钟山县', 86 | '南接昭平', '西北毗邻阳朔', '北连恭城', '总面积1919.34平方公里。']}, 87 | {'text': ['平乐县主要旅游景点有榕津千年古榕、冷水石景苑、仙家温泉、桂江风景区、漓江风景区等', '平乐县为漓江分界点', 88 | '平乐以北称漓江', '以南称桂江', '是著名的大桂林旅游区之一。']}, 89 | {'text': ['印岭玲珑', '昭水晶莹', '环绕我平中。青年的乐园', '多士受陶熔。生活自觉自治', '学习自发自动。五育并重', 90 | '手脑并用。迎接新潮流', '建设新平中']}, 91 | {'text': ['桂林山水甲天下', '阳朔山水甲桂林']}] 92 | res = tcp.predict(texts, logits_type="sigmoid") 93 | print(str(res).encode("ignore").decode("ignore")) 94 | 95 | -------------------------------------------------------------------------------- /pytorch-loss.md: -------------------------------------------------------------------------------- 1 | # some nlp pytorch-loss 2 | - [通过互信息思想来缓解类别不平衡问题](https://spaces.ac.cn/archives/7615) 3 | - [将“softmax+交叉熵”推广到多标签分类问题](https://spaces.ac.cn/archives/7359) 4 | 5 | 6 | # 1. Prior-BCE 7 | 8 | ```bash 9 | class PriorMultiLabelSoftMarginLoss(nn.Module): 10 | def __init__(self, prior=None, num_labels=None, reduction="mean", eps=1e-9, tau=1.0): 11 | """PriorCrossEntropy 12 | categorical-crossentropy-with-prior 13 | urls: [通过互信息思想来缓解类别不平衡问题](https://spaces.ac.cn/archives/7615) 14 | args: 15 | prior: List, prior of label, 先验知识. eg. [0.6, 0.2, 0.1, 0.1] 16 | num_labels: int, num of labels, 类别数. eg. 10 17 | reduction: str, Specifies the reduction to apply to the output, 输出形式. 18 | eg.``'none'`` | ``'mean'`` | ``'sum'``. ``'none'`` 19 | eps: float, Minimum of maths, 极小值. eg. 1e-9 20 | tau: float, weight of prior in loss, 先验知识的权重, eg. ``1.0`` 21 | returns: 22 | Tensor of loss. 23 | examples: 24 | >>> loss = PriorCrossEntropy(prior)(logits, label) 25 | """ 26 | super(PriorMultiLabelSoftMarginLoss, self).__init__() 27 | self.loss_mlsm = torch.nn.MultiLabelSoftMarginLoss(reduction=reduction) 28 | if not prior: prior = np.array([1/num_labels for _ in range(num_labels)]) # 如果不存在就设置为num 29 | if type(prior) ==list: prior = np.array(prior) 30 | self.log_prior = torch.tensor(np.log(prior + eps)).unsqueeze(0) 31 | self.eps = eps 32 | self.tau = tau 33 | 34 | def forward(self, logits, labels): 35 | # 使用与输入label相同的device 36 | logits = logits + self.tau * self.log_prior.to(labels.device) 37 | loss = self.loss_mlsm(logits, labels) 38 | return loss 39 | 40 | ``` 41 | 42 | 43 | 44 | 45 | 46 | # 2. CircleLoss of MultiLabel 47 | ```bash 48 | from torch import nn 49 | import numpy as np 50 | import torch 51 | class MultiLabelCircleLoss(nn.Module): 52 | def __init__(self, reduction="mean", inf=1e12): 53 | """CircleLoss of MultiLabel, 多个目标类的多标签分类场景,希望“每个目标类得分都不小于每个非目标类的得分” 54 | 多标签分类的交叉熵(softmax+crossentropy推广, N选K问题), LSE函数的梯度恰好是softmax函数 55 | 让同类相似度与非同类相似度之间拉开一定的margin。 56 | - 使同类相似度比最大的非同类相似度更大。 57 | - 使最小的同类相似度比最大的非同类相似度更大。 58 | - 所有同类相似度都比所有非同类相似度更大。 59 | urls: [将“softmax+交叉熵”推广到多标签分类问题](https://spaces.ac.cn/archives/7359) 60 | args: 61 | reduction: str, Specifies the reduction to apply to the output, 输出形式. 62 | eg.``'none'`` | ``'mean'`` | ``'sum'``. ``'none'`` 63 | inf: float, Minimum of maths, 无穷大. eg. 1e12 64 | returns: 65 | Tensor of loss. 66 | examples: 67 | >>> label, logits = [[1, 1, 1, 1], [0, 0, 0, 1]], [[0, 1, 1, 0], [1, 0, 0, 1],] 68 | >>> label, logits = torch.tensor(label).float(), torch.tensor(logits).float() 69 | >>> loss = MultiLabelCircleLoss()(logits, label) 70 | """ 71 | super(MultiLabelCircleLoss, self).__init__() 72 | self.reduction = reduction 73 | self.inf = inf # 无穷大 74 | 75 | def forward(self, logits, labels): 76 | logits = (1 - 2 * labels) * logits # <3, 4> 77 | logits_neg = logits - labels * self.inf # <3, 4> 78 | logits_pos = logits - (1 - labels) * self.inf # <3, 4> 79 | zeros = torch.zeros_like(logits[..., :1]) # <3, 1> 80 | logits_neg = torch.cat([logits_neg, zeros], dim=-1) # <3, 5> 81 | logits_pos = torch.cat([logits_pos, zeros], dim=-1) # <3, 5> 82 | neg_loss = torch.logsumexp(logits_neg, dim=-1) # <3, > 83 | pos_loss = torch.logsumexp(logits_pos, dim=-1) # <3, > 84 | loss = neg_loss + pos_loss 85 | if "mean" == self.reduction: 86 | loss = loss.mean() 87 | else: 88 | loss = loss.sum() 89 | return loss 90 | 91 | ``` 92 | 希望对你又送帮助! 93 | 94 | -------------------------------------------------------------------------------- /pytorch_nlu/corpus/text_classification/school/README.md: -------------------------------------------------------------------------------- 1 | # 数据集 2 | ##数据来源: 3 | - url: https://github.com/FBI1314/textClassification/tree/master/multilabel_text_classfication/data 4 | - 详情: unknow, 来源未知, 多标签分类语料, 约22339语料, 7个类别. 5 | 6 | ##备注: 7 | 这里训练、验证、测试集各只取了132个样例 8 | 9 | ##数据分析: 10 | ```bash 11 | 12 | file = open("train.json", encoding="utf-8") 13 | texts = file.readlines() 14 | 15 | 16 | label_text = {} 17 | for text in texts: 18 | text_json = eval(text.strip()) 19 | label = text_json["label"] 20 | text = text_json["text"] 21 | if label not in label_text: 22 | label_text[label] = [text] 23 | else: 24 | if len(label_text[label]) < 10: 25 | label_text[label] += [text] 26 | 27 | print(label_text) 28 | 29 | 30 | 31 | id_to_label = { 32 | "0": "校园活动", 33 | "1": "文明礼仪", 34 | "2": "文化课程", 35 | "3": "课后活动", 36 | "4": "性格情绪", 37 | "5": "德育量化", 38 | "6": "作息生活"} 39 | 40 | 41 | 42 | train_data = { 43 | '0': ['体育课被批评', '挑战性问题', '设备现场无问题', '加错分抵消', '校级活动', '啪啪啪', '请课"中"', '三自班级', '专门评价陈屌丝的', '特别奖'], 44 | '1': ['个人卫生(指甲太长)', '专注之星季军奖励6分', '不戴红领巾校章', '个人卫生保持差', '小老师管理', '精神面貌好', '磨蹭', '优秀主持人', '语文作业完成质量高', '会话表演之星'], 45 | '2': ['单科考试95分以上', '表达清晰有想法', '语文80分以上', '小组回答问题', '小小写手', '生物作业优秀', '期中测试80-89', '按时完成作业改错', '英语听写全对', '成绩退步明显'], 46 | '3': ['下课乱跳大叫', '上课比较认真', '早到且迅速开始学习', '下位走动', '书皮没包没姓名签', '静校讲话', '语文家庭作业没完成', '书写干净整齐', '应付作业', '安静就餐不浪费'], 47 | '4': ['愤怒', '带情绪练琴', '自己的事情自己做', '神经三八很废', '情绪过度', '控制自己的情绪', '跟她妈生气', '管理好情绪', '情绪消极', '不吵不闹'], 48 | '5': ['秩序感优秀', '【好事】好人好事', '爱护班级卫生', '宿舍扣分', '上课乖', '乐于帮助他人', '文明有礼', '秩序之星', '教师助手', '尊师重道'], 49 | '6': ['做位险的游戏', '态度消极', '举手回答问题2次', '各位活动积极参与奖', '动作迅速有效', '懒洋洋慢吞吞', '晚修吵闹', '起床迅速', '老师小助手月奖励', '举手回答'], 50 | 51 | '3#2': ['作业不改', '作业书写较乱', '按要求完成语文作业', '优化认真辅导、批改。', '订正多次', '写字进步', '成绩进步或优秀', '语文作业完美', '堂听满分', '中文经典5分'], 52 | '1#3': ['自觉预习', '书写凌乱', '课前准备小标兵', '完成13号的语文作业', '卷面脏', '未做放学卫生值日', '今日工作突出', '没带美术用具', '扫地逃跑', '认真写1号作业'], 53 | '1#5': ['值日班长、卫生督导员', '榜样作用', '坐姿补端', '课堂合作愉快!', '爱劳动讲卫生', '清洁流动红旗', '扫地偷懒', '主动为班级加分', '献爱心积极分子', '上课期间穿拖鞋'], 54 | '3#5': ['连带纪律适当', '不文明说脏话', '给力小助手', '说脏话、起外号、打人', '履行职责', '遵守纪律三操认真', '发脾气破坏公物', '顶撞老师', '温柔又专心', '到校准时'], 55 | '5#2': ['带头读书', '善于分享交流', '班干部或组长嘉奖', '引领带动帮助同学', '文明发言会倾听', '数学小老师', '班级职责不到位', '好样的小老师', '分享好点子', '劳动积极工作负责'], 56 | '1#6': ['回答3次及以上', '回答3次', '音乐课积极发言', '运动会', '没值日', '主动练琴', '课前一支歌表现差', '发言小达人', '辛苦劳动', '语文课答问'], 57 | '3#6': ['政治缺交', '快乐晨读', '作业拖欠', '今日事今日毕作业完成', '主动帮助他人', '3点半作业拖拉没写完', '自习离位', '做与课堂无关事', '背诵不按时', '安全平台按时完成'], 58 | '3#1': ['积极背单词', '乱扔果皮纸屑', '主动做作业', '没有预习课文', '超额完成作业', '桌子没摆正', '配合打扫好班级卫生', '做清洁认真', '数学课表扬', '做卫生认真'], 59 | '4#3': ['大课间纪律差', '数学课堂注意力集中', '自习课纪律差大吵大闹', '字写得好'], 60 | '1#2': ['预习小能手', '大型集会说话', '不卫生学习差有神经病', '答问声音响亮', '我会领读', '参加辩论赛', '积极发言,善于表达', '举手是我的日常操作', '课堂积极动脑举手答题', '板演'], 61 | '5#6': ['上课说话屡教不改', '就寝时讲话吵闹', '两操检查', '上课玩东西,不认真', '站队做到快、静、齐', '做作业讲话', '队列中讲话', '上课扭头', '打架斗殴', '餐厅安静回位'], 62 | '2#6': ['能运用学过的词语句子', '作业速度太慢', '唱歌', '积极思考有自己的想法', '回答问题积极声音洪亮', '作业速度快', '回答问题声音洪亮', '数学课回答声音洪亮', '物理作业及订正认真', 63 | '回答问题声音洪亮干脆'], '4#5': ['乱发脾气', '情绪失控-说不文明话'], '4#1': ['吃饭安静有序'], '4#6': ['玩得很开心', '过度兴奋', '发生肢体冲突'], 64 | '4#2': ['关心班级']} 65 | 66 | 67 | 68 | 69 | val_data = { 70 | '0': ['我的眼睛', '大神经', '老师特别表扬', '上周表现良好'], 71 | '1': ['小小图书管理员', '班级打扫卫生', '奖励', '教室卫生达人', '不及时打扫卫生', '升旗小能手', '餐点不挑食不浪费', '二级巨星', '小组表现不好', '歌曲变歌词小能手'], 72 | '2': ['课前提问回答不完整', '提出质疑', '作业完成情况优秀', '我会算', '看拼音写词语纸展览', '作业完成不好', '80分以下', '读得好', '非常优秀', '考试低分'], 73 | '3': ['运动会打架', '认真完成聪明题', '静校未及时,说话', '准时返校', '上课打闹,屡教不改', '课堂表现类', '午餐后自我管理差', '部分作业没完成', '晚拖讲话', '国庆作业'], 74 | '4': ['发火,生气', '管理情绪', '开朗乐观'], 75 | '5': ['善良,有孝心', '说脏话起外号', '携带零食', '队列安静有序', '课堂集体评比第3名', '温柔听话', '雷同作业', '集合速度快', '喜欢偷偷说别人', '关心、帮助他人'], 76 | '6': ['动作拖拉慢吞吞', '跑步第一', '心不在焉', '9点半之前没上床睡觉', '运动会第一名', '做事拖拉慢吞吞', '太累了', '体育运动棒', '课前准备没有拿书出来', '课堂上举手积极发言'], 77 | '3#5': ['课堂搞东西,没认真听', '不团结', '化学老师的得力助手', '未完成老师布置的作业', '追跑打闹,打骂同学'], 78 | '3#2': ['测验90-94.A-', '长江作业未交', '美术作业棒棒哒', '学霸笔记', '主动学习看书', '作文不合格', '单元考试进步很大', '订正过关', '作文修改后得优加', '期中考前十名'], 79 | '1#3': ['内务脏乱、静校慢'], 80 | '3#6': ['老师不在,讲话', '动作迅速不讲话', '干扰同桌', '大课间good', '按时到校值日', '做事不安静', '到班不及时', '不写笔记', '考试的时候讲话', '按时上交材料'], 81 | '4#5': ['善良可爱美丽大方'], 82 | '5#6': ['乱起哄', '不好好做眼保健操'], '1#5': ['有错不改脾气倔', '流动红旗'], 83 | '1#2': ['创新表演'], 84 | '2#6': ['聪明活泼']} 85 | 86 | ``` -------------------------------------------------------------------------------- /pytorch_nlu/corpus/sequence_labeling/ner_china_people_daily_1998_span/train.span: -------------------------------------------------------------------------------- 1 | {"label": [{"type": "LOC", "ent": "厦门", "pos": [7, 8]}, {"type": "LOC", "ent": "金门", "pos": [10, 11]}], "text": "海钓比赛地点在厦门与金门之间的海域。"} 2 | {"label": [], "text": "这座依山傍水的博物馆由国内一流的设计师主持设计,整个建筑群精美而恢宏。"} 3 | {"label": [], "text": "但作为一个共产党员、人民公仆,应当胸怀宽阔,真正做到“先天下之忧而忧,后天下之乐而乐”,淡化个人的名利得失和宠辱悲喜,把改革大业摆在首位,这样才能超越自我,摆脱世俗,有所作为。"} 4 | {"label": [], "text": "在发达国家,急救保险十分普及,已成为社会保障体系的重要组成部分。"} 5 | {"label": [{"type": "LOC", "ent": "日", "pos": [0, 0]}, {"type": "LOC", "ent": "俄", "pos": [1, 1]}, {"type": "LOC", "ent": "日", "pos": [16, 16]}, {"type": "LOC", "ent": "俄", "pos": [17, 17]}], "text": "日俄两国国内政局都充满变数,尽管日俄关系目前是历史最佳时期,但其脆弱性不言自明。"} 6 | {"label": [{"type": "PER", "ent": "克马尔", "pos": [0, 2]}, {"type": "PER", "ent": "让娜", "pos": [6, 7]}, {"type": "ORG", "ent": "家委会", "pos": [33, 35]}], "text": "克马尔的女儿让娜今年读五年级,她所在的班上有30多名同学,该班的“家委会”由10名家长组成。"} 7 | {"label": [], "text": "参加步行的有男有女,有年轻人,也有中年人。"} 8 | {"label": [{"type": "ORG", "ent": "沙特队", "pos": [0, 2]}, {"type": "PER", "ent": "佩雷拉", "pos": [5, 7]}], "text": "沙特队教练佩雷拉:两支队都想胜,因此都作出了最大的努力。"} 9 | {"label": [], "text": "这种混乱局面导致有些海域使用者的合法权益难以得到维护。"} 10 | {"label": [{"type": "PER", "ent": "鲁宾", "pos": [0, 1]}, {"type": "LOC", "ent": "美国", "pos": [24, 25]}, {"type": "LOC", "ent": "中国", "pos": [34, 35]}, {"type": "ORG", "ent": "众议院", "pos": [58, 60]}, {"type": "LOC", "ent": "美国", "pos": [77, 78]}, {"type": "LOC", "ent": "美国", "pos": [92, 93]}], "text": "鲁宾明确指出,对政府的这种指控完全没有事实根据,美国政府不想也没有向中国转让敏感技术,事实真相总有一天会大白于天下;众议院的这种做法令人“非常失望”,将使美国的商业卫星产业受到威胁,使美国的竞争力受到损害。"} 11 | {"label": [], "text": "体育场每天早6时至8时向群众免费开放,体育馆、游泳馆等则增加综合服务,延长开放时间,采取灵活收费。"} 12 | {"label": [], "text": "再看内容,图文并茂,简短的文字,准确地反映了五十六个民族的风土人情和文化传统,把各民族的主要特点讲得很到位。"} 13 | {"label": [{"type": "ORG", "ent": "美国“哥伦比亚”号航天飞机", "pos": [0, 12]}], "text": "美国“哥伦比亚”号航天飞机上的宇航员今天一边进行实验,一边继续抢修出了故障的二氧化碳清除装置。"} 14 | {"label": [], "text": "从前剥削阶级在劳动生产率很低的情况下占有有限的剩余产品,主要是为了满足其家族豪华生活的需要,难有多大力量扩大再生产;现在资产阶级在社会化大生产条件下榨取大量剩余价值,除供自身需求外,主要又转化为资本,用以剥削新的雇佣劳动,从而使私人资本越来越扩大,生产资料也越来越集中。"} 15 | {"label": [{"type": "ORG", "ent": "全国人民代表大会澳门特别行政区筹备委员会", "pos": [0, 19]}, {"type": "LOC", "ent": "北京", "pos": [32, 33]}, {"type": "LOC", "ent": "人民大会堂", "pos": [34, 38]}, {"type": "ORG", "ent": "国务院", "pos": [42, 44]}, {"type": "ORG", "ent": "筹委会", "pos": [49, 51]}, {"type": "PER", "ent": "钱其琛", "pos": [56, 58]}, {"type": "LOC", "ent": "澳门特别行政区", "pos": [70, 76]}, {"type": "ORG", "ent": "筹委会", "pos": [85, 87]}, {"type": "LOC", "ent": "澳门", "pos": [119, 120]}], "text": "全国人民代表大会澳门特别行政区筹备委员会第一次全体会议今天上午在北京人民大会堂开幕,国务院副总理、筹委会主任委员钱其琛在致开幕词中指出,筹建澳门特别行政区的工作已经启动,筹委会面临的工作是大量的、紧迫的,筹委们任重道远,希望大家齐心协力为澳门的平稳过渡、政权顺利交接作出贡献。"} 16 | {"label": [], "text": "而经济社会的活动过程已经使整个经济学界深信:宏观经济的变化必须以微观经济为基础。"} 17 | {"label": [], "text": "本病好发于面、颈、背、躯干及外生殖器,可见于各种年龄。"} 18 | {"label": [{"type": "LOC", "ent": "北京市", "pos": [0, 2]}, {"type": "LOC", "ent": "怀柔县", "pos": [3, 5]}, {"type": "LOC", "ent": "四川省", "pos": [25, 27]}, {"type": "ORG", "ent": "江油市华丰中学", "pos": [28, 34]}], "text": "北京市怀柔县参试学生普遍感觉第四节课饥饿感消失了,四川省江油市华丰中学选用豆奶和复合营养素后,试验组男生的贫血率下降13个百分点,而对照组只降低0.44个百分点。"} 19 | {"label": [], "text": "党的宗旨和当干部的目的决定了党员干部要乐于奉献,甘愿吃亏,吃苦在前,享受在后,把困难和危险留给自己,把方便和安全让给群众。"} 20 | {"label": [{"type": "ORG", "ent": "党中央", "pos": [5, 7]}, {"type": "ORG", "ent": "国务院", "pos": [9, 11]}], "text": "今年年初,党中央、国务院根据国内外经济形势的变化,及时作出扩大内需、保持经济持续快速增长的重大决策。"} 21 | {"label": [], "text": "发现而定位,确立而研究,一个文学流派就能提供许多富有意味的理论话题。"} 22 | {"label": [{"type": "PER", "ent": "粟光前", "pos": [4, 6]}], "text": "党委书记粟光前当机立断:“我们都是他的家人,都是他的亲兄弟,我签!”"} 23 | {"label": [{"type": "ORG", "ent": "世行", "pos": [2, 3]}], "text": "熟悉世行运作的人士说,贷款一旦被推迟,将在几个月、甚至可能更长的时间后才能被重新考虑。"} 24 | {"label": [], "text": "在我们的队伍中,确实有些只讲哥们儿义气不讲党的原则,只图实惠不讲干部形象的事。"} 25 | {"label": [], "text": "我估计她会说像疤痕,像笨拙女人手下的针线活儿。"} 26 | {"label": [{"type": "PER", "ent": "周恩来", "pos": [0, 2]}], "text": "周恩来总理说,那就送一株万古常青的友谊红杉吧!"} 27 | {"label": [{"type": "LOC", "ent": "巴", "pos": [2, 2]}, {"type": "LOC", "ent": "印", "pos": [11, 11]}], "text": "在与巴重开会谈问题上,印声明再次宣称,“没有任何第三方介入的余地”。"} 28 | {"label": [{"type": "ORG", "ent": "江西省委老干部局", "pos": [12, 19]}], "text": "不久前,记者就这些问题赴江西省委老干部局进行了采访。"} 29 | {"label": [], "text": "一是发动广大职工积极参与企业民主管理,接受职工监督。"} 30 | {"label": [], "text": "她说:“如今,我们真是过上了安居乐业的好日子。”"} 31 | {"label": [], "text": "倘若对客观环境的变化反应迟钝,就很难迅速地作出正确判断和决策,也就难免陷于被动甚至四处碰壁。"} 32 | {"label": [], "text": "然而作为新的经济增长点,必将成为新的消费热点。"} 33 | {"label": [], "text": "被解雇后,失业者的首要任务是四处寻找工作,而联邦和州政府也给予适当救济和支持。"} 34 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_textclassification/tcGraph.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/11/17 21:35 4 | # @author : Mo 5 | # @function: graph of pre-train model 6 | 7 | 8 | from tcConfig import PRETRAINED_MODEL_CLASSES 9 | from tcLayer import FCLayer 10 | # torch 11 | from transformers import BertPreTrainedModel 12 | import torch 13 | 14 | 15 | class TCGraph(BertPreTrainedModel): 16 | def __init__(self, graph_config, tokenizer): 17 | """ 18 | Pytorch Graph of TextClassification, Pre-Trained Model based 19 | config: 20 | config: json, params of graph, eg. {"num_labels":17, "model_type":"BERT"} 21 | Returns: 22 | output: Tuple, Tensor of logits and loss 23 | """ 24 | # 预训练语言模型读取 25 | self.graph_config = graph_config 26 | pretrained_config, pretrained_tokenizer, pretrained_model = PRETRAINED_MODEL_CLASSES[graph_config.model_type] 27 | self.pretrained_config = pretrained_config.from_pretrained(graph_config.pretrained_model_name_or_path, output_hidden_states=graph_config.output_hidden_states) 28 | self.pretrained_config.update({"gradient_checkpointing": True}) 29 | super(TCGraph, self).__init__(self.pretrained_config) 30 | if self.graph_config.is_train: 31 | self.pretrain_model = pretrained_model.from_pretrained(graph_config.pretrained_model_name_or_path, config=self.pretrained_config) 32 | self.pretrain_model.resize_token_embeddings(len(tokenizer)) 33 | else: 34 | self.pretrain_model = pretrained_model(self.pretrained_config) # 推理时候只需要加载超参数, 不需要预训练模型的权重 35 | self.pretrain_model.resize_token_embeddings(len(tokenizer)) 36 | # # tokenizer.model_max_length = self.model.config.max_position_embeddings 37 | # 如果用隐藏层输出 38 | if self.graph_config.output_hidden_states: 39 | # self.dense = FCLayer(int(self.pretrained_config.hidden_size*len(self.graph_config.output_hidden_states)*3), self.graph_config.num_labels, 40 | # is_dropout=self.graph_config.is_dropout, is_active=self.graph_config.is_active, active_type=self.graph_config.active_type) 41 | self.dense = FCLayer( 42 | int(self.pretrained_config.hidden_size * len(self.graph_config.output_hidden_states)), 43 | self.graph_config.num_labels, 44 | is_dropout=self.graph_config.is_dropout, is_active=self.graph_config.is_active, 45 | active_type=self.graph_config.active_type) 46 | else: 47 | self.dense = FCLayer(self.pretrained_config.hidden_size, self.graph_config.num_labels, is_dropout=self.graph_config.is_dropout, 48 | is_active=self.graph_config.is_active, active_type=self.graph_config.active_type) 49 | 50 | # 池化层 51 | self.global_maxpooling = torch.nn.AdaptiveMaxPool1d(1) 52 | self.global_avgpooling = torch.nn.AdaptiveAvgPool1d(1) 53 | # 激活层/随即失活层 54 | self.softmax = torch.nn.Softmax() 55 | self.sigmoid = torch.nn.Sigmoid() 56 | self.dropout = torch.nn.Dropout 57 | 58 | def forward(self, input_ids, attention_mask, token_type_ids, labels=None): 59 | output = self.pretrain_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) 60 | # output, 输出可以自己改 61 | if self.graph_config.output_hidden_states: 62 | x = output[2] 63 | hidden_states_idx = [i for i in range(len(x))] 64 | 65 | ### pool, [max-pool, avg-pool, cls] 66 | # x_cat = torch.cat([x[i] for i in self.graph_config.output_hidden_states if i in hidden_states_idx], dim=-1) 67 | # x_max = self.global_maxpooling(x_cat.permute(0, 2, 1)).squeeze(dim=-1) 68 | # x_avg = self.global_avgpooling(x_cat.permute(0, 2, 1)).squeeze(dim=-1) 69 | # x_cls = x_cat[:, 0, :] 70 | # x_merge = torch.cat([x_max, x_avg, x_cls], dim=-1) 71 | 72 | ### cls-concat 73 | cls = torch.cat([x[i][:, 0, :] for i in self.graph_config.output_hidden_states if i in hidden_states_idx], dim=-1) 74 | 75 | # cls = self.dropout(p=self.graph_config.dropout_rate)(cls) 76 | else: 77 | cls = output[0][:, 0, :] # cls 78 | logits = self.dense(cls) # fc 79 | # inference 80 | if self.graph_config.is_fc_sigmoid: 81 | return self.sigmoid(logits) 82 | elif self.graph_config.is_fc_softmax: 83 | return self.softmax(logits) 84 | return logits 85 | 86 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_textclassification/tcPredict.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/7/25 9:30 4 | # @author : Mo 5 | # @function: predict model, 预测模块 6 | 7 | 8 | # 适配linux 9 | import logging as logger 10 | import sys 11 | import os 12 | import traceback 13 | 14 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".")) 15 | sys.path.append(path_root) 16 | # os.environ["CUDA_VISIBLE_DEVICES"] = model_config.get("CUDA_VISIBLE_DEVICES", "0") 17 | from tcConfig import model_config 18 | from tcTools import load_json 19 | from tcOffice import Office 20 | from tcData import Corpus 21 | 22 | from argparse import Namespace 23 | 24 | 25 | class TextClassificationPredict: 26 | def __init__(self, path_config, pretrained_model_name_or_path=None, logger=logger): 27 | """ 初始化 """ 28 | self.pretrained_model_name_or_path = pretrained_model_name_or_path 29 | self.logger = logger 30 | self.load_config(path_config) 31 | self.load_model() 32 | 33 | def load_config(self, path_config): 34 | """ 加载超参数 """ 35 | config = load_json(path_config) 36 | self.config = Namespace(**config) 37 | if self.pretrained_model_name_or_path: 38 | self.config.pretrained_model_name_or_path = self.pretrained_model_name_or_path 39 | self.config.CUDA_VISIBLE_DEVICES = model_config.get("CUDA_VISIBLE_DEVICES", "") 40 | self.real_model_save_path = os.path.split(path_config)[0] 41 | self.config.model_save_path = self.real_model_save_path 42 | self.l2i, self.i2l = self.config.l2i, self.config.i2l 43 | # 数据预处理 类 44 | self.corpus = Corpus(config=self.config, logger=self.logger) 45 | 46 | def load_model(self): 47 | """ 加载模型 """ 48 | self.office = Office(config=self.config, tokenizer=self.corpus.tokenizer, logger=self.logger) 49 | try: 50 | self.office.load_model_state() 51 | except Exception as e: 52 | self.logger.info(traceback.print_exc()) 53 | self.logger.info("self.office.load_model_state() is wrong, start self.office.load_model()") 54 | self.office.load_model() 55 | 56 | def process(self, texts): 57 | """ 数据预处理, process """ 58 | # token 转 idx, 训练集/验证集 59 | datas_xy, _ = self.corpus.read_texts_from_json(texts, keys=self.config.xy_keys) 60 | dataset = self.corpus.preprocess(datas_xy, self.config.l2i, max_len=self.config.max_len) 61 | return dataset 62 | 63 | def predict(self, texts, logits_type="sigmoid", rounded=4, use_logits=False): 64 | """ 分类模型预测 65 | config: 66 | texts : List, inputs of text, eg. {"num_labels":17, "model_type":"BERT"} 67 | logits_type: string, output-logits type, eg. "logits", "sigmoid", "softmax" 68 | rounded : int, rounded of float, eg. 3, 4, 6 69 | use_logits: Bool, only reture logits, eg. True 70 | Returns: 71 | res : List, output of label-score, eg. 72 | """ 73 | dataset = self.process(texts) 74 | res = self.office.predict(dataset, rounded=rounded, logits_type=logits_type, use_logits=use_logits) 75 | return res 76 | 77 | def predict_loop(self): 78 | while 1: 79 | print("请输入:") 80 | text = input() 81 | res = self.predict(text) 82 | print(res) 83 | 84 | 85 | if __name__ == "__main__": 86 | # BERT-base = 8109M 87 | path_config = "../output/text_classification/model_ERNIE/tc.config" 88 | 89 | tcp = TextClassificationPredict(path_config) 90 | texts = [{"text": "平乐县,古称昭州,隶属于广西壮族自治区桂林市,位于广西东北部,桂林市东南部,东临钟山县,南接昭平,西北毗邻阳朔,北连恭城,总面积1919.34平方公里。"}, 91 | {"text": "平乐县主要旅游景点有榕津千年古榕、冷水石景苑、仙家温泉、桂江风景区、漓江风景区等,平乐县为漓江分界点,平乐以北称漓江,以南称桂江,是著名的大桂林旅游区之一。"}, 92 | {"text": "印岭玲珑,昭水晶莹,环绕我平中。青年的乐园,多士受陶熔。生活自觉自治,学习自发自动。五育并重,手脑并用。迎接新潮流,建设新平中"}, 93 | {"text": "桂林山水甲天下, 阳朔山水甲桂林"}, 94 | ] 95 | res = tcp.predict(texts, logits_type="sigmoid") 96 | print(res) 97 | # tcp.office.config.model_save_path = tcp.office.config.model_save_path + "_state" 98 | # tcp.office.save_model_state() 99 | 100 | while True: 101 | print("请输入:") 102 | question = input() 103 | res = tcp.predict([{"text": question}], logits_type="sigmoid") 104 | # print(res) 105 | print([sorted(r.items(), key=lambda x:x[1], reverse=True) for r in res]) 106 | 107 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_textregression/trPredict.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/7/25 9:30 4 | # @author : Mo 5 | # @function: predict model, 预测模块 6 | 7 | 8 | # 适配linux 9 | import logging as logger 10 | import sys 11 | import os 12 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".")) 13 | sys.path.append(path_root) 14 | # os.environ["CUDA_VISIBLE_DEVICES"] = model_config.get("CUDA_VISIBLE_DEVICES", "0") 15 | from trConfig import model_config 16 | from trTools import load_json 17 | from trOffice import Office 18 | from trData import Corpus 19 | 20 | from argparse import Namespace 21 | 22 | 23 | class TextRegressionPredict: 24 | def __init__(self, path_config, pretrained_model_name_or_path=None, logger=logger): 25 | """ 初始化 """ 26 | self.pretrained_model_name_or_path = pretrained_model_name_or_path 27 | self.logger = logger 28 | self.load_config(path_config) 29 | self.load_model() 30 | 31 | def load_config(self, path_config): 32 | """ 加载超参数 """ 33 | config = load_json(path_config) 34 | self.config = Namespace(**config) 35 | if self.pretrained_model_name_or_path: 36 | self.config.pretrained_model_name_or_path = self.pretrained_model_name_or_path 37 | # if os.environ.get("CUDA_VISIBLE_DEVICES", "-1") == "-1": 38 | # self.config.CUDA_VISIBLE_DEVICES = "-1" 39 | self.config.CUDA_VISIBLE_DEVICES = model_config.get("CUDA_VISIBLE_DEVICES", "") # os.environ.get("CUDA_VISIBLE_DEVICES", "-1") 40 | self.real_model_save_path = os.path.split(path_config)[0] 41 | self.config.model_save_path = self.real_model_save_path 42 | # 数据预处理 类 43 | self.corpus = Corpus(config=self.config, logger=self.logger) 44 | 45 | def load_model(self): 46 | """ 加载模型 """ 47 | self.office = Office(config=self.config, tokenizer=self.corpus.tokenizer, logger=self.logger) 48 | try: 49 | self.office.load_model_state() 50 | except Exception as e: 51 | self.logger.info("self.office.load_model_state() is wrong, start self.office.load_model()") 52 | self.office.load_model() 53 | 54 | def process(self, texts): 55 | """ 数据预处理, process """ 56 | # token 转 idx, 训练集/验证集 57 | datas_xy, _ = self.corpus.read_texts_from_json(texts, keys=self.config.xy_keys) 58 | dataset = self.corpus.preprocess(datas_xy, max_len=self.config.max_len) 59 | return dataset 60 | 61 | def predict(self, texts, rounded=4, use_logits=False): 62 | """ 分类模型预测 63 | config: 64 | texts : List, inputs of text, eg. {"num_labels":17, "model_type":"BERT"} 65 | rounded : int, rounded of float, eg. 3, 4, 6 66 | use_logits: Bool, only reture logits, eg. True 67 | Returns: 68 | res : List, output of label-score, eg. 69 | """ 70 | dataset = self.process(texts) 71 | res = self.office.predict(dataset, rounded=rounded, use_logits=use_logits) 72 | return res 73 | 74 | def predict_loop(self): 75 | while 1: 76 | print("请输入:") 77 | text = input() 78 | res = self.predict(text) 79 | print(res) 80 | 81 | 82 | if __name__ == "__main__": 83 | # BERT-base = 8109M 84 | # path_config = "../output/text_classification/model_ERNIE/tc.config" 85 | path_config = "../output/text_regression/model_ERNIE/tc.config" 86 | # path_config = "../output/text_regression/model_BERT/tc.config" 87 | 88 | tcp = TextRegressionPredict(path_config) 89 | texts = [{"text": "平乐县,古称昭州,隶属于广西壮族自治区桂林市,位于广西东北部,桂林市东南部,东临钟山县,南接昭平,西北毗邻阳朔,北连恭城,总面积1919.34平方公里。"}, 90 | {"text": "平乐县主要旅游景点有榕津千年古榕、冷水石景苑、仙家温泉、桂江风景区、漓江风景区等,平乐县为漓江分界点,平乐以北称漓江,以南称桂江,是著名的大桂林旅游区之一。"}, 91 | {"text": "印岭玲珑,昭水晶莹,环绕我平中。青年的乐园,多士受陶熔。生活自觉自治,学习自发自动。五育并重,手脑并用。迎接新潮流,建设新平中"}, 92 | {"text": "桂林山水甲天下, 阳朔山水甲桂林"}, 93 | {"text": "她不喜欢你"}, 94 | {"text": "不怎么样"}, 95 | {"text": "流水落花, 不不不, 是伤春"}, 96 | ] 97 | res = tcp.predict(texts) 98 | print(res) 99 | # tcp.office.config.model_save_path = tcp.office.config.model_save_path + "_state" 100 | # tcp.office.save_model_state() 101 | 102 | while True: 103 | print("请输入:") 104 | question = input() 105 | res = tcp.predict([{"text": question}]) 106 | print(res) 107 | # print([sorted(r.items(), key=lambda x:x[1], reverse=True) for r in res]) 108 | 109 | -------------------------------------------------------------------------------- /test/tc/tet_tc_base_multi_label_choice.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/2/23 21:34 4 | # @author : Mo 5 | # @function: choice, model_config可配置参数 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import json 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_textclassification") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | print(path_sys) 18 | # 分类下的引入, pytorch_textclassification 19 | from tcTools import get_current_time 20 | from tcRun import TextClassification 21 | from tcConfig import model_config 22 | 23 | 24 | # 预训练模型地址, 本地win10默认只跑2步就评估保存模型 25 | if platform.system().lower() == 'windows': 26 | # pretrained_model_dir = "D:/pretrain_models/pytorch" 27 | pretrained_model_dir = "E:/DATA/bert-model/00_pytorch" 28 | evaluate_steps = 32 # 评估步数 29 | save_steps = 32 # 存储步数 30 | else: 31 | pretrained_model_dir = "/pretrain_models/pytorch" 32 | evaluate_steps = 320 # 评估步数 33 | save_steps = 320 # 存储步数 34 | ee = 0 35 | 36 | 37 | if __name__ == "__main__": 38 | # 训练-验证语料地址, 可以只输入训练地址 39 | path_corpus = os.path.join(path_root, "pytorch_nlu", "corpus", "text_classification", "school") 40 | path_train = os.path.join(path_corpus, "train.json") 41 | path_dev = os.path.join(path_corpus, "dev.json") 42 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 43 | model_config["save_steps"] = save_steps # 存储步数 44 | model_config["path_train"] = path_train # 训练模语料, 必须 45 | model_config["path_dev"] = path_dev # 验证语料, 可为None 46 | model_config["path_tet"] = None # 测试语料, 可为None 47 | 48 | ## 参数配置 choice 49 | model_config["lr"] = 3e-5 # 5e-5, 1e-5 # 学习率 50 | model_config["max_len"] = 128 # 最大文本长度, padding 51 | model_config["batch_size"] = 64 # 批尺寸 52 | model_config["warmup_steps"] = 1000 # 预热步数 53 | model_config["is_active"] = False # fc是否加激活函数 54 | model_config["is_dropout"] = True # 是否随机丢弃 55 | model_config["is_adv"] = True # 是否对抗训练 56 | # model_config["len_rate"] = 0.01 # 参与训练数据的样本数比率(如win10下少量数据跑通) 57 | model_config["epochs"] = 16 # 训练轮次 # 21 # 32 58 | 59 | model_config["output_hidden_states"] = [0, 1, 2, 3] # 输出多层 # [0, 1, 2, 3] # [1, 2, 11, 12] # [8, 9, 10, 11, 12] # [0,1, 5,6, 11,12] # [1, 3, 5] 60 | model_config["loss_type"] = "PRIOR_MARGIN_LOSS" # 损失函数 61 | # model_config["loss_type"] = "FOCAL_LOSS" 62 | # model_config["loss_type"] = "PRIOR_MARGIN_LOSS" 63 | # model_config["loss_type"] = "MIX_focal_prior" 64 | # model_config["loss_type"] = "MIX_prior_bce" 65 | # model_config["loss_type"] = "MIX_focal_bce" 66 | # model_config["loss_type"] = "BCE_MULTI" 67 | # model_config["loss_type"] = "BCE_LOGITS" 68 | # model_config["loss_type"] = "CIRCLE_LOSS" 69 | # model_config["loss_type"] = "CB_LOSS" 70 | # model_config["loss_type"] = "DB_LOSS" 71 | model_config["label_sep"] = "|myz|" # 多标签分类类别标签分隔符 72 | 73 | # 预训练模型适配的class 74 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 75 | pretrained_model_name_or_path = { 76 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 77 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 78 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 79 | "XLNET": "hfl/chinese-xlnet-mid", 80 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 81 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 82 | "BERT": "bert-base-chinese", 83 | # "BERT": "hfl/chinese-macbert-base", 84 | 85 | } 86 | idx = 0 # 选择的预训练模型类型---model_type, 0为BERT, 87 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 88 | # model_config["model_save_path"] = "../output/text_classification/model_{}".format(model_type[idx] + "_" + str(get_current_time())) 89 | model_config["model_save_path"] = "../output/text_classification/model_{}".format(model_type[idx]) 90 | model_config["model_type"] = model_type[idx] 91 | model_config["ADDITIONAL_SPECIAL_TOKENS"] = ["+","-", "=", ":", ".", "(", ")", "≈", "%", 92 | "∥", "<", ">", "⊙", "≌", "。"] # 新增特殊字符 93 | # main 94 | lc = TextClassification(model_config) 95 | lc.process() 96 | lc.train() 97 | 98 | 99 | # shell 100 | # nohup python tet_tc_base_multi_label_choice.py > tc.tet_tc_base_multi_label_choice.py.log 2>&1 & 101 | # tail -n 1000 -f tc.tet_tc_base_multi_label_choice.py.log 102 | # |myz| 103 | 104 | -------------------------------------------------------------------------------- /test/tr/tet_tr_base_predict.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/7/25 9:30 4 | # @author : Mo 5 | # @function: predict model, 预测模块 6 | 7 | 8 | # 适配linux 9 | import logging as logger 10 | import sys 11 | import os 12 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 13 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_textregression") 14 | sys.path.append(path_sys) 15 | print(path_root) 16 | print(path_sys) 17 | # os.environ["CUDA_VISIBLE_DEVICES"] = model_config.get("CUDA_VISIBLE_DEVICES", "0") 18 | from trConfig import model_config 19 | from trTools import load_json 20 | from trOffice import Office 21 | from trData import Corpus 22 | 23 | from argparse import Namespace 24 | 25 | 26 | class TextRegressionPredict: 27 | def __init__(self, path_config, pretrained_model_name_or_path=None, logger=logger): 28 | """ 初始化 """ 29 | self.pretrained_model_name_or_path = pretrained_model_name_or_path 30 | self.logger = logger 31 | self.load_config(path_config) 32 | self.load_model() 33 | 34 | def load_config(self, path_config): 35 | """ 加载超参数 """ 36 | config = load_json(path_config) 37 | self.config = Namespace(**config) 38 | if self.pretrained_model_name_or_path: 39 | self.config.pretrained_model_name_or_path = self.pretrained_model_name_or_path 40 | # if os.environ.get("CUDA_VISIBLE_DEVICES", "-1") == "-1": 41 | # self.config.CUDA_VISIBLE_DEVICES = "-1" 42 | self.config.CUDA_VISIBLE_DEVICES = model_config.get("CUDA_VISIBLE_DEVICES", "") # os.environ.get("CUDA_VISIBLE_DEVICES", "-1") 43 | self.real_model_save_path = os.path.split(path_config)[0] 44 | self.config.model_save_path = self.real_model_save_path 45 | # 数据预处理 类 46 | self.corpus = Corpus(config=self.config, logger=self.logger) 47 | 48 | def load_model(self): 49 | """ 加载模型 """ 50 | self.office = Office(config=self.config, tokenizer=self.corpus.tokenizer, logger=self.logger) 51 | try: 52 | self.office.load_model_state() 53 | except Exception as e: 54 | self.logger.info("self.office.load_model_state() is wrong, start self.office.load_model()") 55 | self.office.load_model() 56 | 57 | def process(self, texts): 58 | """ 数据预处理, process """ 59 | # token 转 idx, 训练集/验证集 60 | datas_xy, _ = self.corpus.read_texts_from_json(texts, keys=self.config.xy_keys) 61 | dataset = self.corpus.preprocess(datas_xy, max_len=self.config.max_len) 62 | return dataset 63 | 64 | def predict(self, texts, rounded=4, use_logits=False): 65 | """ 分类模型预测 66 | config: 67 | texts : List, inputs of text, eg. {"num_labels":17, "model_type":"BERT"} 68 | rounded : int, rounded of float, eg. 3, 4, 6 69 | use_logits: Bool, only reture logits, eg. True 70 | Returns: 71 | res : List, output of label-score, eg. 72 | """ 73 | dataset = self.process(texts) 74 | res = self.office.predict(dataset, rounded=rounded, use_logits=use_logits) 75 | return res 76 | 77 | def predict_loop(self): 78 | while 1: 79 | print("请输入:") 80 | text = input() 81 | res = self.predict(text) 82 | print(res) 83 | 84 | 85 | if __name__ == "__main__": 86 | # BERT-base = 8109M 87 | # path_config = "../output/text_classification/model_ERNIE/tc.config" 88 | path_config = "../output/text_regression/model_ERNIE/tc.config" 89 | # path_config = "../output/text_regression/model_BERT/tc.config" 90 | 91 | tcp = TextRegressionPredict(path_config) 92 | texts = [{"text": "平乐县,古称昭州,隶属于广西壮族自治区桂林市,位于广西东北部,桂林市东南部,东临钟山县,南接昭平,西北毗邻阳朔,北连恭城,总面积1919.34平方公里。"}, 93 | {"text": "平乐县主要旅游景点有榕津千年古榕、冷水石景苑、仙家温泉、桂江风景区、漓江风景区等,平乐县为漓江分界点,平乐以北称漓江,以南称桂江,是著名的大桂林旅游区之一。"}, 94 | {"text": "印岭玲珑,昭水晶莹,环绕我平中。青年的乐园,多士受陶熔。生活自觉自治,学习自发自动。五育并重,手脑并用。迎接新潮流,建设新平中"}, 95 | {"text": "桂林山水甲天下, 阳朔山水甲桂林"}, 96 | {"text": "她不喜欢你"}, 97 | {"text": "不怎么样"}, 98 | {"text": "流水落花, 不不不, 是伤春"}, 99 | ] 100 | res = tcp.predict(texts) 101 | print(res) 102 | # tcp.office.config.model_save_path = tcp.office.config.model_save_path + "_state" 103 | # tcp.office.save_model_state() 104 | 105 | while True: 106 | print("请输入:") 107 | question = input() 108 | res = tcp.predict([{"text": question}]) 109 | print(res) 110 | # print([sorted(r.items(), key=lambda x:x[1], reverse=True) for r in res]) 111 | 112 | -------------------------------------------------------------------------------- /pytorch_nlu/corpus/text_classification/school/test.json: -------------------------------------------------------------------------------- 1 | {"label": "2", "text": "不写假期作业"} 2 | {"label": "3", "text": "没戴校牌"} 3 | {"label": "3", "text": "列队"} 4 | {"label": "1", "text": "独立完成实践作业"} 5 | {"label": "3", "text": "上课坐端正不讲话"} 6 | {"label": "3", "text": "积极完成学习活动"} 7 | {"label": "1", "text": "作业合格"} 8 | {"label": "2", "text": "报听写全对"} 9 | {"label": "5", "text": "欺负低年级小同学"} 10 | {"label": "3", "text": "未阅读"} 11 | {"label": "3|myz|2", "text": "回答问题条理清晰"} 12 | {"label": "0", "text": "五星班级"} 13 | {"label": "3", "text": "上课不认真听讲,乱动"} 14 | {"label": "3", "text": "字体优美"} 15 | {"label": "3", "text": "与同学闹矛盾"} 16 | {"label": "3", "text": "抄写全对"} 17 | {"label": "3", "text": "追跑打架"} 18 | {"label": "5", "text": "关注别人"} 19 | {"label": "2", "text": "甲+优"} 20 | {"label": "0", "text": "情绪"} 21 | {"label": "3", "text": "优学派作业认真"} 22 | {"label": "3", "text": "不带作业回校"} 23 | {"label": "3", "text": "课前准备很好"} 24 | {"label": "5", "text": "撕纸"} 25 | {"label": "3|myz|6", "text": "集会乱说话"} 26 | {"label": "3", "text": "私自到学校对面买东西"} 27 | {"label": "3", "text": "静音小组"} 28 | {"label": "2", "text": "练习卷100"} 29 | {"label": "0", "text": "没有他小明线难过呗"} 30 | {"label": "2", "text": "小测成绩尚可"} 31 | {"label": "2", "text": "作业过关"} 32 | {"label": "6", "text": "声音哄亮"} 33 | {"label": "3", "text": "钢琴课认真"} 34 | {"label": "2", "text": "名列前茅"} 35 | {"label": "3", "text": "练习优秀3分"} 36 | {"label": "1", "text": "拓展延伸优秀"} 37 | {"label": "3", "text": "晚自习讲话"} 38 | {"label": "1", "text": "地面和桌面干净整洁"} 39 | {"label": "1", "text": "储物柜不整齐"} 40 | {"label": "3", "text": "早读最认真"} 41 | {"label": "6", "text": "三操之星"} 42 | {"label": "1", "text": "大清洁未扫地"} 43 | {"label": "1", "text": "积极打扫"} 44 | {"label": "1", "text": "积极参赛"} 45 | {"label": "3", "text": "课堂坐姿"} 46 | {"label": "3", "text": "作业认真订正"} 47 | {"label": "1", "text": "踊跃回答"} 48 | {"label": "3", "text": "学习走神"} 49 | {"label": "3", "text": "作业马虎,应付老师"} 50 | {"label": "5", "text": "目无尊长"} 51 | {"label": "1", "text": "汇报表现突出"} 52 | {"label": "5", "text": "见义勇为"} 53 | {"label": "1", "text": "仪容仪表差"} 54 | {"label": "2", "text": "回答不出来或错"} 55 | {"label": "2", "text": "未交物理作业"} 56 | {"label": "3", "text": "按规定操作"} 57 | {"label": "3", "text": "学具齐备"} 58 | {"label": "3", "text": "玩危险游戏"} 59 | {"label": "3", "text": "恪尽职守"} 60 | {"label": "2", "text": "背诵作业不过关"} 61 | {"label": "5", "text": "没事来打人或骂人"} 62 | {"label": "3", "text": "优秀值日生"} 63 | {"label": "2", "text": "单词王"} 64 | {"label": "3", "text": "动手动脑小精灵"} 65 | {"label": "2", "text": "化学小测满分"} 66 | {"label": "3", "text": "上课走动"} 67 | {"label": "2", "text": "进步非常大"} 68 | {"label": "3", "text": "按时完成作业!已交!"} 69 | {"label": "3", "text": "上课走神(提醒3次)"} 70 | {"label": "2", "text": "测验80分以上"} 71 | {"label": "3", "text": "管不住嘴巴"} 72 | {"label": "3", "text": "未扎头发"} 73 | {"label": "3", "text": "过单词给力"} 74 | {"label": "3", "text": "回家作业没完成"} 75 | {"label": "5", "text": "积极为班级服务"} 76 | {"label": "3", "text": "上课不带课本/资料"} 77 | {"label": "2", "text": "单元测满分"} 78 | {"label": "1", "text": "我是行动派"} 79 | {"label": "3|myz|2", "text": "考试94分"} 80 | {"label": "1", "text": "及时喝水"} 81 | {"label": "3", "text": "课堂表现棒棒哒"} 82 | {"label": "3", "text": "日有所诵"} 83 | {"label": "2", "text": "百分作业"} 84 | {"label": "1", "text": "桌面邋遢"} 85 | {"label": "1", "text": "积极贡献"} 86 | {"label": "0", "text": "天真"} 87 | {"label": "2", "text": "个人单科前十加3"} 88 | {"label": "1", "text": "课堂思维活跃,有想法"} 89 | {"label": "2", "text": "周末作业完成棒"} 90 | {"label": "3", "text": "语文预习不认真"} 91 | {"label": "5", "text": "打驾"} 92 | {"label": "3", "text": "礼仪操不认真"} 93 | {"label": "3", "text": "上课玩游戏"} 94 | {"label": "3", "text": "红领巾没戴"} 95 | {"label": "1", "text": "英语小组长榜样"} 96 | {"label": "1", "text": "抽屉非常乱"} 97 | {"label": "2", "text": "阅读达人"} 98 | {"label": "1", "text": "值日不达标"} 99 | {"label": "3", "text": "作业没有补交"} 100 | {"label": "0", "text": "长的不好看扣分"} 101 | {"label": "1", "text": "寝室卫生差"} 102 | {"label": "2", "text": "数学作业3星"} 103 | {"label": "5", "text": "带食物出饭堂"} 104 | {"label": "1", "text": "积极举手回答问题正确"} 105 | {"label": "3", "text": "科学课"} 106 | {"label": "3", "text": "数学笔记没抄完"} 107 | {"label": "3", "text": "晨读(晨学)认真"} 108 | {"label": "3", "text": "上课不学习"} 109 | {"label": "1", "text": "上课认真,举手积极"} 110 | {"label": "2", "text": "成绩100"} 111 | {"label": "2", "text": "数学作业完成情况良好"} 112 | {"label": "3", "text": "学校补作业"} 113 | {"label": "3", "text": "数学小测或单元"} 114 | {"label": "2", "text": "作业已纠错"} 115 | {"label": "3", "text": "本单元词语全部学会"} 116 | {"label": "2", "text": "听写附加分"} 117 | {"label": "5", "text": "骂人很大声"} 118 | {"label": "2", "text": "主动订正"} 119 | {"label": "6", "text": "拖沓慢吞吞"} 120 | {"label": "3", "text": "字不太漂亮呀"} 121 | {"label": "3|myz|2", "text": "统测统批,年级第一"} 122 | {"label": "5", "text": "关心他人,乐于助人"} 123 | {"label": "1", "text": "早到"} 124 | {"label": "3", "text": "【课堂】注意力集中"} 125 | {"label": "5", "text": "善小助人"} 126 | {"label": "3", "text": "按规定穿班服"} 127 | {"label": "1", "text": "积极扫地"} 128 | {"label": "3", "text": "看俩课外课"} 129 | {"label": "3", "text": "班查:做操不认真"} 130 | {"label": "3", "text": "作业签名"} 131 | {"label": "3|myz|5", "text": "不尊重师长"} 132 | {"label": "1", "text": "英语课发言优秀"} 133 | -------------------------------------------------------------------------------- /test/corpus/pos_to_conll.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/8/24 23:09 4 | # @author : Mo 5 | # @function: transform conll to span, 将CONLL格式的数据转化为MYZ格式{pos:[1,3]} 6 | 7 | 8 | import logging 9 | 10 | 11 | def txt_write(lines, path: str, model: str = "w", encoding: str = "utf-8"): 12 | """ 13 | Write Line of list to file 14 | Args: 15 | lines: lines of list which need save 16 | path: path of save file, such as "txt" 17 | model: type of write, such as "w", "a+" 18 | encoding: type of encoding, such as "utf-8", "gbk" 19 | """ 20 | 21 | try: 22 | file = open(path, model, encoding=encoding) 23 | file.writelines(lines) 24 | file.close() 25 | except Exception as e: 26 | logging.info(str(e)) 27 | def save_json(lines, path, encoding: str = "utf-8", indent: int = 4): 28 | """ 29 | Write Line of List to file 30 | Args: 31 | lines: lines of list[str] which need save 32 | path: path of save file, such as "json.txt" 33 | encoding: type of encoding, such as "utf-8", "gbk" 34 | """ 35 | 36 | with open(path, "w", encoding=encoding) as fj: 37 | fj.write(json.dumps(lines, ensure_ascii=False, indent=indent)) 38 | fj.close() 39 | def transform_span_to_conll(sent, label, sl_ctype): 40 | """将span格式数据(pos, SPAN)转化为CONLL的形式 41 | transform span to conll 42 | Args: 43 | label : List, span-pos, eg. [{"type":"city", "ent":"沪", "pos":[2:3]}] 44 | sent : str, sent of one sample, eg. "macropodus是叉尾斗鱼" 45 | sl_ctype : str, type of corpus, 数据格式sl-type, eg. "BIO", "BMES", "BIOES" 46 | Returns: 47 | res : List, eg. [("鱼", "O")] 48 | """ 49 | label_str = ["O"] * len(sent) 50 | for i, yi in enumerate(label): 51 | yi_pos = yi.get("pos", [0, 1]) 52 | yi_type = yi.get("type", "") 53 | # yi_e = yi.get("ent", "") 54 | yi_pos_0 = yi_pos[0] 55 | yi_pos_1 = yi_pos[1] 56 | # 截取的最大长度, 防止溢出 57 | if yi_pos_1 >= len(sent): 58 | break 59 | if sl_ctype in ["BIO", "OIB"]: 60 | for id in range(yi_pos[1] - yi_pos[0]): 61 | label_str[yi_pos_0 + id] = "I-" + yi_type 62 | label_str[yi_pos_1] = "I-" + yi_type 63 | label_str[yi_pos_0] = "B-" + yi_type 64 | elif sl_ctype in ["BMES"]: # 专门用于CWS分词标注等 65 | label_str[yi_pos_1] = "E-" + yi_type 66 | label_str[yi_pos_0] = "B-" + yi_type 67 | for id in range(yi_pos[1] - yi_pos[0]): 68 | label_str[yi_pos_0 + id] = "M-" + yi_type 69 | if yi_pos_0==yi_pos_1: 70 | label_str[yi_pos_0] = "S-" + yi_type 71 | elif sl_ctype in ["BIOES"]: 72 | label_str[yi_pos_1] = "E-" + yi_type 73 | label_str[yi_pos_0] = "B-" + yi_type 74 | for id in range(yi_pos[1] - yi_pos[0]): 75 | label_str[yi_pos_0 + id] = "I-" + yi_type 76 | if yi_pos_0 == yi_pos_1: 77 | label_str[yi_pos_0] = "S-" + yi_type 78 | res = [] 79 | for i in range(len(label_str)): 80 | res.append((sent[i], label_str[i])) 81 | return res 82 | def read_corpus(corpus_path, encoding="utf-8", keys=["text", "label"]): 83 | """读取MYZ类型数据 84 | read corpus for sequence-labeling 85 | Args: 86 | corpus_path: String, path/origin text, eg. "ner.conll" 87 | Returns: 88 | data: List, [{...}] 89 | """ 90 | with open(corpus_path, "r", encoding=encoding) as fo: 91 | xys = [] 92 | count = 0 93 | for line in fo: 94 | count += 1 95 | # if count > 32: 96 | # break 97 | if not line: 98 | continue 99 | # 最初想定义可配置化, 但是后期实验较多, 还是设置成一般形式, 可自己定义 100 | line_json = json.loads(line.strip()) 101 | x, y = line_json.get(keys[0], ""), line_json.get(keys[1], []) 102 | xys.append((x, y)) 103 | fo.close() 104 | return xys 105 | 106 | 107 | if __name__ == '__main__': 108 | import json 109 | import sys 110 | import os 111 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 112 | sys.path.append(path_root) 113 | print(path_root) 114 | path = path_root + "/corpus/sequence_labeling/ner_china_people_daily_1998_span/" 115 | for t in ["train", "dev", "test"]: 116 | t = t + ".span" 117 | data = read_corpus(path + t) 118 | res = [] 119 | for d in data: 120 | label = transform_span_to_conll(d[0], d[1], sl_ctype="BIO") 121 | label_strs = [li[0] + " " + li[1] + "\n" for li in label] + ["\n"] 122 | res += label_strs 123 | txt_write(res, path + t.replace(".span", ".conll")) 124 | 125 | ee = 0 126 | 127 | # transform span to conll, 将SPAN格式{pos:[1,3]}数据转化为CONLL格式 128 | 129 | -------------------------------------------------------------------------------- /pytorch_nlu/corpus/text_classification/school/dev.json: -------------------------------------------------------------------------------- 1 | {"label": "3", "text": "运动会打架"} 2 | {"label": "2", "text": "课前提问回答不完整"} 3 | {"label": "3", "text": "认真完成聪明题"} 4 | {"label": "1", "text": "小小图书管理员"} 5 | {"label": "4", "text": "发火,生气"} 6 | {"label": "3", "text": "静校未及时,说话"} 7 | {"label": "5", "text": "善良,有孝心"} 8 | {"label": "2", "text": "提出质疑"} 9 | {"label": "2", "text": "作业完成情况优秀"} 10 | {"label": "2", "text": "我会算"} 11 | {"label": "2", "text": "看拼音写词语纸展览"} 12 | {"label": "3", "text": "准时返校"} 13 | {"label": "3", "text": "上课打闹,屡教不改"} 14 | {"label": "1", "text": "班级打扫卫生"} 15 | {"label": "6", "text": "动作拖拉慢吞吞"} 16 | {"label": "2", "text": "作业完成不好"} 17 | {"label": "3", "text": "课堂表现类"} 18 | {"label": "3", "text": "午餐后自我管理差"} 19 | {"label": "2", "text": "80分以下"} 20 | {"label": "3", "text": "部分作业没完成"} 21 | {"label": "3", "text": "晚拖讲话"} 22 | {"label": "3", "text": "国庆作业"} 23 | {"label": "3|myz|5", "text": "课堂搞东西,没认真听"} 24 | {"label": "2", "text": "读得好"} 25 | {"label": "3", "text": "项目完成"} 26 | {"label": "1", "text": "奖励"} 27 | {"label": "1", "text": "教室卫生达人"} 28 | {"label": "3", "text": "背诵古诗认真"} 29 | {"label": "3", "text": "作业未补完"} 30 | {"label": "3", "text": "大奖"} 31 | {"label": "3", "text": "吃饭违纪"} 32 | {"label": "2", "text": "非常优秀"} 33 | {"label": "1", "text": "不及时打扫卫生"} 34 | {"label": "2", "text": "考试低分"} 35 | {"label": "1", "text": "升旗小能手"} 36 | {"label": "3|myz|2", "text": "测验90-94.A-"} 37 | {"label": "3", "text": "好好睡觉"} 38 | {"label": "3", "text": "没有读书"} 39 | {"label": "2", "text": "诵读能手"} 40 | {"label": "2", "text": "葫芦丝过关"} 41 | {"label": "2", "text": "古诗词达人"} 42 | {"label": "3|myz|2", "text": "长江作业未交"} 43 | {"label": "2", "text": "写作明星"} 44 | {"label": "2", "text": "语文练习册满分"} 45 | {"label": "3", "text": "路队纪律差"} 46 | {"label": "5", "text": "说脏话起外号"} 47 | {"label": "1", "text": "餐点不挑食不浪费"} 48 | {"label": "3", "text": "科学课积极认真!"} 49 | {"label": "2", "text": "周练12甲+"} 50 | {"label": "3", "text": "午觉乖"} 51 | {"label": "2", "text": "错字较多"} 52 | {"label": "2", "text": "学习进步奖"} 53 | {"label": "2", "text": "英语作文"} 54 | {"label": "2", "text": "负责任,好组长"} 55 | {"label": "1|myz|3", "text": "内务脏乱、静校慢"} 56 | {"label": "3", "text": "写坐姿端正"} 57 | {"label": "1", "text": "二级巨星"} 58 | {"label": "2", "text": "中段考试成绩优秀"} 59 | {"label": "3", "text": "组长工作能力"} 60 | {"label": "2", "text": "月考加分"} 61 | {"label": "3", "text": "没带语文书"} 62 | {"label": "3|myz|5", "text": "不团结"} 63 | {"label": "2", "text": "听算全对"} 64 | {"label": "2", "text": "错题出错"} 65 | {"label": "3", "text": "趴好休息"} 66 | {"label": "3", "text": "写字不认真,书写潦草"} 67 | {"label": "1", "text": "小组表现不好"} 68 | {"label": "1", "text": "歌曲变歌词小能手"} 69 | {"label": "2", "text": "背书优秀"} 70 | {"label": "3", "text": "安静听课"} 71 | {"label": "3", "text": "没带课本试卷作业"} 72 | {"label": "3|myz|6", "text": "老师不在,讲话"} 73 | {"label": "2", "text": "小组比赛获胜"} 74 | {"label": "3", "text": "活动课遵守纪律"} 75 | {"label": "3", "text": "在学校表现好"} 76 | {"label": "2", "text": "听写80分以上"} 77 | {"label": "3", "text": "第二课堂纪律差"} 78 | {"label": "2", "text": "作文写的真棒"} 79 | {"label": "2", "text": "较出色的完成了作业!"} 80 | {"label": "1", "text": "不讲个人卫生"} 81 | {"label": "5", "text": "携带零食"} 82 | {"label": "3", "text": "课上插嘴"} 83 | {"label": "3|myz|6", "text": "动作迅速不讲话"} 84 | {"label": "2", "text": "字词认读准确"} 85 | {"label": "5", "text": "队列安静有序"} 86 | {"label": "3", "text": "值日未尽责"} 87 | {"label": "3|myz|2", "text": "美术作业棒棒哒"} 88 | {"label": "1", "text": "自己收拾书包"} 89 | {"label": "3", "text": "作业工整向日葵"} 90 | {"label": "3", "text": "上课喝水吃东西"} 91 | {"label": "2", "text": "数学没交家庭作业"} 92 | {"label": "3", "text": "电脑管理员认真负责"} 93 | {"label": "3", "text": "买零食"} 94 | {"label": "3", "text": "午睡喧闹"} 95 | {"label": "3", "text": "一级棒"} 96 | {"label": "3", "text": "小组不认真"} 97 | {"label": "2", "text": "物理作业不认真"} 98 | {"label": "5", "text": "课堂集体评比第3名"} 99 | {"label": "2", "text": "语文测试状元"} 100 | {"label": "2", "text": "考试计算全对"} 101 | {"label": "1", "text": "空气污浊"} 102 | {"label": "3", "text": "英语作业未交"} 103 | {"label": "2", "text": "作业质量较差"} 104 | {"label": "3|myz|2", "text": "学霸笔记"} 105 | {"label": "3", "text": "说维语"} 106 | {"label": "3", "text": "预习课文"} 107 | {"label": "4|myz|5", "text": "善良可爱美丽大方"} 108 | {"label": "2", "text": "小测第一"} 109 | {"label": "1", "text": "作业干净工整"} 110 | {"label": "3", "text": "未完成规定任务"} 111 | {"label": "3|myz|6", "text": "干扰同桌"} 112 | {"label": "3", "text": "上课疯玩"} 113 | {"label": "2", "text": "亲子口算"} 114 | {"label": "2", "text": "英语90分以上"} 115 | {"label": "3", "text": "放学排队讲话"} 116 | {"label": "2", "text": "金榜题名"} 117 | {"label": "5", "text": "温柔听话"} 118 | {"label": "3", "text": "上课自己走神"} 119 | {"label": "5", "text": "雷同作业"} 120 | {"label": "5", "text": "集合速度快"} 121 | {"label": "3", "text": "捣乱"} 122 | {"label": "3", "text": "课间学习"} 123 | {"label": "3", "text": "上课看闲书"} 124 | {"label": "2", "text": "数学校本"} 125 | {"label": "3", "text": "专心"} 126 | {"label": "1", "text": "物品杂乱"} 127 | {"label": "1", "text": "光盘行动、合理膳食"} 128 | {"label": "3", "text": "自习违纪"} 129 | {"label": "1", "text": "课桌收拾干净整洁"} 130 | {"label": "3|myz|2", "text": "主动学习看书"} 131 | {"label": "5", "text": "喜欢偷偷说别人"} 132 | {"label": "0", "text": "我的眼睛"} 133 | -------------------------------------------------------------------------------- /pytorch_nlu/corpus/text_classification/school/train.json: -------------------------------------------------------------------------------- 1 | {"label": "2", "text": "单科考试95分以上"} 2 | {"label": "2", "text": "表达清晰有想法"} 3 | {"label": "3", "text": "下课乱跳大叫"} 4 | {"label": "0", "text": "体育课被批评"} 5 | {"label": "2", "text": "语文80分以上"} 6 | {"label": "2", "text": "小组回答问题"} 7 | {"label": "3", "text": "上课比较认真"} 8 | {"label": "3", "text": "早到且迅速开始学习"} 9 | {"label": "2", "text": "小小写手"} 10 | {"label": "2", "text": "生物作业优秀"} 11 | {"label": "3", "text": "下位走动"} 12 | {"label": "3", "text": "书皮没包没姓名签"} 13 | {"label": "3", "text": "静校讲话"} 14 | {"label": "3", "text": "语文家庭作业没完成"} 15 | {"label": "1", "text": "个人卫生(指甲太长)"} 16 | {"label": "3", "text": "书写干净整齐"} 17 | {"label": "5", "text": "秩序感优秀"} 18 | {"label": "3", "text": "应付作业"} 19 | {"label": "5", "text": "【好事】好人好事"} 20 | {"label": "3", "text": "安静就餐不浪费"} 21 | {"label": "5", "text": "爱护班级卫生"} 22 | {"label": "3", "text": "打扫卫生缺席"} 23 | {"label": "2", "text": "期中测试80-89"} 24 | {"label": "2", "text": "按时完成作业改错"} 25 | {"label": "2", "text": "英语听写全对"} 26 | {"label": "5", "text": "宿舍扣分"} 27 | {"label": "3", "text": "没做卫生"} 28 | {"label": "3|myz|2", "text": "作业不改"} 29 | {"label": "1|myz|3", "text": "自觉预习"} 30 | {"label": "2", "text": "成绩退步明显"} 31 | {"label": "2", "text": "信息技术作品优秀"} 32 | {"label": "1|myz|5", "text": "值日班长、卫生督导员"} 33 | {"label": "2", "text": "作业9分"} 34 | {"label": "3|myz|2", "text": "作业书写较乱"} 35 | {"label": "2", "text": "英语值日报告出色"} 36 | {"label": "2", "text": "语文成绩优秀"} 37 | {"label": "3", "text": "值日生不尽责"} 38 | {"label": "3", "text": "作业未更正"} 39 | {"label": "3", "text": "秋游纪律表扬"} 40 | {"label": "1", "text": "专注之星季军奖励6分"} 41 | {"label": "5", "text": "上课乖"} 42 | {"label": "3", "text": "没有带名著"} 43 | {"label": "1", "text": "不戴红领巾校章"} 44 | {"label": "3", "text": "路队吵闹"} 45 | {"label": "2", "text": "化学单元考80分以上"} 46 | {"label": "3", "text": "作业弄丢"} 47 | {"label": "3", "text": "上课走出座位"} 48 | {"label": "2", "text": "小检测90分以上"} 49 | {"label": "3|myz|2", "text": "按要求完成语文作业"} 50 | {"label": "3", "text": "留校走人"} 51 | {"label": "6", "text": "做位险的游戏"} 52 | {"label": "3", "text": "作业C认真"} 53 | {"label": "2", "text": "数学作业优A"} 54 | {"label": "1", "text": "个人卫生保持差"} 55 | {"label": "1", "text": "小老师管理"} 56 | {"label": "3", "text": "能认真完成《补习卷》"} 57 | {"label": "2", "text": "语家没完成"} 58 | {"label": "3", "text": "没带课本学具"} 59 | {"label": "3", "text": "上课讲话、做小动作"} 60 | {"label": "3", "text": "中午到教室大吵大闹"} 61 | {"label": "0", "text": "挑战性问题"} 62 | {"label": "2", "text": "本周政治周考不及格。"} 63 | {"label": "1", "text": "精神面貌好"} 64 | {"label": "2", "text": "表达有条理"} 65 | {"label": "3", "text": "不认真完成作业"} 66 | {"label": "1", "text": "磨蹭"} 67 | {"label": "2", "text": "单科第一"} 68 | {"label": "2", "text": "课堂作业优"} 69 | {"label": "2", "text": "作业能手"} 70 | {"label": "2", "text": "取得100分"} 71 | {"label": "3", "text": "损坏劳动工具"} 72 | {"label": "6", "text": "态度消极"} 73 | {"label": "3", "text": "安静吃饭"} 74 | {"label": "3|myz|5", "text": "连带纪律适当"} 75 | {"label": "2", "text": "考试进步5-9名"} 76 | {"label": "1", "text": "优秀主持人"} 77 | {"label": "3", "text": "个人平时积分"} 78 | {"label": "2", "text": "考试90到94分"} 79 | {"label": "5", "text": "乐于帮助他人"} 80 | {"label": "2", "text": "计算正确率高"} 81 | {"label": "3", "text": "小组认真积极"} 82 | {"label": "3", "text": "胜利者"} 83 | {"label": "1", "text": "语文作业完成质量高"} 84 | {"label": "5", "text": "文明有礼"} 85 | {"label": "1", "text": "会话表演之星"} 86 | {"label": "2", "text": "单科测验100分"} 87 | {"label": "3|myz|2", "text": "优化认真辅导、批改。"} 88 | {"label": "5|myz|2", "text": "带头读书"} 89 | {"label": "3", "text": "自习讲话"} 90 | {"label": "3", "text": "口算未订正"} 91 | {"label": "3", "text": "午托不认真"} 92 | {"label": "1", "text": "积极回答问题好"} 93 | {"label": "1", "text": "教学活动中表现很棒"} 94 | {"label": "1|myz|6", "text": "回答3次及以上"} 95 | {"label": "3", "text": "英语课随意讲话"} 96 | {"label": "1", "text": "卫生脏乱差"} 97 | {"label": "6", "text": "举手回答问题2次"} 98 | {"label": "3", "text": "55"} 99 | {"label": "1|myz|3", "text": "书写凌乱"} 100 | {"label": "3", "text": "遵守纪律有进步"} 101 | {"label": "5", "text": "秩序之星"} 102 | {"label": "2", "text": "考试全优"} 103 | {"label": "3", "text": "课前准备好,座位干净"} 104 | {"label": "2", "text": "作业默写全对"} 105 | {"label": "3", "text": "午餐表现好"} 106 | {"label": "3", "text": "破坏环境卫生"} 107 | {"label": "3", "text": "经常缺交作业"} 108 | {"label": "3", "text": "口算小达人"} 109 | {"label": "3", "text": "没做作业"} 110 | {"label": "2", "text": "读书笔记差"} 111 | {"label": "2", "text": "不会整理"} 112 | {"label": "3", "text": "作业书写潦草的"} 113 | {"label": "3", "text": "健心卡"} 114 | {"label": "1", "text": "座位有垃圾,桌凳歪"} 115 | {"label": "6", "text": "各位活动积极参与奖"} 116 | {"label": "1", "text": "没有认真完成作业"} 117 | {"label": "3", "text": "就寝讲话"} 118 | {"label": "2", "text": "期中考试全年级第一名"} 119 | {"label": "2", "text": "数学课堂表现出色"} 120 | {"label": "2", "text": "考试最优秀"} 121 | {"label": "1", "text": "家务能手"} 122 | {"label": "3|myz|6", "text": "政治缺交"} 123 | {"label": "3", "text": "上课玩玩具、说话"} 124 | {"label": "3", "text": "乱跳台阶"} 125 | {"label": "5|myz|2", "text": "善于分享交流"} 126 | {"label": "1", "text": "乱丢垃圾!"} 127 | {"label": "6", "text": "动作迅速有效"} 128 | {"label": "2", "text": "书写棒棒的"} 129 | {"label": "1", "text": "宿舍内务没做好"} 130 | {"label": "3", "text": "上课离开座位"} 131 | {"label": "2", "text": "妙笔生花(日记作文)"} 132 | {"label": "3", "text": "干卫生拖拉我错了"} 133 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_textregression/trConfig.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/11/19 21:48 4 | # @author : Mo 5 | # @function: config of transformers and graph-model 6 | 7 | 8 | # model算法超参数 9 | model_config = { 10 | "path_finetune": "", 11 | "CUDA_VISIBLE_DEVICES": "0", # 环境, GPU-CPU, "-1"/"0"/"1"/"2"... 12 | "USE_TORCH": "1", # transformers使用torch, 因为脚本是torch写的 13 | "output_hidden_states": None, # [6,11] # 输出层, 即取第几层transformer的隐藏输出, list 14 | "pretrained_model_name_or_path": "", # 预训练模型地址 15 | "model_save_path": "save_path", # 训练模型保存-训练完毕模型目录 16 | "config_name": "tc.config", # 训练模型保存-超参数文件名 17 | "model_name": "tc.model", # 训练模型保存-全量模型 18 | "path_train": None, # 验证语料地址, 必传, string 19 | "path_dev": None, # 验证语料地址, 必传, 可为None 20 | "path_tet": None, # 验证语料地址, 必传, 可为None 21 | 22 | "task_type": "TR", # 任务类型, 依据数据类型自动更新, "TC-MULTI-CLASS", "TC-MULTI-LABEL", TC为text-classification的缩写 23 | "model_type": "BERT", # 预训练模型类型, 如bert, roberta, ernie 24 | "loss_type": "MAE_SMOOTH", # "MAE_SMOOTH", "MAE", "MSE" 25 | "eval_type": "MSE", 26 | 27 | "batch_size": 32, # 批尺寸 28 | "num_labels": 0, # 类别数, 自动更新 29 | "max_len": -1, # 最大文本长度, -1则为自动获取覆盖0.95数据的文本长度, 0为取得最大文本长度作为maxlen 30 | "epochs": 5, # 训练轮次 31 | "lr": 1e-5, # 学习率 32 | 33 | "grad_accum_steps": 1, # 梯度积累多少步 34 | "max_grad_norm": 1.0, # 最大标准化梯度 35 | "weight_decay": 5e-4, # 模型参数l2权重 36 | "dropout_rate": 0.1, # 随即失活概率 37 | "adam_eps": 1e-8, # adam优化器超参 38 | "seed": 2024, # 随机种子, 3407, 2021 39 | 40 | "stop_epochs": 4, # 早停轮次 41 | "evaluate_steps": 320, # 评估步数 42 | "save_steps": 320, # 存储步数 43 | "warmup_steps": -1, # 预热步数, -1为默认0.5 epoch, 整数 1-N为 默认步数 44 | "ignore_index": 0, # 忽略的index 45 | "max_steps": -1, # 最大步数, -1表示取满epochs 46 | "is_train": True, # 是否训练, 另外一个人不是(而是预测) 47 | "is_cuda": True, # 是否使用gpu, 另外一个不是gpu(而是cpu) 48 | "is_adv": False, # 是否使用对抗训练(默认FGM) 49 | "is_dropout": True, # 最后几层输出是否使用随即失活 50 | "is_active": True, # 最后几层输出是否使用激活函数, 如FCLayer/SpanLayer层 51 | "active_type": "SIGMOID", # 最后几层输出使用的激活函数, 可填写RELU/SIGMOID/TANH/MISH/SWISH/GELU 52 | 53 | "save_best_mertics_key": ["micro_avg", "mse"], # 模型存储的判别指标, index-1可选: [micro_avg, macro_avg, weighted_avg], 54 | # index-2可选: [precision, recall, f1-score] 55 | "multi_label_threshold": 0.5, # 多标签分类时候生效, 大于该阈值则认为预测对的 56 | "xy_keys": ["text", "label"], # text,label在file中对应的keys 57 | "label_sep": "|myz|", # "|myz|" 多标签数据分割符, 用于多标签分类语料中 58 | "len_rate": 1, # 训练数据和验证数据占比, float, 0-1闭区间 59 | "adv_emb_name": "word_embeddings.", # emb_name这个参数要换成你模型中embedding的参数名, model.embeddings.word_embeddings.weight 60 | "adv_eps": 1.0, # 梯度权重epsilon 61 | 62 | "ADDITIONAL_SPECIAL_TOKENS": ["[macropodus]", "[macadam]"], # 新增特殊字符 63 | "flag_add_new_tokens": False, # 是否新增特殊字符, 默认不新增 64 | "flag_only_char": False, # 字符编码是单个字char还是拆分, 默认bpe拆分 65 | 66 | "prior": None, # 类别先验分布, 自动设置, 为一个label_num类别数个元素的list, json无法保存np.array 67 | "l2i": None, 68 | "i2l": None, 69 | "len_corpus": None, # 训练语料长度 70 | "prior_count": None, # 每个类别样本频次 71 | } 72 | 73 | 74 | import os 75 | # os.environ["CUDA_VISIBLE_DEVICES"] = model_config.get("CUDA_VISIBLE_DEVICES", "2") 76 | os.environ["USE_TORCH"] = model_config.get("USE_TORCH", "1") 77 | from transformers import BertTokenizer, RobertaTokenizer, AlbertTokenizer, XLNetTokenizer, ElectraTokenizer, XLMTokenizer, AutoTokenizer 78 | from transformers import BertConfig, RobertaConfig, AlbertConfig, XLNetConfig, ElectraConfig, XLMConfig, AutoConfig 79 | from transformers import BertModel, RobertaModel, AlbertModel, XLNetModel, ElectraModel, XLMModel, AutoModel 80 | # from transformers import LongformerTokenizer, LongformerConfig, LongformerModel 81 | from transformers import DebertaTokenizer, DebertaConfig, DebertaModel 82 | from transformers import GPT2Tokenizer, GPT2Config, GPT2Model 83 | from transformers import T5Tokenizer, T5Config, T5Model 84 | 85 | 86 | # transformers类等 87 | PRETRAINED_MODEL_CLASSES = { 88 | # "LONGFORMER": (LongformerConfig, LongformerTokenizer, LongformerModel), 89 | "ELECTRA": (ElectraConfig, ElectraTokenizer, ElectraModel), 90 | "DEBERTA": (DebertaConfig, DebertaTokenizer, DebertaModel), 91 | "ROBERTA": (AutoConfig, AutoTokenizer, AutoModel), # (RobertaConfig, RobertaTokenizer, RobertaModel), # 92 | "ALBERT": (AlbertConfig, AlbertTokenizer, AlbertModel), 93 | "MACBERT": (AutoConfig, BertTokenizer, BertModel), 94 | "XLNET": (XLNetConfig, XLNetTokenizer, XLNetModel), 95 | "ERNIE": (BertConfig, BertTokenizer, BertModel), 96 | "NEZHA": (BertConfig, BertTokenizer, BertModel), 97 | "BERT": (BertConfig, BertTokenizer, BertModel), 98 | "GPT2": (GPT2Config, GPT2Tokenizer, GPT2Model), 99 | "AUTO": (AutoConfig, AutoTokenizer, AutoModel), 100 | "XLM": (XLMConfig, XLMTokenizer, XLMModel), 101 | "T5": (T5Config, T5Tokenizer, T5Model), 102 | } 103 | 104 | -------------------------------------------------------------------------------- /pytorch_nlu/corpus/sequence_labeling/ner_china_people_daily_1998_span/dev.span: -------------------------------------------------------------------------------- 1 | {"label": [], "text": "在这里恕弟不恭之罪,敢在尊前一诤:前人论书,每曰“字字有来历,笔笔有出处”,细读公字,何尝跳出前人藩篱,自隶变而后,直至明季,兄有何新出?"} 2 | {"label": [{"type": "ORG", "ent": "青岛海牛队", "pos": [5, 9]}, {"type": "ORG", "ent": "广州松日队", "pos": [11, 15]}], "text": "相比之下,青岛海牛队和广州松日队的雨中之战虽然也是0∶0,但乏善可陈。"} 3 | {"label": [], "text": "理由多多,最无奈的却是:5月恰逢双重考试,她攻读的博士学位论文要通考;她任教的两所学校,也要在这段时日大考。"} 4 | {"label": [], "text": "分工,各有各的责任;合作,正副经理之间,全厂的事,不管由谁分管,也不管你有什么事找到谁,绝不会把你推给第二个人。"} 5 | {"label": [{"type": "PER", "ent": "胡", "pos": [0, 0]}], "text": "胡老说,当画画疲倦时就到院里去看看,给这盆花浇点水,给那棵花剪剪枝,回来再接着画,画累了再出去,如此循环往复,脑体结合,有益健康,胜过吃药。"} 6 | {"label": [], "text": "当前国有大中型企业改制中存在的问题。"} 7 | {"label": [], "text": "试验证明,在吸无过滤嘴的香烟时,香烟燃烧过程中产生的尼古丁14%至20%都进了口腔,即使是有过滤嘴,进入口腔的尼古丁仍会有5%到12%。"} 8 | {"label": [{"type": "ORG", "ent": "市委", "pos": [10, 11]}, {"type": "PER", "ent": "张敬涛", "pos": [14, 16]}], "text": "去年十二月二十四日,市委书记张敬涛召集县市主要负责同志研究信访工作时,提出三问:『假如上访群众是我们的父母姐妹,你会用什么样的感情对待他们?"} 9 | {"label": [], "text": "对贫困母亲,只要我们帮她们一点点,她们就能站起来,就能多接受一些教育,就能多一些尊严。"} 10 | {"label": [], "text": "小小的纪念章托在手心,殷红得像一滴血、一蓬火。"} 11 | {"label": [{"type": "PER", "ent": "胡锦涛", "pos": [0, 2]}, {"type": "ORG", "ent": "中国共产党", "pos": [6, 10]}, {"type": "ORG", "ent": "罗社会民主主义党", "pos": [36, 43]}, {"type": "LOC", "ent": "罗", "pos": [47, 47]}], "text": "胡锦涛强调,中国共产党愿在党际关系四项原则的基础上,继续致力于发展同包括罗社会民主主义党在内的罗主要政党之间的党际交流和合作。"} 12 | {"label": [{"type": "ORG", "ent": "人权会", "pos": [2, 4]}, {"type": "ORG", "ent": "古巴代表团", "pos": [6, 10]}, {"type": "ORG", "ent": "司法部", "pos": [14, 16]}, {"type": "PER", "ent": "卡洛斯·阿马特", "pos": [18, 24]}, {"type": "LOC", "ent": "美国", "pos": [32, 33]}, {"type": "LOC", "ent": "古巴", "pos": [35, 36]}, {"type": "LOC", "ent": "古巴", "pos": [47, 48]}], "text": "参加人权会的古巴代表团团长、司法部长卡洛斯·阿马特在表决前指出,美国对古巴的经济封锁是“发生在古巴的唯一的大规模侵犯人权案”。"} 13 | {"label": [], "text": "那么,这里我们应该引申出一个重要结论,即社会主义精神文明建设也应当以改革的精神来进行。"} 14 | {"label": [], "text": "与此同时,作者同一题材的长篇侦破小说《鱼孽》也出版发行。"} 15 | {"label": [{"type": "ORG", "ent": "新华社", "pos": [0, 2]}, {"type": "LOC", "ent": "北京", "pos": [3, 4]}, {"type": "ORG", "ent": "国务院", "pos": [10, 12]}, {"type": "PER", "ent": "李岚清", "pos": [16, 18]}, {"type": "LOC", "ent": "中南海", "pos": [22, 24]}, {"type": "ORG", "ent": "美国前商务部", "pos": [28, 33]}, {"type": "PER", "ent": "芭芭拉·弗兰克林", "pos": [35, 42]}], "text": "新华社北京5月7日电国务院副总理李岚清今天在中南海会见了美国前商务部长芭芭拉·弗兰克林。"} 16 | {"label": [{"type": "LOC", "ent": "不来梅", "pos": [21, 23]}], "text": "按照赛程,a、b两组的出线球队编为e组,在不来梅进行一轮单循环赛,在第一阶段小组赛中已经交手的球队不再比赛,成绩带入第二阶段。"} 17 | {"label": [{"type": "LOC", "ent": "海卫1", "pos": [5, 7]}, {"type": "LOC", "ent": "“旅行者”号探测器", "pos": [15, 23]}], "text": "根据测算,海卫1表面温度已经从“旅行者”号探测器1989年造访时的零下236摄氏度上升到零下234摄氏度。"} 18 | {"label": [], "text": "胶卷冲洗是没有废品率的行业,摄影作品往往是不可重复的,尤其是具有重要意义的作品,一旦出现问题无法弥补。"} 19 | {"label": [{"type": "LOC", "ent": "北京", "pos": [2, 3]}, {"type": "PER", "ent": "袁(日希)", "pos": [12, 16]}, {"type": "ORG", "ent": "国家新闻出版署", "pos": [20, 26]}, {"type": "LOC", "ent": "京", "pos": [30, 30]}, {"type": "LOC", "ent": "中国", "pos": [58, 59]}], "text": "本报北京5月14日讯记者袁(日希)报道:国家新闻出版署今天在京召开关于做好纪念党的十一届三中全会召开20周年、庆祝新中国成立50周年图书出版工作会议。"} 20 | {"label": [{"type": "ORG", "ent": "中国检察日报社影视部", "pos": [15, 24]}], "text": "十八集电视连续剧《红树林》正由中国检察日报社影视部摄制。"} 21 | {"label": [{"type": "ORG", "ent": "前卫寰岛队", "pos": [5, 9]}, {"type": "PER", "ent": "高峰", "pos": [12, 13]}], "text": "有心无力的前卫寰岛队只靠高峰扳回一球。"} 22 | {"label": [], "text": "但所有这些都未能阻止他在大选中获胜。"} 23 | {"label": [{"type": "ORG", "ent": "总社新闻研究所", "pos": [6, 12]}, {"type": "ORG", "ent": "中国新闻学院", "pos": [14, 19]}, {"type": "PER", "ent": "毛泽东", "pos": [36, 38]}, {"type": "PER", "ent": "邓小平", "pos": [40, 42]}, {"type": "PER", "ent": "江泽民", "pos": [44, 46]}], "text": "这个出版社与总社新闻研究所、中国新闻学院合作编撰的国家“九五”重点图书《毛泽东、邓小平、江泽民新闻宣传思想研究》已取得实质性进展。"} 24 | {"label": [], "text": "作者对普通老百姓生活的熟稔、对平民语言的自如运用和整篇的传奇色彩相互映衬,构成作品的特色。"} 25 | {"label": [], "text": "1.由遗传基因所决定的遗传性癌症:如肾母细胞瘤、视网膜母细胞瘤等。"} 26 | {"label": [{"type": "ORG", "ent": "辽宁队", "pos": [17, 19]}, {"type": "PER", "ent": "李", "pos": [23, 23]}, {"type": "ORG", "ent": "五牛队", "pos": [36, 38]}], "text": "上半时进行到20多分钟,右路进攻的辽宁队17号李从底线一记漂亮的斜射首破五牛队城门。"} 27 | {"label": [{"type": "LOC", "ent": "台湾", "pos": [2, 3]}, {"type": "PER", "ent": "克林顿", "pos": [7, 9]}, {"type": "LOC", "ent": "中国", "pos": [15, 16]}, {"type": "LOC", "ent": "美国", "pos": [22, 23]}, {"type": "LOC", "ent": "中", "pos": [25, 25]}, {"type": "LOC", "ent": "美国", "pos": [37, 38]}, {"type": "LOC", "ent": "中国", "pos": [43, 44]}, {"type": "LOC", "ent": "中", "pos": [51, 51]}, {"type": "LOC", "ent": "美", "pos": [52, 52]}], "text": "关于台湾问题,克林顿说,他将向中国领导人重申美国向中方作出的承诺,再次表明美国坚持一个中国的政策,遵守中美三个联合公报的原则。"} 28 | {"label": [{"type": "PER", "ent": "何泰权", "pos": [0, 2]}, {"type": "PER", "ent": "姜京珍", "pos": [4, 6]}], "text": "何泰权/姜京珍的拼搏精神再一次感染了现场的每一位观众,人们不断用热烈的掌声倾注着对这两位悲壮的赛场斗士的尊敬之情。"} 29 | {"label": [{"type": "PER", "ent": "陈伊玲", "pos": [52, 54]}], "text": "最出乎意外的是,1979年作为全国高校统考的语文作文试题,要求根据两千字的《第二次考试》改写成八百字的《陈伊玲的故事》。"} 30 | {"label": [{"type": "LOC", "ent": "黄河", "pos": [28, 29]}, {"type": "LOC", "ent": "青海省", "pos": [43, 45]}, {"type": "LOC", "ent": "曲麻莱县", "pos": [46, 49]}, {"type": "LOC", "ent": "玛多县", "pos": [51, 53]}, {"type": "LOC", "ent": "称多县", "pos": [55, 57]}], "text": "这3张卫星图片分别记录了70年代、80年代和90年代中期黄河源的生态环境状况,范围包括青海省曲麻莱县、玛多县和称多县等地,面积近4万平方公里。"} 31 | {"label": [{"type": "PER", "ent": "金大中", "pos": [5, 7]}], "text": "今年2月,金大中新政府成立后,社会舆论要求惩治对金融危机负有重大责任者。"} 32 | {"label": [], "text": "作为重点国有企业的领导干部,必须时刻牢记自己身上的责任,坚定共产主义理想和社会主义信念,坚持全心全意为人民服务的宗旨,遵守党纪国法,正确处理好权力、责任与利益的关系,自觉维护国家和人民的利益,廉洁自律,自觉做到自重、自省、自警、自励,自觉同职工群众同甘苦、共患难,自觉抵御各种腐朽思想和生活方式的侵蚀。"} 33 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_sequencelabeling/slConfig.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/7/8 20:42 4 | # @author : Mo 5 | # @function: config of sequence-labeling, 超参数/类 6 | 7 | 8 | import os 9 | os.environ["USE_TORCH"] = "1" 10 | from transformers import BertTokenizer, RobertaTokenizer, AlbertTokenizer, XLNetTokenizer, ElectraTokenizer, XLMTokenizer, AutoTokenizer 11 | from transformers import BertConfig, RobertaConfig, AlbertConfig, XLNetConfig, ElectraConfig, XLMConfig, AutoConfig 12 | from transformers import BertModel, RobertaModel, AlbertModel, XLNetModel, ElectraModel, XLMModel, AutoModel 13 | # from transformers import LongformerTokenizer, LongformerConfig, LongformerModel 14 | from transformers import GPT2Tokenizer, GPT2Config, GPT2Model 15 | from transformers import T5Tokenizer, T5Config, T5Model 16 | 17 | 18 | PRETRAINED_MODEL_CLASSES = { 19 | # "LONGFORMER": (LongformerConfig, LongformerTokenizer, LongformerModel), 20 | "ELECTRA": (ElectraConfig, ElectraTokenizer, ElectraModel), 21 | "ROBERTA": (RobertaConfig, RobertaTokenizer, RobertaModel), 22 | "ALBERT": (AlbertConfig, AlbertTokenizer, AlbertModel), 23 | "XLNET": (XLNetConfig, XLNetTokenizer, XLNetModel), 24 | "ERNIE": (BertConfig, BertTokenizer, BertModel), 25 | "NEZHA": (BertConfig, BertTokenizer, BertModel), 26 | "BERT": (BertConfig, BertTokenizer, BertModel), 27 | "GPT2": (GPT2Config, GPT2Tokenizer, GPT2Model), 28 | "AUTO": (AutoConfig, AutoTokenizer, AutoModel), 29 | "XLM": (XLMConfig, XLMTokenizer, XLMModel), 30 | "T5": (T5Config, T5Tokenizer, T5Model) 31 | } 32 | 33 | 34 | # 标识符 35 | _SL_MODEL_SOFTMAX = "SL-SOFTMAX" 36 | _SL_MODEL_GRID = "SL-GRID" # 网格, 即矩阵, Global-Pointer 37 | _SL_MODEL_SPAN = "SL-SPAN" 38 | _SL_MODEL_CRF = "SL-CRF" 39 | _SL_DATA_CONLL = "DATA-CONLL" # conll 40 | _SL_DATA_SPAN = "DATA-SPAN" # span 41 | 42 | 43 | # model算法超参数 44 | model_config = { 45 | "CUDA_VISIBLE_DEVICES": "1", # 环境, GPU-CPU, "-1"/"0"/"1"/"2"... 46 | "output_hidden_states": None, # 输出层, 即取第几层transformer的隐藏输出, list, eg. [6,11], None, [-1] 47 | "pretrained_model_name_or_path": "", # 预训练模型地址 48 | "model_save_path": "model", # 训练模型保存-训练完毕模型目录 49 | "config_name": "sl.config", # 训练模型保存-超参数文件名 50 | "model_name": "sl.model", # 训练模型保存-全量模型 51 | 52 | "path_train": None, # 验证语料地址, 必传, string 53 | "path_dev": None, # 验证语料地址, 必传, 可为None 54 | "path_tet": None, # 验证语料地址, 必传, 可为None 55 | 56 | "corpus_type": "DATA-SPAN", # 语料数据格式, "DATA-CONLL", "DATA-SPAN" 57 | "task_type": "SL-SPAN", # 任务类型, "SL-SOFTMAX", "SL-CRF", "SL-SPAN", "SL-GRID", "sequence_labeling" 58 | "model_type": "BERT", # 预训练模型类型, 如BERT/ROBERTA/ERNIE/ELECTRA/ALBERT 59 | "loss_type": "MARGIN_LOSS", # 损失函数类型, 可选 None(BCE), BCE, MSE, FOCAL_LOSS, 60 | # multi-label: MARGIN_LOSS, PRIOR_MARGIN_LOSS, CIRCLE_LOSS等 61 | # 备注: "SL-GRID"类型不要用BCE、PRIOR_MARGIN_LOSS 62 | "batch_size": 32, # 批尺寸 63 | "num_labels": 0, # 类别数, 自动更新 64 | "max_len": 128, # 最大文本长度, None和-1则为自动获取覆盖0.95数据的文本长度, 0则取训练语料的最大长度, 具体的数值就是强制padding到max_len 65 | "epochs": 16, # 训练轮次 66 | "dense_lr": 1e-3, # CRF层学习率/全连接层学习率, CRF时候与lr保持100-1000倍的大小差距 67 | "lr": 1e-5, # 学习率 68 | 69 | "grad_accum_steps": 1, # 梯度积累多少步 70 | "max_grad_norm": 1.0, # 最大标准化梯度 71 | "weight_decay": 0.99, # lr学习率衰减系数 72 | "dropout_rate": 0.1, # 随机失活概率 73 | "adam_eps": 1e-8, # adam优化器超参 74 | "seed": 2021, # 随机种子 75 | 76 | "stop_epochs": 4, # 连续N轮无增长早停轮次 77 | "evaluate_steps": 320, # 评估步数 78 | "save_steps": 320, # 存储步数 79 | "warmup_steps": -1, # 预热步数, -1为取 0.5 的epoch步数 80 | "ignore_index": 0, # 忽略的index 81 | "max_steps": -1, # 最大步数, -1表示取满epochs 82 | "is_soft_label": True, # 是否使用软标签, soft-label 83 | "is_train": True, # 是否训练, 另外一个人不是(而是预测) 84 | "is_cuda": True, # 是否使用gpu, 另外一个不是gpu(而是cpu) 85 | "is_adv": False, # 是否使用对抗训练(默认FGM) 86 | "is_dropout": True, # 最后几层输出是否使用随即失活 87 | "is_active": True, # 最后几层输出是否使用激活函数, 如FCLayer/SpanLayer层 88 | "active_type": "GELU", # 最后几层输出使用的激活函数, 可填写RELU/SIGMOID/TANH/MISH/SWISH/GELU 89 | 90 | "save_best_mertics_key": ["micro_avg", "f1-score"], # 模型存储的判别指标, index-1可选: [micro_avg, macro_avg, weighted_avg], 91 | # index-2可选: [precision, recall, f1-score] 92 | "multi_label_threshold": 0.5, # 多标签分类时候生效, 大于该阈值则认为预测对的 93 | "grid_pointer_threshold": 0, # 网格(全局)指针网络阈值, 大于该阈值则认为预测对的 94 | "xy_keys_predict": ["text", "label"], # 读取数据的格式, predict预测的时候用 95 | # "xy_keys": ["text", "label"], # SPAN格式的数据, text, label在file中对应的keys 96 | "xy_keys": [0, 1], # CONLL格式的数据, text, label在file中对应的keys, colln时候选择[0,1]等integer 97 | "label_sep": "|myz|", # "|myz|" 多标签数据分割符, 用于多标签分类语料中 98 | "sl_ctype": "BIO", # 数据格式sl-type, BIO, BMES, BIOES, 只在"corpus_type": "MYX", "task_type": "SL-CRL"或"SL-SOFTMAX"时候生效 99 | "head_size": 64, # task_type=="SL-GRID"用 100 | 101 | # 是否对抗学习 102 | "adv_emb_name": "word_embeddings.", # emb_name这个参数要换成你模型中embedding的参数名, model.embeddings.word_embeddings.weight 103 | "adv_eps": 1.0, # 梯度权重epsilon 104 | 105 | "ADDITIONAL_SPECIAL_TOKENS": ["", ""], # 新增特殊字符 106 | "prior": None, # 类别先验分布, 自动设置, 为一个label_num类别数个元素的list, json无法保存np.array 107 | "l2i_conll": None, 108 | "l2i": None, 109 | "i2l":None, 110 | } 111 | 112 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_textsummary/tsConfig.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/11/19 21:48 4 | # @author : Mo 5 | # @function: config of transformers and graph-model 6 | 7 | 8 | _TS_MODEL_BERTSUM = "TS_MODEL_BERTSUM" 9 | 10 | 11 | # cuda设置 12 | import platform 13 | if platform.system().lower() == "windows": 14 | CUDA_VISIBLE_DEVICES = "0" 15 | else: 16 | CUDA_VISIBLE_DEVICES = "0" 17 | 18 | 19 | # model算法超参数 20 | model_config = { 21 | "path_finetune": "", 22 | "CUDA_VISIBLE_DEVICES": CUDA_VISIBLE_DEVICES, # 环境, GPU-CPU, "-1"/"0"/"1"/"2"... 23 | "USE_TORCH": "1", # transformers使用torch, 因为脚本是torch写的 24 | "output_hidden_states": None, # [6,11] # 输出层, 即取第几层transformer的隐藏输出, list 25 | "pretrained_model_name_or_path": "", # 预训练模型地址 26 | "model_save_path": "save_path", # 训练模型保存-训练完毕模型目录 27 | "config_name": "tc.config", # 训练模型保存-超参数文件名 28 | "model_name": "tc.model", # 训练模型保存-全量模型 29 | "path_train": None, # 验证语料地址, 必传, string 30 | "path_dev": None, # 验证语料地址, 必传, 可为None 31 | "path_tet": None, # 验证语料地址, 必传, 可为None 32 | 33 | "tokenizer_type": "BASE", # tokenizer解析的类型, 默认transformers自带的, 可设"CHAR"(单个字符的, 不使用bpe等词根的) 34 | "task_type": _TS_MODEL_BERTSUM, # 任务类型, 依据数据类型自动更新, "TS_MODEL_BERTSUM", TS为text-summary的缩写 35 | "model_type": "BERT", # 预训练模型类型, 如bert, roberta, ernie 36 | "loss_type": "BCE", # "BCE", # 损失函数类型, 37 | # multi-class: 可选 None(BCE), BCE, BCE_LOGITS, MSE, FOCAL_LOSS, DICE_LOSS, LABEL_SMOOTH, MIX; 38 | # multi-label: SOFT_MARGIN_LOSS, PRIOR_MARGIN_LOSS, FOCAL_LOSS, CIRCLE_LOSS, DICE_LOSS, MIX等 39 | 40 | "batch_size": 32, # 批尺寸 41 | "num_labels": 0, # 类别数, 自动更新 42 | "max_len": 0, # 最大文本长度, -1则为自动获取覆盖0.95数据的文本长度, 0为取得最大文本长度作为maxlen 43 | "epochs": 21, # 训练轮次 44 | "lr": 1e-5, # 学习率 45 | 46 | "grad_accum_steps": 1, # 梯度积累多少步 47 | "max_grad_norm": 1.0, # 最大标准化梯度 48 | "weight_decay": 0.99, # lr衰减 49 | "dropout_rate": 0.1, # 随即失活概率 50 | "adam_eps": 1e-8, # adam优化器超参 51 | "seed": 2021, # 随机种子, 3407, 2021 52 | 53 | "stop_epochs": 4, # 早停轮次 54 | "evaluate_steps": 320, # 评估步数 55 | "save_steps": 320, # 存储步数 56 | "warmup_steps": -1, # 预热步数 57 | "ignore_index": 0, # 忽略的index 58 | "max_steps": -1, # 最大步数, -1表示取满epochs 59 | "is_train": True, # 是否训练, 另外一个人不是(而是预测) 60 | "is_cuda": True, # 是否使用gpu, 另外一个不是gpu(而是cpu) 61 | "is_adv": False, # 是否使用对抗训练(默认FGM) 62 | "is_dropout": True, # 最后几层输出是否使用随即失活 63 | "is_active": True, # 最后几层输出是否使用激活函数, 如FCLayer/SpanLayer层 64 | "active_type": "RELU", # 最后几层输出使用的激活函数, 可填写RELU/SIGMOID/TANH/MISH/SWISH/GELU 65 | "is_fc_sigmoid": False, # 最后一层是否使用sigmoid(训练时灵活配置, 存储模型时加上方便推理[如->onnx->tf-serving的时候]) 66 | "is_fc_softmax": False, # 最后一层是否使用softmax(训练时灵活配置, 存储模型时加上方便推理[如->onnx->tf-serving的时候]) 67 | 68 | "save_best_mertics_key": ["micro_avg", "f1-score"], # ["macro avg", "f1-score"], # 模型存储的判别指标, index-1可选: [micro_avg, macro_avg, weighted_avg], 69 | # index-2可选: [precision, recall, f1-score] 70 | "multi_label_threshold": 0.5, # 多标签分类时候生效, 大于该阈值则认为预测对的 71 | "xy_keys": ["text", "label"], # text,label在file中对应的keys 72 | "label_sep": "|myz|", # "|myz|" 多标签数据分割符, 用于多标签分类语料中 73 | "len_rate": 1, # 训练数据和验证数据占比, float, 0-1闭区间 74 | "adv_emb_name": "word_embeddings.", # emb_name这个参数要换成你模型中embedding的参数名, model.embeddings.word_embeddings.weight 75 | "adv_eps": 1.0, # 梯度权重epsilon 76 | 77 | "ADDITIONAL_SPECIAL_TOKENS": ["[macropodus]", "[macadam]"], # 新增特殊字符 78 | "len_corpus": None, # 训练样本数, 自动更新 79 | "prior_count": None, # 各个类别频次, 自动更新 80 | "prior": None, # 类别先验分布, 自动更新, 为一个label_num类别数个元素的list, json无法保存np.array 81 | "l2i": None, 82 | "i2l": None, 83 | } 84 | 85 | 86 | import os 87 | # os.environ["CUDA_VISIBLE_DEVICES"] = model_config.get("CUDA_VISIBLE_DEVICES", "2") 88 | os.environ["USE_TORCH"] = model_config.get("USE_TORCH", "1") 89 | from transformers import BertTokenizer, RobertaTokenizer, AlbertTokenizer, XLNetTokenizer, ElectraTokenizer, XLMTokenizer, AutoTokenizer 90 | from transformers import BertConfig, RobertaConfig, AlbertConfig, XLNetConfig, ElectraConfig, XLMConfig, AutoConfig 91 | from transformers import BertModel, RobertaModel, AlbertModel, XLNetModel, ElectraModel, XLMModel, AutoModel 92 | # from transformers import LongformerTokenizer, LongformerConfig, LongformerModel 93 | from transformers import GPT2Tokenizer, GPT2Config, GPT2Model 94 | from transformers import T5Tokenizer, T5Config, T5Model 95 | 96 | 97 | # transformers类等 98 | PRETRAINED_MODEL_CLASSES = { 99 | # "LONGFORMER": (LongformerConfig, LongformerTokenizer, LongformerModel), 100 | "ELECTRA": (ElectraConfig, ElectraTokenizer, ElectraModel), 101 | "ROBERTA": (AutoConfig, AutoTokenizer, AutoModel), # (RobertaConfig, RobertaTokenizer, RobertaModel), # 102 | "ALBERT": (AlbertConfig, AlbertTokenizer, AlbertModel), 103 | "MACBERT": (AutoConfig, BertTokenizer, BertModel), 104 | "XLNET": (XLNetConfig, XLNetTokenizer, XLNetModel), 105 | "ERNIE": (BertConfig, BertTokenizer, BertModel), 106 | "NEZHA": (BertConfig, BertTokenizer, BertModel), 107 | "BERT": (BertConfig, BertTokenizer, BertModel), 108 | "GPT2": (GPT2Config, GPT2Tokenizer, GPT2Model), 109 | "AUTO": (AutoConfig, AutoTokenizer, AutoModel), 110 | "XLM": (XLMConfig, XLMTokenizer, XLMModel), 111 | "T5": (T5Config, T5Tokenizer, T5Model) 112 | } 113 | 114 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_textclassification/tcConfig.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/11/19 21:48 4 | # @author : Mo 5 | # @function: config of transformers and graph-model 6 | 7 | 8 | _TC_MULTI_CLASS = "TC-MULTI-CLASS" 9 | _TC_MULTI_LABEL = "TC-MULTI-LABEL" 10 | 11 | 12 | # model算法超参数 13 | model_config = { 14 | "path_finetune": "", 15 | "CUDA_VISIBLE_DEVICES": "0", # 环境, GPU-CPU, "-1"/"0"/"1"/"2"... 16 | "USE_TORCH": "1", # transformers使用torch, 因为脚本是torch写的 17 | "output_hidden_states": None, # [6,11] # 输出层, 即取第几层transformer的隐藏输出, list 18 | "pretrained_model_name_or_path": "", # 预训练模型地址 19 | "model_save_path": "save_path", # 训练模型保存-训练完毕模型目录 20 | "config_name": "tc.config", # 训练模型保存-超参数文件名 21 | "model_name": "tc.model", # 训练模型保存-全量模型 22 | "path_train": None, # 验证语料地址, 必传, string 23 | "path_dev": None, # 验证语料地址, 必传, 可为None 24 | "path_tet": None, # 验证语料地址, 必传, 可为None 25 | 26 | "tokenizer_type": "BASE", # tokenizer解析的类型, 默认transformers自带的, 可设"CHAR"(单个字符的, 不使用bpe等词根的) 27 | "task_type": "TC-MULTI-CLASS", # 任务类型, 依据数据类型自动更新, "TC-MULTI-CLASS", "TC-MULTI-LABEL", TC为text-classification的缩写 28 | "model_type": "BERT", # 预训练模型类型, 如bert, roberta, ernie 29 | "loss_type": "BCE", # "BCE", # 损失函数类型, 30 | # multi-class: 可选 None(BCE), BCE, BCE_LOGITS, MSE, FOCAL_LOSS, DICE_LOSS, LABEL_SMOOTH, MIX; 31 | # multi-label: SOFT_MARGIN_LOSS, PRIOR_MARGIN_LOSS, FOCAL_LOSS, CIRCLE_LOSS, DICE_LOSS, MIX等 32 | "batch_size": 32, # 批尺寸 33 | "num_labels": 0, # 类别数, 自动更新 34 | "max_len": 0, # 最大文本长度(不超过512), -1则为自动获取覆盖0.95数据的文本长度, 0为取得最大文本长度作为maxlen 35 | "epochs": 21, # 训练轮次 36 | "lr": 1e-5, # 学习率 37 | 38 | "grad_accum_steps": 1, # 梯度积累多少步 39 | "max_grad_norm": 1.0, # 最大标准化梯度 40 | "weight_decay": 5e-4, # 模型参数l2权重 41 | "dropout_rate": 0.1, # 随即失活概率 42 | "adam_eps": 1e-8, # adam优化器超参 43 | "seed": 2022, # 随机种子, 3407, 2021 44 | 45 | "stop_epochs": 4, # 早停轮次 46 | "evaluate_steps": 320, # 评估步数 47 | "save_steps": 320, # 存储步数 48 | "warmup_steps": -1, # 预热步数, -1表示取一个epoch的1/2, 其他可设具体步数 49 | "ignore_index": 0, # 忽略的index 50 | "max_steps": -1, # 最大步数, -1表示取满epochs 51 | "is_train": True, # 是否训练, 另外一个人不是(而是预测) 52 | "is_cuda": True, # 是否使用gpu, 另外一个不是gpu(而是cpu) 53 | "is_adv": False, # 是否使用对抗训练(默认FGM) 54 | "is_dropout": True, # 最后几层输出是否使用随即失活 55 | "is_active": True, # 最后几层输出是否使用激活函数, 如FCLayer/SpanLayer层 56 | "active_type": "RELU", # 最后几层输出使用的激活函数, 可填写RELU/SIGMOID/TANH/MISH/SWISH/GELU 57 | "is_fc_sigmoid": False, # 最后一层是否使用sigmoid(训练时灵活配置, 存储模型时加上方便推理[如->onnx->tf-serving的时候]) 58 | "is_fc_softmax": False, # 最后一层是否使用softmax(训练时灵活配置, 存储模型时加上方便推理[如->onnx->tf-serving的时候]) 59 | 60 | "save_best_mertics_key": ["micro_avg", "f1-score"], # 模型存储的判别指标, index-1可选: [micro_avg, macro_avg, weighted_avg], 61 | # index-2可选: [precision, recall, f1-score] 62 | "multi_label_threshold": 0.5, # 多标签分类时候生效, 大于该阈值则认为预测对的 63 | "xy_keys": ["text", "label"], # text,label在file中对应的keys 64 | "label_sep": "|myz|", # "|myz|" 多标签数据分割符, 用于多标签分类语料中 65 | "len_rate": 1, # 训练数据和验证数据占比, float, 0-1闭区间 66 | "adv_emb_name": "word_embeddings.", # emb_name这个参数要换成你模型中embedding的参数名, model.embeddings.word_embeddings.weight 67 | "adv_eps": 1.0, # 梯度权重epsilon 68 | 69 | "ADDITIONAL_SPECIAL_TOKENS": ["[macropodus]", "[macadam]"], # 新增特殊字符 70 | "len_corpus": None, # 训练语料长度 71 | "prior_count": None, # 每个类别样本频次 72 | "prior": None, # 类别先验分布, 自动设置, 为一个label_num类别数个元素的list, json无法保存np.array 73 | "l2i": None, 74 | "i2l": None, 75 | } 76 | 77 | 78 | import os 79 | # os.environ["CUDA_VISIBLE_DEVICES"] = model_config.get("CUDA_VISIBLE_DEVICES", "2") 80 | os.environ["USE_TORCH"] = model_config.get("USE_TORCH", "1") 81 | from transformers import BertTokenizer, RobertaTokenizer, AlbertTokenizer, XLNetTokenizer, ElectraTokenizer, XLMTokenizer, AutoTokenizer 82 | from transformers import BertConfig, RobertaConfig, AlbertConfig, XLNetConfig, ElectraConfig, XLMConfig, AutoConfig 83 | from transformers import BertModel, RobertaModel, AlbertModel, XLNetModel, ElectraModel, XLMModel, AutoModel 84 | # from transformers import LongformerTokenizer, LongformerConfig, LongformerModel 85 | from transformers import DebertaTokenizer, DebertaConfig, DebertaModel 86 | from transformers import GPT2Tokenizer, GPT2Config, GPT2Model 87 | from transformers import T5Tokenizer, T5Config, T5Model 88 | 89 | 90 | # transformers类等 91 | PRETRAINED_MODEL_CLASSES = { 92 | # "LONGFORMER": (LongformerConfig, LongformerTokenizer, LongformerModel), 93 | "ELECTRA": (ElectraConfig, ElectraTokenizer, ElectraModel), 94 | "DEBERTA": (DebertaConfig, DebertaTokenizer, DebertaModel), 95 | "ROBERTA": (AutoConfig, AutoTokenizer, AutoModel), # (RobertaConfig, RobertaTokenizer, RobertaModel), # 96 | "ALBERT": (AlbertConfig, AlbertTokenizer, AlbertModel), 97 | "MACBERT": (AutoConfig, BertTokenizer, BertModel), 98 | "XLNET": (XLNetConfig, XLNetTokenizer, XLNetModel), 99 | "ERNIE": (BertConfig, BertTokenizer, BertModel), 100 | "NEZHA": (BertConfig, BertTokenizer, BertModel), 101 | "BERT": (BertConfig, BertTokenizer, BertModel), 102 | "GPT2": (GPT2Config, GPT2Tokenizer, GPT2Model), 103 | "AUTO": (AutoConfig, AutoTokenizer, AutoModel), 104 | "XLM": (XLMConfig, XLMTokenizer, XLMModel), 105 | "T5": (T5Config, T5Tokenizer, T5Model), 106 | } 107 | 108 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_textregression/trGraph.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/11/17 21:35 4 | # @author : Mo 5 | # @function: graph of pre-train model 6 | 7 | 8 | from trLayer import PriorMultiLabelSoftMarginLoss, MultiLabelCircleLoss, LabelSmoothingCrossEntropy 9 | from trLayer import FCLayer, FocalLoss, DiceLoss 10 | from trConfig import PRETRAINED_MODEL_CLASSES 11 | # torch 12 | from transformers import BertPreTrainedModel 13 | import torch 14 | 15 | 16 | class TRGraph(BertPreTrainedModel): 17 | def __init__(self, graph_config, tokenizer): 18 | """ 19 | Pytorch Graph of TextClassification, Pre-Trained Model based 20 | config: 21 | config: json, params of graph, eg. {"num_labels":17, "model_type":"BERT"} 22 | Returns: 23 | output: Tuple, Tensor of logits and loss 24 | Url: https://github.com/yongzhuo 25 | """ 26 | # 预训练语言模型读取 27 | self.graph_config = graph_config 28 | pretrained_config, pretrained_tokenizer, pretrained_model = PRETRAINED_MODEL_CLASSES[graph_config.model_type] 29 | self.pretrained_config = pretrained_config.from_pretrained(graph_config.pretrained_model_name_or_path, output_hidden_states=graph_config.output_hidden_states) 30 | self.pretrained_config.update({"gradient_checkpointing": True}) 31 | # self.pretrained_config.update({"gradient_checkpointing": True, "max_position_embeddings": graph_config.max_len}) 32 | # self.tokenizer = pretrained_tokenizer.from_pretrained(graph_config.pretrained_model_name_or_path) 33 | # self.tokenizer = tokenizer 34 | super(TRGraph, self).__init__(self.pretrained_config) 35 | if self.graph_config.is_train: 36 | self.pretrain_model = pretrained_model.from_pretrained(graph_config.pretrained_model_name_or_path, config=self.pretrained_config) 37 | self.pretrain_model.resize_token_embeddings(len(tokenizer)) 38 | else: 39 | self.pretrain_model = pretrained_model(self.pretrained_config) 40 | self.pretrain_model.resize_token_embeddings(len(tokenizer)) 41 | # # tokenizer.model_max_length = self.model.config.max_position_embeddings 42 | # 如果用隐藏层输出 43 | if self.graph_config.output_hidden_states: 44 | # self.dense = FCLayer(int(self.pretrained_config.hidden_size*len(self.graph_config.output_hidden_states)*3), self.graph_config.num_labels, 45 | # is_dropout=self.graph_config.is_dropout, is_active=self.graph_config.is_active, active_type=self.graph_config.active_type) 46 | self.dense = FCLayer( 47 | int(self.pretrained_config.hidden_size * len(self.graph_config.output_hidden_states)), 48 | self.graph_config.num_labels, 49 | is_dropout=self.graph_config.is_dropout, is_active=self.graph_config.is_active, 50 | active_type=self.graph_config.active_type) 51 | else: 52 | self.dense = FCLayer(self.pretrained_config.hidden_size, self.graph_config.num_labels, is_dropout=self.graph_config.is_dropout, 53 | is_active=self.graph_config.is_active, active_type=self.graph_config.active_type) 54 | # # 池化层 55 | # self.global_maxpooling = torch.nn.AdaptiveMaxPool1d(1) 56 | # self.global_avgpooling = torch.nn.AdaptiveAvgPool1d(1) 57 | # 损失函数, loss 58 | self.loss_type = self.graph_config.loss_type if self.graph_config.loss_type else "MSE" 59 | # self.task_type = self.graph_config.task_type if self.graph_config.task_type else "BCE" 60 | self.loss_ce = torch.nn.CrossEntropyLoss(ignore_index=0) 61 | self.loss_mae_smooth = torch.nn.SmoothL1Loss() 62 | self.loss_mse = torch.nn.MSELoss() 63 | self.loss_mae = torch.nn.L1Loss() 64 | # 激活层/随即失活层 65 | self.softmax = torch.nn.Softmax(dim=-1) 66 | self.sigmoid = torch.nn.Sigmoid() 67 | self.dropout = torch.nn.Dropout 68 | 69 | def forward(self, input_ids, attention_mask, token_type_ids, labels=None): 70 | output = self.pretrain_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) 71 | if self.graph_config.output_hidden_states: 72 | x = output[2] 73 | hidden_states_idx = [i for i in range(len(x))] 74 | # ### pool, [max-pool, avg-pool, cls] 75 | # x_cat = torch.cat([x[i] for i in self.graph_config.output_hidden_states if i in hidden_states_idx], dim=-1) 76 | # x_max = self.global_maxpooling(x_cat.permute(0, 2, 1)).squeeze(dim=-1) 77 | # x_avg = self.global_avgpooling(x_cat.permute(0, 2, 1)).squeeze(dim=-1) 78 | # x_cls = x_cat[:, 0, :] 79 | # x_merge = torch.cat([x_max, x_avg, x_cls], dim=-1) 80 | # cls = self.dropout(p=self.graph_config.dropout_rate)(x_merge) 81 | ### cls 82 | cls = torch.cat([x[i][:, 0, :] for i in self.graph_config.output_hidden_states if i in hidden_states_idx], dim=-1) 83 | else: # CLS 84 | cls = output[0][:, 0, :] # cls 85 | logits = self.dense(cls) # full-connect: FCLayer 86 | if labels is not None: # loss 87 | # L1 Loss、L2 Loss、Smooth L1 Loss 88 | if self.loss_type.upper() == "MAE_SMOOTH": # L1平滑损失 89 | loss = self.loss_mae_smooth(logits.view(-1), labels.view(-1)) 90 | elif self.loss_type.upper() == "MAE": # L1损失 91 | loss = self.loss_mae(logits.view(-1), labels.view(-1)) 92 | else: # L2损失 93 | loss = self.loss_mse(logits.view(-1), labels.view(-1)) 94 | return loss, logits 95 | else: 96 | return logits 97 | 98 | -------------------------------------------------------------------------------- /pytorch_nlu/corpus/text_regression/negative_sentence/train.json: -------------------------------------------------------------------------------- 1 | {"text": "安静点", "label": [1,0]} 2 | {"text": "幼儿园的小朋友一个也不不喜欢王老师。", "label": [0,0]} 3 | {"text": "对于这件事,咱们什么也别说。", "label": [0,0]} 4 | {"text": "官方:崔万军不再担任广州男篮主教练", "label": [0,0]} 5 | {"text": "当你听到这段英雄事迹的时候,难道你不觉得我们的战士是最可爱的吗?", "label": [1,0]} 6 | {"text": "她没有在唱歌。", "label": [0,0]} 7 | {"text": "希望可以给大家学习带来帮助。", "label": [1,0]} 8 | {"text": "他不做家庭作业。", "label": [0,0]} 9 | {"text": "由俭入奢易,由奢入俭难。", "label": [1,0]} 10 | {"text": "我没有汽车。", "label": [0,0]} 11 | {"text": "因人废言", "label": [1,0]} 12 | {"text": "心似双丝网,中有千千结。", "label": [1,0]} 13 | {"text": "我曾经是一名学生。", "label": [1,0]} 14 | {"text": "图书馆除了星期日,每天都开放。", "label": [0,0]} 15 | {"text": "这树上有许多鸟巢", "label": [1,0]} 16 | {"text": "如果我们选择居住", "label": [1,0]} 17 | {"text": "今天是晴天。", "label": [1,0]} 18 | {"text": "是一只蜗牛,跑得不快。", "label": [0,0]} 19 | {"text": "却克伤于矢,流血及屦,未绝鼓音", "label": [0,0]} 20 | {"text": "罔敷求先王。", "label": [0,0]} 21 | {"text": "无可辩驳", "label": [1,0]} 22 | {"text": "这应该不难的", "label": [0,0]} 23 | {"text": "他不是受贿的那种人。", "label": [0,0]} 24 | {"text": "班主任难道不应该以身作则吗?", "label": [1,0]} 25 | {"text": "是的一点不爱了。", "label": [0,0]} 26 | {"text": "颂古非今", "label": [1,0]} 27 | {"text": "这么笨。", "label": [1,0]} 28 | {"text": "不懂", "label": [0,0]} 29 | {"text": "她不是在跳舞,而是在锻炼。", "label": [0,0]} 30 | {"text": "随意砍伐树木,破坏绿化", "label": [1,0]} 31 | {"text": "这些书中有几本我不喜欢。", "label": [0,0]} 32 | {"text": "小米并不是很友好。", "label": [0,0]} 33 | {"text": "死亡也无法叫我开口。", "label": [0,0]} 34 | {"text": "Help!救命!", "label": [1,0]} 35 | {"text": "他不开汽车。", "label": [0,0]} 36 | {"text": "后来的我们,过的不怎么样", "label": [0,0]} 37 | {"text": "做完了值日工作,我抓紧时间早读。", "label": [1,0]} 38 | {"text": "天亮前不能不赶到目的地.", "label": [1,0]} 39 | {"text": "小草怀着无比崇敬的心情仰望着慈眉善目的太阳。", "label": [1,0]} 40 | {"text": "天气真好!", "label": [1,0]} 41 | {"text": "他不会蠢到投资那项事业的地步。", "label": [0,0]} 42 | {"text": "这就是他迟到的原因。", "label": [1,0]} 43 | {"text": "他说:“哦,我是要办大事业的。”", "label": [1,0]} 44 | {"text": "约翰想要一些杂志。", "label": [1,0]} 45 | {"text": "张民对学习很不认真。", "label": [0,0]} 46 | {"text": "那是绝对不行的。", "label": [0,0]} 47 | {"text": "汤姆在学习汉语。", "label": [1,0]} 48 | {"text": "没人知道如何治疗这种可怕的疾病。", "label": [0,0]} 49 | {"text": "全体起立!", "label": [1,0]} 50 | {"text": "要做一个好孩子!", "label": [1,0]} 51 | {"text": "我不是程序员。", "label": [0,0]} 52 | {"text": "用臣之计,毋战而略地,不攻而下城,传檄而千里定", "label": [0,0]} 53 | {"text": "他今天不能不来。", "label": [1,0]} 54 | {"text": "他没有兄弟姐妹。", "label": [0,0]} 55 | {"text": "这两本书都不是英国出版的。", "label": [0,0]} 56 | {"text": "君了食无求饱,居无求安", "label": [0,0]} 57 | {"text": "这是你的钢笔", "label": [1,0]} 58 | {"text": "那我就不能多讲了。", "label": [0,0]} 59 | {"text": "我现在没做什么事情。", "label": [0,0]} 60 | {"text": "这里有瓶牛奶。", "label": [1,0]} 61 | {"text": "我并不完全同意。", "label": [0,0]} 62 | {"text": "睡觉前把门锁上。", "label": [1,0]} 63 | {"text": "拿着梨,心里老觉得丢了什么,对了,是诚实。", "label": [1,0]} 64 | {"text": "这真是伟大的奇观啊!", "label": [1,0]} 65 | {"text": "艾玛不是歌手", "label": [0,0]} 66 | {"text": "他不是笨孩子", "label": [0,0]} 67 | {"text": "你们要保护环境。", "label": [1,0]} 68 | {"text": "杭州的西湖像一颗光彩夺目的明珠。", "label": [1,0]} 69 | {"text": "以不济可", "label": [1,0]} 70 | {"text": "我不喝可乐。", "label": [0,0]} 71 | {"text": "是何异于刺人而杀之,曰:“非我也,兵也。”", "label": [0,0]} 72 | {"text": "没一个认识李白", "label": [0,0]} 73 | {"text": "没有哪个人不喜欢闪闪发光的东西", "label": [1,0]} 74 | {"text": "我没有订过这葡萄酒。", "label": [0,0]} 75 | {"text": "狗熊身上脏得发臭,房间乱七八糟。", "label": [1,0]} 76 | {"text": "毫无疑问,我能在两个小时内完成这项任务。", "label": [1,0]} 77 | {"text": "我没有任何外国朋友。", "label": [0,0]} 78 | {"text": "孩子们都喜欢他", "label": [1,0]} 79 | {"text": "束手无策,等待死神的到来。", "label": [1,0]} 80 | {"text": "这不是我喜欢的,我做这件事纯粹是出于责任感。", "label": [0,0]} 81 | {"text": "冲天香阵透长安,满城尽带黄金甲。", "label": [1,0]} 82 | {"text": "我几乎未曾离开过北京。", "label": [0,0]} 83 | {"text": "难道这里不热吗!", "label": [1,0]} 84 | {"text": "桌子在门后面。", "label": [1,0]} 85 | {"text": "这样做。", "label": [1,0]} 86 | {"text": "他不能回家", "label": [0,0]} 87 | {"text": "春宵一刻值千金,花有清香月有阴。", "label": [1,0]} 88 | {"text": "她不相信它变成了一个好孩子。", "label": [0,0]} 89 | {"text": "我没有一次不是见到你就想起我的兄弟来。", "label": [0,0]} 90 | {"text": "这件事让人高兴。", "label": [1,0]} 91 | {"text": "汤姆也不上学。", "label": [0,0]} 92 | {"text": "兴安岭,是多么会打扮自己啊:青松作衫,白桦为裙,还穿着绣花鞋", "label": [1,0]} 93 | {"text": "激光不是一种普通的光.", "label": [0,0]} 94 | {"text": "噌吰者,周景王之无射也。", "label": [1,0]} 95 | {"text": "杰森是最不可能通过考试的人,因为他几乎整个学期都在外面打工。", "label": [0,0]} 96 | {"text": "我不是奴才,别使唤我", "label": [0,0]} 97 | {"text": "王老师带病工作", "label": [1,0]} 98 | {"text": "我认为迈克没有看这场电影。", "label": [0,0]} 99 | {"text": "他英语说的很好。", "label": [1,0]} 100 | {"text": "最不可能的", "label": [0,0]} 101 | {"text": "一概抹煞", "label": [1,0]} 102 | {"text": "黑熊躲进大树洞里。", "label": [1,0]} 103 | {"text": "好多年过去了,谁也没看见狗熊办成了什么大事业。", "label": [1,0]} 104 | {"text": "警察叔叔持续作战,终于将犯罪分子一网打尽。", "label": [1,0]} 105 | {"text": "为人类造福。", "label": [1,0]} 106 | {"text": "今天必须去上学", "label": [1,0]} 107 | {"text": "我不是女孩。", "label": [0,0]} 108 | {"text": "11.28万起的国货,有猛禽外观却全系不配ESP,买它真需要勇气", "label": [0,0]} 109 | {"text": "我们不能过高地估计现代科学的价值。", "label": [0,0]} 110 | {"text": "黄山的松树大都生在石上", "label": [1,0]} 111 | {"text": "这是一部多有趣的电影啊!", "label": [1,0]} 112 | {"text": "难道能否认《红楼梦》不是一部很好的古典小说吗?", "label": [0,0]} 113 | {"text": "他画得多好啊,画了小鸟、孔雀和天鹅。", "label": [1,0]} 114 | {"text": "付不起", "label": [0,0]} 115 | {"text": "简直没什么可说的了。", "label": [0,0]} 116 | {"text": "世界很美好.", "label": [1,0]} 117 | {"text": "吉姆不做作业。", "label": [0,0]} 118 | {"text": "不是多重否定句:", "label": [0,0]} 119 | {"text": "那是绝对不行的。", "label": [0,0]} 120 | {"text": "妈妈说,她今天不回家吃饭了。", "label": [1,0]} 121 | {"text": "他不敢告诉她实情。", "label": [0,0]} 122 | {"text": "这个问题不须清楚。", "label": [0,0]} 123 | {"text": "毋庸讳言", "label": [0,0]} 124 | {"text": "我不得不赞美他的勇气。", "label": [1,0]} 125 | {"text": "是书生脚短", "label": [1,0]} 126 | {"text": "我几乎不懂英语。我很少看见人。", "label": [0,0]} 127 | {"text": "我已经完成我的作业了。", "label": [1,0]} 128 | {"text": "我不知道这个。没有消息就是好消息。", "label": [0,0]} 129 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_textsummary/README.md: -------------------------------------------------------------------------------- 1 | 2 | # [***pytorch-textsummary***](https://github.com/yongzhuo/Pytorch-NLU/pytorch_textsummary) 3 | >>> pytorch-textsummary是一个以pytorch和transformers为基础,专注于中文文本摘要的轻量级自然语言处理工具,支持抽取式摘要等。 4 | 5 | 6 | ## 目录 7 | * [数据](#数据) 8 | * [使用方式](#使用方式) 9 | * [paper](#paper) 10 | * [参考](#参考) 11 | 12 | 13 | ## 项目地址 14 | - pytorch-textsummary: [https://github.com/yongzhuo/Pytorch-NLU/pytorch_textsummary](https://github.com/yongzhuo/Pytorch-NLU/pytorch_textsummary) 15 | 16 | 17 | # 数据 18 | ## 数据来源 19 | 免责声明:以下数据集由公开渠道收集而成, 只做汇总说明; 科学研究、商用请联系原作者; 如有侵权, 请及时联系删除。 20 | * [chinese_abstractive_corpus](https://github.com/wonderfulsuccess/chinese_abstractive_corpus), 教育培训行业抽象式自动摘要中文语料:语料库收集了教育培训行业主流垂直媒体的历史文章(截止到2018年6月5日)大约24500条数据集。主要是为训练抽象式模型而整理,每条数据有summary(摘要)和text(正文),两个字段,Summary字段均为作者标注。 21 | * [NLPCC2017-task3-Single Document Summarization](http://tcci.ccf.org.cn/conference/2017/taskdata.php), NLPCC 2017 task3 单文档摘要; 22 | * [A Large-Scale Chinese Long-text Extractive Summarization Corpus](http://icrc.hitsz.edu.cn/info/1037/1411.htm), 哈工大长文本摘要数据; 23 | * [LCSTS: A Large-Scale Chinese Short Text Summarization Dataset](http://icrc.hitsz.edu.cn/info/1037/1141.htm), 哈工大LCSTS短文本摘要数据; 24 | * 生成式文本摘要可以用一些带标题的文章来训练; 25 | 26 | ## 数据格式 27 | ``` 28 | 1. 文本摘要 (txt格式, 每行为一个json): 29 | 30 | 1.1 抽取式文本摘要格式: 31 | {"label": [0, 1, 0, 0, 1, 0, 0, 0, 0, 0], "text": ["针对现有法向量估值算法都只能适用于某一类特定形状模型的问题。", "提出三维点云模糊分类的法向量估值算法。", "利用模糊推理系统对模型的点云数据分类。", "根据点云在不同形状区域的分布情况和曲率变化给出模糊规则。", "将点云分成属于平滑形状区域、薄片形状区域和尖锐形状区域三类。", "每类点云对应给出特定的法向量估值算法。", "由于任意模型形状分布的差别。", "其点云数据经过模糊分类后调用相应的估值算法次数会有差别。", "因此采用牙齿模型点云数据验证了算法的可行性。", "经过与三种典型算法比较可以看出本算法估算准确、简单可行。"]} 32 | {"label": [0, 0, 1, 1, 0, 0], "text": ["医院物联网是物联网技术在医疗行业应用的集中体现。", "在简单介绍医院物联网基本概念的基础上。", "结合物联网机制和医院的实际特点。", "探讨了适用于医院物联网的体系结构。", "并分析了构建中的关键技术。", "包括医院物联网的标准建设、中间件技术及嵌入式电子病历的研究与设计等。"]} 33 | 34 | ``` 35 | 36 | 37 | # 使用方式 38 | 更多样例sample详情见test/ts 39 | - 1. 需要配置好预训练模型目录, 即变量 pretrained_model_dir、pretrained_model_name_or_path、idx等; 40 | - 2. 需要配置好自己的语料地址, 即字典 model_config["path_train"]、model_config["path_dev"] 41 | - 3. cd到该脚本目录下运行普通的命令行即可, 例如: python3 tet_ts_base_train.py , python3 tet_ts_base_pred.py 42 | ## 文本摘要(TS), Text-Summary 43 | ```bash 44 | # !/usr/bin/python 45 | # -*- coding: utf-8 -*- 46 | # @time : 2021/2/23 21:34 47 | # @author : Mo 48 | # @function: 文本摘要, text-summary 49 | 50 | 51 | # 适配linux 52 | import platform 53 | import json 54 | import sys 55 | import os 56 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 57 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_textsummary") 58 | sys.path.append(path_sys) 59 | print(path_root) 60 | print(path_sys) 61 | 62 | from tsTools import get_current_time 63 | from tsConfig import model_config 64 | from tsRun import TextSummary 65 | 66 | 67 | # 预训练模型地址, 本地win10默认只跑2步就评估保存模型 68 | if platform.system().lower() == 'windows': 69 | # pretrained_model_dir = "D:/pretrain_models/pytorch" 70 | pretrained_model_dir = "E:/DATA/bert-model/00_pytorch" 71 | evaluate_steps = 32 # 评估步数 72 | save_steps = 32 # 存储步数 73 | else: 74 | pretrained_model_dir = "/pretrain_models/pytorch" 75 | evaluate_steps = 320 # 评估步数 76 | save_steps = 320 # 存储步数 77 | ee = 0 78 | 79 | 80 | if __name__ == "__main__": 81 | 82 | # 训练-验证语料地址, 可以只输入训练地址 83 | path_corpus = os.path.join(path_root, "pytorch_nlu", "corpus", "text_summary", "maths_toy") 84 | path_train = os.path.join(path_corpus, "train.json") 85 | path_dev = os.path.join(path_corpus, "dev.json") 86 | 87 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 88 | model_config["save_steps"] = save_steps # 存储步数 89 | model_config["path_train"] = path_train 90 | model_config["path_dev"] = path_dev 91 | model_config["lr"] = 1e-5 # 测试语料, 可为None 92 | model_config["max_len"] = 256 # 测试语料, 可为None 93 | model_config["batch_size"] = 32 # 测试语料, 可为None 94 | model_config["loss_type"] = "SOFT_MARGIN_LOSS" # 测试语料, 可为None 95 | model_config["is_dropout"] = True # 96 | model_config["is_adv"] = False # 测试语料, 可为None 97 | 98 | 99 | # 预训练模型适配的class 100 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 101 | pretrained_model_name_or_path = { 102 | "BERT_WWM": pretrained_model_dir + "/chinese_wwm_pytorch", 103 | "ROBERTA": pretrained_model_dir + "/chinese_roberta_wwm_ext_pytorch", 104 | "ALBERT": pretrained_model_dir + "/albert_base_v1", 105 | "XLNET": pretrained_model_dir + "/chinese_xlnet_mid_pytorch", 106 | # "ERNIE": pretrained_model_dir + "/ERNIE_stable-1.0.1-pytorch", 107 | "ERNIE": pretrained_model_dir + "/ernie-tiny", 108 | "BERT": pretrained_model_dir + "/bert-base-chinese", 109 | # "BERT": pretrained_model_dir + "/mengzi-bert-base/", 110 | } 111 | idx = 0 # 选择的预训练模型类型---model_type 112 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 113 | model_config["model_save_path"] = "../output/text_summary/model_{}".format(model_type[idx]) 114 | model_config["model_type"] = model_type[idx] 115 | # main 116 | lc = TextSummary(model_config) 117 | lc.process() 118 | lc.train() 119 | 120 | 121 | # shell 122 | # nohup python tcRun.py > tc.log 2>&1 & 123 | # tail -n 1000 -f tc.log 124 | # |myz| 125 | ``` 126 | 127 | 128 | # paper 129 | ## 文本摘要(TS), Text-Summary 130 | * BertSum: [Fine-tune BERT for Extractive Summarization](https://arxiv.org/pdf/1903.10318.pdf) 131 | 132 | 133 | # 参考 134 | This library is inspired by and references following frameworks and papers. 135 | 136 | * GPT2-NewsTitle: [https://github.com/liucongg/GPT2-NewsTitle](https://github.com/liucongg/GPT2-NewsTitle) 137 | * BertSum: [https://github.com/nlpyang/BertSum](https://github.com/nlpyang/BertSum) 138 | 139 | 140 | # Reference 141 | For citing this work, you can refer to the present GitHub project. For example, with BibTeX: 142 | ``` 143 | @software{Pytorch-NLU, 144 | url = {https://github.com/yongzhuo/Pytorch-NLU}, 145 | author = {Yongzhuo Mo}, 146 | title = {Pytorch-NLU}, 147 | year = {2021} 148 | 149 | ``` 150 | *希望对你有所帮助! 151 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_textsummary/tsRun.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/2/23 21:34 4 | # @author : Mo 5 | # @function: main programing, "训练时候logger不需要考虑" 6 | 7 | 8 | # 适配linux 9 | from collections import Counter 10 | from argparse import Namespace 11 | import random 12 | import copy 13 | import sys 14 | import os 15 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "")) 16 | sys.path.append(path_root) 17 | # os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 18 | 19 | from tsConfig import _TS_MODEL_BERTSUM 20 | from tsTools import get_logger 21 | from tsOffice import Office 22 | from tsData import DataSet 23 | 24 | 25 | class TextSummary: 26 | def __init__(self, config): 27 | self.config = Namespace(**config) 28 | self.logger = get_logger(self.config.model_save_path) 29 | self.l2i, self.i2l = {}, {} 30 | 31 | def process(self): 32 | """ 数据预处理, process """ 33 | # 数据读取 34 | # 训练集/验证集划分 35 | if self.config.path_dev: 36 | self.train_corpus = DataSet(self.config, self.config.path_train, self.logger) 37 | self.dev_corpus = DataSet(self.config, self.config.path_dev, self.logger) 38 | else: # 没有验证集的时候, 默认划分 4:1 39 | self.train_corpus = DataSet(self.config, self.config.path_train, self.logger) 40 | xs_train, ys_train = self.train_corpus.data_iter 41 | len_rate_8 = int(len(ys_train) * 0.8) 42 | xs_train, ys_train = xs_train[:len_rate_8], ys_train[:len_rate_8] 43 | xs_dev, ys_dev = xs_train[len_rate_8:], ys_train[len_rate_8:] 44 | self.train_corpus.data_iter = xs_train, ys_train 45 | self.dev_corpus = DataSet(self.config, None, self.logger) 46 | self.dev_corpus.data_iter = xs_dev, ys_dev 47 | self.tet_corpus = DataSet(self.config, self.config.path_tet, self.logger) 48 | self.logger.info("read_corpus_from_json ok!") 49 | # 参数更新 50 | self.config.len_corpus = self.train_corpus.len_corpus 51 | self.config.prior_count = self.train_corpus.prior_count 52 | self.config.prior = self.train_corpus.prior 53 | self.config.task_type = _TS_MODEL_BERTSUM 54 | 55 | def train(self, path_save=None): 56 | """ 初始化训练 """ 57 | # 创建模型目录与储存超参信息 58 | if not os.path.exists(self.config.model_save_path): 59 | os.makedirs(self.config.model_save_path, exist_ok=True) 60 | # 训练 61 | self.office = Office(tokenizer=self.train_corpus.tokenizer, 62 | train_corpus=self.train_corpus, 63 | dev_corpus=self.dev_corpus, 64 | tet_corpus=self.tet_corpus, 65 | config=self.config, 66 | logger=self.logger) 67 | # 加载训练好的模型 68 | if path_save and path_save.strip(): 69 | try: 70 | self.office.load_model_state(path_save) 71 | except Exception as e: 72 | self.logger.info(str(e)) 73 | self.office.load_model(path_save) 74 | # 训练 75 | self.office.train_model() 76 | 77 | def eval(self): 78 | """ 验证评估 """ 79 | try: 80 | self.office.load_model_state() 81 | except Exception as e: 82 | self.logger.info(str(e)) 83 | self.office.load_model() 84 | tet_results = self.office.evaluate("tet") 85 | return tet_results 86 | 87 | 88 | if __name__ == "__main__": 89 | # 适配linux 90 | import platform 91 | import sys 92 | import os 93 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 94 | sys.path.append(path_root) 95 | print(path_root) 96 | # 分类下的引入, pytorch_textclassification 97 | from tsConfig import model_config 98 | from tsTools import get_current_time 99 | 100 | # 预训练模型地址, 本地win10默认只跑2步就评估保存模型 101 | if platform.system().lower() == 'windows': 102 | pretrained_model_dir = "D:/DATA/bert-model/00_pytorch" 103 | evaluate_steps = 32 # 评估步数 104 | save_steps = 32 # 存储步数 105 | else: 106 | pretrained_model_dir = "/pretrain_models/pytorch" 107 | path_ernie = "/home/moyzh/pretrain_models/pytorch/ernie-tiny" 108 | evaluate_steps = 320 # 评估步数 109 | save_steps = 320 # 存储步数 110 | ee = 0 111 | # 训练-验证语料地址, 可以只输入训练地址 112 | path_corpus = path_root + "/corpus/text_summary/maths_toy" 113 | path_train = os.path.join(path_corpus, "train.json") 114 | path_dev = os.path.join(path_corpus, "dev.json") 115 | 116 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 117 | model_config["save_steps"] = save_steps # 存储步数 118 | model_config["path_train"] = path_train 119 | model_config["path_dev"] = path_dev 120 | model_config["lr"] = 5e-5 # 测试语料, 可为None 121 | model_config["max_len"] = 256 # 测试语料, 可为None 122 | model_config["batch_size"] = 32 # 测试语料, 可为None 123 | model_config["loss_type"] = "SOFT_MARGIN_LOSS" # 测试语料, 可为None 124 | model_config["is_adv"] = False # 测试语料, 可为None 125 | 126 | # 损失函数类型, 127 | # multi-class: 可选 None(BCE), BCE, BCE_LOGITS, MSE, FOCAL_LOSS, DICE_LOSS, LABEL_SMOOTH 128 | # multi-label: SOFT_MARGIN_LOSS, PRIOR_MARGIN_LOSS, FOCAL_LOSS, CIRCLE_LOSS, DICE_LOSS等 129 | # model_config["loss_type"] = "SOFT_MARGIN_LOSS" 130 | # model_config["loss_type"] = "MIX" 131 | # model_config["loss_type"] = "SOFT_MARGIN_LOSS" 132 | 133 | # 预训练模型适配的class 134 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 135 | pretrained_model_name_or_path = { 136 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 137 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 138 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 139 | "XLNET": "hfl/chinese-xlnet-mid", 140 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 141 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 142 | "BERT": "bert-base-chinese", 143 | # "BERT": "hfl/chinese-macbert-base", 144 | 145 | } 146 | idx = 1 # 选择的预训练模型类型---model_type 147 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 148 | model_config["model_save_path"] = "../output/text_summary/model_{}".format(model_type[idx]) 149 | model_config["model_type"] = model_type[idx] 150 | 151 | model_config["ADDITIONAL_SPECIAL_TOKENS"] = ["+","-", "=", ":", ".", "(", ")", "≈", "%", 152 | "∥", "<", ">", "⊙", "≌", "。"] # 新增特殊字符 153 | # main 154 | lc = TextSummary(model_config) 155 | lc.process() 156 | lc.train() 157 | 158 | 159 | # shell 160 | # nohup python tsRun.py > tc_multi_class.log 2>&1 & 161 | # tail -n 1000 -f tc_multi_class.log 162 | # |myz| 163 | 164 | -------------------------------------------------------------------------------- /test/corpus/conll_to_pos.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/8/24 23:09 4 | # @author : Mo 5 | # @function: transform conll to span, 将CONLL格式的数据转化为SPAN格式{pos:[1,3]} 6 | 7 | 8 | import logging 9 | 10 | 11 | def txt_write(lines, path: str, model: str = "w", encoding: str = "utf-8"): 12 | """ 13 | Write Line of list to file 14 | Args: 15 | lines: lines of list which need save 16 | path: path of save file, such as "txt" 17 | model: type of write, such as "w", "a+" 18 | encoding: type of encoding, such as "utf-8", "gbk" 19 | """ 20 | 21 | try: 22 | file = open(path, model, encoding=encoding) 23 | file.writelines(lines) 24 | file.close() 25 | except Exception as e: 26 | logging.info(str(e)) 27 | def save_json(lines, path, encoding: str = "utf-8", indent: int = 4): 28 | """ 29 | Write Line of List to file 30 | Args: 31 | lines: lines of list[str] which need save 32 | path: path of save file, such as "json.txt" 33 | encoding: type of encoding, such as "utf-8", "gbk" 34 | """ 35 | 36 | with open(path, "w", encoding=encoding) as fj: 37 | fj.write(json.dumps(lines, ensure_ascii=False, indent=indent)) 38 | fj.close() 39 | def get_pos_from_common(words0, tag1): 40 | """从common模型的输出中重构标注, 即获取未知信息---position 41 | common analysis for sequence-labeling 42 | Args: 43 | words0: String/List, origin text, eg. "沪是上海" 44 | tag1 : List, common-output of labels, eg. ["S-city", "O", "B-city", "I-city"] 45 | Returns: 46 | reault: List, eg. [{"type":"city", "ent":"沪", "pos":[2:4]}] 47 | """ 48 | res = [] 49 | ws = "" 50 | start_pos_1 = 0 51 | end_pos_1 = 0 52 | sentence = "" 53 | types = "" 54 | for i in range(len(tag1)): 55 | if tag1[i].startswith("S-"): 56 | ws += words0[i] 57 | start_pos_1 = i 58 | end_pos_1 = i 59 | sentence += words0[i] 60 | types = tag1[i][2:] 61 | res.append([ws, start_pos_1, end_pos_1, types]) 62 | ws = "" 63 | types = "" 64 | 65 | if tag1[i].startswith("B-"): 66 | if len(ws) > 0: 67 | res.append([ws, start_pos_1, end_pos_1, types]) 68 | ws = "" 69 | types = "" 70 | if len(ws) == 0: 71 | ws += words0[i] 72 | start_pos_1 = i 73 | end_pos_1 = i 74 | sentence += words0[i] 75 | types = tag1[i][2:] 76 | 77 | elif tag1[i].startswith("I-"): 78 | if len(ws) > 0 and types == tag1[i][2:]: 79 | ws += words0[i] 80 | sentence += words0[i] 81 | end_pos_1 = i 82 | 83 | elif len(ws) > 0 and types != tag1[i][2:]: 84 | res.append([ws, start_pos_1, end_pos_1, types]) 85 | ws = "" 86 | types = "" 87 | 88 | if len(ws) == 0: 89 | ws += words0[i] 90 | start_pos_1 = i 91 | end_pos_1 = i 92 | sentence += words0[i] 93 | types = tag1[i][2:] 94 | 95 | elif tag1[i].startswith("M-"): 96 | if len(ws) > 0 and types == tag1[i][2:]: 97 | ws += words0[i] 98 | sentence += words0[i] 99 | end_pos_1 = i 100 | 101 | elif len(ws) > 0 and types != tag1[i][2:]: 102 | res.append([ws, start_pos_1, end_pos_1, types]) 103 | ws = "" 104 | types = "" 105 | 106 | if len(ws) == 0: 107 | ws += words0[i] 108 | start_pos_1 = i 109 | end_pos_1 = i 110 | sentence += words0[i] 111 | types = tag1[i][2:] 112 | 113 | elif tag1[i].startswith('E-'): 114 | if len(ws) > 0 and types == tag1[i][2:]: 115 | ws += words0[i] 116 | sentence += words0[i] 117 | end_pos_1 = i 118 | res.append([ws, start_pos_1, end_pos_1, types]) 119 | ws = "" 120 | types = "" 121 | 122 | if len(ws) > 0 and types != tag1[i][2:]: 123 | res.append([ws, start_pos_1, end_pos_1, types]) 124 | ws = "" 125 | ws += words0[i] 126 | start_pos_1 = i 127 | end_pos_1 = i 128 | sentence += words0[i] 129 | types = tag1[i][2:] 130 | res.append([ws, start_pos_1, end_pos_1, types]) 131 | ws = "" 132 | types = "" 133 | 134 | elif tag1[i] == "O": 135 | 136 | if len(ws) > 0: 137 | res.append([ws, start_pos_1, end_pos_1, types]) 138 | ws = "" 139 | types = "" 140 | 141 | sentence += words0[i] 142 | 143 | if i == len(tag1) - 1 and len(ws) > 0: 144 | res.append([ws, start_pos_1, end_pos_1, types]) 145 | ws = "" 146 | types = "" 147 | reault = [] 148 | for r in res: 149 | entity_dict = {} 150 | entity_dict["type"] = r[3] 151 | entity_dict["ent"] = r[0] 152 | entity_dict["pos"] = [r[1], r[2]] 153 | reault.append(entity_dict) 154 | return reault 155 | def read_corpus(corpus_path): 156 | """读取CONLL数据 157 | read corpus for sequence-labeling 158 | Args: 159 | corpus_path: String, path/origin text, eg. "ner.conll" 160 | Returns: 161 | data: List, 162 | """ 163 | data = [] 164 | with open(corpus_path, encoding="utf-8") as fr: 165 | lines = fr.readlines() 166 | sent_, tag_ = [], [] 167 | for line in lines: 168 | if line != "\n": 169 | [char, label] = line.strip().split() 170 | sent_.append(char) 171 | tag_.append(label) 172 | else: 173 | data.append((sent_, tag_)) 174 | sent_, tag_ = [], [] 175 | return data 176 | 177 | 178 | if __name__ == '__main__': 179 | import json 180 | import sys 181 | import os 182 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 183 | sys.path.append(path_root) 184 | print(path_root) 185 | path = path_root + "/corpus/sequence_labeling/ner_china_people_daily_1998_conll/" 186 | for t in ["train", "dev", "test"]: 187 | t = t + ".conll" 188 | data = read_corpus(path + t) 189 | res = [] 190 | for d in data: 191 | label = get_pos_from_common(d[0], d[1]) 192 | line = {"label":label, "text":"".join(d[0])} 193 | res.append(json.dumps(line, ensure_ascii=False) + "\n") 194 | txt_write(res, path + t + ".span") 195 | 196 | ee = 0 197 | # transform conll to span, 将CONLL格式的数据转化为SPAN格式{pos:[1,3]} 198 | 199 | 200 | -------------------------------------------------------------------------------- /pytorch_nlu/pytorch_textregression/trRun.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/2/23 21:34 4 | # @author : Mo 5 | # @function: main programing, "训练时候logger不需要考虑" 6 | 7 | 8 | # 适配linux 9 | import sys 10 | import os 11 | import gc 12 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "")) 13 | sys.path.append(path_root) 14 | from trTools import get_logger 15 | from trOffice import Office 16 | from trData import Corpus 17 | 18 | from collections import Counter 19 | from argparse import Namespace 20 | import random 21 | 22 | 23 | class TextRegression: 24 | def __init__(self, config): 25 | self.config = Namespace(**config) 26 | self.logger = get_logger(self.config.model_save_path) 27 | 28 | def process(self): 29 | """ 数据预处理, process """ 30 | # 数据读取 31 | self.corpus = Corpus(self.config, self.logger) 32 | # 训练集/验证集划分 33 | if self.config.path_dev: 34 | xs_dev, ys_dev = self.corpus.read_corpus_from_json(self.config.path_dev, keys=self.config.xy_keys, len_rate=self.config.len_rate) 35 | xs_train, ys_train = self.corpus.read_corpus_from_json(self.config.path_train, keys=self.config.xy_keys, len_rate=self.config.len_rate) 36 | else: # 没有验证集的时候, 默认划分 4:1 37 | xs_train, ys_train = self.corpus.read_corpus_from_json(self.config.path_train, keys=self.config.xy_keys, len_rate=self.config.len_rate) 38 | len_rate_8 = int(len(ys_train) * 0.8) 39 | xs_dev, ys_dev = xs_train[len_rate_8:], ys_train[len_rate_8:] 40 | xs_train, ys_train = xs_train[:len_rate_8], ys_train[:len_rate_8] 41 | self.logger.info("read_corpus_from_json ok!") 42 | # 参数更新 43 | self.config.len_corpus = len(ys_train) + len(ys_dev) 44 | self.config.num_labels = len(ys_train[0]) 45 | self.config.max_len = self.corpus.len_max 46 | # token 转 idx, 训练集/验证集 47 | random.shuffle(xs_train) # shuffle扰动 48 | # xs_train = xs_train[:int(len(xs_train)*0.18*2)] ### len_rate 49 | self.train_data = self.corpus.preprocess(xs_train, max_len=self.config.max_len) 50 | self.dev_data = self.corpus.preprocess(xs_dev, max_len=self.config.max_len) 51 | # 测试集 52 | xs_tet, ys_tet = self.corpus.read_corpus_from_json(self.config.path_tet, keys=self.config.xy_keys, len_rate=self.config.len_rate) if self.config.path_tet else ([], []) 53 | self.tet_data = self.corpus.preprocess(xs_tet, max_len=self.config.max_len) if self.config.path_tet else None 54 | self.logger.info("self.corpus.preprocess ok!") 55 | 56 | def train(self, path_save=None): 57 | """ 初始化训练 """ 58 | # 创建模型目录与储存超参信息 59 | if not os.path.exists(self.config.model_save_path): 60 | os.makedirs(self.config.model_save_path, exist_ok=True) 61 | # 训练 62 | self.office = Office(tokenizer=self.corpus.tokenizer, 63 | train_corpus=self.train_data, 64 | dev_corpus=self.dev_data, 65 | tet_corpus=self.tet_data, 66 | config=self.config, 67 | logger=self.logger) 68 | # 加载训练好的模型 69 | if path_save and path_save.strip(): 70 | try: 71 | self.office.load_model_state(path_save) 72 | except Exception as e: 73 | self.logger.info(str(e)) 74 | self.office.load_model(path_save) 75 | # 训练 76 | self.office.train_model() 77 | 78 | def eval(self): 79 | """ 验证评估 """ 80 | try: 81 | self.office.load_model_state() 82 | except Exception as e: 83 | self.logger.info(str(e)) 84 | self.office.load_model() 85 | tet_results = self.office.evaluate("tet") 86 | return tet_results 87 | 88 | 89 | if __name__ == "__main__": 90 | # 适配linux 91 | import platform 92 | import sys 93 | import os 94 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 95 | sys.path.append(path_root) 96 | print(path_root) 97 | # 分类下的引入, pytorch_textclassification 98 | from trConfig import model_config 99 | from trTools import get_current_time 100 | 101 | # 预训练模型地址, 本地win10默认只跑2步就评估保存模型 102 | if platform.system().lower() == 'windows': 103 | pretrained_model_dir = "E:/DATA/bert-model/00_pytorch" 104 | # pretrained_model_dir = "E:/YXP/data/kg_points/training" 105 | evaluate_steps = 320 # 评估步数 106 | save_steps = 320 # 存储步数 107 | else: 108 | pretrained_model_dir = "/home/moyzh/pretrain_models/pytorch" 109 | evaluate_steps = 320 # 评估步数 110 | save_steps = 320 # 存储步数 111 | ee = 0 112 | # 训练-验证语料地址, 可以只输入训练地址 113 | path_corpus = path_root + "/corpus/text_regression/negative_sentence" 114 | path_train = os.path.join(path_corpus, "train.json") 115 | path_dev = os.path.join(path_corpus, "dev.json") 116 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 117 | model_config["save_steps"] = save_steps # 存储步数 118 | model_config["path_train"] = path_train 119 | model_config["path_dev"] = path_dev 120 | # 损失函数类型, 121 | # multi-class: 可选 None(BCE), BCE, BCE_LOGITS, MSE, FOCAL_LOSS, DICE_LOSS, LABEL_SMOOTH 122 | # multi-label: SOFT_MARGIN_LOSS, PRIOR_MARGIN_LOSS, FOCAL_LOSS, CIRCLE_LOSS, DICE_LOSS等 123 | # model_config["loss_type"] = "SOFT_MARGIN_LOSS" 124 | model_config["loss_type"] = "MSE" 125 | model_config["active_type"] = "SIGMOID" 126 | # model_config["is_adv"] = True 30 0000 127 | model_config["len_rate"] = 0.001 128 | model_config["max_len"] = 512 129 | model_config["epochs"] = 21 130 | model_config["lr"] = 5e-5 131 | 132 | # 预训练模型适配的class 133 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 134 | pretrained_model_name_or_path = { 135 | "BERT_WWM": "hfl/chinese-bert-wwm-ext", 136 | "ROBERTA": "hfl/chinese-roberta-wwm-ext", 137 | "ALBERT": "uer/albert-base-chinese-cluecorpussmall", 138 | "XLNET": "hfl/chinese-xlnet-mid", 139 | "ERNIE": "nghuyong/ernie-1.0-base-zh", 140 | # "ERNIE": "nghuyong/ernie-3.0-base-zh", 141 | "BERT": "bert-base-chinese", 142 | # "BERT": "hfl/chinese-macbert-base", 143 | 144 | } 145 | idx = 1 # 选择的预训练模型类型---model_type 146 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 147 | # model_config["model_save_path"] = "../output/text_classification/model_{}".format(model_type[idx] + "_" + str(get_current_time())) 148 | model_config["model_save_path"] = "../output/text_regression/model_{}".format(model_type[idx]) 149 | model_config["model_type"] = model_type[idx] 150 | 151 | # os.environ["CUDA_VISIBLE_DEVICES"] = str(model_config["CUDA_VISIBLE_DEVICES"]) 152 | 153 | # main 154 | lc = TextRegression(model_config) 155 | lc.process() 156 | lc.train() 157 | 158 | 159 | # shell 160 | # nohup python tcRun.py > tc_multi_class.log 2>&1 & 161 | # tail -n 1000 -f tc_multi_class.log 162 | # |myz| 163 | 164 | -------------------------------------------------------------------------------- /test/tr/tet_tr_base_train.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/2/23 21:34 4 | # @author : Mo 5 | # @function: main programing, "训练时候logger不需要考虑" 6 | 7 | 8 | # 适配linux 9 | import platform 10 | import sys 11 | import os 12 | import gc 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 14 | path_sys = os.path.join(path_root, "pytorch_nlu", "pytorch_textregression") 15 | sys.path.append(path_sys) 16 | print(path_root) 17 | print(path_sys) 18 | from trConfig import model_config 19 | from trTools import get_logger 20 | from trOffice import Office 21 | from trData import Corpus 22 | 23 | from collections import Counter 24 | from argparse import Namespace 25 | import random 26 | 27 | 28 | class TextRegression: 29 | def __init__(self, config): 30 | self.config = Namespace(**config) 31 | self.logger = get_logger(self.config.model_save_path) 32 | 33 | def process(self): 34 | """ 数据预处理, process """ 35 | # 数据读取 36 | self.corpus = Corpus(self.config, self.logger) 37 | # 训练集/验证集划分 38 | if self.config.path_dev: 39 | xs_dev, ys_dev = self.corpus.read_corpus_from_json(self.config.path_dev, keys=self.config.xy_keys, len_rate=self.config.len_rate) 40 | xs_train, ys_train = self.corpus.read_corpus_from_json(self.config.path_train, keys=self.config.xy_keys, len_rate=self.config.len_rate) 41 | else: # 没有验证集的时候, 默认划分 4:1 42 | xs_train, ys_train = self.corpus.read_corpus_from_json(self.config.path_train, keys=self.config.xy_keys, len_rate=self.config.len_rate) 43 | len_rate_8 = int(len(ys_train) * 0.8) 44 | xs_dev, ys_dev = xs_train[len_rate_8:], ys_train[len_rate_8:] 45 | xs_train, ys_train = xs_train[:len_rate_8], ys_train[:len_rate_8] 46 | self.logger.info("read_corpus_from_json ok!") 47 | # 参数更新 48 | self.config.len_corpus = len(ys_train) + len(ys_dev) 49 | self.config.num_labels = len(ys_train[0]) 50 | self.config.max_len = self.corpus.len_max 51 | # token 转 idx, 训练集/验证集 52 | random.shuffle(xs_train) # shuffle扰动 53 | # xs_train = xs_train[:int(len(xs_train)*0.18*2)] ### len_rate 54 | self.train_data = self.corpus.preprocess(xs_train, max_len=self.config.max_len) 55 | self.dev_data = self.corpus.preprocess(xs_dev, max_len=self.config.max_len) 56 | # 测试集 57 | xs_tet, ys_tet = self.corpus.read_corpus_from_json(self.config.path_tet, keys=self.config.xy_keys, len_rate=self.config.len_rate) if self.config.path_tet else ([], []) 58 | self.tet_data = self.corpus.preprocess(xs_tet, max_len=self.config.max_len) if self.config.path_tet else None 59 | self.logger.info("self.corpus.preprocess ok!") 60 | 61 | def train(self, path_save=None): 62 | """ 初始化训练 """ 63 | # 创建模型目录与储存超参信息 64 | if not os.path.exists(self.config.model_save_path): 65 | os.makedirs(self.config.model_save_path, exist_ok=True) 66 | # 训练 67 | self.office = Office(tokenizer=self.corpus.tokenizer, 68 | train_corpus=self.train_data, 69 | dev_corpus=self.dev_data, 70 | tet_corpus=self.tet_data, 71 | config=self.config, 72 | logger=self.logger) 73 | # 加载训练好的模型 74 | if path_save and path_save.strip(): 75 | try: 76 | self.office.load_model_state(path_save) 77 | except Exception as e: 78 | self.logger.info(str(e)) 79 | self.office.load_model(path_save) 80 | # 训练 81 | self.office.train_model() 82 | 83 | def eval(self): 84 | """ 验证评估 """ 85 | try: 86 | self.office.load_model_state() 87 | except Exception as e: 88 | self.logger.info(str(e)) 89 | self.office.load_model() 90 | tet_results = self.office.evaluate("tet") 91 | return tet_results 92 | 93 | 94 | if __name__ == "__main__": 95 | # 预训练模型地址, 本地win10默认只跑2步就评估保存模型 96 | if platform.system().lower() == 'windows': 97 | pretrained_model_dir = "E:/DATA/bert-model/00_pytorch" 98 | # pretrained_model_dir = "E:/YXP/data/kg_points/training" 99 | evaluate_steps = 320 # 评估步数 100 | save_steps = 320 # 存储步数 101 | else: 102 | pretrained_model_dir = "/home/moyzh/pretrain_models/pytorch" 103 | evaluate_steps = 320 # 评估步数 104 | save_steps = 320 # 存储步数 105 | ee = 0 106 | # 训练-验证语料地址, 可以只输入训练地址 107 | path_corpus = path_root + "/pytorch_nlu/corpus/text_regression/negative_sentence" 108 | path_train = os.path.join(path_corpus, "train.json") 109 | path_dev = os.path.join(path_corpus, "dev.json") 110 | model_config["evaluate_steps"] = evaluate_steps # 评估步数 111 | model_config["save_steps"] = save_steps # 存储步数 112 | model_config["path_train"] = path_train 113 | model_config["path_dev"] = path_dev 114 | # 损失函数类型, 115 | # multi-class: 可选 None(BCE), BCE, BCE_LOGITS, MSE, FOCAL_LOSS, DICE_LOSS, LABEL_SMOOTH 116 | # multi-label: SOFT_MARGIN_LOSS, PRIOR_MARGIN_LOSS, FOCAL_LOSS, CIRCLE_LOSS, DICE_LOSS等 117 | # model_config["loss_type"] = "SOFT_MARGIN_LOSS" 118 | model_config["loss_type"] = "MSE" 119 | model_config["active_type"] = "SIGMOID" 120 | # model_config["is_adv"] = True 30 0000 121 | model_config["len_rate"] = 0.001 122 | model_config["max_len"] = 512 123 | model_config["epochs"] = 21 124 | model_config["lr"] = 5e-5 125 | model_config["warmup_steps"] = 2 126 | model_config["batch_size"] = 16 # 16 # 32 127 | 128 | # 预训练模型适配的class 129 | model_type = ["BERT", "ERNIE", "BERT_WWM", "ALBERT", "ROBERTA", "XLNET", "ELECTRA"] 130 | pretrained_model_name_or_path = { 131 | "BERT_WWM": pretrained_model_dir + "/chinese_wwm_pytorch", 132 | "ROBERTA": pretrained_model_dir + "/chinese_roberta_wwm_ext_pytorch", 133 | "ALBERT": pretrained_model_dir + "/albert_base_v1", 134 | "XLNET": pretrained_model_dir + "/chinese_xlnet_mid_pytorch", 135 | # "ERNIE": pretrained_model_dir + "/ERNIE_stable-1.0.1-pytorch", 136 | "ERNIE": pretrained_model_dir + "/ernie-tiny", 137 | "BERT": pretrained_model_dir + "/bert-base-chinese", 138 | # "BERT": pretrained_model_dir + "/mengzi-bert-base/", 139 | } 140 | idx = 1 # 选择的预训练模型类型---model_type 141 | model_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path[model_type[idx]] 142 | # model_config["model_save_path"] = "../output/text_classification/model_{}".format(model_type[idx] + "_" + str(get_current_time())) 143 | model_config["model_save_path"] = "../output/text_regression/model_{}".format(model_type[idx]) 144 | model_config["model_type"] = model_type[idx] 145 | 146 | # os.environ["CUDA_VISIBLE_DEVICES"] = str(model_config["CUDA_VISIBLE_DEVICES"]) 147 | 148 | # main 149 | lc = TextRegression(model_config) 150 | lc.process() 151 | lc.train() 152 | 153 | 154 | # shell 155 | # nohup python tcRun.py > tc_multi_class.log 2>&1 & 156 | # tail -n 1000 -f tc_multi_class.log 157 | # |myz| 158 | 159 | -------------------------------------------------------------------------------- /pytorch_nlu/corpus/text_classification/tnews/dev.json: -------------------------------------------------------------------------------- 1 | {"label": "finance", "text": "新榨季糖料种植形势乐观"} 2 | {"label": "politics", "text": "最高检:深挖黑恶势力保护伞杜绝有案不查"} 3 | {"label": "politics", "text": "基地组织网上发表声明证实本-拉登已死亡"} 4 | {"label": "game", "text": "《棒棒堂》迎春节多送10%金币活动"} 5 | {"label": "finance", "text": "天然橡胶:七年一梦 牛去熊来"} 6 | {"label": "finance", "text": "开放式基金哑铃配置 侧重选股能力"} 7 | {"label": "society", "text": "3名村民用猪肉磨成粉造假胎盘被擒"} 8 | {"label": "sports", "text": "好帽被吹阿联急得转圈跺脚 末节仅歇2分钟彰显地位"} 9 | {"label": "education", "text": "北京高考志愿开始填报 考试院专家提醒"} 10 | {"label": "politics", "text": "救援人员找到阿富汗载43人失踪客机机尾"} 11 | {"label": "stocks", "text": "汇丰升中铝评级至增持 目标价维持8.5港元"} 12 | {"label": "entertainment", "text": "李光洁:宝贝计划不想过早透露(图)"} 13 | {"label": "society", "text": "组图:杭州举行万人相亲大会"} 14 | {"label": "sports", "text": "欧洲围棋大会开幕式举行 气氛火爆“引发”火警"} 15 | {"label": "realty", "text": "房山中铁原香小镇均价14000元96折优惠"} 16 | {"label": "politics", "text": "驻俄大使:中国并不谋求在俄建立华人聚居区"} 17 | {"label": "education", "text": "江苏16日填报体艺类公办本科征求志愿"} 18 | {"label": "society", "text": "变电站设备房莫名下陷30厘米成危房(图)"} 19 | {"label": "realty", "text": "西三环唐宁府在售188平起现房均价28000元(图)"} 20 | {"label": "science", "text": "传红筹股回归A股进程暂停"} 21 | {"label": "game", "text": "《超级街头霸王4:街机版2012》公布"} 22 | {"label": "sports", "text": "特鲁西埃中国首次被喊下课 被曝刚愎自用搞乱球队"} 23 | {"label": "sports", "text": "切尔西魔兽战红军竟是带病登场 非洲杯期间已染怪病?"} 24 | {"label": "science", "text": "30倍光学变焦 索尼HX100上市后首降价"} 25 | {"label": "society", "text": "数十名居民阻止街道改建幼儿园(图)"} 26 | {"label": "stocks", "text": "银行保险利空消化还需时日"} 27 | {"label": "education", "text": "2010年高校招生全国统一考试大纲:物理"} 28 | {"label": "finance", "text": "结构当先商业银行草绘信贷新年图谱"} 29 | {"label": "realty", "text": "姚良松:从家居的角度看房地产发展趋势"} 30 | {"label": "science", "text": "黑客盯上世界杯"} 31 | {"label": "game", "text": "新浪UT首创“网游语音接口”概念"} 32 | {"label": "entertainment", "text": "胡静曾与老公签分手合同 恋爱三个月后就分手"} 33 | {"label": "science", "text": "i3芯5470独显 戴尔4G内存思跃15本促销"} 34 | {"label": "game", "text": "《战锤:战争黎明2》1.4补丁测试展开"} 35 | {"label": "stocks", "text": "港股认股证总成交金额28.73亿元 占大市17.65%"} 36 | {"label": "science", "text": "入门机再降300 索尼液晶欲破3000元"} 37 | {"label": "game", "text": "传黑客彻底攻陷PSN 索尼偷窥用户信息"} 38 | {"label": "finance", "text": "《信托公司参与股指期货交易业务指引》全文"} 39 | {"label": "education", "text": "盘点:那些最有“钱途”的留学专业"} 40 | {"label": "society", "text": "男子谎称有房产低价出售诈骗11万"} 41 | {"label": "politics", "text": "吴伯雄发表卸任感言:无怨无悔支持马主席到底"} 42 | {"label": "sports", "text": "两个秘密造就意甲最大黑马 200万投入惊成就第一防线"} 43 | {"label": "game", "text": "《幻想i时代》快来寻找你的水晶恋人"} 44 | {"label": "science", "text": "中电信推两款千元Android手机"} 45 | {"label": "science", "text": "独家专访梦工厂裘新:说我要另起炉灶纯属想象"} 46 | {"label": "realty", "text": "燕郊东方美墅一期售完二期4月中下旬开盘(图)"} 47 | {"label": "science", "text": "华东代理商携百余员工进入谷歌上海办公室抗议"} 48 | {"label": "entertainment", "text": "何丽全回应暂停四台联颁大奖 称决定过于仓促"} 49 | {"label": "society", "text": "男子发微博求租飞机转运患肺脑水肿驴友"} 50 | {"label": "society", "text": "组图:戒毒所内的特殊婚礼"} 51 | {"label": "society", "text": "美院教授30幅名画疑被儿子女友盗走(图)"} 52 | {"label": "education", "text": "你问我来答:怎样才能静心读书"} 53 | {"label": "science", "text": "中电信计划向C网三年投资800亿"} 54 | {"label": "sports", "text": "篮网3少跻身新秀榜前30 DDM战术让洛佩斯必须升级"} 55 | {"label": "sports", "text": "秘史曝光劳尔竟是皇马改革阻力 巨鳄展蓝图不惜清洗他"} 56 | {"label": "science", "text": "国内最大货到付款快递网落户当当"} 57 | {"label": "finance", "text": "节后首批新基金集中放行"} 58 | {"label": "realty", "text": "中房协副会长提醒开发商不要助推楼市涨价"} 59 | {"label": "science", "text": "买就送好礼 佳能EF镜头精彩促销不断"} 60 | {"label": "realty", "text": "顺义moma万万树独栋别墅尾房98折预计8月推新盘"} 61 | {"label": "society", "text": "探访南科大首位准大学生:不想做官想当科学家"} 62 | {"label": "game", "text": "斗神推上帝之眼 称团战系统或更完善"} 63 | {"label": "finance", "text": "债基深陷破发门 浮亏拖累业绩"} 64 | {"label": "realty", "text": "顺义北京苏活65-140平米精装1-3居可享97折"} 65 | {"label": "entertainment", "text": "《筑梦2008》代表中国内地“申奥”(图)"} 66 | {"label": "game", "text": "《九州OL》暗影魔踪即将盛大开启"} 67 | {"label": "game", "text": "中央政治局国务委员刘延东视察网龙"} 68 | {"label": "finance", "text": "芝加哥农产品期价18日继续大涨"} 69 | {"label": "sports", "text": "街球王负伤弗朗西斯得机会 挑战掘金欲当突击队长"} 70 | {"label": "science", "text": "索尼延期发布火爆游戏GT赛车5"} 71 | {"label": "education", "text": "四川外语学院2010年招生章程"} 72 | {"label": "science", "text": "国际电子巨头集体裁员应对亏损"} 73 | {"label": "realty", "text": "香河人居建设之路发展论坛实录4"} 74 | {"label": "stocks", "text": "跌停板上明目张胆的利益输送何时休"} 75 | {"label": "politics", "text": "专家称北约撤军后利比亚关注点为治安问题"} 76 | {"label": "science", "text": "中央提出组建国家级广电网络公司"} 77 | {"label": "finance", "text": "细剖私募基金绩效 证券类私募信托收益呈橄榄形"} 78 | {"label": "science", "text": "微软紧急发布SMBv2安全漏洞补丁"} 79 | {"label": "politics", "text": "蒙古民主党宣称赢得总统大选"} 80 | {"label": "finance", "text": "郑白糖雄居去年全球商品期货交易量老大"} 81 | {"label": "sports", "text": "图文-F1西班牙站红牛头排 舒马赫第六算不算凯旋?"} 82 | {"label": "science", "text": "摩根大通下调金融界评级至中性"} 83 | {"label": "game", "text": "ASUS2011春季赛新增添Dota项目"} 84 | {"label": "education", "text": "复旦学生陈剑首位被MIT双学位录取"} 85 | {"label": "science", "text": "英特尔32nm时代带来了什么"} 86 | {"label": "game", "text": "《QQ炫舞》浪漫盛典深圳站精彩回顾"} 87 | {"label": "sports", "text": "图文-[季后赛]公牛85-96热火 波什跃身投篮"} 88 | {"label": "stocks", "text": "盘前:美股指期货小幅下滑"} 89 | {"label": "game", "text": "Guangmo复出成解说 与Miss搭档"} 90 | {"label": "stocks", "text": "法巴上调富力目标价"} 91 | {"label": "stocks", "text": "大盘面临压力 个股机会涌现"} 92 | {"label": "science", "text": "史玉柱做网络游戏"} 93 | {"label": "politics", "text": "巴勒斯坦拟于明年1月前举行大选"} 94 | {"label": "stocks", "text": "又一内幕交易人落网 证监会曝光细节处以30万罚款"} 95 | {"label": "realty", "text": "香河雀禧64平起1-2居板楼均价6300享98折(图)"} 96 | {"label": "entertainment", "text": "组图:格温-史蒂芬尼携子外出 七彩羽绒服抢镜"} 97 | {"label": "game", "text": "《口袋精灵》浪漫七夕盛宴余味犹存"} 98 | {"label": "education", "text": "考研历程:我知道这是一条艰辛的路"} 99 | {"label": "education", "text": "河南公务员面试27日举行 20分钟当场出结果"} 100 | {"label": "politics", "text": "联合国前雇员咬伤保安拒绝认罪"} 101 | {"label": "entertainment", "text": "布兰妮开唱裙摆飞扬 骑男舞伴身上姿势撩逗(图)"} 102 | {"label": "game", "text": "《魔兽争霸4》消息:游戏是肯定会有"} 103 | {"label": "stocks", "text": "申万看好高端装备制造等六行业"} 104 | {"label": "entertainment", "text": "黄晓明过年行慈善 帮百户玉树灾民越冬"} 105 | {"label": "stocks", "text": "佣金大战溃败 东北证券领跌A股"} 106 | {"label": "education", "text": "新GRE考试费用“半价优惠”"} 107 | {"label": "education", "text": "江苏南京09年1月自学考试成绩查询时间"} 108 | {"label": "education", "text": "2011年广东高考提前批录取6日开始"} 109 | {"label": "politics", "text": "古巴版维基百科称美国为当代帝国(图)"} 110 | {"label": "realty", "text": "地产板块成昨日抗跌英雄9月以来涨幅称冠"} 111 | {"label": "sports", "text": "巴萨青训再出三大妖童惊西甲 队长未来接班人位列其中"} 112 | {"label": "realty", "text": "华熙乐茂:打造京西全家人的购物中心(组图)"} 113 | {"label": "politics", "text": "台海军官学校17名同学涉嫌非法签赌"} 114 | {"label": "game", "text": "回合网游领头羊《天之翼OL》再战市场"} 115 | {"label": "science", "text": "酷黑隐型键设计 三星U490下月将上市"} 116 | {"label": "realty", "text": "城市综合体的前世今生"} 117 | {"label": "society", "text": "生殖器官畸形者被施暴 到医院确认自己系女孩"} 118 | {"label": "realty", "text": "世茂系完成家族分工 世茂股份再次增发"} 119 | {"label": "entertainment", "text": "黄晓明力挺Angelababy 李菲儿一怒斩发丝(图)"} 120 | {"label": "politics", "text": "美国鸟类专家称拉登藏身巴基斯坦边境房屋内"} 121 | {"label": "education", "text": "名师辅导:高考作文之记叙文三宗宝(图)"} 122 | {"label": "realty", "text": "昌平金隅万科城二期三居28日开盘均价15000(图)"} 123 | {"label": "society", "text": "贵州村官数十年坚守贫瘠山村感动总理(图)"} 124 | {"label": "stocks", "text": "救市新计划 英央行可能几周后开始收购银行资产"} 125 | {"label": "education", "text": "“异地高考” 关键是要推进录取公平"} 126 | {"label": "politics", "text": "世卫组织称津巴布韦霍乱已致3028人死亡"} 127 | {"label": "society", "text": "女子拿三把刀挟持药店营业员索100片安定(图)"} 128 | {"label": "entertainment", "text": "林志颖将发新EP开个唱 经营哲学自有一套(组图)"} 129 | {"label": "education", "text": "高考是否应改到周末 教育部将调研"} 130 | {"label": "sports", "text": "传蔡斌对裁判不满离场抗议:只希望比赛有公平环境"} 131 | {"label": "stocks", "text": "金融股红旗飘扬 沪指翻红"} 132 | {"label": "science", "text": "佳能新旗舰降价 一周数码相机降价排行"} 133 | -------------------------------------------------------------------------------- /pytorch_nlu/corpus/text_classification/tnews/train.json: -------------------------------------------------------------------------------- 1 | {"label": "politics", "text": "吴邦国启程出访东南亚三国"} 2 | {"label": "society", "text": "女孩多次在公交车上拍摄小偷作案(组图)"} 3 | {"label": "finance", "text": "外盘玩突袭 国内期市将迎虎头"} 4 | {"label": "politics", "text": "媒体披露克林顿女儿婚礼细节(图)"} 5 | {"label": "science", "text": "新浪搜狐不满腾讯霸道 欲退出互联网大会"} 6 | {"label": "finance", "text": "多空分歧巨大 期市三品种成交破百万手"} 7 | {"label": "sports", "text": "古力金志锡惊天一战回顾 几度劫争惊险逆转(多谱)"} 8 | {"label": "sports", "text": "公告-2011年中高协第三期初级教练员考试成绩表"} 9 | {"label": "education", "text": "2010年高考地理试题及答案(海南卷)"} 10 | {"label": "education", "text": "谁说美国学生“整天玩就能上大学”?"} 11 | {"label": "entertainment", "text": "组图:徐若�新片母亲形象曝光 为电影放弃代言"} 12 | {"label": "society", "text": "水库管理员未救溺水偷鱼者获缓刑"} 13 | {"label": "sports", "text": "马丁:一想到搭档姚就很开心 火箭下赛季值得期待"} 14 | {"label": "politics", "text": "日本静冈县知事为让钉子户砍树而辞职"} 15 | {"label": "sports", "text": "雷吉米勒火箭可进总决赛 詹姆斯包揽MVP和得分王"} 16 | {"label": "realty", "text": "朝阳海棠公社230-300跃层98折一层带50平花园"} 17 | {"label": "realty", "text": "华润橡树湾亿元级全精装景观示范区5月磅礴问世"} 18 | {"label": "politics", "text": "外交部提醒在英同胞注意出行安全"} 19 | {"label": "sports", "text": "朱挺:我不知道自己能不能留下来 期待幸运降临"} 20 | {"label": "education", "text": "热议:高考试题猜想究竟卖的什么药"} 21 | {"label": "entertainment", "text": "白玉兰奖最佳导演提名揭晓 徐纪周赵宝刚争奖"} 22 | {"label": "politics", "text": "两名被疑为西班牙埃塔组织头目者在法国落网"} 23 | {"label": "sports", "text": "阿森纳王牌70码奔袭如梅西 他敢抨击绝对大佬狗屎不如"} 24 | {"label": "education", "text": "中国赴美移民总数居第二 第一代移民已近217万"} 25 | {"label": "game", "text": "大众软件:西雅图夜 雪乐山巨人谢幕"} 26 | {"label": "finance", "text": "全球铂金市场三年来首度供应过剩"} 27 | {"label": "science", "text": "好乐买携品牌商推“半价封顶促销”"} 28 | {"label": "society", "text": "二人转演员男扮女装勾引男子抢劫"} 29 | {"label": "game", "text": "角色pk场英雄无敌在线穿越的策略对战"} 30 | {"label": "society", "text": "回家一开门撞见两小偷 两名男子奋力逮到一贼"} 31 | {"label": "education", "text": "北京09年6月1-26日中高考期间禁夜间施工"} 32 | {"label": "politics", "text": "菲律宾工厂爆炸致12人死亡7人受伤"} 33 | {"label": "stocks", "text": "快讯:两市保持窄幅震荡 成交量再度萎缩"} 34 | {"label": "education", "text": "2009年国家公务员部分部门面试真题汇总"} 35 | {"label": "stocks", "text": "盘前:外围市场普涨 美股期指上扬"} 36 | {"label": "science", "text": "堕落还是亲民!徕卡lux20跌破4K"} 37 | {"label": "sports", "text": "图文-[CBA]北京108-94胜佛山 马布里表情淡定"} 38 | {"label": "realty", "text": "深圳盐田现天价房:一套房每平米高达81893元"} 39 | {"label": "sports", "text": "半场技术统计-葡萄牙压倒性优势 射门次数16比2"} 40 | {"label": "education", "text": "重庆市昨日气温40度 中小学今起可放假(图)"} 41 | {"label": "education", "text": "2010年考研翻译长难句结构分析全攻略"} 42 | {"label": "game", "text": "一个人的反恐战争 《战地3》详细评析"} 43 | {"label": "sports", "text": "欧足联宣称意大利只是冰山一角 亚平宁之外更有阴暗角"} 44 | {"label": "finance", "text": "铜价上扬成谈判阻力 年铜加工费长协价或降3成"} 45 | {"label": "society", "text": "司机撞死路边的哥后跪求死者家属原谅"} 46 | {"label": "politics", "text": "伊朗宣布在核计划上取得一系列新进展"} 47 | {"label": "society", "text": "夫妇因为孩子取名起争执险些离婚"} 48 | {"label": "game", "text": "大作集中发布韩游三大厂商Q1财报纵览"} 49 | {"label": "science", "text": "1200W像素3寸大屏 爱国者T1260售1190"} 50 | {"label": "sports", "text": "C罗重返皇马首次触球训练 透露复出时间有望出战巴萨"} 51 | {"label": "game", "text": "《诸侯OL》内测乱世降临全解析"} 52 | {"label": "realty", "text": "永泰地产荣获2010中国房地产百强企业称号(组图)"} 53 | {"label": "realty", "text": "通州北京ONE精装97平起2-3居在售起价28000元"} 54 | {"label": "entertainment", "text": "林莉顶替Rosemary走秀 邓达智称不会影响合作"} 55 | {"label": "game", "text": "《倩女幽魂》庆五一 经验金钱大派送"} 56 | {"label": "politics", "text": "沙特要求教长缩短讲经时间防甲流传播"} 57 | {"label": "science", "text": "超炫电容屏 诺基亚S60音乐X6行货评测"} 58 | {"label": "stocks", "text": "合生创展完成高端地产布局"} 59 | {"label": "finance", "text": "基金调研探秘 3000点之上的挖股角逐"} 60 | {"label": "realty", "text": "海淀西山壹号院通透大户型4-5居现房在售(图)"} 61 | {"label": "stocks", "text": "第一资本金融集团90亿美元收购ING美国网银业务"} 62 | {"label": "entertainment", "text": "新《红楼梦》王熙凤戏里颠狂 姚笛演到精疲力尽"} 63 | {"label": "realty", "text": "大兴绿城阳光公馆60-120平米1-3居在售(图)"} 64 | {"label": "finance", "text": "美元回落 商品涨势欲卷土重来"} 65 | {"label": "society", "text": "男子驾摩托车撞人潜逃五年被抓获"} 66 | {"label": "society", "text": "男子因不满女友提出分手驾车将其撞伤"} 67 | {"label": "education", "text": "南方科大领军教授年薪高达百万 尝试自主招生"} 68 | {"label": "science", "text": "柯达向数码转型完成80%"} 69 | {"label": "finance", "text": "解除外方投资顾问 QDII基金首现单飞"} 70 | {"label": "politics", "text": "印尼罗肯火山今日两次喷发"} 71 | {"label": "finance", "text": "年末发行压轴戏 四大基金公司同台竞技"} 72 | {"label": "politics", "text": "陈水扁案宣判在即 绿营动作频频欲声援"} 73 | {"label": "stocks", "text": "港股认股证成交额38.39亿元 占大市11.76%"} 74 | {"label": "realty", "text": "朝阳CROSS朗廷50平0居尾房在售均价24000(图)"} 75 | {"label": "politics", "text": "智利圣何塞铜矿第12名被困矿工升井"} 76 | {"label": "politics", "text": "菲律宾南部连发3次7.3至7.4级地震"} 77 | {"label": "politics", "text": "司法部提出2010年反腐六项重点工作"} 78 | {"label": "education", "text": "2009年黑龙江省本科专业就业率“座次”排定"} 79 | {"label": "education", "text": "北京“穿越式”高考题难倒考生"} 80 | {"label": "finance", "text": "供求关系难改 PTA后市或近强远弱"} 81 | {"label": "stocks", "text": "外围股市的调整影响指数短线震荡"} 82 | {"label": "game", "text": "2010年NDS平台十大期待游戏来袭"} 83 | {"label": "stocks", "text": "重磅利空再袭来 下跌大盘再迎考验"} 84 | {"label": "finance", "text": "泰信蓝筹精选股票基金分红 每10份派0.5元"} 85 | {"label": "finance", "text": "古今通宝是骗子还是创新者"} 86 | {"label": "realty", "text": "顺义香悦四季95平起2-3居在售均价18000元"} 87 | {"label": "finance", "text": "美元反弹纽约油价中止连阳"} 88 | {"label": "society", "text": "高中课堂讲授生殖避孕知识"} 89 | {"label": "society", "text": "南京一名副教授组织聚众淫乱活动被起诉"} 90 | {"label": "realty", "text": "不知道未来楼市政策走向 消费者出现恐慌性购房"} 91 | {"label": "science", "text": "微软发Mango升级版本 4周完成98%用户升级"} 92 | {"label": "stocks", "text": "66亿资金外逃 主力换仓哪些股"} 93 | {"label": "realty", "text": "顺义成熟社区京汉铂寓周边交通配套解析"} 94 | {"label": "stocks", "text": "股市投资:别让自己陷在小巷里"} 95 | {"label": "sports", "text": "直击:皇帝上篮似人肉坦克 热火单节完成19分逆转"} 96 | {"label": "realty", "text": "内地楼市强劲反弹 港人投资热情再度升温"} 97 | {"label": "stocks", "text": "缩量调整 股指今年仍有望新高"} 98 | {"label": "realty", "text": "易宪容等三专家热议 房地产市场泡沫到底大不大"} 99 | {"label": "society", "text": "陕西宝鸡男子患“巨舌症” 21年舌头长到25厘米"} 100 | {"label": "sports", "text": "5场6球勒沃库森新射手崛起 加盟三个月就获得大单合同"} 101 | {"label": "stocks", "text": "美国传奇投资家罗杰斯:伯南克不懂经济学"} 102 | {"label": "stocks", "text": "电子器件板块走高 立讯精密涨停"} 103 | {"label": "stocks", "text": "华谊兄弟:解禁利空逐步消化 关注对外业务拓展"} 104 | {"label": "politics", "text": "G20抗议活动中死亡男子疑遭警察推挤致死"} 105 | {"label": "entertainment", "text": "卫兰Facebook自曝喜讯 黎明称其跟上帝订婚"} 106 | {"label": "education", "text": "学历文凭和自考文凭到底有什么区别"} 107 | {"label": "sports", "text": "罗斯砍33分却绝杀失手 山猫火力虚弱仍爆冷胜公牛"} 108 | {"label": "realty", "text": "通州中建雅颂居两限房85平米起均价6000元(图)"} 109 | {"label": "stocks", "text": "三巨头340亿美元援助或打对折"} 110 | {"label": "game", "text": "《灵兽世界》两大卡通主题副本上线"} 111 | {"label": "sports", "text": "生存压力把他逼成篮板狂徒 大卫李雷霆是勇士模板"} 112 | {"label": "entertainment", "text": "旭日阳刚将上兔年春晚 董洁潘粤明同台秀团圆?"} 113 | {"label": "realty", "text": "保利地产“和乐中国”北京赛区12强出炉"} 114 | {"label": "society", "text": "女孩不堪辅导班男生骚扰欲买刀自卫"} 115 | {"label": "education", "text": "珠海2010上半年自考考务考籍工作通知"} 116 | {"label": "society", "text": "初一女孩被扎数百针眼续:确认8名男生对其殴打"} 117 | {"label": "game", "text": "动视暴雪CEO:希望PC取代游戏主机"} 118 | {"label": "sports", "text": "高洪波:集训越长对国足参赛越有利 不该是横征暴敛"} 119 | {"label": "society", "text": "犯罪团伙设计捉奸陷阱敲诈2万被擒"} 120 | {"label": "society", "text": "中学老师被学生及家长打昏续多位老师曾遭殴打"} 121 | {"label": "sports", "text": "夺冠一战验出御林军成色 工体登顶成就北京真爷们"} 122 | {"label": "sports", "text": "宏远恒强的秘密:不职业的CBA成就广东队一家独大"} 123 | {"label": "game", "text": "《梦幻天劫》4月7日更新公告"} 124 | {"label": "realty", "text": "房山城铁边上的家93平米2居现房在售均价4200"} 125 | {"label": "science", "text": "806MHz处理器 中恒GPS智能手机S90评测"} 126 | {"label": "politics", "text": "台当局新闻主管部门称将开放大陆歌手赴台"} 127 | {"label": "game", "text": "创意总监爆料《细胞分裂5》将拍电影"} 128 | {"label": "realty", "text": "润泽地产深耕十年 润泽公馆新品样板间正式亮相"} 129 | {"label": "society", "text": "12岁男孩被轿车撞飞后遭大客车碾压而死"} 130 | {"label": "sports", "text": "济州主帅变低调未戴两球领带 盼和泰达一起出线"} 131 | {"label": "society", "text": "拆迁房70岁老人遭外来人员围殴成脑震荡"} 132 | {"label": "entertainment", "text": "张庭人工受孕成功挺大肚坐轮椅做妇检(图)"} 133 | --------------------------------------------------------------------------------